From 2f0f267ea0720ec6adbe9cf7386450425fac8258 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 5 Mar 2015 10:49:19 +1030
Subject: cpumask: remove deprecated functions.

Using these functions with offstack cpus is unsafe.  They use all NR_CPUS
bits, unstead of nr_cpumask_bits.

In particular, lustre (in staging) used cpus_ and that caused a bug.

Reported-by: Oleg Drokin <green@linuxhacker.ru>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 lib/Kconfig | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'lib')

diff --git a/lib/Kconfig b/lib/Kconfig
index 87da53bb1fef..47d262b3251e 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -397,10 +397,6 @@ config CPUMASK_OFFSTACK
 	  them on the stack.  This is a bit more expensive, but avoids
 	  stack overflow.
 
-config DISABLE_OBSOLETE_CPUMASK_FUNCTIONS
-       bool "Disable obsolete cpumask functions" if DEBUG_PER_CPU_MAPS
-       depends on BROKEN
-
 config CPU_RMAP
 	bool
 	depends on SMP
-- 
cgit v1.2.3


From cdfdef75e795fb5ab76c66f3329e509f3ab8b9b5 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 5 Mar 2015 10:49:19 +1030
Subject: cpumask: only allocate nr_cpumask_bits.

Now we'll find out the hard way if anyone has CPUMASK_OFFSTACK and is
returning these or assigning them.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/cpumask.h | 5 ++---
 lib/cpumask.c           | 7 -------
 2 files changed, 2 insertions(+), 10 deletions(-)

(limited to 'lib')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 646fadee5caf..4ad2d3c8e21f 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -11,6 +11,7 @@
 #include <linux/bitmap.h>
 #include <linux/bug.h>
 
+/* Don't assign or return these: may not be this big! */
 typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;
 
 /**
@@ -609,9 +610,7 @@ static inline int cpulist_parse(const char *buf, struct cpumask *dstp)
  */
 static inline size_t cpumask_size(void)
 {
-	/* FIXME: Once all cpumask assignments are eliminated, this
-	 * can be nr_cpumask_bits */
-	return BITS_TO_LONGS(NR_CPUS) * sizeof(long);
+	return BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long);
 }
 
 /*
diff --git a/lib/cpumask.c b/lib/cpumask.c
index b6513a9f2892..ba379d12bb57 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -89,13 +89,6 @@ bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
 		dump_stack();
 	}
 #endif
-	/* FIXME: Bandaid to save us from old primitives which go to NR_CPUS. */
-	if (*mask) {
-		unsigned char *ptr = (unsigned char *)cpumask_bits(*mask);
-		unsigned int tail;
-		tail = BITS_TO_LONGS(NR_CPUS - nr_cpumask_bits) * sizeof(long);
-		memset(ptr + cpumask_size() - tail, 0, tail);
-	}
 
 	return *mask != NULL;
 }
-- 
cgit v1.2.3


From 34644524bce91883d5051a7eaf3ec5464ed149bf Mon Sep 17 00:00:00 2001
From: Abhilash Kesavan <a.kesavan@samsung.com>
Date: Fri, 6 Feb 2015 19:15:27 +0530
Subject: lib: devres: add a helper function for ioremap_wc

Implement a resource managed writecombine ioremap function.

Signed-off-by: Abhilash Kesavan <a.kesavan@samsung.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/driver-model/devres.txt |  1 +
 include/linux/io.h                    |  2 ++
 lib/devres.c                          | 28 ++++++++++++++++++++++++++++
 3 files changed, 31 insertions(+)

(limited to 'lib')

diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt
index 6d1e8eeb5990..7fe7fd263aba 100644
--- a/Documentation/driver-model/devres.txt
+++ b/Documentation/driver-model/devres.txt
@@ -276,6 +276,7 @@ IOMAP
   devm_ioport_unmap()
   devm_ioremap()
   devm_ioremap_nocache()
+  devm_ioremap_wc()
   devm_ioremap_resource() : checks resource, requests memory region, ioremaps
   devm_iounmap()
   pcim_iomap()
diff --git a/include/linux/io.h b/include/linux/io.h
index fa02e55e5a2e..42b33f03d1df 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -64,6 +64,8 @@ void __iomem *devm_ioremap(struct device *dev, resource_size_t offset,
 			   resource_size_t size);
 void __iomem *devm_ioremap_nocache(struct device *dev, resource_size_t offset,
 				   resource_size_t size);
+void __iomem *devm_ioremap_wc(struct device *dev, resource_size_t offset,
+				   resource_size_t size);
 void devm_iounmap(struct device *dev, void __iomem *addr);
 int check_signature(const volatile void __iomem *io_addr,
 			const unsigned char *signature, int length);
diff --git a/lib/devres.c b/lib/devres.c
index 0f1dd2e9d2c1..fbe2aac522e6 100644
--- a/lib/devres.c
+++ b/lib/devres.c
@@ -71,6 +71,34 @@ void __iomem *devm_ioremap_nocache(struct device *dev, resource_size_t offset,
 }
 EXPORT_SYMBOL(devm_ioremap_nocache);
 
+/**
+ * devm_ioremap_wc - Managed ioremap_wc()
+ * @dev: Generic device to remap IO address for
+ * @offset: BUS offset to map
+ * @size: Size of map
+ *
+ * Managed ioremap_wc().  Map is automatically unmapped on driver detach.
+ */
+void __iomem *devm_ioremap_wc(struct device *dev, resource_size_t offset,
+			      resource_size_t size)
+{
+	void __iomem **ptr, *addr;
+
+	ptr = devres_alloc(devm_ioremap_release, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return NULL;
+
+	addr = ioremap_wc(offset, size);
+	if (addr) {
+		*ptr = addr;
+		devres_add(dev, ptr);
+	} else
+		devres_free(ptr);
+
+	return addr;
+}
+EXPORT_SYMBOL(devm_ioremap_wc);
+
 /**
  * devm_iounmap - Managed iounmap()
  * @dev: Generic device to unmap for
-- 
cgit v1.2.3


From b9f28d863594c429e1df35a0474d2663ca28b307 Mon Sep 17 00:00:00 2001
From: James Bottomley <JBottomley@Odin.com>
Date: Thu, 5 Mar 2015 18:47:01 -0800
Subject: sd, mmc, virtio_blk, string_helpers: fix block size units

The current string_get_size() overflows when the device size goes over
2^64 bytes because the string helper routine computes the suffix from
the size in bytes.  However, the entirety of Linux thinks in terms of
blocks, not bytes, so this will artificially induce an overflow on very
large devices.  Fix this by making the function string_get_size() take
blocks and the block size instead of bytes.  This should allow us to
keep working until the current SCSI standard overflows.

Also fix virtio_blk and mmc (both of which were also artificially
multiplying by the block size to pass a byte side to string_get_size()).

The mathematics of this is pretty simple:  we're taking a product of
size in blocks (S) and block size (B) and trying to re-express this in
exponential form: S*B = R*N^E (where N, the exponent is either 1000 or
1024) and R < N.  Mathematically, S = RS*N^ES and B=RB*N^EB, so if RS*RB
< N it's easy to see that S*B = RS*RB*N^(ES+EB).  However, if RS*BS > N,
we can see that this can be re-expressed as RS*BS = R*N (where R =
RS*BS/N < N) so the whole exponent becomes R*N^(ES+EB+1)

[jejb: fix incorrect 32 bit do_div spotted by kbuild test robot <fengguang.wu@intel.com>]
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: James Bottomley <JBottomley@Odin.com>
---
 drivers/block/virtio_blk.c     |  9 +++---
 drivers/mmc/card/block.c       |  4 +--
 drivers/scsi/sd.c              |  8 ++---
 include/linux/string_helpers.h |  2 +-
 lib/string_helpers.c           | 68 +++++++++++++++++++++++++++++-------------
 5 files changed, 60 insertions(+), 31 deletions(-)

(limited to 'lib')

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 655e570b9b31..5ea2f0bbbc7c 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -342,7 +342,7 @@ static void virtblk_config_changed_work(struct work_struct *work)
 	struct request_queue *q = vblk->disk->queue;
 	char cap_str_2[10], cap_str_10[10];
 	char *envp[] = { "RESIZE=1", NULL };
-	u64 capacity, size;
+	u64 capacity;
 
 	/* Host must always specify the capacity. */
 	virtio_cread(vdev, struct virtio_blk_config, capacity, &capacity);
@@ -354,9 +354,10 @@ static void virtblk_config_changed_work(struct work_struct *work)
 		capacity = (sector_t)-1;
 	}
 
-	size = capacity * queue_logical_block_size(q);
-	string_get_size(size, STRING_UNITS_2, cap_str_2, sizeof(cap_str_2));
-	string_get_size(size, STRING_UNITS_10, cap_str_10, sizeof(cap_str_10));
+	string_get_size(capacity, queue_logical_block_size(q),
+			STRING_UNITS_2, cap_str_2, sizeof(cap_str_2));
+	string_get_size(capacity, queue_logical_block_size(q),
+			STRING_UNITS_10, cap_str_10, sizeof(cap_str_10));
 
 	dev_notice(&vdev->dev,
 		  "new size: %llu %d-byte logical blocks (%s/%s)\n",
diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index c69afb5e264e..2fc426926574 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -2230,7 +2230,7 @@ static int mmc_blk_alloc_part(struct mmc_card *card,
 	part_md->part_type = part_type;
 	list_add(&part_md->part, &md->part);
 
-	string_get_size((u64)get_capacity(part_md->disk) << 9, STRING_UNITS_2,
+	string_get_size((u64)get_capacity(part_md->disk), 512, STRING_UNITS_2,
 			cap_str, sizeof(cap_str));
 	pr_info("%s: %s %s partition %u %s\n",
 	       part_md->disk->disk_name, mmc_card_id(card),
@@ -2436,7 +2436,7 @@ static int mmc_blk_probe(struct device *dev)
 	if (IS_ERR(md))
 		return PTR_ERR(md);
 
-	string_get_size((u64)get_capacity(md->disk) << 9, STRING_UNITS_2,
+	string_get_size((u64)get_capacity(md->disk), 512, STRING_UNITS_2,
 			cap_str, sizeof(cap_str));
 	pr_info("%s: %s %s %s %s\n",
 		md->disk->disk_name, mmc_card_id(card), mmc_card_name(card),
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 9e0c63e57aff..dcc42446f58a 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -2211,11 +2211,11 @@ got_data:
 
 	{
 		char cap_str_2[10], cap_str_10[10];
-		u64 sz = (u64)sdkp->capacity << ilog2(sector_size);
 
-		string_get_size(sz, STRING_UNITS_2, cap_str_2,
-				sizeof(cap_str_2));
-		string_get_size(sz, STRING_UNITS_10, cap_str_10,
+		string_get_size(sdkp->capacity, sector_size,
+				STRING_UNITS_2, cap_str_2, sizeof(cap_str_2));
+		string_get_size(sdkp->capacity, sector_size,
+				STRING_UNITS_10, cap_str_10,
 				sizeof(cap_str_10));
 
 		if (sdkp->first_scan || old_capacity != sdkp->capacity) {
diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index 657571817260..263328063730 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -10,7 +10,7 @@ enum string_size_units {
 	STRING_UNITS_2,		/* use binary powers of 2^10 */
 };
 
-void string_get_size(u64 size, enum string_size_units units,
+void string_get_size(u64 size, u64 blk_size, enum string_size_units units,
 		     char *buf, int len);
 
 #define UNESCAPE_SPACE		0x01
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index 8f8c4417f228..4a913ec3acf9 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -4,6 +4,7 @@
  * Copyright 31 August 2008 James Bottomley
  * Copyright (C) 2013, Intel Corporation
  */
+#include <linux/bug.h>
 #include <linux/kernel.h>
 #include <linux/math64.h>
 #include <linux/export.h>
@@ -14,7 +15,8 @@
 
 /**
  * string_get_size - get the size in the specified units
- * @size:	The size to be converted
+ * @size:	The size to be converted in blocks
+ * @blk_size:	Size of the block (use 1 for size in bytes)
  * @units:	units to use (powers of 1000 or 1024)
  * @buf:	buffer to format to
  * @len:	length of buffer
@@ -24,14 +26,14 @@
  * at least 9 bytes and will always be zero terminated.
  *
  */
-void string_get_size(u64 size, const enum string_size_units units,
+void string_get_size(u64 size, u64 blk_size, const enum string_size_units units,
 		     char *buf, int len)
 {
 	static const char *const units_10[] = {
-		"B", "kB", "MB", "GB", "TB", "PB", "EB"
+		"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"
 	};
 	static const char *const units_2[] = {
-		"B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB"
+		"B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"
 	};
 	static const char *const *const units_str[] = {
 		[STRING_UNITS_10] = units_10,
@@ -42,31 +44,57 @@ void string_get_size(u64 size, const enum string_size_units units,
 		[STRING_UNITS_2] = 1024,
 	};
 	int i, j;
-	u32 remainder = 0, sf_cap;
+	u32 remainder = 0, sf_cap, exp;
 	char tmp[8];
+	const char *unit;
 
 	tmp[0] = '\0';
 	i = 0;
-	if (size >= divisor[units]) {
-		while (size >= divisor[units]) {
-			remainder = do_div(size, divisor[units]);
-			i++;
-		}
+	if (!size)
+		goto out;
 
-		sf_cap = size;
-		for (j = 0; sf_cap*10 < 1000; j++)
-			sf_cap *= 10;
+	while (blk_size >= divisor[units]) {
+		remainder = do_div(blk_size, divisor[units]);
+		i++;
+	}
 
-		if (j) {
-			remainder *= 1000;
-			remainder /= divisor[units];
-			snprintf(tmp, sizeof(tmp), ".%03u", remainder);
-			tmp[j+1] = '\0';
-		}
+	exp = divisor[units] / (u32)blk_size;
+	if (size >= exp) {
+		remainder = do_div(size, divisor[units]);
+		remainder *= blk_size;
+		i++;
+	} else {
+		remainder *= size;
+	}
+
+	size *= blk_size;
+	size += remainder / divisor[units];
+	remainder %= divisor[units];
+
+	while (size >= divisor[units]) {
+		remainder = do_div(size, divisor[units]);
+		i++;
 	}
 
+	sf_cap = size;
+	for (j = 0; sf_cap*10 < 1000; j++)
+		sf_cap *= 10;
+
+	if (j) {
+		remainder *= 1000;
+		remainder /= divisor[units];
+		snprintf(tmp, sizeof(tmp), ".%03u", remainder);
+		tmp[j+1] = '\0';
+	}
+
+ out:
+	if (i >= ARRAY_SIZE(units_2))
+		unit = "UNK";
+	else
+		unit = units_str[units][i];
+
 	snprintf(buf, len, "%u%s %s", (u32)size,
-		 tmp, units_str[units][i]);
+		 tmp, unit);
 }
 EXPORT_SYMBOL(string_get_size);
 
-- 
cgit v1.2.3


From 10b88a4b17d31a7409494b179dcb76e7ab2fcaea Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Thu, 12 Mar 2015 20:02:35 -0400
Subject: sparc: Break up monolithic iommu table/lock into finer graularity
 pools and lock

Investigation of multithreaded iperf experiments on an ethernet
interface show the iommu->lock as the hottest lock identified by
lockstat, with something of the order of  21M contentions out of
27M acquisitions, and an average wait time of 26 us for the lock.
This is not efficient. A more scalable design is to follow the ppc
model, where the iommu_table has multiple pools, each stretching
over a segment of the map, and with a separate lock for each pool.
This model allows for better parallelization of the iommu map search.

This patch adds the iommu range alloc/free function infrastructure.

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/iommu-common.h |  55 +++++++++++
 lib/Makefile                 |   2 +-
 lib/iommu-common.c           | 220 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 276 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/iommu-common.h
 create mode 100644 lib/iommu-common.c

(limited to 'lib')

diff --git a/include/linux/iommu-common.h b/include/linux/iommu-common.h
new file mode 100644
index 000000000000..6be5c863f329
--- /dev/null
+++ b/include/linux/iommu-common.h
@@ -0,0 +1,55 @@
+#ifndef _LINUX_IOMMU_COMMON_H
+#define _LINUX_IOMMU_COMMON_H
+
+#include <linux/spinlock_types.h>
+#include <linux/device.h>
+#include <asm/page.h>
+
+#define IOMMU_POOL_HASHBITS     4
+#define IOMMU_NR_POOLS          (1 << IOMMU_POOL_HASHBITS)
+
+struct iommu_pool {
+	unsigned long	start;
+	unsigned long	end;
+	unsigned long	hint;
+	spinlock_t	lock;
+};
+
+struct iommu_table;
+
+struct iommu_tbl_ops {
+	unsigned long	(*cookie_to_index)(u64, void *);
+	void		(*demap)(void *, unsigned long, unsigned long);
+	void		(*reset)(struct iommu_table *);
+};
+
+struct iommu_table {
+	unsigned long		page_table_map_base;
+	unsigned long		page_table_shift;
+	unsigned long		nr_pools;
+	const struct iommu_tbl_ops  *iommu_tbl_ops;
+	unsigned long		poolsize;
+	struct iommu_pool	arena_pool[IOMMU_NR_POOLS];
+	u32			flags;
+#define	IOMMU_HAS_LARGE_POOL	0x00000001
+	struct iommu_pool	large_pool;
+	unsigned long		*map;
+};
+
+extern void iommu_tbl_pool_init(struct iommu_table *iommu,
+				unsigned long num_entries,
+				u32 page_table_shift,
+				const struct iommu_tbl_ops *iommu_tbl_ops,
+				bool large_pool, u32 npools);
+
+extern unsigned long iommu_tbl_range_alloc(struct device *dev,
+					   struct iommu_table *iommu,
+					   unsigned long npages,
+					   unsigned long *handle,
+					   unsigned int pool_hash);
+
+extern void iommu_tbl_range_free(struct iommu_table *iommu,
+				 u64 dma_addr, unsigned long npages,
+				 bool do_demap, void *demap_arg);
+
+#endif
diff --git a/lib/Makefile b/lib/Makefile
index 58f74d2dd396..60c22e65b793 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -106,7 +106,7 @@ obj-$(CONFIG_AUDIT_GENERIC) += audit.o
 obj-$(CONFIG_AUDIT_COMPAT_GENERIC) += compat_audit.o
 
 obj-$(CONFIG_SWIOTLB) += swiotlb.o
-obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o
+obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o iommu-common.o
 obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o
 obj-$(CONFIG_NOTIFIER_ERROR_INJECTION) += notifier-error-inject.o
 obj-$(CONFIG_CPU_NOTIFIER_ERROR_INJECT) += cpu-notifier-error-inject.o
diff --git a/lib/iommu-common.c b/lib/iommu-common.c
new file mode 100644
index 000000000000..7583f9b7846b
--- /dev/null
+++ b/lib/iommu-common.c
@@ -0,0 +1,220 @@
+/*
+ * IOMMU mmap management and range allocation functions.
+ * Based almost entirely upon the powerpc iommu allocator.
+ */
+
+#include <linux/export.h>
+#include <linux/bitmap.h>
+#include <linux/bug.h>
+#include <linux/iommu-helper.h>
+#include <linux/iommu-common.h>
+#include <linux/dma-mapping.h>
+
+#define IOMMU_LARGE_ALLOC	15
+
+/*
+ * Initialize iommu_pool entries for the iommu_table. `num_entries'
+ * is the number of table entries. If `large_pool' is set to true,
+ * the top 1/4 of the table will be set aside for pool allocations
+ * of more than IOMMU_LARGE_ALLOC pages.
+ */
+extern void iommu_tbl_pool_init(struct iommu_table *iommu,
+				unsigned long num_entries,
+				u32 page_table_shift,
+				const struct iommu_tbl_ops *iommu_tbl_ops,
+				bool large_pool, u32 npools)
+{
+	unsigned int start, i;
+	struct iommu_pool *p = &(iommu->large_pool);
+
+	if (npools == 0)
+		iommu->nr_pools = IOMMU_NR_POOLS;
+	else
+		iommu->nr_pools = npools;
+	BUG_ON(npools > IOMMU_NR_POOLS);
+
+	iommu->page_table_shift = page_table_shift;
+	iommu->iommu_tbl_ops = iommu_tbl_ops;
+	start = 0;
+	if (large_pool)
+		iommu->flags |= IOMMU_HAS_LARGE_POOL;
+
+	if (!large_pool)
+		iommu->poolsize = num_entries/iommu->nr_pools;
+	else
+		iommu->poolsize = (num_entries * 3 / 4)/iommu->nr_pools;
+	for (i = 0; i < iommu->nr_pools; i++) {
+		spin_lock_init(&(iommu->arena_pool[i].lock));
+		iommu->arena_pool[i].start = start;
+		iommu->arena_pool[i].hint = start;
+		start += iommu->poolsize; /* start for next pool */
+		iommu->arena_pool[i].end = start - 1;
+	}
+	if (!large_pool)
+		return;
+	/* initialize large_pool */
+	spin_lock_init(&(p->lock));
+	p->start = start;
+	p->hint = p->start;
+	p->end = num_entries;
+}
+EXPORT_SYMBOL(iommu_tbl_pool_init);
+
+unsigned long iommu_tbl_range_alloc(struct device *dev,
+				struct iommu_table *iommu,
+				unsigned long npages,
+				unsigned long *handle,
+				unsigned int pool_hash)
+{
+	unsigned long n, end, start, limit, boundary_size;
+	struct iommu_pool *arena;
+	int pass = 0;
+	unsigned int pool_nr;
+	unsigned int npools = iommu->nr_pools;
+	unsigned long flags;
+	bool large_pool = ((iommu->flags & IOMMU_HAS_LARGE_POOL) != 0);
+	bool largealloc = (large_pool && npages > IOMMU_LARGE_ALLOC);
+	unsigned long shift;
+
+	/* Sanity check */
+	if (unlikely(npages == 0)) {
+		printk_ratelimited("npages == 0\n");
+		return DMA_ERROR_CODE;
+	}
+
+	if (largealloc) {
+		arena = &(iommu->large_pool);
+		spin_lock_irqsave(&arena->lock, flags);
+		pool_nr = 0; /* to keep compiler happy */
+	} else {
+		/* pick out pool_nr */
+		pool_nr =  pool_hash & (npools - 1);
+		arena = &(iommu->arena_pool[pool_nr]);
+
+		/* find first available unlocked pool */
+		while (!spin_trylock_irqsave(&(arena->lock), flags)) {
+			pool_nr = (pool_nr + 1) & (iommu->nr_pools - 1);
+			arena = &(iommu->arena_pool[pool_nr]);
+		}
+	}
+
+ again:
+	if (pass == 0 && handle && *handle &&
+	    (*handle >= arena->start) && (*handle < arena->end))
+		start = *handle;
+	else
+		start = arena->hint;
+
+	limit = arena->end;
+
+	/* The case below can happen if we have a small segment appended
+	 * to a large, or when the previous alloc was at the very end of
+	 * the available space. If so, go back to the beginning and flush.
+	 */
+	if (start >= limit) {
+		start = arena->start;
+		if (iommu->iommu_tbl_ops->reset != NULL)
+			iommu->iommu_tbl_ops->reset(iommu);
+	}
+
+	if (dev)
+		boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
+				      1 << iommu->page_table_shift);
+	else
+		boundary_size = ALIGN(1UL << 32, 1 << iommu->page_table_shift);
+
+	shift = iommu->page_table_map_base >> iommu->page_table_shift;
+	boundary_size = boundary_size >> iommu->page_table_shift;
+	/*
+	 * if the iommu has a non-trivial cookie <-> index mapping, we set
+	 * things up so that iommu_is_span_boundary() merely checks if the
+	 * (index + npages) < num_tsb_entries
+	 */
+	if (iommu->iommu_tbl_ops->cookie_to_index != NULL) {
+		shift = 0;
+		boundary_size = iommu->poolsize * iommu->nr_pools;
+	}
+	n = iommu_area_alloc(iommu->map, limit, start, npages, shift,
+			     boundary_size, 0);
+	if (n == -1) {
+		if (likely(pass == 0)) {
+			/* First failure, rescan from the beginning.  */
+			arena->hint = arena->start;
+			if (iommu->iommu_tbl_ops->reset != NULL)
+				iommu->iommu_tbl_ops->reset(iommu);
+			pass++;
+			goto again;
+		} else if (!largealloc && pass <= iommu->nr_pools) {
+			spin_unlock(&(arena->lock));
+			pool_nr = (pool_nr + 1) & (iommu->nr_pools - 1);
+			arena = &(iommu->arena_pool[pool_nr]);
+			while (!spin_trylock(&(arena->lock))) {
+				pool_nr = (pool_nr + 1) & (iommu->nr_pools - 1);
+				arena = &(iommu->arena_pool[pool_nr]);
+			}
+			arena->hint = arena->start;
+			pass++;
+			goto again;
+		} else {
+			/* give up */
+			spin_unlock_irqrestore(&(arena->lock), flags);
+			return DMA_ERROR_CODE;
+		}
+	}
+
+	end = n + npages;
+
+	arena->hint = end;
+
+	/* Update handle for SG allocations */
+	if (handle)
+		*handle = end;
+	spin_unlock_irqrestore(&(arena->lock), flags);
+
+	return n;
+}
+EXPORT_SYMBOL(iommu_tbl_range_alloc);
+
+static struct iommu_pool *get_pool(struct iommu_table *tbl,
+				   unsigned long entry)
+{
+	struct iommu_pool *p;
+	unsigned long largepool_start = tbl->large_pool.start;
+	bool large_pool = ((tbl->flags & IOMMU_HAS_LARGE_POOL) != 0);
+
+	/* The large pool is the last pool at the top of the table */
+	if (large_pool && entry >= largepool_start) {
+		p = &tbl->large_pool;
+	} else {
+		unsigned int pool_nr = entry / tbl->poolsize;
+
+		BUG_ON(pool_nr >= tbl->nr_pools);
+		p = &tbl->arena_pool[pool_nr];
+	}
+	return p;
+}
+
+void iommu_tbl_range_free(struct iommu_table *iommu, u64 dma_addr,
+			  unsigned long npages, bool do_demap, void *demap_arg)
+{
+	unsigned long entry;
+	struct iommu_pool *pool;
+	unsigned long flags;
+	unsigned long shift = iommu->page_table_shift;
+
+	if (iommu->iommu_tbl_ops->cookie_to_index != NULL) {
+		entry = (*iommu->iommu_tbl_ops->cookie_to_index)(dma_addr,
+								 demap_arg);
+	} else {
+		entry = (dma_addr - iommu->page_table_map_base) >> shift;
+	}
+	pool = get_pool(iommu, entry);
+
+	spin_lock_irqsave(&(pool->lock), flags);
+	if (do_demap && iommu->iommu_tbl_ops->demap != NULL)
+		(*iommu->iommu_tbl_ops->demap)(demap_arg, entry, npages);
+
+	bitmap_clear(iommu->map, entry, npages);
+	spin_unlock_irqrestore(&(pool->lock), flags);
+}
+EXPORT_SYMBOL(iommu_tbl_range_free);
-- 
cgit v1.2.3


From 2c57a0e233d72f8c2e2404560dcf0188ac3cf5d7 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Thu, 16 Apr 2015 12:43:13 -0700
Subject: lib: find_*_bit reimplementation

This patchset does rework to find_bit function family to achieve better
performance, and decrease size of text.  All rework is done in patch 1.
Patches 2 and 3 are about code moving and renaming.

It was boot-tested on x86_64 and MIPS (big-endian) machines.
Performance tests were ran on userspace with code like this:

	/* addr[] is filled from /dev/urandom */
	start = clock();
	while (ret < nbits)
		ret = find_next_bit(addr, nbits, ret + 1);

	end = clock();
	printf("%ld\t", (unsigned long) end - start);

On Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz measurements are: (for
find_next_bit, nbits is 8M, for find_first_bit - 80K)

	find_next_bit:		find_first_bit:
	new	current		new	current
	26932	43151		14777	14925
	26947	43182		14521	15423
	26507	43824		15053	14705
	27329	43759		14473	14777
	26895	43367		14847	15023
	26990	43693		15103	15163
	26775	43299		15067	15232
	27282	42752		14544	15121
	27504	43088		14644	14858
	26761	43856		14699	15193
	26692	43075		14781	14681
	27137	42969		14451	15061
	...			...

find_next_bit performance gain is 35-40%;
find_first_bit - no measurable difference.

On ARM machine, there is arch-specific implementation for find_bit.

Thanks a lot to George Spelvin and Rasmus Villemoes for hints and
helpful discussions.

This patch (of 3):

New implementations takes less space in source file (see diffstat) and in
object.  For me it's 710 vs 453 bytes of text.  It also shows better
performance.

find_last_bit description fixed due to obvious typo.

[akpm@linux-foundation.org: include linux/bitmap.h, per Rasmus]
Signed-off-by: Yury Norov <yury.norov@gmail.com>
Reviewed-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Reviewed-by: George Spelvin <linux@horizon.com>
Cc: Alexey Klimov <klimov.linux@gmail.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Daniel Borkmann <dborkman@redhat.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Valentin Rothberg <valentinrothberg@gmail.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitops.h |   4 +-
 lib/find_last_bit.c    |  36 +++----
 lib/find_next_bit.c    | 267 ++++++++++++++-----------------------------------
 3 files changed, 91 insertions(+), 216 deletions(-)

(limited to 'lib')

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 5d858e02997f..297f5bda4fdf 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -218,9 +218,9 @@ static inline unsigned long __ffs64(u64 word)
 /**
  * find_last_bit - find the last set bit in a memory region
  * @addr: The address to start the search at
- * @size: The maximum size to search
+ * @size: The number of bits to search
  *
- * Returns the bit number of the first set bit, or size.
+ * Returns the bit number of the last set bit, or size.
  */
 extern unsigned long find_last_bit(const unsigned long *addr,
 				   unsigned long size);
diff --git a/lib/find_last_bit.c b/lib/find_last_bit.c
index 91ca09fbf6f9..3e3be40c6a6e 100644
--- a/lib/find_last_bit.c
+++ b/lib/find_last_bit.c
@@ -4,6 +4,9 @@
  * Written by Rusty Russell <rusty@rustcorp.com.au>
  * (Inspired by David Howell's find_next_bit implementation)
  *
+ * Rewritten by Yury Norov <yury.norov@gmail.com> to decrease
+ * size and improve performance, 2015.
+ *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version
@@ -11,37 +14,26 @@
  */
 
 #include <linux/bitops.h>
+#include <linux/bitmap.h>
 #include <linux/export.h>
-#include <asm/types.h>
-#include <asm/byteorder.h>
+#include <linux/kernel.h>
 
 #ifndef find_last_bit
 
 unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
 {
-	unsigned long words;
-	unsigned long tmp;
-
-	/* Start at final word. */
-	words = size / BITS_PER_LONG;
+	if (size) {
+		unsigned long val = BITMAP_LAST_WORD_MASK(size);
+		unsigned long idx = (size-1) / BITS_PER_LONG;
 
-	/* Partial final word? */
-	if (size & (BITS_PER_LONG-1)) {
-		tmp = (addr[words] & (~0UL >> (BITS_PER_LONG
-					 - (size & (BITS_PER_LONG-1)))));
-		if (tmp)
-			goto found;
-	}
+		do {
+			val &= addr[idx];
+			if (val)
+				return idx * BITS_PER_LONG + __fls(val);
 
-	while (words) {
-		tmp = addr[--words];
-		if (tmp) {
-found:
-			return words * BITS_PER_LONG + __fls(tmp);
-		}
+			val = ~0ul;
+		} while (idx--);
 	}
-
-	/* Not found */
 	return size;
 }
 EXPORT_SYMBOL(find_last_bit);
diff --git a/lib/find_next_bit.c b/lib/find_next_bit.c
index 0cbfc0b4398f..cbea5ef843aa 100644
--- a/lib/find_next_bit.c
+++ b/lib/find_next_bit.c
@@ -3,6 +3,9 @@
  * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
+ * Rewritten by Yury Norov <yury.norov@gmail.com> to decrease
+ * size and improve performance, 2015.
+ *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version
@@ -11,98 +14,58 @@
 
 #include <linux/bitops.h>
 #include <linux/export.h>
-#include <asm/types.h>
-#include <asm/byteorder.h>
+#include <linux/kernel.h>
 
-#define BITOP_WORD(nr)		((nr) / BITS_PER_LONG)
+#if !defined(find_next_bit) || !defined(find_next_zero_bit)
 
-#ifndef find_next_bit
 /*
- * Find the next set bit in a memory region.
+ * This is a common helper function for find_next_bit and
+ * find_next_zero_bit.  The difference is the "invert" argument, which
+ * is XORed with each fetched word before searching it for one bits.
  */
-unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
-			    unsigned long offset)
+static unsigned long _find_next_bit(const unsigned long *addr,
+		unsigned long nbits, unsigned long start, unsigned long invert)
 {
-	const unsigned long *p = addr + BITOP_WORD(offset);
-	unsigned long result = offset & ~(BITS_PER_LONG-1);
 	unsigned long tmp;
 
-	if (offset >= size)
-		return size;
-	size -= result;
-	offset %= BITS_PER_LONG;
-	if (offset) {
-		tmp = *(p++);
-		tmp &= (~0UL << offset);
-		if (size < BITS_PER_LONG)
-			goto found_first;
-		if (tmp)
-			goto found_middle;
-		size -= BITS_PER_LONG;
-		result += BITS_PER_LONG;
-	}
-	while (size & ~(BITS_PER_LONG-1)) {
-		if ((tmp = *(p++)))
-			goto found_middle;
-		result += BITS_PER_LONG;
-		size -= BITS_PER_LONG;
+	if (!nbits || start >= nbits)
+		return nbits;
+
+	tmp = addr[start / BITS_PER_LONG] ^ invert;
+
+	/* Handle 1st word. */
+	tmp &= BITMAP_FIRST_WORD_MASK(start);
+	start = round_down(start, BITS_PER_LONG);
+
+	while (!tmp) {
+		start += BITS_PER_LONG;
+		if (start >= nbits)
+			return nbits;
+
+		tmp = addr[start / BITS_PER_LONG] ^ invert;
 	}
-	if (!size)
-		return result;
-	tmp = *p;
 
-found_first:
-	tmp &= (~0UL >> (BITS_PER_LONG - size));
-	if (tmp == 0UL)		/* Are any bits set? */
-		return result + size;	/* Nope. */
-found_middle:
-	return result + __ffs(tmp);
+	return min(start + __ffs(tmp), nbits);
 }
-EXPORT_SYMBOL(find_next_bit);
 #endif
 
-#ifndef find_next_zero_bit
+#ifndef find_next_bit
 /*
- * This implementation of find_{first,next}_zero_bit was stolen from
- * Linus' asm-alpha/bitops.h.
+ * Find the next set bit in a memory region.
  */
+unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
+			    unsigned long offset)
+{
+	return _find_next_bit(addr, size, offset, 0UL);
+}
+EXPORT_SYMBOL(find_next_bit);
+#endif
+
+#ifndef find_next_zero_bit
 unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
 				 unsigned long offset)
 {
-	const unsigned long *p = addr + BITOP_WORD(offset);
-	unsigned long result = offset & ~(BITS_PER_LONG-1);
-	unsigned long tmp;
-
-	if (offset >= size)
-		return size;
-	size -= result;
-	offset %= BITS_PER_LONG;
-	if (offset) {
-		tmp = *(p++);
-		tmp |= ~0UL >> (BITS_PER_LONG - offset);
-		if (size < BITS_PER_LONG)
-			goto found_first;
-		if (~tmp)
-			goto found_middle;
-		size -= BITS_PER_LONG;
-		result += BITS_PER_LONG;
-	}
-	while (size & ~(BITS_PER_LONG-1)) {
-		if (~(tmp = *(p++)))
-			goto found_middle;
-		result += BITS_PER_LONG;
-		size -= BITS_PER_LONG;
-	}
-	if (!size)
-		return result;
-	tmp = *p;
-
-found_first:
-	tmp |= ~0UL << size;
-	if (tmp == ~0UL)	/* Are any bits zero? */
-		return result + size;	/* Nope. */
-found_middle:
-	return result + ffz(tmp);
+	return _find_next_bit(addr, size, offset, ~0UL);
 }
 EXPORT_SYMBOL(find_next_zero_bit);
 #endif
@@ -113,24 +76,14 @@ EXPORT_SYMBOL(find_next_zero_bit);
  */
 unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
 {
-	const unsigned long *p = addr;
-	unsigned long result = 0;
-	unsigned long tmp;
+	unsigned long idx;
 
-	while (size & ~(BITS_PER_LONG-1)) {
-		if ((tmp = *(p++)))
-			goto found;
-		result += BITS_PER_LONG;
-		size -= BITS_PER_LONG;
+	for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
+		if (addr[idx])
+			return min(idx * BITS_PER_LONG + __ffs(addr[idx]), size);
 	}
-	if (!size)
-		return result;
 
-	tmp = (*p) & (~0UL >> (BITS_PER_LONG - size));
-	if (tmp == 0UL)		/* Are any bits set? */
-		return result + size;	/* Nope. */
-found:
-	return result + __ffs(tmp);
+	return size;
 }
 EXPORT_SYMBOL(find_first_bit);
 #endif
@@ -141,24 +94,14 @@ EXPORT_SYMBOL(find_first_bit);
  */
 unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
 {
-	const unsigned long *p = addr;
-	unsigned long result = 0;
-	unsigned long tmp;
+	unsigned long idx;
 
-	while (size & ~(BITS_PER_LONG-1)) {
-		if (~(tmp = *(p++)))
-			goto found;
-		result += BITS_PER_LONG;
-		size -= BITS_PER_LONG;
+	for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
+		if (addr[idx] != ~0UL)
+			return min(idx * BITS_PER_LONG + ffz(addr[idx]), size);
 	}
-	if (!size)
-		return result;
 
-	tmp = (*p) | (~0UL << size);
-	if (tmp == ~0UL)	/* Are any bits zero? */
-		return result + size;	/* Nope. */
-found:
-	return result + ffz(tmp);
+	return size;
 }
 EXPORT_SYMBOL(find_first_zero_bit);
 #endif
@@ -166,18 +109,6 @@ EXPORT_SYMBOL(find_first_zero_bit);
 #ifdef __BIG_ENDIAN
 
 /* include/linux/byteorder does not support "unsigned long" type */
-static inline unsigned long ext2_swabp(const unsigned long * x)
-{
-#if BITS_PER_LONG == 64
-	return (unsigned long) __swab64p((u64 *) x);
-#elif BITS_PER_LONG == 32
-	return (unsigned long) __swab32p((u32 *) x);
-#else
-#error BITS_PER_LONG not defined
-#endif
-}
-
-/* include/linux/byteorder doesn't support "unsigned long" type */
 static inline unsigned long ext2_swab(const unsigned long y)
 {
 #if BITS_PER_LONG == 64
@@ -189,48 +120,38 @@ static inline unsigned long ext2_swab(const unsigned long y)
 #endif
 }
 
-#ifndef find_next_zero_bit_le
-unsigned long find_next_zero_bit_le(const void *addr, unsigned
-		long size, unsigned long offset)
+#if !defined(find_next_bit_le) || !defined(find_next_zero_bit_le)
+static unsigned long _find_next_bit_le(const unsigned long *addr,
+		unsigned long nbits, unsigned long start, unsigned long invert)
 {
-	const unsigned long *p = addr;
-	unsigned long result = offset & ~(BITS_PER_LONG - 1);
 	unsigned long tmp;
 
-	if (offset >= size)
-		return size;
-	p += BITOP_WORD(offset);
-	size -= result;
-	offset &= (BITS_PER_LONG - 1UL);
-	if (offset) {
-		tmp = ext2_swabp(p++);
-		tmp |= (~0UL >> (BITS_PER_LONG - offset));
-		if (size < BITS_PER_LONG)
-			goto found_first;
-		if (~tmp)
-			goto found_middle;
-		size -= BITS_PER_LONG;
-		result += BITS_PER_LONG;
-	}
+	if (!nbits || start >= nbits)
+		return nbits;
+
+	tmp = addr[start / BITS_PER_LONG] ^ invert;
+
+	/* Handle 1st word. */
+	tmp &= ext2_swab(BITMAP_FIRST_WORD_MASK(start));
+	start = round_down(start, BITS_PER_LONG);
 
-	while (size & ~(BITS_PER_LONG - 1)) {
-		if (~(tmp = *(p++)))
-			goto found_middle_swap;
-		result += BITS_PER_LONG;
-		size -= BITS_PER_LONG;
+	while (!tmp) {
+		start += BITS_PER_LONG;
+		if (start >= nbits)
+			return nbits;
+
+		tmp = addr[start / BITS_PER_LONG] ^ invert;
 	}
-	if (!size)
-		return result;
-	tmp = ext2_swabp(p);
-found_first:
-	tmp |= ~0UL << size;
-	if (tmp == ~0UL)	/* Are any bits zero? */
-		return result + size; /* Nope. Skip ffz */
-found_middle:
-	return result + ffz(tmp);
 
-found_middle_swap:
-	return result + ffz(ext2_swab(tmp));
+	return min(start + __ffs(ext2_swab(tmp)), nbits);
+}
+#endif
+
+#ifndef find_next_zero_bit_le
+unsigned long find_next_zero_bit_le(const void *addr, unsigned
+		long size, unsigned long offset)
+{
+	return _find_next_bit_le(addr, size, offset, ~0UL);
 }
 EXPORT_SYMBOL(find_next_zero_bit_le);
 #endif
@@ -239,45 +160,7 @@ EXPORT_SYMBOL(find_next_zero_bit_le);
 unsigned long find_next_bit_le(const void *addr, unsigned
 		long size, unsigned long offset)
 {
-	const unsigned long *p = addr;
-	unsigned long result = offset & ~(BITS_PER_LONG - 1);
-	unsigned long tmp;
-
-	if (offset >= size)
-		return size;
-	p += BITOP_WORD(offset);
-	size -= result;
-	offset &= (BITS_PER_LONG - 1UL);
-	if (offset) {
-		tmp = ext2_swabp(p++);
-		tmp &= (~0UL << offset);
-		if (size < BITS_PER_LONG)
-			goto found_first;
-		if (tmp)
-			goto found_middle;
-		size -= BITS_PER_LONG;
-		result += BITS_PER_LONG;
-	}
-
-	while (size & ~(BITS_PER_LONG - 1)) {
-		tmp = *(p++);
-		if (tmp)
-			goto found_middle_swap;
-		result += BITS_PER_LONG;
-		size -= BITS_PER_LONG;
-	}
-	if (!size)
-		return result;
-	tmp = ext2_swabp(p);
-found_first:
-	tmp &= (~0UL >> (BITS_PER_LONG - size));
-	if (tmp == 0UL)		/* Are any bits set? */
-		return result + size; /* Nope. */
-found_middle:
-	return result + __ffs(tmp);
-
-found_middle_swap:
-	return result + __ffs(ext2_swab(tmp));
+	return _find_next_bit_le(addr, size, offset, 0UL);
 }
 EXPORT_SYMBOL(find_next_bit_le);
 #endif
-- 
cgit v1.2.3


From 8f6f19dd5143aa59139deeb885a8ed5e2d937e21 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Thu, 16 Apr 2015 12:43:16 -0700
Subject: lib: move find_last_bit to lib/find_next_bit.c

Currently all 'find_*_bit' family is located in lib/find_next_bit.c,
except 'find_last_bit', which is in lib/find_last_bit.c. It seems,
there's no major benefit to have it separated.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Reviewed-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Reviewed-by: George Spelvin <linux@horizon.com>
Cc: Alexey Klimov <klimov.linux@gmail.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Daniel Borkmann <dborkman@redhat.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Valentin Rothberg <valentinrothberg@gmail.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/Makefile        |  2 +-
 lib/find_next_bit.c | 27 ++++++++++++++++++++++++++-
 2 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/lib/Makefile b/lib/Makefile
index 58f74d2dd396..05f8fa56a1bc 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -25,7 +25,7 @@ obj-y	+= lockref.o
 obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
 	 bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \
 	 gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
-	 bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o \
+	 bsearch.o find_next_bit.o llist.o memweight.o kfifo.o \
 	 percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o
 obj-y += string_helpers.o
 obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
diff --git a/lib/find_next_bit.c b/lib/find_next_bit.c
index cbea5ef843aa..18072ea9c20e 100644
--- a/lib/find_next_bit.c
+++ b/lib/find_next_bit.c
@@ -1,8 +1,12 @@
-/* find_next_bit.c: fallback find next bit implementation
+/* bit search implementation
  *
  * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
+ * Copyright (C) 2008 IBM Corporation
+ * 'find_last_bit' is written by Rusty Russell <rusty@rustcorp.com.au>
+ * (Inspired by David Howell's find_next_bit implementation)
+ *
  * Rewritten by Yury Norov <yury.norov@gmail.com> to decrease
  * size and improve performance, 2015.
  *
@@ -13,6 +17,7 @@
  */
 
 #include <linux/bitops.h>
+#include <linux/bitmap.h>
 #include <linux/export.h>
 #include <linux/kernel.h>
 
@@ -106,6 +111,26 @@ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
 EXPORT_SYMBOL(find_first_zero_bit);
 #endif
 
+#ifndef find_last_bit
+unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
+{
+	if (size) {
+		unsigned long val = BITMAP_LAST_WORD_MASK(size);
+		unsigned long idx = (size-1) / BITS_PER_LONG;
+
+		do {
+			val &= addr[idx];
+			if (val)
+				return idx * BITS_PER_LONG + __fls(val);
+
+			val = ~0ul;
+		} while (idx--);
+	}
+	return size;
+}
+EXPORT_SYMBOL(find_last_bit);
+#endif
+
 #ifdef __BIG_ENDIAN
 
 /* include/linux/byteorder does not support "unsigned long" type */
-- 
cgit v1.2.3


From 840620a1596a90636a44d6a593db4041bb28d52e Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Thu, 16 Apr 2015 12:43:19 -0700
Subject: lib: rename lib/find_next_bit.c to lib/find_bit.c

This file contains implementation for all find_*_bit{,_le}
So giving it more generic name looks reasonable.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Reviewed-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Reviewed-by: George Spelvin <linux@horizon.com>
Cc: Alexey Klimov <klimov.linux@gmail.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Daniel Borkmann <dborkman@redhat.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Valentin Rothberg <valentinrothberg@gmail.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/Makefile        |   2 +-
 lib/find_bit.c      | 193 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/find_next_bit.c | 193 ----------------------------------------------------
 3 files changed, 194 insertions(+), 194 deletions(-)
 create mode 100644 lib/find_bit.c
 delete mode 100644 lib/find_next_bit.c

(limited to 'lib')

diff --git a/lib/Makefile b/lib/Makefile
index 05f8fa56a1bc..da6116b21555 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -25,7 +25,7 @@ obj-y	+= lockref.o
 obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
 	 bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \
 	 gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
-	 bsearch.o find_next_bit.o llist.o memweight.o kfifo.o \
+	 bsearch.o find_bit.o llist.o memweight.o kfifo.o \
 	 percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o
 obj-y += string_helpers.o
 obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
diff --git a/lib/find_bit.c b/lib/find_bit.c
new file mode 100644
index 000000000000..18072ea9c20e
--- /dev/null
+++ b/lib/find_bit.c
@@ -0,0 +1,193 @@
+/* bit search implementation
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * Copyright (C) 2008 IBM Corporation
+ * 'find_last_bit' is written by Rusty Russell <rusty@rustcorp.com.au>
+ * (Inspired by David Howell's find_next_bit implementation)
+ *
+ * Rewritten by Yury Norov <yury.norov@gmail.com> to decrease
+ * size and improve performance, 2015.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/bitops.h>
+#include <linux/bitmap.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+
+#if !defined(find_next_bit) || !defined(find_next_zero_bit)
+
+/*
+ * This is a common helper function for find_next_bit and
+ * find_next_zero_bit.  The difference is the "invert" argument, which
+ * is XORed with each fetched word before searching it for one bits.
+ */
+static unsigned long _find_next_bit(const unsigned long *addr,
+		unsigned long nbits, unsigned long start, unsigned long invert)
+{
+	unsigned long tmp;
+
+	if (!nbits || start >= nbits)
+		return nbits;
+
+	tmp = addr[start / BITS_PER_LONG] ^ invert;
+
+	/* Handle 1st word. */
+	tmp &= BITMAP_FIRST_WORD_MASK(start);
+	start = round_down(start, BITS_PER_LONG);
+
+	while (!tmp) {
+		start += BITS_PER_LONG;
+		if (start >= nbits)
+			return nbits;
+
+		tmp = addr[start / BITS_PER_LONG] ^ invert;
+	}
+
+	return min(start + __ffs(tmp), nbits);
+}
+#endif
+
+#ifndef find_next_bit
+/*
+ * Find the next set bit in a memory region.
+ */
+unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
+			    unsigned long offset)
+{
+	return _find_next_bit(addr, size, offset, 0UL);
+}
+EXPORT_SYMBOL(find_next_bit);
+#endif
+
+#ifndef find_next_zero_bit
+unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
+				 unsigned long offset)
+{
+	return _find_next_bit(addr, size, offset, ~0UL);
+}
+EXPORT_SYMBOL(find_next_zero_bit);
+#endif
+
+#ifndef find_first_bit
+/*
+ * Find the first set bit in a memory region.
+ */
+unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
+{
+	unsigned long idx;
+
+	for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
+		if (addr[idx])
+			return min(idx * BITS_PER_LONG + __ffs(addr[idx]), size);
+	}
+
+	return size;
+}
+EXPORT_SYMBOL(find_first_bit);
+#endif
+
+#ifndef find_first_zero_bit
+/*
+ * Find the first cleared bit in a memory region.
+ */
+unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
+{
+	unsigned long idx;
+
+	for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
+		if (addr[idx] != ~0UL)
+			return min(idx * BITS_PER_LONG + ffz(addr[idx]), size);
+	}
+
+	return size;
+}
+EXPORT_SYMBOL(find_first_zero_bit);
+#endif
+
+#ifndef find_last_bit
+unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
+{
+	if (size) {
+		unsigned long val = BITMAP_LAST_WORD_MASK(size);
+		unsigned long idx = (size-1) / BITS_PER_LONG;
+
+		do {
+			val &= addr[idx];
+			if (val)
+				return idx * BITS_PER_LONG + __fls(val);
+
+			val = ~0ul;
+		} while (idx--);
+	}
+	return size;
+}
+EXPORT_SYMBOL(find_last_bit);
+#endif
+
+#ifdef __BIG_ENDIAN
+
+/* include/linux/byteorder does not support "unsigned long" type */
+static inline unsigned long ext2_swab(const unsigned long y)
+{
+#if BITS_PER_LONG == 64
+	return (unsigned long) __swab64((u64) y);
+#elif BITS_PER_LONG == 32
+	return (unsigned long) __swab32((u32) y);
+#else
+#error BITS_PER_LONG not defined
+#endif
+}
+
+#if !defined(find_next_bit_le) || !defined(find_next_zero_bit_le)
+static unsigned long _find_next_bit_le(const unsigned long *addr,
+		unsigned long nbits, unsigned long start, unsigned long invert)
+{
+	unsigned long tmp;
+
+	if (!nbits || start >= nbits)
+		return nbits;
+
+	tmp = addr[start / BITS_PER_LONG] ^ invert;
+
+	/* Handle 1st word. */
+	tmp &= ext2_swab(BITMAP_FIRST_WORD_MASK(start));
+	start = round_down(start, BITS_PER_LONG);
+
+	while (!tmp) {
+		start += BITS_PER_LONG;
+		if (start >= nbits)
+			return nbits;
+
+		tmp = addr[start / BITS_PER_LONG] ^ invert;
+	}
+
+	return min(start + __ffs(ext2_swab(tmp)), nbits);
+}
+#endif
+
+#ifndef find_next_zero_bit_le
+unsigned long find_next_zero_bit_le(const void *addr, unsigned
+		long size, unsigned long offset)
+{
+	return _find_next_bit_le(addr, size, offset, ~0UL);
+}
+EXPORT_SYMBOL(find_next_zero_bit_le);
+#endif
+
+#ifndef find_next_bit_le
+unsigned long find_next_bit_le(const void *addr, unsigned
+		long size, unsigned long offset)
+{
+	return _find_next_bit_le(addr, size, offset, 0UL);
+}
+EXPORT_SYMBOL(find_next_bit_le);
+#endif
+
+#endif /* __BIG_ENDIAN */
diff --git a/lib/find_next_bit.c b/lib/find_next_bit.c
deleted file mode 100644
index 18072ea9c20e..000000000000
--- a/lib/find_next_bit.c
+++ /dev/null
@@ -1,193 +0,0 @@
-/* bit search implementation
- *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * Copyright (C) 2008 IBM Corporation
- * 'find_last_bit' is written by Rusty Russell <rusty@rustcorp.com.au>
- * (Inspired by David Howell's find_next_bit implementation)
- *
- * Rewritten by Yury Norov <yury.norov@gmail.com> to decrease
- * size and improve performance, 2015.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/bitops.h>
-#include <linux/bitmap.h>
-#include <linux/export.h>
-#include <linux/kernel.h>
-
-#if !defined(find_next_bit) || !defined(find_next_zero_bit)
-
-/*
- * This is a common helper function for find_next_bit and
- * find_next_zero_bit.  The difference is the "invert" argument, which
- * is XORed with each fetched word before searching it for one bits.
- */
-static unsigned long _find_next_bit(const unsigned long *addr,
-		unsigned long nbits, unsigned long start, unsigned long invert)
-{
-	unsigned long tmp;
-
-	if (!nbits || start >= nbits)
-		return nbits;
-
-	tmp = addr[start / BITS_PER_LONG] ^ invert;
-
-	/* Handle 1st word. */
-	tmp &= BITMAP_FIRST_WORD_MASK(start);
-	start = round_down(start, BITS_PER_LONG);
-
-	while (!tmp) {
-		start += BITS_PER_LONG;
-		if (start >= nbits)
-			return nbits;
-
-		tmp = addr[start / BITS_PER_LONG] ^ invert;
-	}
-
-	return min(start + __ffs(tmp), nbits);
-}
-#endif
-
-#ifndef find_next_bit
-/*
- * Find the next set bit in a memory region.
- */
-unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
-			    unsigned long offset)
-{
-	return _find_next_bit(addr, size, offset, 0UL);
-}
-EXPORT_SYMBOL(find_next_bit);
-#endif
-
-#ifndef find_next_zero_bit
-unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
-				 unsigned long offset)
-{
-	return _find_next_bit(addr, size, offset, ~0UL);
-}
-EXPORT_SYMBOL(find_next_zero_bit);
-#endif
-
-#ifndef find_first_bit
-/*
- * Find the first set bit in a memory region.
- */
-unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
-{
-	unsigned long idx;
-
-	for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
-		if (addr[idx])
-			return min(idx * BITS_PER_LONG + __ffs(addr[idx]), size);
-	}
-
-	return size;
-}
-EXPORT_SYMBOL(find_first_bit);
-#endif
-
-#ifndef find_first_zero_bit
-/*
- * Find the first cleared bit in a memory region.
- */
-unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
-{
-	unsigned long idx;
-
-	for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
-		if (addr[idx] != ~0UL)
-			return min(idx * BITS_PER_LONG + ffz(addr[idx]), size);
-	}
-
-	return size;
-}
-EXPORT_SYMBOL(find_first_zero_bit);
-#endif
-
-#ifndef find_last_bit
-unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
-{
-	if (size) {
-		unsigned long val = BITMAP_LAST_WORD_MASK(size);
-		unsigned long idx = (size-1) / BITS_PER_LONG;
-
-		do {
-			val &= addr[idx];
-			if (val)
-				return idx * BITS_PER_LONG + __fls(val);
-
-			val = ~0ul;
-		} while (idx--);
-	}
-	return size;
-}
-EXPORT_SYMBOL(find_last_bit);
-#endif
-
-#ifdef __BIG_ENDIAN
-
-/* include/linux/byteorder does not support "unsigned long" type */
-static inline unsigned long ext2_swab(const unsigned long y)
-{
-#if BITS_PER_LONG == 64
-	return (unsigned long) __swab64((u64) y);
-#elif BITS_PER_LONG == 32
-	return (unsigned long) __swab32((u32) y);
-#else
-#error BITS_PER_LONG not defined
-#endif
-}
-
-#if !defined(find_next_bit_le) || !defined(find_next_zero_bit_le)
-static unsigned long _find_next_bit_le(const unsigned long *addr,
-		unsigned long nbits, unsigned long start, unsigned long invert)
-{
-	unsigned long tmp;
-
-	if (!nbits || start >= nbits)
-		return nbits;
-
-	tmp = addr[start / BITS_PER_LONG] ^ invert;
-
-	/* Handle 1st word. */
-	tmp &= ext2_swab(BITMAP_FIRST_WORD_MASK(start));
-	start = round_down(start, BITS_PER_LONG);
-
-	while (!tmp) {
-		start += BITS_PER_LONG;
-		if (start >= nbits)
-			return nbits;
-
-		tmp = addr[start / BITS_PER_LONG] ^ invert;
-	}
-
-	return min(start + __ffs(ext2_swab(tmp)), nbits);
-}
-#endif
-
-#ifndef find_next_zero_bit_le
-unsigned long find_next_zero_bit_le(const void *addr, unsigned
-		long size, unsigned long offset)
-{
-	return _find_next_bit_le(addr, size, offset, ~0UL);
-}
-EXPORT_SYMBOL(find_next_zero_bit_le);
-#endif
-
-#ifndef find_next_bit_le
-unsigned long find_next_bit_le(const void *addr, unsigned
-		long size, unsigned long offset)
-{
-	return _find_next_bit_le(addr, size, offset, 0UL);
-}
-EXPORT_SYMBOL(find_next_bit_le);
-#endif
-
-#endif /* __BIG_ENDIAN */
-- 
cgit v1.2.3


From 7c43d9a30c527d9e06e2c55f82b56f28df43caed Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 16 Apr 2015 12:43:22 -0700
Subject: lib/vsprintf.c: even faster binary to decimal conversion

The most expensive part of decimal conversion is the divisions by 10
(albeit done using reciprocal multiplication with appropriately chosen
constants).  I decided to see if one could eliminate around half of
these multiplications by emitting two digits at a time, at the cost of a
200 byte lookup table, and it does indeed seem like there is something
to be gained, especially on 64 bits.  Microbenchmarking shows
improvements ranging from -50% (for numbers uniformly distributed in [0,
2^64-1]) to -25% (for numbers heavily biased toward the smaller end, a
more realistic distribution).

On a larger scale, perf shows that top, one of the big consumers of /proc
data, uses 0.5-1.0% fewer cpu cycles.

I had to jump through some hoops to get the 32 bit code to compile and run
on my 64 bit machine, so I'm not sure how relevant these numbers are, but
just for comparison the microbenchmark showed improvements between -30%
and -10%.

The bloat-o-meter costs are around 150 bytes (the generated code is a
little smaller, so it's not the full 200 bytes) on both 32 and 64 bit.
I'm aware that extra cache misses won't show up in a microbenchmark as
used above, but on the other hand decimal conversions often happen in bulk
(for example in the case of top).

I have of course tested that the new code generates the same output as the
old, for both the first and last 1e10 numbers in [0,2^64-1] and 4e9
'random' numbers in-between.

Test and verification code on github: https://github.com/Villemoes/dec.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Tested-by: Jeff Epler <jepler@unpythonic.net>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/vsprintf.c | 246 ++++++++++++++++++++++++++++++---------------------------
 1 file changed, 128 insertions(+), 118 deletions(-)

(limited to 'lib')

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 3a1e0843f9a2..c93ec8a035b3 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -33,6 +33,7 @@
 
 #include <asm/page.h>		/* for PAGE_SIZE */
 #include <asm/sections.h>	/* for dereference_function_descriptor() */
+#include <asm/byteorder.h>	/* cpu_to_le16 */
 
 #include <linux/string_helpers.h>
 #include "kstrtox.h"
@@ -122,142 +123,147 @@ int skip_atoi(const char **s)
 	return i;
 }
 
-/* Decimal conversion is by far the most typical, and is used
- * for /proc and /sys data. This directly impacts e.g. top performance
- * with many processes running. We optimize it for speed
- * using ideas described at <http://www.cs.uiowa.edu/~jones/bcd/divide.html>
- * (with permission from the author, Douglas W. Jones).
+/*
+ * Decimal conversion is by far the most typical, and is used for
+ * /proc and /sys data. This directly impacts e.g. top performance
+ * with many processes running. We optimize it for speed by emitting
+ * two characters at a time, using a 200 byte lookup table. This
+ * roughly halves the number of multiplications compared to computing
+ * the digits one at a time. Implementation strongly inspired by the
+ * previous version, which in turn used ideas described at
+ * <http://www.cs.uiowa.edu/~jones/bcd/divide.html> (with permission
+ * from the author, Douglas W. Jones).
+ *
+ * It turns out there is precisely one 26 bit fixed-point
+ * approximation a of 64/100 for which x/100 == (x * (u64)a) >> 32
+ * holds for all x in [0, 10^8-1], namely a = 0x28f5c29. The actual
+ * range happens to be somewhat larger (x <= 1073741898), but that's
+ * irrelevant for our purpose.
+ *
+ * For dividing a number in the range [10^4, 10^6-1] by 100, we still
+ * need a 32x32->64 bit multiply, so we simply use the same constant.
+ *
+ * For dividing a number in the range [100, 10^4-1] by 100, there are
+ * several options. The simplest is (x * 0x147b) >> 19, which is valid
+ * for all x <= 43698.
  */
 
-#if BITS_PER_LONG != 32 || BITS_PER_LONG_LONG != 64
-/* Formats correctly any integer in [0, 999999999] */
+static const u16 decpair[100] = {
+#define _(x) (__force u16) cpu_to_le16(((x % 10) | ((x / 10) << 8)) + 0x3030)
+	_( 0), _( 1), _( 2), _( 3), _( 4), _( 5), _( 6), _( 7), _( 8), _( 9),
+	_(10), _(11), _(12), _(13), _(14), _(15), _(16), _(17), _(18), _(19),
+	_(20), _(21), _(22), _(23), _(24), _(25), _(26), _(27), _(28), _(29),
+	_(30), _(31), _(32), _(33), _(34), _(35), _(36), _(37), _(38), _(39),
+	_(40), _(41), _(42), _(43), _(44), _(45), _(46), _(47), _(48), _(49),
+	_(50), _(51), _(52), _(53), _(54), _(55), _(56), _(57), _(58), _(59),
+	_(60), _(61), _(62), _(63), _(64), _(65), _(66), _(67), _(68), _(69),
+	_(70), _(71), _(72), _(73), _(74), _(75), _(76), _(77), _(78), _(79),
+	_(80), _(81), _(82), _(83), _(84), _(85), _(86), _(87), _(88), _(89),
+	_(90), _(91), _(92), _(93), _(94), _(95), _(96), _(97), _(98), _(99),
+#undef _
+};
+
+/*
+ * This will print a single '0' even if r == 0, since we would
+ * immediately jump to out_r where two 0s would be written and one of
+ * them then discarded. This is needed by ip4_string below. All other
+ * callers pass a non-zero value of r.
+*/
 static noinline_for_stack
-char *put_dec_full9(char *buf, unsigned q)
+char *put_dec_trunc8(char *buf, unsigned r)
 {
-	unsigned r;
+	unsigned q;
 
-	/*
-	 * Possible ways to approx. divide by 10
-	 * (x * 0x1999999a) >> 32 x < 1073741829 (multiply must be 64-bit)
-	 * (x * 0xcccd) >> 19     x <      81920 (x < 262149 when 64-bit mul)
-	 * (x * 0x6667) >> 18     x <      43699
-	 * (x * 0x3334) >> 17     x <      16389
-	 * (x * 0x199a) >> 16     x <      16389
-	 * (x * 0x0ccd) >> 15     x <      16389
-	 * (x * 0x0667) >> 14     x <       2739
-	 * (x * 0x0334) >> 13     x <       1029
-	 * (x * 0x019a) >> 12     x <       1029
-	 * (x * 0x00cd) >> 11     x <       1029 shorter code than * 0x67 (on i386)
-	 * (x * 0x0067) >> 10     x <        179
-	 * (x * 0x0034) >>  9     x <         69 same
-	 * (x * 0x001a) >>  8     x <         69 same
-	 * (x * 0x000d) >>  7     x <         69 same, shortest code (on i386)
-	 * (x * 0x0007) >>  6     x <         19
-	 * See <http://www.cs.uiowa.edu/~jones/bcd/divide.html>
-	 */
-	r      = (q * (uint64_t)0x1999999a) >> 32;
-	*buf++ = (q - 10 * r) + '0'; /* 1 */
-	q      = (r * (uint64_t)0x1999999a) >> 32;
-	*buf++ = (r - 10 * q) + '0'; /* 2 */
-	r      = (q * (uint64_t)0x1999999a) >> 32;
-	*buf++ = (q - 10 * r) + '0'; /* 3 */
-	q      = (r * (uint64_t)0x1999999a) >> 32;
-	*buf++ = (r - 10 * q) + '0'; /* 4 */
-	r      = (q * (uint64_t)0x1999999a) >> 32;
-	*buf++ = (q - 10 * r) + '0'; /* 5 */
-	/* Now value is under 10000, can avoid 64-bit multiply */
-	q      = (r * 0x199a) >> 16;
-	*buf++ = (r - 10 * q)  + '0'; /* 6 */
-	r      = (q * 0xcd) >> 11;
-	*buf++ = (q - 10 * r)  + '0'; /* 7 */
-	q      = (r * 0xcd) >> 11;
-	*buf++ = (r - 10 * q) + '0'; /* 8 */
-	*buf++ = q + '0'; /* 9 */
+	/* 1 <= r < 10^8 */
+	if (r < 100)
+		goto out_r;
+
+	/* 100 <= r < 10^8 */
+	q = (r * (u64)0x28f5c29) >> 32;
+	*((u16 *)buf) = decpair[r - 100*q];
+	buf += 2;
+
+	/* 1 <= q < 10^6 */
+	if (q < 100)
+		goto out_q;
+
+	/*  100 <= q < 10^6 */
+	r = (q * (u64)0x28f5c29) >> 32;
+	*((u16 *)buf) = decpair[q - 100*r];
+	buf += 2;
+
+	/* 1 <= r < 10^4 */
+	if (r < 100)
+		goto out_r;
+
+	/* 100 <= r < 10^4 */
+	q = (r * 0x147b) >> 19;
+	*((u16 *)buf) = decpair[r - 100*q];
+	buf += 2;
+out_q:
+	/* 1 <= q < 100 */
+	r = q;
+out_r:
+	/* 1 <= r < 100 */
+	*((u16 *)buf) = decpair[r];
+	buf += 2;
+	if (buf[-1] == '0')
+		buf--;
 	return buf;
 }
-#endif
 
-/* Similar to above but do not pad with zeros.
- * Code can be easily arranged to print 9 digits too, but our callers
- * always call put_dec_full9() instead when the number has 9 decimal digits.
- */
+#if BITS_PER_LONG == 64 && BITS_PER_LONG_LONG == 64
 static noinline_for_stack
-char *put_dec_trunc8(char *buf, unsigned r)
+char *put_dec_full8(char *buf, unsigned r)
 {
 	unsigned q;
 
-	/* Copy of previous function's body with added early returns */
-	while (r >= 10000) {
-		q = r + '0';
-		r  = (r * (uint64_t)0x1999999a) >> 32;
-		*buf++ = q - 10*r;
-	}
+	/* 0 <= r < 10^8 */
+	q = (r * (u64)0x28f5c29) >> 32;
+	*((u16 *)buf) = decpair[r - 100*q];
+	buf += 2;
 
-	q      = (r * 0x199a) >> 16;	/* r <= 9999 */
-	*buf++ = (r - 10 * q)  + '0';
-	if (q == 0)
-		return buf;
-	r      = (q * 0xcd) >> 11;	/* q <= 999 */
-	*buf++ = (q - 10 * r)  + '0';
-	if (r == 0)
-		return buf;
-	q      = (r * 0xcd) >> 11;	/* r <= 99 */
-	*buf++ = (r - 10 * q) + '0';
-	if (q == 0)
-		return buf;
-	*buf++ = q + '0';		 /* q <= 9 */
-	return buf;
-}
+	/* 0 <= q < 10^6 */
+	r = (q * (u64)0x28f5c29) >> 32;
+	*((u16 *)buf) = decpair[q - 100*r];
+	buf += 2;
 
-/* There are two algorithms to print larger numbers.
- * One is generic: divide by 1000000000 and repeatedly print
- * groups of (up to) 9 digits. It's conceptually simple,
- * but requires a (unsigned long long) / 1000000000 division.
- *
- * Second algorithm splits 64-bit unsigned long long into 16-bit chunks,
- * manipulates them cleverly and generates groups of 4 decimal digits.
- * It so happens that it does NOT require long long division.
- *
- * If long is > 32 bits, division of 64-bit values is relatively easy,
- * and we will use the first algorithm.
- * If long long is > 64 bits (strange architecture with VERY large long long),
- * second algorithm can't be used, and we again use the first one.
- *
- * Else (if long is 32 bits and long long is 64 bits) we use second one.
- */
+	/* 0 <= r < 10^4 */
+	q = (r * 0x147b) >> 19;
+	*((u16 *)buf) = decpair[r - 100*q];
+	buf += 2;
 
-#if BITS_PER_LONG != 32 || BITS_PER_LONG_LONG != 64
-
-/* First algorithm: generic */
+	/* 0 <= q < 100 */
+	*((u16 *)buf) = decpair[q];
+	buf += 2;
+	return buf;
+}
 
-static
+static noinline_for_stack
 char *put_dec(char *buf, unsigned long long n)
 {
-	if (n >= 100*1000*1000) {
-		while (n >= 1000*1000*1000)
-			buf = put_dec_full9(buf, do_div(n, 1000*1000*1000));
-		if (n >= 100*1000*1000)
-			return put_dec_full9(buf, n);
-	}
+	if (n >= 100*1000*1000)
+		buf = put_dec_full8(buf, do_div(n, 100*1000*1000));
+	/* 1 <= n <= 1.6e11 */
+	if (n >= 100*1000*1000)
+		buf = put_dec_full8(buf, do_div(n, 100*1000*1000));
+	/* 1 <= n < 1e8 */
 	return put_dec_trunc8(buf, n);
 }
 
-#else
+#elif BITS_PER_LONG == 32 && BITS_PER_LONG_LONG == 64
 
-/* Second algorithm: valid only for 64-bit long longs */
-
-/* See comment in put_dec_full9 for choice of constants */
-static noinline_for_stack
-void put_dec_full4(char *buf, unsigned q)
+static void
+put_dec_full4(char *buf, unsigned r)
 {
-	unsigned r;
-	r      = (q * 0xccd) >> 15;
-	buf[0] = (q - 10 * r) + '0';
-	q      = (r * 0xcd) >> 11;
-	buf[1] = (r - 10 * q)  + '0';
-	r      = (q * 0xcd) >> 11;
-	buf[2] = (q - 10 * r)  + '0';
-	buf[3] = r + '0';
+	unsigned q;
+
+	/* 0 <= r < 10^4 */
+	q = (r * 0x147b) >> 19;
+	*((u16 *)buf) = decpair[r - 100*q];
+	buf += 2;
+	/* 0 <= q < 100 */
+	*((u16 *)buf) = decpair[q];
 }
 
 /*
@@ -265,9 +271,9 @@ void put_dec_full4(char *buf, unsigned q)
  * The approximation x/10000 == (x * 0x346DC5D7) >> 43
  * holds for all x < 1,128,869,999.  The largest value this
  * helper will ever be asked to convert is 1,125,520,955.
- * (d1 in the put_dec code, assuming n is all-ones).
+ * (second call in the put_dec code, assuming n is all-ones).
  */
-static
+static noinline_for_stack
 unsigned put_dec_helper4(char *buf, unsigned x)
 {
         uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43;
@@ -294,6 +300,8 @@ char *put_dec(char *buf, unsigned long long n)
 	d2  = (h      ) & 0xffff;
 	d3  = (h >> 16); /* implicit "& 0xffff" */
 
+	/* n = 2^48 d3 + 2^32 d2 + 2^16 d1 + d0
+	     = 281_4749_7671_0656 d3 + 42_9496_7296 d2 + 6_5536 d1 + d0 */
 	q   = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff);
 	q = put_dec_helper4(buf, q);
 
@@ -323,7 +331,8 @@ char *put_dec(char *buf, unsigned long long n)
  */
 int num_to_str(char *buf, int size, unsigned long long num)
 {
-	char tmp[sizeof(num) * 3];
+	/* put_dec requires 2-byte alignment of the buffer. */
+	char tmp[sizeof(num) * 3] __aligned(2);
 	int idx, len;
 
 	/* put_dec() may work incorrectly for num = 0 (generate "", not "0") */
@@ -384,7 +393,8 @@ static noinline_for_stack
 char *number(char *buf, char *end, unsigned long long num,
 	     struct printf_spec spec)
 {
-	char tmp[3 * sizeof(num)];
+	/* put_dec requires 2-byte alignment of the buffer. */
+	char tmp[3 * sizeof(num)] __aligned(2);
 	char sign;
 	char locase;
 	int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10);
@@ -944,7 +954,7 @@ char *ip4_string(char *p, const u8 *addr, const char *fmt)
 		break;
 	}
 	for (i = 0; i < 4; i++) {
-		char temp[3];	/* hold each IP quad in reverse order */
+		char temp[4] __aligned(2);	/* hold each IP quad in reverse order */
 		int digits = put_dec_trunc8(temp, addr[index]) - temp;
 		if (leading_zeros) {
 			if (digits < 3)
-- 
cgit v1.2.3


From a7a2c02a40151811609ad8cd3bf5c4fc516fece5 Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@linux.vnet.ibm.com>
Date: Thu, 16 Apr 2015 12:43:25 -0700
Subject: lib/dma-debug: fix bucket_find_contain()

bucket_find_contain() will search the bucket list for a dma_debug_entry.
When the entry isn't found it needs to search other buckets too, since
only the start address of a dma range is hashed (which might be in a
different bucket).

A copy of the dma_debug_entry is used to get the previous hash bucket
but when its list is searched the original dma_debug_entry is to be used
not its modified copy.

This fixes false "device driver tries to sync DMA memory it has not allocated"
warnings.

Signed-off-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: Horia Geanta <horia.geanta@freescale.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/dma-debug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 9722bd2dbc9b..ae4b65e17e64 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -361,7 +361,7 @@ static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket,
 	unsigned int range = 0;
 
 	while (range <= max_range) {
-		entry = __hash_bucket_find(*bucket, &index, containing_match);
+		entry = __hash_bucket_find(*bucket, ref, containing_match);
 
 		if (entry)
 			return entry;
-- 
cgit v1.2.3


From 675cf53c1deaffadc7b6a0b4afe6cdafec86bedb Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 16 Apr 2015 12:43:42 -0700
Subject: lib/vsprintf.c: improve put_dec_trunc8 slightly

I hadn't had enough coffee when I wrote this. Currently, the final
increment of buf depends on the value loaded from the table, and
causes gcc to emit a cmov immediately before the return. It is smarter
to let it depend on r, since the increment can then be computed in
parallel with the final load/store pair. It also shaves 16 bytes of
.text.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Tejun Heo <tj@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/vsprintf.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'lib')

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index c93ec8a035b3..da39c608a28c 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -165,9 +165,9 @@ static const u16 decpair[100] = {
 
 /*
  * This will print a single '0' even if r == 0, since we would
- * immediately jump to out_r where two 0s would be written and one of
- * them then discarded. This is needed by ip4_string below. All other
- * callers pass a non-zero value of r.
+ * immediately jump to out_r where two 0s would be written but only
+ * one of them accounted for in buf. This is needed by ip4_string
+ * below. All other callers pass a non-zero value of r.
 */
 static noinline_for_stack
 char *put_dec_trunc8(char *buf, unsigned r)
@@ -206,9 +206,7 @@ out_q:
 out_r:
 	/* 1 <= r < 100 */
 	*((u16 *)buf) = decpair[r];
-	buf += 2;
-	if (buf[-1] == '0')
-		buf--;
+	buf += r < 10 ? 1 : 2;
 	return buf;
 }
 
-- 
cgit v1.2.3


From 2afe27c718b669b551895595873611ac39cc31e3 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Thu, 16 Apr 2015 12:44:00 -0700
Subject: lib/bitmap.c: bitmap_[empty,full]: remove code duplication

bitmap_empty() has its own implementation.  But it's clearly as simple as:

	find_first_bit(src, nbits) == nbits

The same is true for 'bitmap_full'.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Cc: George Spelvin <linux@horizon.com>
Cc: Alexey Klimov <klimov.linux@gmail.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitmap.h |  8 ++++----
 lib/bitmap.c           | 30 ------------------------------
 2 files changed, 4 insertions(+), 34 deletions(-)

(limited to 'lib')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index be4fa5ddf36c..ea17cca9e685 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -283,16 +283,16 @@ static inline int bitmap_empty(const unsigned long *src, unsigned nbits)
 {
 	if (small_const_nbits(nbits))
 		return ! (*src & BITMAP_LAST_WORD_MASK(nbits));
-	else
-		return __bitmap_empty(src, nbits);
+
+	return find_first_bit(src, nbits) == nbits;
 }
 
 static inline int bitmap_full(const unsigned long *src, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
 		return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits));
-	else
-		return __bitmap_full(src, nbits);
+
+	return find_first_zero_bit(src, nbits) == nbits;
 }
 
 static inline int bitmap_weight(const unsigned long *src, unsigned int nbits)
diff --git a/lib/bitmap.c b/lib/bitmap.c
index d456f4c15a9f..64c0926f5dd8 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -42,36 +42,6 @@
  * for the best explanations of this ordering.
  */
 
-int __bitmap_empty(const unsigned long *bitmap, unsigned int bits)
-{
-	unsigned int k, lim = bits/BITS_PER_LONG;
-	for (k = 0; k < lim; ++k)
-		if (bitmap[k])
-			return 0;
-
-	if (bits % BITS_PER_LONG)
-		if (bitmap[k] & BITMAP_LAST_WORD_MASK(bits))
-			return 0;
-
-	return 1;
-}
-EXPORT_SYMBOL(__bitmap_empty);
-
-int __bitmap_full(const unsigned long *bitmap, unsigned int bits)
-{
-	unsigned int k, lim = bits/BITS_PER_LONG;
-	for (k = 0; k < lim; ++k)
-		if (~bitmap[k])
-			return 0;
-
-	if (bits % BITS_PER_LONG)
-		if (~bitmap[k] & BITMAP_LAST_WORD_MASK(bits))
-			return 0;
-
-	return 1;
-}
-EXPORT_SYMBOL(__bitmap_full);
-
 int __bitmap_equal(const unsigned long *bitmap1,
 		const unsigned long *bitmap2, unsigned int bits)
 {
-- 
cgit v1.2.3


From 534b483a86e6b96f1b5cc03bbe4b696f3daead6d Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Thu, 16 Apr 2015 12:48:04 -0700
Subject: cpumask: don't perform while loop in cpumask_next_and()

cpumask_next_and() is looking for cpumask_next() in src1 in a loop and
tests if found cpu is also present in src2. remove that loop, perform
cpumask_and() of src1 and src2 first and use that new mask to find
cpumask_next().

Apart from removing while loop, ./bloat-o-meter on x86_64 shows
add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-8 (-8)
function                                     old     new   delta
cpumask_next_and                              62      54      -8

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Amir Vadai <amirv@mellanox.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/cpumask.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'lib')

diff --git a/lib/cpumask.c b/lib/cpumask.c
index b6513a9f2892..5ab1553fd076 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -37,10 +37,11 @@ EXPORT_SYMBOL(__next_cpu_nr);
 int cpumask_next_and(int n, const struct cpumask *src1p,
 		     const struct cpumask *src2p)
 {
-	while ((n = cpumask_next(n, src1p)) < nr_cpu_ids)
-		if (cpumask_test_cpu(n, src2p))
-			break;
-	return n;
+	struct cpumask tmp;
+
+	if (cpumask_and(&tmp, src1p, src2p))
+		return cpumask_next(n, &tmp);
+	return nr_cpu_ids;
 }
 EXPORT_SYMBOL(cpumask_next_and);
 
-- 
cgit v1.2.3


From 9e522c0d28d1418c2983ffbc3903f7bed3354180 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 16 Apr 2015 12:49:07 -0700
Subject: lib/Kconfig: fix up HAVE_ARCH_BITREVERSE help text

Cc: Yalin Wang <yalin.wang@sonymobile.com>
Cc: Russell King <linux@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/Kconfig | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'lib')

diff --git a/lib/Kconfig b/lib/Kconfig
index 87da53bb1fef..f5440221d929 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -18,9 +18,8 @@ config HAVE_ARCH_BITREVERSE
 	default n
 	depends on BITREVERSE
 	help
-	  This option provides an config for the architecture which have instruction
-	  can do bitreverse operation, we use the hardware instruction if the architecture
-	  have this capability.
+	  This option enables the use of hardware bit-reversal instructions on
+	  architectures which support such operations.
 
 config RATIONAL
 	bool
-- 
cgit v1.2.3


From cb97201cb060d13da0b87fd1bf68208c7389c5b1 Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Thu, 16 Apr 2015 22:28:04 -0400
Subject: iommu-common: Fix PARISC compile-time warnings

Fixes warnings due to
- no DMA_ERROR_CODE on PARISC,
- sizeof (unsigned long) == 4 bytes on PARISC.

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/iommu-common.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/iommu-common.c b/lib/iommu-common.c
index 7583f9b7846b..fac4f35250c9 100644
--- a/lib/iommu-common.c
+++ b/lib/iommu-common.c
@@ -10,6 +10,10 @@
 #include <linux/iommu-common.h>
 #include <linux/dma-mapping.h>
 
+#ifndef	DMA_ERROR_CODE
+#define	DMA_ERROR_CODE (~(dma_addr_t)0x0)
+#endif
+
 #define IOMMU_LARGE_ALLOC	15
 
 /*
@@ -121,7 +125,7 @@ unsigned long iommu_tbl_range_alloc(struct device *dev,
 		boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
 				      1 << iommu->page_table_shift);
 	else
-		boundary_size = ALIGN(1UL << 32, 1 << iommu->page_table_shift);
+		boundary_size = ALIGN(1ULL << 32, 1 << iommu->page_table_shift);
 
 	shift = iommu->page_table_map_base >> iommu->page_table_shift;
 	boundary_size = boundary_size >> iommu->page_table_shift;
-- 
cgit v1.2.3


From c12f048ffdf3a5802239426dc290290929268dc9 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 18 Apr 2015 12:31:25 -0700
Subject: sparc: Revert generic IOMMU allocator.

I applied the wrong version of this patch series, V4 instead
of V10, due to a patchwork bundling snafu.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/include/asm/iommu_64.h |   7 +-
 arch/sparc/kernel/iommu.c         | 188 +++++++++++++++++++++-----------
 arch/sparc/kernel/iommu_common.h  |   8 ++
 arch/sparc/kernel/ldc.c           | 185 +++++++++++++++----------------
 arch/sparc/kernel/pci_sun4v.c     | 193 ++++++++++++++++----------------
 include/linux/iommu-common.h      |  55 ----------
 lib/Makefile                      |   2 +-
 lib/iommu-common.c                | 224 --------------------------------------
 8 files changed, 325 insertions(+), 537 deletions(-)
 delete mode 100644 include/linux/iommu-common.h
 delete mode 100644 lib/iommu-common.c

(limited to 'lib')

diff --git a/arch/sparc/include/asm/iommu_64.h b/arch/sparc/include/asm/iommu_64.h
index e3cd4493d81d..2b9321ab064d 100644
--- a/arch/sparc/include/asm/iommu_64.h
+++ b/arch/sparc/include/asm/iommu_64.h
@@ -16,7 +16,6 @@
 #define IOPTE_WRITE   0x0000000000000002UL
 
 #define IOMMU_NUM_CTXS	4096
-#include <linux/iommu-common.h>
 
 struct iommu_arena {
 	unsigned long	*map;
@@ -25,10 +24,11 @@ struct iommu_arena {
 };
 
 struct iommu {
-	struct iommu_table	tbl;
 	spinlock_t		lock;
-	u32			dma_addr_mask;
+	struct iommu_arena	arena;
+	void			(*flush_all)(struct iommu *);
 	iopte_t			*page_table;
+	u32			page_table_map_base;
 	unsigned long		iommu_control;
 	unsigned long		iommu_tsbbase;
 	unsigned long		iommu_flush;
@@ -40,6 +40,7 @@ struct iommu {
 	unsigned long		dummy_page_pa;
 	unsigned long		ctx_lowest_free;
 	DECLARE_BITMAP(ctx_bitmap, IOMMU_NUM_CTXS);
+	u32			dma_addr_mask;
 };
 
 struct strbuf {
diff --git a/arch/sparc/kernel/iommu.c b/arch/sparc/kernel/iommu.c
index 9b16b341b6ae..bfa4d0c2df42 100644
--- a/arch/sparc/kernel/iommu.c
+++ b/arch/sparc/kernel/iommu.c
@@ -13,15 +13,11 @@
 #include <linux/errno.h>
 #include <linux/iommu-helper.h>
 #include <linux/bitmap.h>
-#include <linux/hash.h>
-#include <linux/iommu-common.h>
 
 #ifdef CONFIG_PCI
 #include <linux/pci.h>
 #endif
 
-static	DEFINE_PER_CPU(unsigned int, iommu_pool_hash);
-
 #include <asm/iommu.h>
 
 #include "iommu_common.h"
@@ -49,9 +45,8 @@ static	DEFINE_PER_CPU(unsigned int, iommu_pool_hash);
 			       "i" (ASI_PHYS_BYPASS_EC_E))
 
 /* Must be invoked under the IOMMU lock. */
-static void iommu_flushall(struct iommu_table *iommu_table)
+static void iommu_flushall(struct iommu *iommu)
 {
-	struct iommu *iommu = container_of(iommu_table, struct iommu, tbl);
 	if (iommu->iommu_flushinv) {
 		iommu_write(iommu->iommu_flushinv, ~(u64)0);
 	} else {
@@ -92,22 +87,93 @@ static inline void iopte_make_dummy(struct iommu *iommu, iopte_t *iopte)
 	iopte_val(*iopte) = val;
 }
 
-static struct iommu_tbl_ops iommu_sparc_ops = {
-	.reset	= iommu_flushall
-};
-
-static void setup_iommu_pool_hash(void)
+/* Based almost entirely upon the ppc64 iommu allocator.  If you use the 'handle'
+ * facility it must all be done in one pass while under the iommu lock.
+ *
+ * On sun4u platforms, we only flush the IOMMU once every time we've passed
+ * over the entire page table doing allocations.  Therefore we only ever advance
+ * the hint and cannot backtrack it.
+ */
+unsigned long iommu_range_alloc(struct device *dev,
+				struct iommu *iommu,
+				unsigned long npages,
+				unsigned long *handle)
 {
-	unsigned int i;
-	static bool do_once;
+	unsigned long n, end, start, limit, boundary_size;
+	struct iommu_arena *arena = &iommu->arena;
+	int pass = 0;
 
-	if (do_once)
-		return;
-	do_once = true;
-	for_each_possible_cpu(i)
-		per_cpu(iommu_pool_hash, i) = hash_32(i, IOMMU_POOL_HASHBITS);
+	/* This allocator was derived from x86_64's bit string search */
+
+	/* Sanity check */
+	if (unlikely(npages == 0)) {
+		if (printk_ratelimit())
+			WARN_ON(1);
+		return DMA_ERROR_CODE;
+	}
+
+	if (handle && *handle)
+		start = *handle;
+	else
+		start = arena->hint;
+
+	limit = arena->limit;
+
+	/* The case below can happen if we have a small segment appended
+	 * to a large, or when the previous alloc was at the very end of
+	 * the available space. If so, go back to the beginning and flush.
+	 */
+	if (start >= limit) {
+		start = 0;
+		if (iommu->flush_all)
+			iommu->flush_all(iommu);
+	}
+
+ again:
+
+	if (dev)
+		boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
+				      1 << IO_PAGE_SHIFT);
+	else
+		boundary_size = ALIGN(1UL << 32, 1 << IO_PAGE_SHIFT);
+
+	n = iommu_area_alloc(arena->map, limit, start, npages,
+			     iommu->page_table_map_base >> IO_PAGE_SHIFT,
+			     boundary_size >> IO_PAGE_SHIFT, 0);
+	if (n == -1) {
+		if (likely(pass < 1)) {
+			/* First failure, rescan from the beginning.  */
+			start = 0;
+			if (iommu->flush_all)
+				iommu->flush_all(iommu);
+			pass++;
+			goto again;
+		} else {
+			/* Second failure, give up */
+			return DMA_ERROR_CODE;
+		}
+	}
+
+	end = n + npages;
+
+	arena->hint = end;
+
+	/* Update handle for SG allocations */
+	if (handle)
+		*handle = end;
+
+	return n;
 }
 
+void iommu_range_free(struct iommu *iommu, dma_addr_t dma_addr, unsigned long npages)
+{
+	struct iommu_arena *arena = &iommu->arena;
+	unsigned long entry;
+
+	entry = (dma_addr - iommu->page_table_map_base) >> IO_PAGE_SHIFT;
+
+	bitmap_clear(arena->map, entry, npages);
+}
 
 int iommu_table_init(struct iommu *iommu, int tsbsize,
 		     u32 dma_offset, u32 dma_addr_mask,
@@ -121,22 +187,22 @@ int iommu_table_init(struct iommu *iommu, int tsbsize,
 	/* Setup initial software IOMMU state. */
 	spin_lock_init(&iommu->lock);
 	iommu->ctx_lowest_free = 1;
-	iommu->tbl.page_table_map_base = dma_offset;
+	iommu->page_table_map_base = dma_offset;
 	iommu->dma_addr_mask = dma_addr_mask;
 
 	/* Allocate and initialize the free area map.  */
 	sz = num_tsb_entries / 8;
 	sz = (sz + 7UL) & ~7UL;
-	iommu->tbl.map = kmalloc_node(sz, GFP_KERNEL, numa_node);
-	if (!iommu->tbl.map)
+	iommu->arena.map = kmalloc_node(sz, GFP_KERNEL, numa_node);
+	if (!iommu->arena.map) {
+		printk(KERN_ERR "IOMMU: Error, kmalloc(arena.map) failed.\n");
 		return -ENOMEM;
-	memset(iommu->tbl.map, 0, sz);
-	if (tlb_type != hypervisor)
-		iommu_sparc_ops.reset = NULL; /* not needed on on sun4v */
+	}
+	memset(iommu->arena.map, 0, sz);
+	iommu->arena.limit = num_tsb_entries;
 
-	setup_iommu_pool_hash();
-	iommu_tbl_pool_init(&iommu->tbl, num_tsb_entries, IO_PAGE_SHIFT,
-			    &iommu_sparc_ops, false, 1);
+	if (tlb_type != hypervisor)
+		iommu->flush_all = iommu_flushall;
 
 	/* Allocate and initialize the dummy page which we
 	 * set inactive IO PTEs to point to.
@@ -169,20 +235,18 @@ out_free_dummy_page:
 	iommu->dummy_page = 0UL;
 
 out_free_map:
-	kfree(iommu->tbl.map);
-	iommu->tbl.map = NULL;
+	kfree(iommu->arena.map);
+	iommu->arena.map = NULL;
 
 	return -ENOMEM;
 }
 
-static inline iopte_t *alloc_npages(struct device *dev,
-				    struct iommu *iommu,
+static inline iopte_t *alloc_npages(struct device *dev, struct iommu *iommu,
 				    unsigned long npages)
 {
 	unsigned long entry;
 
-	entry = iommu_tbl_range_alloc(dev, &iommu->tbl, npages, NULL,
-				      __this_cpu_read(iommu_pool_hash));
+	entry = iommu_range_alloc(dev, iommu, npages, NULL);
 	if (unlikely(entry == DMA_ERROR_CODE))
 		return NULL;
 
@@ -220,7 +284,7 @@ static void *dma_4u_alloc_coherent(struct device *dev, size_t size,
 				   dma_addr_t *dma_addrp, gfp_t gfp,
 				   struct dma_attrs *attrs)
 {
-	unsigned long order, first_page;
+	unsigned long flags, order, first_page;
 	struct iommu *iommu;
 	struct page *page;
 	int npages, nid;
@@ -242,14 +306,16 @@ static void *dma_4u_alloc_coherent(struct device *dev, size_t size,
 
 	iommu = dev->archdata.iommu;
 
+	spin_lock_irqsave(&iommu->lock, flags);
 	iopte = alloc_npages(dev, iommu, size >> IO_PAGE_SHIFT);
+	spin_unlock_irqrestore(&iommu->lock, flags);
 
 	if (unlikely(iopte == NULL)) {
 		free_pages(first_page, order);
 		return NULL;
 	}
 
-	*dma_addrp = (iommu->tbl.page_table_map_base +
+	*dma_addrp = (iommu->page_table_map_base +
 		      ((iopte - iommu->page_table) << IO_PAGE_SHIFT));
 	ret = (void *) first_page;
 	npages = size >> IO_PAGE_SHIFT;
@@ -270,12 +336,16 @@ static void dma_4u_free_coherent(struct device *dev, size_t size,
 				 struct dma_attrs *attrs)
 {
 	struct iommu *iommu;
-	unsigned long order, npages;
+	unsigned long flags, order, npages;
 
 	npages = IO_PAGE_ALIGN(size) >> IO_PAGE_SHIFT;
 	iommu = dev->archdata.iommu;
 
-	iommu_tbl_range_free(&iommu->tbl, dvma, npages, false, NULL);
+	spin_lock_irqsave(&iommu->lock, flags);
+
+	iommu_range_free(iommu, dvma, npages);
+
+	spin_unlock_irqrestore(&iommu->lock, flags);
 
 	order = get_order(size);
 	if (order < 10)
@@ -305,8 +375,8 @@ static dma_addr_t dma_4u_map_page(struct device *dev, struct page *page,
 	npages = IO_PAGE_ALIGN(oaddr + sz) - (oaddr & IO_PAGE_MASK);
 	npages >>= IO_PAGE_SHIFT;
 
-	base = alloc_npages(dev, iommu, npages);
 	spin_lock_irqsave(&iommu->lock, flags);
+	base = alloc_npages(dev, iommu, npages);
 	ctx = 0;
 	if (iommu->iommu_ctxflush)
 		ctx = iommu_alloc_ctx(iommu);
@@ -315,7 +385,7 @@ static dma_addr_t dma_4u_map_page(struct device *dev, struct page *page,
 	if (unlikely(!base))
 		goto bad;
 
-	bus_addr = (iommu->tbl.page_table_map_base +
+	bus_addr = (iommu->page_table_map_base +
 		    ((base - iommu->page_table) << IO_PAGE_SHIFT));
 	ret = bus_addr | (oaddr & ~IO_PAGE_MASK);
 	base_paddr = __pa(oaddr & IO_PAGE_MASK);
@@ -426,7 +496,7 @@ static void dma_4u_unmap_page(struct device *dev, dma_addr_t bus_addr,
 	npages = IO_PAGE_ALIGN(bus_addr + sz) - (bus_addr & IO_PAGE_MASK);
 	npages >>= IO_PAGE_SHIFT;
 	base = iommu->page_table +
-		((bus_addr - iommu->tbl.page_table_map_base) >> IO_PAGE_SHIFT);
+		((bus_addr - iommu->page_table_map_base) >> IO_PAGE_SHIFT);
 	bus_addr &= IO_PAGE_MASK;
 
 	spin_lock_irqsave(&iommu->lock, flags);
@@ -445,11 +515,11 @@ static void dma_4u_unmap_page(struct device *dev, dma_addr_t bus_addr,
 	for (i = 0; i < npages; i++)
 		iopte_make_dummy(iommu, base + i);
 
+	iommu_range_free(iommu, bus_addr, npages);
+
 	iommu_free_ctx(iommu, ctx);
-	spin_unlock_irqrestore(&iommu->lock, flags);
 
-	iommu_tbl_range_free(&iommu->tbl, bus_addr, npages,
-			     false, NULL);
+	spin_unlock_irqrestore(&iommu->lock, flags);
 }
 
 static int dma_4u_map_sg(struct device *dev, struct scatterlist *sglist,
@@ -497,7 +567,7 @@ static int dma_4u_map_sg(struct device *dev, struct scatterlist *sglist,
 	max_seg_size = dma_get_max_seg_size(dev);
 	seg_boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
 				  IO_PAGE_SIZE) >> IO_PAGE_SHIFT;
-	base_shift = iommu->tbl.page_table_map_base >> IO_PAGE_SHIFT;
+	base_shift = iommu->page_table_map_base >> IO_PAGE_SHIFT;
 	for_each_sg(sglist, s, nelems, i) {
 		unsigned long paddr, npages, entry, out_entry = 0, slen;
 		iopte_t *base;
@@ -511,8 +581,7 @@ static int dma_4u_map_sg(struct device *dev, struct scatterlist *sglist,
 		/* Allocate iommu entries for that segment */
 		paddr = (unsigned long) SG_ENT_PHYS_ADDRESS(s);
 		npages = iommu_num_pages(paddr, slen, IO_PAGE_SIZE);
-		entry = iommu_tbl_range_alloc(dev, &iommu->tbl, npages, &handle,
-					      __this_cpu_read(iommu_pool_hash));
+		entry = iommu_range_alloc(dev, iommu, npages, &handle);
 
 		/* Handle failure */
 		if (unlikely(entry == DMA_ERROR_CODE)) {
@@ -525,7 +594,7 @@ static int dma_4u_map_sg(struct device *dev, struct scatterlist *sglist,
 		base = iommu->page_table + entry;
 
 		/* Convert entry to a dma_addr_t */
-		dma_addr = iommu->tbl.page_table_map_base +
+		dma_addr = iommu->page_table_map_base +
 			(entry << IO_PAGE_SHIFT);
 		dma_addr |= (s->offset & ~IO_PAGE_MASK);
 
@@ -585,17 +654,15 @@ iommu_map_failed:
 			vaddr = s->dma_address & IO_PAGE_MASK;
 			npages = iommu_num_pages(s->dma_address, s->dma_length,
 						 IO_PAGE_SIZE);
+			iommu_range_free(iommu, vaddr, npages);
 
-			entry = (vaddr - iommu->tbl.page_table_map_base)
+			entry = (vaddr - iommu->page_table_map_base)
 				>> IO_PAGE_SHIFT;
 			base = iommu->page_table + entry;
 
 			for (j = 0; j < npages; j++)
 				iopte_make_dummy(iommu, base + j);
 
-			iommu_tbl_range_free(&iommu->tbl, vaddr, npages,
-					     false, NULL);
-
 			s->dma_address = DMA_ERROR_CODE;
 			s->dma_length = 0;
 		}
@@ -610,19 +677,17 @@ iommu_map_failed:
 /* If contexts are being used, they are the same in all of the mappings
  * we make for a particular SG.
  */
-static unsigned long fetch_sg_ctx(struct iommu *iommu,
-				  struct scatterlist *sg)
+static unsigned long fetch_sg_ctx(struct iommu *iommu, struct scatterlist *sg)
 {
 	unsigned long ctx = 0;
 
 	if (iommu->iommu_ctxflush) {
 		iopte_t *base;
 		u32 bus_addr;
-		struct iommu_table *tbl = &iommu->tbl;
 
 		bus_addr = sg->dma_address & IO_PAGE_MASK;
 		base = iommu->page_table +
-		       ((bus_addr - tbl->page_table_map_base) >> IO_PAGE_SHIFT);
+			((bus_addr - iommu->page_table_map_base) >> IO_PAGE_SHIFT);
 
 		ctx = (iopte_val(*base) & IOPTE_CONTEXT) >> 47UL;
 	}
@@ -658,8 +723,9 @@ static void dma_4u_unmap_sg(struct device *dev, struct scatterlist *sglist,
 		if (!len)
 			break;
 		npages = iommu_num_pages(dma_handle, len, IO_PAGE_SIZE);
+		iommu_range_free(iommu, dma_handle, npages);
 
-		entry = ((dma_handle - iommu->tbl.page_table_map_base)
+		entry = ((dma_handle - iommu->page_table_map_base)
 			 >> IO_PAGE_SHIFT);
 		base = iommu->page_table + entry;
 
@@ -671,8 +737,6 @@ static void dma_4u_unmap_sg(struct device *dev, struct scatterlist *sglist,
 		for (i = 0; i < npages; i++)
 			iopte_make_dummy(iommu, base + i);
 
-		iommu_tbl_range_free(&iommu->tbl, dma_handle, npages, false,
-				     NULL);
 		sg = sg_next(sg);
 	}
 
@@ -706,10 +770,9 @@ static void dma_4u_sync_single_for_cpu(struct device *dev,
 	if (iommu->iommu_ctxflush &&
 	    strbuf->strbuf_ctxflush) {
 		iopte_t *iopte;
-		struct iommu_table *tbl = &iommu->tbl;
 
 		iopte = iommu->page_table +
-			((bus_addr - tbl->page_table_map_base)>>IO_PAGE_SHIFT);
+			((bus_addr - iommu->page_table_map_base)>>IO_PAGE_SHIFT);
 		ctx = (iopte_val(*iopte) & IOPTE_CONTEXT) >> 47UL;
 	}
 
@@ -742,10 +805,9 @@ static void dma_4u_sync_sg_for_cpu(struct device *dev,
 	if (iommu->iommu_ctxflush &&
 	    strbuf->strbuf_ctxflush) {
 		iopte_t *iopte;
-		struct iommu_table *tbl = &iommu->tbl;
 
-		iopte = iommu->page_table + ((sglist[0].dma_address -
-			tbl->page_table_map_base) >> IO_PAGE_SHIFT);
+		iopte = iommu->page_table +
+			((sglist[0].dma_address - iommu->page_table_map_base) >> IO_PAGE_SHIFT);
 		ctx = (iopte_val(*iopte) & IOPTE_CONTEXT) >> 47UL;
 	}
 
diff --git a/arch/sparc/kernel/iommu_common.h b/arch/sparc/kernel/iommu_common.h
index f4be0d724fc6..1ec0de4156e7 100644
--- a/arch/sparc/kernel/iommu_common.h
+++ b/arch/sparc/kernel/iommu_common.h
@@ -48,4 +48,12 @@ static inline int is_span_boundary(unsigned long entry,
 	return iommu_is_span_boundary(entry, nr, shift, boundary_size);
 }
 
+unsigned long iommu_range_alloc(struct device *dev,
+				struct iommu *iommu,
+				unsigned long npages,
+				unsigned long *handle);
+void iommu_range_free(struct iommu *iommu,
+		      dma_addr_t dma_addr,
+		      unsigned long npages);
+
 #endif /* _IOMMU_COMMON_H */
diff --git a/arch/sparc/kernel/ldc.c b/arch/sparc/kernel/ldc.c
index d485697c37c0..274a9f59d95c 100644
--- a/arch/sparc/kernel/ldc.c
+++ b/arch/sparc/kernel/ldc.c
@@ -15,8 +15,6 @@
 #include <linux/list.h>
 #include <linux/init.h>
 #include <linux/bitmap.h>
-#include <linux/hash.h>
-#include <linux/iommu-common.h>
 
 #include <asm/hypervisor.h>
 #include <asm/iommu.h>
@@ -29,11 +27,6 @@
 #define DRV_MODULE_VERSION	"1.1"
 #define DRV_MODULE_RELDATE	"July 22, 2008"
 
-#define COOKIE_PGSZ_CODE	0xf000000000000000ULL
-#define COOKIE_PGSZ_CODE_SHIFT	60ULL
-
-static DEFINE_PER_CPU(unsigned int, ldc_pool_hash);
-
 static char version[] =
 	DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
 #define LDC_PACKET_SIZE		64
@@ -105,10 +98,10 @@ static const struct ldc_mode_ops stream_ops;
 int ldom_domaining_enabled;
 
 struct ldc_iommu {
-	/* Protects ldc_unmap.  */
+	/* Protects arena alloc/free.  */
 	spinlock_t			lock;
+	struct iommu_arena		arena;
 	struct ldc_mtable_entry		*page_table;
-	struct iommu_table		iommu_table;
 };
 
 struct ldc_channel {
@@ -1005,85 +998,31 @@ static void free_queue(unsigned long num_entries, struct ldc_packet *q)
 	free_pages((unsigned long)q, order);
 }
 
-static unsigned long ldc_cookie_to_index(u64 cookie, void *arg)
-{
-	u64 szcode = cookie >> COOKIE_PGSZ_CODE_SHIFT;
-	/* struct ldc_iommu *ldc_iommu = (struct ldc_iommu *)arg; */
-
-	cookie &= ~COOKIE_PGSZ_CODE;
-
-	return (cookie >> (13ULL + (szcode * 3ULL)));
-}
-
-struct ldc_demap_arg {
-	struct ldc_iommu *ldc_iommu;
-	u64 cookie;
-	unsigned long id;
-};
-
-static void ldc_demap(void *arg, unsigned long entry, unsigned long npages)
-{
-	struct ldc_demap_arg *ldc_demap_arg = arg;
-	struct ldc_iommu *iommu = ldc_demap_arg->ldc_iommu;
-	unsigned long id = ldc_demap_arg->id;
-	u64 cookie = ldc_demap_arg->cookie;
-	struct ldc_mtable_entry *base;
-	unsigned long i, shift;
-
-	shift = (cookie >> COOKIE_PGSZ_CODE_SHIFT) * 3;
-	base = iommu->page_table + entry;
-	for (i = 0; i < npages; i++) {
-		if (base->cookie)
-			sun4v_ldc_revoke(id, cookie + (i << shift),
-					 base->cookie);
-		base->mte = 0;
-	}
-}
-
 /* XXX Make this configurable... XXX */
 #define LDC_IOTABLE_SIZE	(8 * 1024)
 
-struct iommu_tbl_ops ldc_iommu_ops = {
-	.cookie_to_index = ldc_cookie_to_index,
-	.demap = ldc_demap,
-};
-
-static void setup_ldc_pool_hash(void)
-{
-	unsigned int i;
-	static bool do_once;
-
-	if (do_once)
-		return;
-	do_once = true;
-	for_each_possible_cpu(i)
-		per_cpu(ldc_pool_hash, i) = hash_32(i, IOMMU_POOL_HASHBITS);
-}
-
-
-static int ldc_iommu_init(const char *name, struct ldc_channel *lp)
+static int ldc_iommu_init(struct ldc_channel *lp)
 {
 	unsigned long sz, num_tsb_entries, tsbsize, order;
-	struct ldc_iommu *ldc_iommu = &lp->iommu;
-	struct iommu_table *iommu = &ldc_iommu->iommu_table;
+	struct ldc_iommu *iommu = &lp->iommu;
 	struct ldc_mtable_entry *table;
 	unsigned long hv_err;
 	int err;
 
 	num_tsb_entries = LDC_IOTABLE_SIZE;
 	tsbsize = num_tsb_entries * sizeof(struct ldc_mtable_entry);
-	setup_ldc_pool_hash();
-	spin_lock_init(&ldc_iommu->lock);
+
+	spin_lock_init(&iommu->lock);
 
 	sz = num_tsb_entries / 8;
 	sz = (sz + 7UL) & ~7UL;
-	iommu->map = kzalloc(sz, GFP_KERNEL);
-	if (!iommu->map) {
+	iommu->arena.map = kzalloc(sz, GFP_KERNEL);
+	if (!iommu->arena.map) {
 		printk(KERN_ERR PFX "Alloc of arena map failed, sz=%lu\n", sz);
 		return -ENOMEM;
 	}
-	iommu_tbl_pool_init(iommu, num_tsb_entries, PAGE_SHIFT,
-			    &ldc_iommu_ops, false, 1);
+
+	iommu->arena.limit = num_tsb_entries;
 
 	order = get_order(tsbsize);
 
@@ -1098,7 +1037,7 @@ static int ldc_iommu_init(const char *name, struct ldc_channel *lp)
 
 	memset(table, 0, PAGE_SIZE << order);
 
-	ldc_iommu->page_table = table;
+	iommu->page_table = table;
 
 	hv_err = sun4v_ldc_set_map_table(lp->id, __pa(table),
 					 num_tsb_entries);
@@ -1110,32 +1049,31 @@ static int ldc_iommu_init(const char *name, struct ldc_channel *lp)
 
 out_free_table:
 	free_pages((unsigned long) table, order);
-	ldc_iommu->page_table = NULL;
+	iommu->page_table = NULL;
 
 out_free_map:
-	kfree(iommu->map);
-	iommu->map = NULL;
+	kfree(iommu->arena.map);
+	iommu->arena.map = NULL;
 
 	return err;
 }
 
 static void ldc_iommu_release(struct ldc_channel *lp)
 {
-	struct ldc_iommu *ldc_iommu = &lp->iommu;
-	struct iommu_table *iommu = &ldc_iommu->iommu_table;
+	struct ldc_iommu *iommu = &lp->iommu;
 	unsigned long num_tsb_entries, tsbsize, order;
 
 	(void) sun4v_ldc_set_map_table(lp->id, 0, 0);
 
-	num_tsb_entries = iommu->poolsize * iommu->nr_pools;
+	num_tsb_entries = iommu->arena.limit;
 	tsbsize = num_tsb_entries * sizeof(struct ldc_mtable_entry);
 	order = get_order(tsbsize);
 
-	free_pages((unsigned long) ldc_iommu->page_table, order);
-	ldc_iommu->page_table = NULL;
+	free_pages((unsigned long) iommu->page_table, order);
+	iommu->page_table = NULL;
 
-	kfree(iommu->map);
-	iommu->map = NULL;
+	kfree(iommu->arena.map);
+	iommu->arena.map = NULL;
 }
 
 struct ldc_channel *ldc_alloc(unsigned long id,
@@ -1202,7 +1140,7 @@ struct ldc_channel *ldc_alloc(unsigned long id,
 
 	lp->id = id;
 
-	err = ldc_iommu_init(name, lp);
+	err = ldc_iommu_init(lp);
 	if (err)
 		goto out_free_ldc;
 
@@ -1947,6 +1885,40 @@ int ldc_read(struct ldc_channel *lp, void *buf, unsigned int size)
 }
 EXPORT_SYMBOL(ldc_read);
 
+static long arena_alloc(struct ldc_iommu *iommu, unsigned long npages)
+{
+	struct iommu_arena *arena = &iommu->arena;
+	unsigned long n, start, end, limit;
+	int pass;
+
+	limit = arena->limit;
+	start = arena->hint;
+	pass = 0;
+
+again:
+	n = bitmap_find_next_zero_area(arena->map, limit, start, npages, 0);
+	end = n + npages;
+	if (unlikely(end >= limit)) {
+		if (likely(pass < 1)) {
+			limit = start;
+			start = 0;
+			pass++;
+			goto again;
+		} else {
+			/* Scanned the whole thing, give up. */
+			return -1;
+		}
+	}
+	bitmap_set(arena->map, n, npages);
+
+	arena->hint = end;
+
+	return n;
+}
+
+#define COOKIE_PGSZ_CODE	0xf000000000000000ULL
+#define COOKIE_PGSZ_CODE_SHIFT	60ULL
+
 static u64 pagesize_code(void)
 {
 	switch (PAGE_SIZE) {
@@ -1973,14 +1945,23 @@ static u64 make_cookie(u64 index, u64 pgsz_code, u64 page_offset)
 		page_offset);
 }
 
+static u64 cookie_to_index(u64 cookie, unsigned long *shift)
+{
+	u64 szcode = cookie >> COOKIE_PGSZ_CODE_SHIFT;
+
+	cookie &= ~COOKIE_PGSZ_CODE;
+
+	*shift = szcode * 3;
+
+	return (cookie >> (13ULL + (szcode * 3ULL)));
+}
 
 static struct ldc_mtable_entry *alloc_npages(struct ldc_iommu *iommu,
 					     unsigned long npages)
 {
 	long entry;
 
-	entry = iommu_tbl_range_alloc(NULL, &iommu->iommu_table, npages,
-				     NULL, __this_cpu_read(ldc_pool_hash));
+	entry = arena_alloc(iommu, npages);
 	if (unlikely(entry < 0))
 		return NULL;
 
@@ -2109,7 +2090,7 @@ int ldc_map_sg(struct ldc_channel *lp,
 	       struct ldc_trans_cookie *cookies, int ncookies,
 	       unsigned int map_perm)
 {
-	unsigned long i, npages;
+	unsigned long i, npages, flags;
 	struct ldc_mtable_entry *base;
 	struct cookie_state state;
 	struct ldc_iommu *iommu;
@@ -2128,7 +2109,9 @@ int ldc_map_sg(struct ldc_channel *lp,
 
 	iommu = &lp->iommu;
 
+	spin_lock_irqsave(&iommu->lock, flags);
 	base = alloc_npages(iommu, npages);
+	spin_unlock_irqrestore(&iommu->lock, flags);
 
 	if (!base)
 		return -ENOMEM;
@@ -2153,7 +2136,7 @@ int ldc_map_single(struct ldc_channel *lp,
 		   struct ldc_trans_cookie *cookies, int ncookies,
 		   unsigned int map_perm)
 {
-	unsigned long npages, pa;
+	unsigned long npages, pa, flags;
 	struct ldc_mtable_entry *base;
 	struct cookie_state state;
 	struct ldc_iommu *iommu;
@@ -2169,7 +2152,9 @@ int ldc_map_single(struct ldc_channel *lp,
 
 	iommu = &lp->iommu;
 
+	spin_lock_irqsave(&iommu->lock, flags);
 	base = alloc_npages(iommu, npages);
+	spin_unlock_irqrestore(&iommu->lock, flags);
 
 	if (!base)
 		return -ENOMEM;
@@ -2187,29 +2172,35 @@ int ldc_map_single(struct ldc_channel *lp,
 }
 EXPORT_SYMBOL(ldc_map_single);
 
-
 static void free_npages(unsigned long id, struct ldc_iommu *iommu,
 			u64 cookie, u64 size)
 {
-	unsigned long npages;
-	struct ldc_demap_arg demap_arg;
-
-	demap_arg.ldc_iommu = iommu;
-	demap_arg.cookie = cookie;
-	demap_arg.id = id;
+	struct iommu_arena *arena = &iommu->arena;
+	unsigned long i, shift, index, npages;
+	struct ldc_mtable_entry *base;
 
 	npages = PAGE_ALIGN(((cookie & ~PAGE_MASK) + size)) >> PAGE_SHIFT;
-	iommu_tbl_range_free(&iommu->iommu_table, cookie, npages, true,
-			     &demap_arg);
+	index = cookie_to_index(cookie, &shift);
+	base = iommu->page_table + index;
+
+	BUG_ON(index > arena->limit ||
+	       (index + npages) > arena->limit);
 
+	for (i = 0; i < npages; i++) {
+		if (base->cookie)
+			sun4v_ldc_revoke(id, cookie + (i << shift),
+					 base->cookie);
+		base->mte = 0;
+		__clear_bit(index + i, arena->map);
+	}
 }
 
 void ldc_unmap(struct ldc_channel *lp, struct ldc_trans_cookie *cookies,
 	       int ncookies)
 {
 	struct ldc_iommu *iommu = &lp->iommu;
-	int i;
 	unsigned long flags;
+	int i;
 
 	spin_lock_irqsave(&iommu->lock, flags);
 	for (i = 0; i < ncookies; i++) {
diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c
index 9b76b9d639e1..47ddbd496a1e 100644
--- a/arch/sparc/kernel/pci_sun4v.c
+++ b/arch/sparc/kernel/pci_sun4v.c
@@ -15,8 +15,6 @@
 #include <linux/export.h>
 #include <linux/log2.h>
 #include <linux/of_device.h>
-#include <linux/hash.h>
-#include <linux/iommu-common.h>
 
 #include <asm/iommu.h>
 #include <asm/irq.h>
@@ -30,7 +28,6 @@
 
 #define DRIVER_NAME	"pci_sun4v"
 #define PFX		DRIVER_NAME ": "
-static DEFINE_PER_CPU(unsigned int, iommu_pool_hash);
 
 static unsigned long vpci_major = 1;
 static unsigned long vpci_minor = 1;
@@ -158,13 +155,14 @@ static void *dma_4v_alloc_coherent(struct device *dev, size_t size,
 
 	iommu = dev->archdata.iommu;
 
-	entry = iommu_tbl_range_alloc(dev, &iommu->tbl, npages, NULL,
-				      __this_cpu_read(iommu_pool_hash));
+	spin_lock_irqsave(&iommu->lock, flags);
+	entry = iommu_range_alloc(dev, iommu, npages, NULL);
+	spin_unlock_irqrestore(&iommu->lock, flags);
 
 	if (unlikely(entry == DMA_ERROR_CODE))
 		goto range_alloc_fail;
 
-	*dma_addrp = (iommu->tbl.page_table_map_base +
+	*dma_addrp = (iommu->page_table_map_base +
 		      (entry << IO_PAGE_SHIFT));
 	ret = (void *) first_page;
 	first_page = __pa(first_page);
@@ -190,46 +188,45 @@ static void *dma_4v_alloc_coherent(struct device *dev, size_t size,
 	return ret;
 
 iommu_map_fail:
-	iommu_tbl_range_free(&iommu->tbl, *dma_addrp, npages, false, NULL);
+	/* Interrupts are disabled.  */
+	spin_lock(&iommu->lock);
+	iommu_range_free(iommu, *dma_addrp, npages);
+	spin_unlock_irqrestore(&iommu->lock, flags);
 
 range_alloc_fail:
 	free_pages(first_page, order);
 	return NULL;
 }
 
-static void dma_4v_iommu_demap(void *demap_arg, unsigned long entry,
-			       unsigned long npages)
-{
-	u32 devhandle = *(u32 *)demap_arg;
-	unsigned long num, flags;
-
-	local_irq_save(flags);
-	do {
-		num = pci_sun4v_iommu_demap(devhandle,
-					    HV_PCI_TSBID(0, entry),
-					    npages);
-
-		entry += num;
-		npages -= num;
-	} while (npages != 0);
-	local_irq_restore(flags);
-}
-
 static void dma_4v_free_coherent(struct device *dev, size_t size, void *cpu,
 				 dma_addr_t dvma, struct dma_attrs *attrs)
 {
 	struct pci_pbm_info *pbm;
 	struct iommu *iommu;
-	unsigned long order, npages, entry;
+	unsigned long flags, order, npages, entry;
 	u32 devhandle;
 
 	npages = IO_PAGE_ALIGN(size) >> IO_PAGE_SHIFT;
 	iommu = dev->archdata.iommu;
 	pbm = dev->archdata.host_controller;
 	devhandle = pbm->devhandle;
-	entry = ((dvma - iommu->tbl.page_table_map_base) >> IO_PAGE_SHIFT);
-	dma_4v_iommu_demap(&devhandle, entry, npages);
-	iommu_tbl_range_free(&iommu->tbl, dvma, npages, false, NULL);
+	entry = ((dvma - iommu->page_table_map_base) >> IO_PAGE_SHIFT);
+
+	spin_lock_irqsave(&iommu->lock, flags);
+
+	iommu_range_free(iommu, dvma, npages);
+
+	do {
+		unsigned long num;
+
+		num = pci_sun4v_iommu_demap(devhandle, HV_PCI_TSBID(0, entry),
+					    npages);
+		entry += num;
+		npages -= num;
+	} while (npages != 0);
+
+	spin_unlock_irqrestore(&iommu->lock, flags);
+
 	order = get_order(size);
 	if (order < 10)
 		free_pages((unsigned long)cpu, order);
@@ -256,13 +253,14 @@ static dma_addr_t dma_4v_map_page(struct device *dev, struct page *page,
 	npages = IO_PAGE_ALIGN(oaddr + sz) - (oaddr & IO_PAGE_MASK);
 	npages >>= IO_PAGE_SHIFT;
 
-	entry = iommu_tbl_range_alloc(dev, &iommu->tbl, npages, NULL,
-				      __this_cpu_read(iommu_pool_hash));
+	spin_lock_irqsave(&iommu->lock, flags);
+	entry = iommu_range_alloc(dev, iommu, npages, NULL);
+	spin_unlock_irqrestore(&iommu->lock, flags);
 
 	if (unlikely(entry == DMA_ERROR_CODE))
 		goto bad;
 
-	bus_addr = (iommu->tbl.page_table_map_base +
+	bus_addr = (iommu->page_table_map_base +
 		    (entry << IO_PAGE_SHIFT));
 	ret = bus_addr | (oaddr & ~IO_PAGE_MASK);
 	base_paddr = __pa(oaddr & IO_PAGE_MASK);
@@ -292,7 +290,11 @@ bad:
 	return DMA_ERROR_CODE;
 
 iommu_map_fail:
-	iommu_tbl_range_free(&iommu->tbl, bus_addr, npages, false, NULL);
+	/* Interrupts are disabled.  */
+	spin_lock(&iommu->lock);
+	iommu_range_free(iommu, bus_addr, npages);
+	spin_unlock_irqrestore(&iommu->lock, flags);
+
 	return DMA_ERROR_CODE;
 }
 
@@ -302,7 +304,7 @@ static void dma_4v_unmap_page(struct device *dev, dma_addr_t bus_addr,
 {
 	struct pci_pbm_info *pbm;
 	struct iommu *iommu;
-	unsigned long npages;
+	unsigned long flags, npages;
 	long entry;
 	u32 devhandle;
 
@@ -319,9 +321,22 @@ static void dma_4v_unmap_page(struct device *dev, dma_addr_t bus_addr,
 	npages = IO_PAGE_ALIGN(bus_addr + sz) - (bus_addr & IO_PAGE_MASK);
 	npages >>= IO_PAGE_SHIFT;
 	bus_addr &= IO_PAGE_MASK;
-	entry = (bus_addr - iommu->tbl.page_table_map_base) >> IO_PAGE_SHIFT;
-	dma_4v_iommu_demap(&devhandle, entry, npages);
-	iommu_tbl_range_free(&iommu->tbl, bus_addr, npages, false, NULL);
+
+	spin_lock_irqsave(&iommu->lock, flags);
+
+	iommu_range_free(iommu, bus_addr, npages);
+
+	entry = (bus_addr - iommu->page_table_map_base) >> IO_PAGE_SHIFT;
+	do {
+		unsigned long num;
+
+		num = pci_sun4v_iommu_demap(devhandle, HV_PCI_TSBID(0, entry),
+					    npages);
+		entry += num;
+		npages -= num;
+	} while (npages != 0);
+
+	spin_unlock_irqrestore(&iommu->lock, flags);
 }
 
 static int dma_4v_map_sg(struct device *dev, struct scatterlist *sglist,
@@ -356,14 +371,14 @@ static int dma_4v_map_sg(struct device *dev, struct scatterlist *sglist,
 	/* Init first segment length for backout at failure */
 	outs->dma_length = 0;
 
-	local_irq_save(flags);
+	spin_lock_irqsave(&iommu->lock, flags);
 
 	iommu_batch_start(dev, prot, ~0UL);
 
 	max_seg_size = dma_get_max_seg_size(dev);
 	seg_boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
 				  IO_PAGE_SIZE) >> IO_PAGE_SHIFT;
-	base_shift = iommu->tbl.page_table_map_base >> IO_PAGE_SHIFT;
+	base_shift = iommu->page_table_map_base >> IO_PAGE_SHIFT;
 	for_each_sg(sglist, s, nelems, i) {
 		unsigned long paddr, npages, entry, out_entry = 0, slen;
 
@@ -376,8 +391,7 @@ static int dma_4v_map_sg(struct device *dev, struct scatterlist *sglist,
 		/* Allocate iommu entries for that segment */
 		paddr = (unsigned long) SG_ENT_PHYS_ADDRESS(s);
 		npages = iommu_num_pages(paddr, slen, IO_PAGE_SIZE);
-		entry = iommu_tbl_range_alloc(dev, &iommu->tbl, npages, &handle,
-				      __this_cpu_read(iommu_pool_hash));
+		entry = iommu_range_alloc(dev, iommu, npages, &handle);
 
 		/* Handle failure */
 		if (unlikely(entry == DMA_ERROR_CODE)) {
@@ -390,7 +404,7 @@ static int dma_4v_map_sg(struct device *dev, struct scatterlist *sglist,
 		iommu_batch_new_entry(entry);
 
 		/* Convert entry to a dma_addr_t */
-		dma_addr = iommu->tbl.page_table_map_base +
+		dma_addr = iommu->page_table_map_base +
 			(entry << IO_PAGE_SHIFT);
 		dma_addr |= (s->offset & ~IO_PAGE_MASK);
 
@@ -437,7 +451,7 @@ static int dma_4v_map_sg(struct device *dev, struct scatterlist *sglist,
 	if (unlikely(err < 0L))
 		goto iommu_map_failed;
 
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&iommu->lock, flags);
 
 	if (outcount < incount) {
 		outs = sg_next(outs);
@@ -455,8 +469,7 @@ iommu_map_failed:
 			vaddr = s->dma_address & IO_PAGE_MASK;
 			npages = iommu_num_pages(s->dma_address, s->dma_length,
 						 IO_PAGE_SIZE);
-			iommu_tbl_range_free(&iommu->tbl, vaddr, npages,
-					     false, NULL);
+			iommu_range_free(iommu, vaddr, npages);
 			/* XXX demap? XXX */
 			s->dma_address = DMA_ERROR_CODE;
 			s->dma_length = 0;
@@ -464,7 +477,7 @@ iommu_map_failed:
 		if (s == outs)
 			break;
 	}
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&iommu->lock, flags);
 
 	return 0;
 }
@@ -476,7 +489,7 @@ static void dma_4v_unmap_sg(struct device *dev, struct scatterlist *sglist,
 	struct pci_pbm_info *pbm;
 	struct scatterlist *sg;
 	struct iommu *iommu;
-	unsigned long flags, entry;
+	unsigned long flags;
 	u32 devhandle;
 
 	BUG_ON(direction == DMA_NONE);
@@ -485,27 +498,33 @@ static void dma_4v_unmap_sg(struct device *dev, struct scatterlist *sglist,
 	pbm = dev->archdata.host_controller;
 	devhandle = pbm->devhandle;
 	
-	local_irq_save(flags);
+	spin_lock_irqsave(&iommu->lock, flags);
 
 	sg = sglist;
 	while (nelems--) {
 		dma_addr_t dma_handle = sg->dma_address;
 		unsigned int len = sg->dma_length;
-		unsigned long npages;
-		struct iommu_table *tbl = &iommu->tbl;
-		unsigned long shift = IO_PAGE_SHIFT;
+		unsigned long npages, entry;
 
 		if (!len)
 			break;
 		npages = iommu_num_pages(dma_handle, len, IO_PAGE_SIZE);
-		entry = ((dma_handle - tbl->page_table_map_base) >> shift);
-		dma_4v_iommu_demap(&devhandle, entry, npages);
-		iommu_tbl_range_free(&iommu->tbl, dma_handle, npages,
-				     false, NULL);
+		iommu_range_free(iommu, dma_handle, npages);
+
+		entry = ((dma_handle - iommu->page_table_map_base) >> IO_PAGE_SHIFT);
+		while (npages) {
+			unsigned long num;
+
+			num = pci_sun4v_iommu_demap(devhandle, HV_PCI_TSBID(0, entry),
+						    npages);
+			entry += num;
+			npages -= num;
+		}
+
 		sg = sg_next(sg);
 	}
 
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&iommu->lock, flags);
 }
 
 static struct dma_map_ops sun4v_dma_ops = {
@@ -517,8 +536,6 @@ static struct dma_map_ops sun4v_dma_ops = {
 	.unmap_sg			= dma_4v_unmap_sg,
 };
 
-static struct iommu_tbl_ops dma_4v_iommu_ops;
-
 static void pci_sun4v_scan_bus(struct pci_pbm_info *pbm, struct device *parent)
 {
 	struct property *prop;
@@ -533,33 +550,30 @@ static void pci_sun4v_scan_bus(struct pci_pbm_info *pbm, struct device *parent)
 }
 
 static unsigned long probe_existing_entries(struct pci_pbm_info *pbm,
-					    struct iommu_table *iommu)
+					    struct iommu *iommu)
 {
-	struct iommu_pool *pool;
-	unsigned long i, pool_nr, cnt = 0;
+	struct iommu_arena *arena = &iommu->arena;
+	unsigned long i, cnt = 0;
 	u32 devhandle;
 
 	devhandle = pbm->devhandle;
-	for (pool_nr = 0; pool_nr < iommu->nr_pools; pool_nr++) {
-		pool = &(iommu->arena_pool[pool_nr]);
-		for (i = pool->start; i <= pool->end; i++) {
-			unsigned long ret, io_attrs, ra;
-
-			ret = pci_sun4v_iommu_getmap(devhandle,
-						     HV_PCI_TSBID(0, i),
-						     &io_attrs, &ra);
-			if (ret == HV_EOK) {
-				if (page_in_phys_avail(ra)) {
-					pci_sun4v_iommu_demap(devhandle,
-							      HV_PCI_TSBID(0,
-							      i), 1);
-				} else {
-					cnt++;
-					__set_bit(i, iommu->map);
-				}
+	for (i = 0; i < arena->limit; i++) {
+		unsigned long ret, io_attrs, ra;
+
+		ret = pci_sun4v_iommu_getmap(devhandle,
+					     HV_PCI_TSBID(0, i),
+					     &io_attrs, &ra);
+		if (ret == HV_EOK) {
+			if (page_in_phys_avail(ra)) {
+				pci_sun4v_iommu_demap(devhandle,
+						      HV_PCI_TSBID(0, i), 1);
+			} else {
+				cnt++;
+				__set_bit(i, arena->map);
 			}
 		}
 	}
+
 	return cnt;
 }
 
@@ -587,22 +601,22 @@ static int pci_sun4v_iommu_init(struct pci_pbm_info *pbm)
 	dma_offset = vdma[0];
 
 	/* Setup initial software IOMMU state. */
+	spin_lock_init(&iommu->lock);
 	iommu->ctx_lowest_free = 1;
-	iommu->tbl.page_table_map_base = dma_offset;
+	iommu->page_table_map_base = dma_offset;
 	iommu->dma_addr_mask = dma_mask;
 
 	/* Allocate and initialize the free area map.  */
 	sz = (num_tsb_entries + 7) / 8;
 	sz = (sz + 7UL) & ~7UL;
-	iommu->tbl.map = kzalloc(sz, GFP_KERNEL);
-	if (!iommu->tbl.map) {
+	iommu->arena.map = kzalloc(sz, GFP_KERNEL);
+	if (!iommu->arena.map) {
 		printk(KERN_ERR PFX "Error, kmalloc(arena.map) failed.\n");
 		return -ENOMEM;
 	}
-	iommu_tbl_pool_init(&iommu->tbl, num_tsb_entries, IO_PAGE_SHIFT,
-			    &dma_4v_iommu_ops, false /* no large_pool */,
-			    0 /* default npools */);
-	sz = probe_existing_entries(pbm, &iommu->tbl);
+	iommu->arena.limit = num_tsb_entries;
+
+	sz = probe_existing_entries(pbm, iommu);
 	if (sz)
 		printk("%s: Imported %lu TSB entries from OBP\n",
 		       pbm->name, sz);
@@ -1001,17 +1015,8 @@ static struct platform_driver pci_sun4v_driver = {
 	.probe		= pci_sun4v_probe,
 };
 
-static void setup_iommu_pool_hash(void)
-{
-	unsigned int i;
-
-	for_each_possible_cpu(i)
-		per_cpu(iommu_pool_hash, i) = hash_32(i, IOMMU_POOL_HASHBITS);
-}
-
 static int __init pci_sun4v_init(void)
 {
-	setup_iommu_pool_hash();
 	return platform_driver_register(&pci_sun4v_driver);
 }
 
diff --git a/include/linux/iommu-common.h b/include/linux/iommu-common.h
deleted file mode 100644
index 6be5c863f329..000000000000
--- a/include/linux/iommu-common.h
+++ /dev/null
@@ -1,55 +0,0 @@
-#ifndef _LINUX_IOMMU_COMMON_H
-#define _LINUX_IOMMU_COMMON_H
-
-#include <linux/spinlock_types.h>
-#include <linux/device.h>
-#include <asm/page.h>
-
-#define IOMMU_POOL_HASHBITS     4
-#define IOMMU_NR_POOLS          (1 << IOMMU_POOL_HASHBITS)
-
-struct iommu_pool {
-	unsigned long	start;
-	unsigned long	end;
-	unsigned long	hint;
-	spinlock_t	lock;
-};
-
-struct iommu_table;
-
-struct iommu_tbl_ops {
-	unsigned long	(*cookie_to_index)(u64, void *);
-	void		(*demap)(void *, unsigned long, unsigned long);
-	void		(*reset)(struct iommu_table *);
-};
-
-struct iommu_table {
-	unsigned long		page_table_map_base;
-	unsigned long		page_table_shift;
-	unsigned long		nr_pools;
-	const struct iommu_tbl_ops  *iommu_tbl_ops;
-	unsigned long		poolsize;
-	struct iommu_pool	arena_pool[IOMMU_NR_POOLS];
-	u32			flags;
-#define	IOMMU_HAS_LARGE_POOL	0x00000001
-	struct iommu_pool	large_pool;
-	unsigned long		*map;
-};
-
-extern void iommu_tbl_pool_init(struct iommu_table *iommu,
-				unsigned long num_entries,
-				u32 page_table_shift,
-				const struct iommu_tbl_ops *iommu_tbl_ops,
-				bool large_pool, u32 npools);
-
-extern unsigned long iommu_tbl_range_alloc(struct device *dev,
-					   struct iommu_table *iommu,
-					   unsigned long npages,
-					   unsigned long *handle,
-					   unsigned int pool_hash);
-
-extern void iommu_tbl_range_free(struct iommu_table *iommu,
-				 u64 dma_addr, unsigned long npages,
-				 bool do_demap, void *demap_arg);
-
-#endif
diff --git a/lib/Makefile b/lib/Makefile
index 6c37933336a0..da6116b21555 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -106,7 +106,7 @@ obj-$(CONFIG_AUDIT_GENERIC) += audit.o
 obj-$(CONFIG_AUDIT_COMPAT_GENERIC) += compat_audit.o
 
 obj-$(CONFIG_SWIOTLB) += swiotlb.o
-obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o iommu-common.o
+obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o
 obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o
 obj-$(CONFIG_NOTIFIER_ERROR_INJECTION) += notifier-error-inject.o
 obj-$(CONFIG_CPU_NOTIFIER_ERROR_INJECT) += cpu-notifier-error-inject.o
diff --git a/lib/iommu-common.c b/lib/iommu-common.c
deleted file mode 100644
index fac4f35250c9..000000000000
--- a/lib/iommu-common.c
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * IOMMU mmap management and range allocation functions.
- * Based almost entirely upon the powerpc iommu allocator.
- */
-
-#include <linux/export.h>
-#include <linux/bitmap.h>
-#include <linux/bug.h>
-#include <linux/iommu-helper.h>
-#include <linux/iommu-common.h>
-#include <linux/dma-mapping.h>
-
-#ifndef	DMA_ERROR_CODE
-#define	DMA_ERROR_CODE (~(dma_addr_t)0x0)
-#endif
-
-#define IOMMU_LARGE_ALLOC	15
-
-/*
- * Initialize iommu_pool entries for the iommu_table. `num_entries'
- * is the number of table entries. If `large_pool' is set to true,
- * the top 1/4 of the table will be set aside for pool allocations
- * of more than IOMMU_LARGE_ALLOC pages.
- */
-extern void iommu_tbl_pool_init(struct iommu_table *iommu,
-				unsigned long num_entries,
-				u32 page_table_shift,
-				const struct iommu_tbl_ops *iommu_tbl_ops,
-				bool large_pool, u32 npools)
-{
-	unsigned int start, i;
-	struct iommu_pool *p = &(iommu->large_pool);
-
-	if (npools == 0)
-		iommu->nr_pools = IOMMU_NR_POOLS;
-	else
-		iommu->nr_pools = npools;
-	BUG_ON(npools > IOMMU_NR_POOLS);
-
-	iommu->page_table_shift = page_table_shift;
-	iommu->iommu_tbl_ops = iommu_tbl_ops;
-	start = 0;
-	if (large_pool)
-		iommu->flags |= IOMMU_HAS_LARGE_POOL;
-
-	if (!large_pool)
-		iommu->poolsize = num_entries/iommu->nr_pools;
-	else
-		iommu->poolsize = (num_entries * 3 / 4)/iommu->nr_pools;
-	for (i = 0; i < iommu->nr_pools; i++) {
-		spin_lock_init(&(iommu->arena_pool[i].lock));
-		iommu->arena_pool[i].start = start;
-		iommu->arena_pool[i].hint = start;
-		start += iommu->poolsize; /* start for next pool */
-		iommu->arena_pool[i].end = start - 1;
-	}
-	if (!large_pool)
-		return;
-	/* initialize large_pool */
-	spin_lock_init(&(p->lock));
-	p->start = start;
-	p->hint = p->start;
-	p->end = num_entries;
-}
-EXPORT_SYMBOL(iommu_tbl_pool_init);
-
-unsigned long iommu_tbl_range_alloc(struct device *dev,
-				struct iommu_table *iommu,
-				unsigned long npages,
-				unsigned long *handle,
-				unsigned int pool_hash)
-{
-	unsigned long n, end, start, limit, boundary_size;
-	struct iommu_pool *arena;
-	int pass = 0;
-	unsigned int pool_nr;
-	unsigned int npools = iommu->nr_pools;
-	unsigned long flags;
-	bool large_pool = ((iommu->flags & IOMMU_HAS_LARGE_POOL) != 0);
-	bool largealloc = (large_pool && npages > IOMMU_LARGE_ALLOC);
-	unsigned long shift;
-
-	/* Sanity check */
-	if (unlikely(npages == 0)) {
-		printk_ratelimited("npages == 0\n");
-		return DMA_ERROR_CODE;
-	}
-
-	if (largealloc) {
-		arena = &(iommu->large_pool);
-		spin_lock_irqsave(&arena->lock, flags);
-		pool_nr = 0; /* to keep compiler happy */
-	} else {
-		/* pick out pool_nr */
-		pool_nr =  pool_hash & (npools - 1);
-		arena = &(iommu->arena_pool[pool_nr]);
-
-		/* find first available unlocked pool */
-		while (!spin_trylock_irqsave(&(arena->lock), flags)) {
-			pool_nr = (pool_nr + 1) & (iommu->nr_pools - 1);
-			arena = &(iommu->arena_pool[pool_nr]);
-		}
-	}
-
- again:
-	if (pass == 0 && handle && *handle &&
-	    (*handle >= arena->start) && (*handle < arena->end))
-		start = *handle;
-	else
-		start = arena->hint;
-
-	limit = arena->end;
-
-	/* The case below can happen if we have a small segment appended
-	 * to a large, or when the previous alloc was at the very end of
-	 * the available space. If so, go back to the beginning and flush.
-	 */
-	if (start >= limit) {
-		start = arena->start;
-		if (iommu->iommu_tbl_ops->reset != NULL)
-			iommu->iommu_tbl_ops->reset(iommu);
-	}
-
-	if (dev)
-		boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
-				      1 << iommu->page_table_shift);
-	else
-		boundary_size = ALIGN(1ULL << 32, 1 << iommu->page_table_shift);
-
-	shift = iommu->page_table_map_base >> iommu->page_table_shift;
-	boundary_size = boundary_size >> iommu->page_table_shift;
-	/*
-	 * if the iommu has a non-trivial cookie <-> index mapping, we set
-	 * things up so that iommu_is_span_boundary() merely checks if the
-	 * (index + npages) < num_tsb_entries
-	 */
-	if (iommu->iommu_tbl_ops->cookie_to_index != NULL) {
-		shift = 0;
-		boundary_size = iommu->poolsize * iommu->nr_pools;
-	}
-	n = iommu_area_alloc(iommu->map, limit, start, npages, shift,
-			     boundary_size, 0);
-	if (n == -1) {
-		if (likely(pass == 0)) {
-			/* First failure, rescan from the beginning.  */
-			arena->hint = arena->start;
-			if (iommu->iommu_tbl_ops->reset != NULL)
-				iommu->iommu_tbl_ops->reset(iommu);
-			pass++;
-			goto again;
-		} else if (!largealloc && pass <= iommu->nr_pools) {
-			spin_unlock(&(arena->lock));
-			pool_nr = (pool_nr + 1) & (iommu->nr_pools - 1);
-			arena = &(iommu->arena_pool[pool_nr]);
-			while (!spin_trylock(&(arena->lock))) {
-				pool_nr = (pool_nr + 1) & (iommu->nr_pools - 1);
-				arena = &(iommu->arena_pool[pool_nr]);
-			}
-			arena->hint = arena->start;
-			pass++;
-			goto again;
-		} else {
-			/* give up */
-			spin_unlock_irqrestore(&(arena->lock), flags);
-			return DMA_ERROR_CODE;
-		}
-	}
-
-	end = n + npages;
-
-	arena->hint = end;
-
-	/* Update handle for SG allocations */
-	if (handle)
-		*handle = end;
-	spin_unlock_irqrestore(&(arena->lock), flags);
-
-	return n;
-}
-EXPORT_SYMBOL(iommu_tbl_range_alloc);
-
-static struct iommu_pool *get_pool(struct iommu_table *tbl,
-				   unsigned long entry)
-{
-	struct iommu_pool *p;
-	unsigned long largepool_start = tbl->large_pool.start;
-	bool large_pool = ((tbl->flags & IOMMU_HAS_LARGE_POOL) != 0);
-
-	/* The large pool is the last pool at the top of the table */
-	if (large_pool && entry >= largepool_start) {
-		p = &tbl->large_pool;
-	} else {
-		unsigned int pool_nr = entry / tbl->poolsize;
-
-		BUG_ON(pool_nr >= tbl->nr_pools);
-		p = &tbl->arena_pool[pool_nr];
-	}
-	return p;
-}
-
-void iommu_tbl_range_free(struct iommu_table *iommu, u64 dma_addr,
-			  unsigned long npages, bool do_demap, void *demap_arg)
-{
-	unsigned long entry;
-	struct iommu_pool *pool;
-	unsigned long flags;
-	unsigned long shift = iommu->page_table_shift;
-
-	if (iommu->iommu_tbl_ops->cookie_to_index != NULL) {
-		entry = (*iommu->iommu_tbl_ops->cookie_to_index)(dma_addr,
-								 demap_arg);
-	} else {
-		entry = (dma_addr - iommu->page_table_map_base) >> shift;
-	}
-	pool = get_pool(iommu, entry);
-
-	spin_lock_irqsave(&(pool->lock), flags);
-	if (do_demap && iommu->iommu_tbl_ops->demap != NULL)
-		(*iommu->iommu_tbl_ops->demap)(demap_arg, entry, npages);
-
-	bitmap_clear(iommu->map, entry, npages);
-	spin_unlock_irqrestore(&(pool->lock), flags);
-}
-EXPORT_SYMBOL(iommu_tbl_range_free);
-- 
cgit v1.2.3


From ff7d37a502022149655c18035b99a53391be0383 Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Thu, 9 Apr 2015 15:33:30 -0400
Subject: Break up monolithic iommu table/lock into finer graularity pools and
 lock

Investigation of multithreaded iperf experiments on an ethernet
interface show the iommu->lock as the hottest lock identified by
lockstat, with something of the order of  21M contentions out of
27M acquisitions, and an average wait time of 26 us for the lock.
This is not efficient. A more scalable design is to follow the ppc
model, where the iommu_map_table has multiple pools, each stretching
over a segment of the map, and with a separate lock for each pool.
This model allows for better parallelization of the iommu map search.

This patch adds the iommu range alloc/free function infrastructure.

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/iommu-common.h |  51 +++++++++
 lib/Makefile                 |   2 +-
 lib/iommu-common.c           | 266 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 318 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/iommu-common.h
 create mode 100644 lib/iommu-common.c

(limited to 'lib')

diff --git a/include/linux/iommu-common.h b/include/linux/iommu-common.h
new file mode 100644
index 000000000000..bbced83b32ee
--- /dev/null
+++ b/include/linux/iommu-common.h
@@ -0,0 +1,51 @@
+#ifndef _LINUX_IOMMU_COMMON_H
+#define _LINUX_IOMMU_COMMON_H
+
+#include <linux/spinlock_types.h>
+#include <linux/device.h>
+#include <asm/page.h>
+
+#define IOMMU_POOL_HASHBITS     4
+#define IOMMU_NR_POOLS          (1 << IOMMU_POOL_HASHBITS)
+
+struct iommu_pool {
+	unsigned long	start;
+	unsigned long	end;
+	unsigned long	hint;
+	spinlock_t	lock;
+};
+
+struct iommu_map_table {
+	unsigned long		table_map_base;
+	unsigned long		table_shift;
+	unsigned long		nr_pools;
+	void			(*lazy_flush)(struct iommu_map_table *);
+	unsigned long		poolsize;
+	struct iommu_pool	pools[IOMMU_NR_POOLS];
+	u32			flags;
+#define	IOMMU_HAS_LARGE_POOL	0x00000001
+#define	IOMMU_NO_SPAN_BOUND	0x00000002
+#define	IOMMU_NEED_FLUSH	0x00000004
+	struct iommu_pool	large_pool;
+	unsigned long		*map;
+};
+
+extern void iommu_tbl_pool_init(struct iommu_map_table *iommu,
+				unsigned long num_entries,
+				u32 table_shift,
+				void (*lazy_flush)(struct iommu_map_table *),
+				bool large_pool, u32 npools,
+				bool skip_span_boundary_check);
+
+extern unsigned long iommu_tbl_range_alloc(struct device *dev,
+					   struct iommu_map_table *iommu,
+					   unsigned long npages,
+					   unsigned long *handle,
+					   unsigned long mask,
+					   unsigned int align_order);
+
+extern void iommu_tbl_range_free(struct iommu_map_table *iommu,
+				 u64 dma_addr, unsigned long npages,
+				 unsigned long entry);
+
+#endif
diff --git a/lib/Makefile b/lib/Makefile
index da6116b21555..6c37933336a0 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -106,7 +106,7 @@ obj-$(CONFIG_AUDIT_GENERIC) += audit.o
 obj-$(CONFIG_AUDIT_COMPAT_GENERIC) += compat_audit.o
 
 obj-$(CONFIG_SWIOTLB) += swiotlb.o
-obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o
+obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o iommu-common.o
 obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o
 obj-$(CONFIG_NOTIFIER_ERROR_INJECTION) += notifier-error-inject.o
 obj-$(CONFIG_CPU_NOTIFIER_ERROR_INJECT) += cpu-notifier-error-inject.o
diff --git a/lib/iommu-common.c b/lib/iommu-common.c
new file mode 100644
index 000000000000..b99f1d744a8d
--- /dev/null
+++ b/lib/iommu-common.c
@@ -0,0 +1,266 @@
+/*
+ * IOMMU mmap management and range allocation functions.
+ * Based almost entirely upon the powerpc iommu allocator.
+ */
+
+#include <linux/export.h>
+#include <linux/bitmap.h>
+#include <linux/bug.h>
+#include <linux/iommu-helper.h>
+#include <linux/iommu-common.h>
+#include <linux/dma-mapping.h>
+#include <linux/hash.h>
+
+unsigned long iommu_large_alloc = 15;
+
+static	DEFINE_PER_CPU(unsigned int, iommu_pool_hash);
+
+static inline bool need_flush(struct iommu_map_table *iommu)
+{
+	return (iommu->lazy_flush != NULL &&
+		(iommu->flags & IOMMU_NEED_FLUSH) != 0);
+}
+
+static inline void set_flush(struct iommu_map_table *iommu)
+{
+	iommu->flags |= IOMMU_NEED_FLUSH;
+}
+
+static inline void clear_flush(struct iommu_map_table *iommu)
+{
+	iommu->flags &= ~IOMMU_NEED_FLUSH;
+}
+
+static void setup_iommu_pool_hash(void)
+{
+	unsigned int i;
+	static bool do_once;
+
+	if (do_once)
+		return;
+	do_once = true;
+	for_each_possible_cpu(i)
+		per_cpu(iommu_pool_hash, i) = hash_32(i, IOMMU_POOL_HASHBITS);
+}
+
+/*
+ * Initialize iommu_pool entries for the iommu_map_table. `num_entries'
+ * is the number of table entries. If `large_pool' is set to true,
+ * the top 1/4 of the table will be set aside for pool allocations
+ * of more than iommu_large_alloc pages.
+ */
+extern void iommu_tbl_pool_init(struct iommu_map_table *iommu,
+				unsigned long num_entries,
+				u32 table_shift,
+				void (*lazy_flush)(struct iommu_map_table *),
+				bool large_pool, u32 npools,
+				bool skip_span_boundary_check)
+{
+	unsigned int start, i;
+	struct iommu_pool *p = &(iommu->large_pool);
+
+	setup_iommu_pool_hash();
+	if (npools == 0)
+		iommu->nr_pools = IOMMU_NR_POOLS;
+	else
+		iommu->nr_pools = npools;
+	BUG_ON(npools > IOMMU_NR_POOLS);
+
+	iommu->table_shift = table_shift;
+	iommu->lazy_flush = lazy_flush;
+	start = 0;
+	if (skip_span_boundary_check)
+		iommu->flags |= IOMMU_NO_SPAN_BOUND;
+	if (large_pool)
+		iommu->flags |= IOMMU_HAS_LARGE_POOL;
+
+	if (!large_pool)
+		iommu->poolsize = num_entries/iommu->nr_pools;
+	else
+		iommu->poolsize = (num_entries * 3 / 4)/iommu->nr_pools;
+	for (i = 0; i < iommu->nr_pools; i++) {
+		spin_lock_init(&(iommu->pools[i].lock));
+		iommu->pools[i].start = start;
+		iommu->pools[i].hint = start;
+		start += iommu->poolsize; /* start for next pool */
+		iommu->pools[i].end = start - 1;
+	}
+	if (!large_pool)
+		return;
+	/* initialize large_pool */
+	spin_lock_init(&(p->lock));
+	p->start = start;
+	p->hint = p->start;
+	p->end = num_entries;
+}
+EXPORT_SYMBOL(iommu_tbl_pool_init);
+
+unsigned long iommu_tbl_range_alloc(struct device *dev,
+				struct iommu_map_table *iommu,
+				unsigned long npages,
+				unsigned long *handle,
+				unsigned long mask,
+				unsigned int align_order)
+{
+	unsigned int pool_hash = __this_cpu_read(iommu_pool_hash);
+	unsigned long n, end, start, limit, boundary_size;
+	struct iommu_pool *pool;
+	int pass = 0;
+	unsigned int pool_nr;
+	unsigned int npools = iommu->nr_pools;
+	unsigned long flags;
+	bool large_pool = ((iommu->flags & IOMMU_HAS_LARGE_POOL) != 0);
+	bool largealloc = (large_pool && npages > iommu_large_alloc);
+	unsigned long shift;
+	unsigned long align_mask = 0;
+
+	if (align_order > 0)
+		align_mask = 0xffffffffffffffffl >> (64 - align_order);
+
+	/* Sanity check */
+	if (unlikely(npages == 0)) {
+		WARN_ON_ONCE(1);
+		return DMA_ERROR_CODE;
+	}
+
+	if (largealloc) {
+		pool = &(iommu->large_pool);
+		pool_nr = 0; /* to keep compiler happy */
+	} else {
+		/* pick out pool_nr */
+		pool_nr =  pool_hash & (npools - 1);
+		pool = &(iommu->pools[pool_nr]);
+	}
+	spin_lock_irqsave(&pool->lock, flags);
+
+ again:
+	if (pass == 0 && handle && *handle &&
+	    (*handle >= pool->start) && (*handle < pool->end))
+		start = *handle;
+	else
+		start = pool->hint;
+
+	limit = pool->end;
+
+	/* The case below can happen if we have a small segment appended
+	 * to a large, or when the previous alloc was at the very end of
+	 * the available space. If so, go back to the beginning. If a
+	 * flush is needed, it will get done based on the return value
+	 * from iommu_area_alloc() below.
+	 */
+	if (start >= limit)
+		start = pool->start;
+	shift = iommu->table_map_base >> iommu->table_shift;
+	if (limit + shift > mask) {
+		limit = mask - shift + 1;
+		/* If we're constrained on address range, first try
+		 * at the masked hint to avoid O(n) search complexity,
+		 * but on second pass, start at 0 in pool 0.
+		 */
+		if ((start & mask) >= limit || pass > 0) {
+			spin_unlock(&(pool->lock));
+			pool = &(iommu->pools[0]);
+			spin_lock(&(pool->lock));
+			start = pool->start;
+		} else {
+			start &= mask;
+		}
+	}
+
+	if (dev)
+		boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
+				      1 << iommu->table_shift);
+	else
+		boundary_size = ALIGN(1UL << 32, 1 << iommu->table_shift);
+
+	boundary_size = boundary_size >> iommu->table_shift;
+	/*
+	 * if the skip_span_boundary_check had been set during init, we set
+	 * things up so that iommu_is_span_boundary() merely checks if the
+	 * (index + npages) < num_tsb_entries
+	 */
+	if ((iommu->flags & IOMMU_NO_SPAN_BOUND) != 0) {
+		shift = 0;
+		boundary_size = iommu->poolsize * iommu->nr_pools;
+	}
+	n = iommu_area_alloc(iommu->map, limit, start, npages, shift,
+			     boundary_size, align_mask);
+	if (n == -1) {
+		if (likely(pass == 0)) {
+			/* First failure, rescan from the beginning.  */
+			pool->hint = pool->start;
+			set_flush(iommu);
+			pass++;
+			goto again;
+		} else if (!largealloc && pass <= iommu->nr_pools) {
+			spin_unlock(&(pool->lock));
+			pool_nr = (pool_nr + 1) & (iommu->nr_pools - 1);
+			pool = &(iommu->pools[pool_nr]);
+			spin_lock(&(pool->lock));
+			pool->hint = pool->start;
+			set_flush(iommu);
+			pass++;
+			goto again;
+		} else {
+			/* give up */
+			n = DMA_ERROR_CODE;
+			goto bail;
+		}
+	}
+	if (n < pool->hint || need_flush(iommu)) {
+		clear_flush(iommu);
+		iommu->lazy_flush(iommu);
+	}
+
+	end = n + npages;
+	pool->hint = end;
+
+	/* Update handle for SG allocations */
+	if (handle)
+		*handle = end;
+bail:
+	spin_unlock_irqrestore(&(pool->lock), flags);
+
+	return n;
+}
+EXPORT_SYMBOL(iommu_tbl_range_alloc);
+
+static struct iommu_pool *get_pool(struct iommu_map_table *tbl,
+				   unsigned long entry)
+{
+	struct iommu_pool *p;
+	unsigned long largepool_start = tbl->large_pool.start;
+	bool large_pool = ((tbl->flags & IOMMU_HAS_LARGE_POOL) != 0);
+
+	/* The large pool is the last pool at the top of the table */
+	if (large_pool && entry >= largepool_start) {
+		p = &tbl->large_pool;
+	} else {
+		unsigned int pool_nr = entry / tbl->poolsize;
+
+		BUG_ON(pool_nr >= tbl->nr_pools);
+		p = &tbl->pools[pool_nr];
+	}
+	return p;
+}
+
+/* Caller supplies the index of the entry into the iommu map table
+ * itself when the mapping from dma_addr to the entry is not the
+ * default addr->entry mapping below.
+ */
+void iommu_tbl_range_free(struct iommu_map_table *iommu, u64 dma_addr,
+			  unsigned long npages, unsigned long entry)
+{
+	struct iommu_pool *pool;
+	unsigned long flags;
+	unsigned long shift = iommu->table_shift;
+
+	if (entry == DMA_ERROR_CODE) /* use default addr->entry mapping */
+		entry = (dma_addr - iommu->table_map_base) >> shift;
+	pool = get_pool(iommu, entry);
+
+	spin_lock_irqsave(&(pool->lock), flags);
+	bitmap_clear(iommu->map, entry, npages);
+	spin_unlock_irqrestore(&(pool->lock), flags);
+}
+EXPORT_SYMBOL(iommu_tbl_range_free);
-- 
cgit v1.2.3


From 2f0c0fdc085c0d415457a1c52344f72e12c4cec6 Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Sat, 18 Apr 2015 12:33:55 -0700
Subject: iommu-common: Fix PARISC compile-time warnings

Fixes warnings due to
- no DMA_ERROR_CODE on PARISC,
- sizeof (unsigned long) == 4 bytes on PARISC.

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/iommu-common.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/iommu-common.c b/lib/iommu-common.c
index b99f1d744a8d..a1a517cba7ec 100644
--- a/lib/iommu-common.c
+++ b/lib/iommu-common.c
@@ -11,6 +11,10 @@
 #include <linux/dma-mapping.h>
 #include <linux/hash.h>
 
+#ifndef	DMA_ERROR_CODE
+#define	DMA_ERROR_CODE (~(dma_addr_t)0x0)
+#endif
+
 unsigned long iommu_large_alloc = 15;
 
 static	DEFINE_PER_CPU(unsigned int, iommu_pool_hash);
@@ -171,7 +175,7 @@ unsigned long iommu_tbl_range_alloc(struct device *dev,
 		boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
 				      1 << iommu->table_shift);
 	else
-		boundary_size = ALIGN(1UL << 32, 1 << iommu->table_shift);
+		boundary_size = ALIGN(1ULL << 32, 1 << iommu->table_shift);
 
 	boundary_size = boundary_size >> iommu->table_shift;
 	/*
-- 
cgit v1.2.3


From e4afa120c98252e44390067c3a6cc775cde30659 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sat, 18 Apr 2015 11:18:27 +0930
Subject: cpumask: remove __first_cpu / __next_cpu

They were for use by the deprecated first_cpu() and next_cpu() wrappers,
but sparc used them directly.

They're now replaced by cpumask_first / cpumask_next.  And __next_cpu_nr
is completely obsolete.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/kernel/time_32.c |  4 ++--
 lib/cpumask.c               | 21 ---------------------
 2 files changed, 2 insertions(+), 23 deletions(-)

(limited to 'lib')

diff --git a/arch/sparc/kernel/time_32.c b/arch/sparc/kernel/time_32.c
index 2f80d23a0a44..73825f431b0c 100644
--- a/arch/sparc/kernel/time_32.c
+++ b/arch/sparc/kernel/time_32.c
@@ -198,7 +198,7 @@ static __init int setup_timer_cs(void)
 static void percpu_ce_setup(enum clock_event_mode mode,
 			struct clock_event_device *evt)
 {
-	int cpu = __first_cpu(evt->cpumask);
+	int cpu = cpumask_first(evt->cpumask);
 
 	switch (mode) {
 		case CLOCK_EVT_MODE_PERIODIC:
@@ -218,7 +218,7 @@ static void percpu_ce_setup(enum clock_event_mode mode,
 static int percpu_ce_set_next_event(unsigned long delta,
 				    struct clock_event_device *evt)
 {
-	int cpu = __first_cpu(evt->cpumask);
+	int cpu = cpumask_first(evt->cpumask);
 	unsigned int next = (unsigned int)delta;
 
 	sparc_config.load_profile_irq(cpu, next);
diff --git a/lib/cpumask.c b/lib/cpumask.c
index ba379d12bb57..75379b759d3f 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -5,27 +5,6 @@
 #include <linux/export.h>
 #include <linux/bootmem.h>
 
-int __first_cpu(const cpumask_t *srcp)
-{
-	return min_t(int, NR_CPUS, find_first_bit(srcp->bits, NR_CPUS));
-}
-EXPORT_SYMBOL(__first_cpu);
-
-int __next_cpu(int n, const cpumask_t *srcp)
-{
-	return min_t(int, NR_CPUS, find_next_bit(srcp->bits, NR_CPUS, n+1));
-}
-EXPORT_SYMBOL(__next_cpu);
-
-#if NR_CPUS > 64
-int __next_cpu_nr(int n, const cpumask_t *srcp)
-{
-	return min_t(int, nr_cpu_ids,
-				find_next_bit(srcp->bits, nr_cpu_ids, n+1));
-}
-EXPORT_SYMBOL(__next_cpu_nr);
-#endif
-
 /**
  * cpumask_next_and - get the next cpu in *src1p & *src2p
  * @n: the cpu prior to the place to search (ie. return will be > @n)
-- 
cgit v1.2.3


From 17974c054db3030b714b7108566bf5208d965a19 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 19 Apr 2015 13:48:40 -0700
Subject: hexdump: avoid warning in test function

The test_data_1_le[] array is a const array of const char *.  To avoid
dropping any const information, we need to use "const char * const *",
not just "const char **".

I'm not sure why the different test arrays end up having different
const'ness, but let's make the pointer we use to traverse them as const
as possible, since we modify neither the array of pointers _or_ the
pointers we find in the array.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/test-hexdump.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/test-hexdump.c b/lib/test-hexdump.c
index 9846ff7428b3..c227cc43ec0a 100644
--- a/lib/test-hexdump.c
+++ b/lib/test-hexdump.c
@@ -48,7 +48,7 @@ static void __init test_hexdump(size_t len, int rowsize, int groupsize,
 	char test[32 * 3 + 2 + 32 + 1];
 	char real[32 * 3 + 2 + 32 + 1];
 	char *p;
-	const char **result;
+	const char * const *result;
 	size_t l = len;
 	int gs = groupsize, rs = rowsize;
 	unsigned int i;
-- 
cgit v1.2.3


From b0cc836d306c12462a60e72aae8f6d2318f10817 Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Sun, 19 Apr 2015 13:13:30 -0400
Subject: iommu-common: fix x86_64 compiler warnings

Declare iommu_large_alloc as static. Remove extern definition  for
iommu_tbl_pool_init().

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/iommu-common.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'lib')

diff --git a/lib/iommu-common.c b/lib/iommu-common.c
index a1a517cba7ec..a9a53f566237 100644
--- a/lib/iommu-common.c
+++ b/lib/iommu-common.c
@@ -15,7 +15,7 @@
 #define	DMA_ERROR_CODE (~(dma_addr_t)0x0)
 #endif
 
-unsigned long iommu_large_alloc = 15;
+static unsigned long iommu_large_alloc = 15;
 
 static	DEFINE_PER_CPU(unsigned int, iommu_pool_hash);
 
@@ -53,12 +53,12 @@ static void setup_iommu_pool_hash(void)
  * the top 1/4 of the table will be set aside for pool allocations
  * of more than iommu_large_alloc pages.
  */
-extern void iommu_tbl_pool_init(struct iommu_map_table *iommu,
-				unsigned long num_entries,
-				u32 table_shift,
-				void (*lazy_flush)(struct iommu_map_table *),
-				bool large_pool, u32 npools,
-				bool skip_span_boundary_check)
+void iommu_tbl_pool_init(struct iommu_map_table *iommu,
+			 unsigned long num_entries,
+			 u32 table_shift,
+			 void (*lazy_flush)(struct iommu_map_table *),
+			 bool large_pool, u32 npools,
+			 bool skip_span_boundary_check)
 {
 	unsigned int start, i;
 	struct iommu_pool *p = &(iommu->large_pool);
-- 
cgit v1.2.3


From 7b3372d4c2bced80598771aab8fea87c40ebb52a Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Sun, 19 Apr 2015 13:13:31 -0400
Subject: iommu-common: rename iommu_pool_hash to iommu_hash_common

When CONFIG_DEBUG_FORCE_WEAK_PER_CPU is set, the DEFINE_PER_CPU_SECTION
macro will define an extern __pcpu_unique_##name variable that could
conflict with the same definition in powerpc at this time. Avoid that
conflict by renaming iommu_pool_hash in iommu-common.c

Thanks to Guenter Roeck for catching this, and helping to test the fix.

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/iommu-common.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'lib')

diff --git a/lib/iommu-common.c b/lib/iommu-common.c
index a9a53f566237..df30632f0bef 100644
--- a/lib/iommu-common.c
+++ b/lib/iommu-common.c
@@ -17,7 +17,7 @@
 
 static unsigned long iommu_large_alloc = 15;
 
-static	DEFINE_PER_CPU(unsigned int, iommu_pool_hash);
+static	DEFINE_PER_CPU(unsigned int, iommu_hash_common);
 
 static inline bool need_flush(struct iommu_map_table *iommu)
 {
@@ -44,7 +44,7 @@ static void setup_iommu_pool_hash(void)
 		return;
 	do_once = true;
 	for_each_possible_cpu(i)
-		per_cpu(iommu_pool_hash, i) = hash_32(i, IOMMU_POOL_HASHBITS);
+		per_cpu(iommu_hash_common, i) = hash_32(i, IOMMU_POOL_HASHBITS);
 }
 
 /*
@@ -106,7 +106,7 @@ unsigned long iommu_tbl_range_alloc(struct device *dev,
 				unsigned long mask,
 				unsigned int align_order)
 {
-	unsigned int pool_hash = __this_cpu_read(iommu_pool_hash);
+	unsigned int pool_hash = __this_cpu_read(iommu_hash_common);
 	unsigned long n, end, start, limit, boundary_size;
 	struct iommu_pool *pool;
 	int pass = 0;
-- 
cgit v1.2.3


From fe5cbc6e06c7d8b3a86f6f5491d74766bb5c2827 Mon Sep 17 00:00:00 2001
From: Markus Stockhausen <stockhausen@collogia.de>
Date: Mon, 15 Dec 2014 12:57:04 +1100
Subject: md/raid6 algorithms: delta syndrome functions

v3: s-o-b comment, explanation of performance and descision for
the start/stop implementation

Implementing rmw functionality for RAID6 requires optimized syndrome
calculation. Up to now we can only generate a complete syndrome. The
target P/Q pages are always overwritten. With this patch we provide
a framework for inplace P/Q modification. In the first place simply
fill those functions with NULL values.

xor_syndrome() has two additional parameters: start & stop. These
will indicate the first and last page that are changing during a
rmw run. That makes it possible to avoid several unneccessary loops
and speed up calculation. The caller needs to implement the following
logic to make the functions work.

1) xor_syndrome(disks, start, stop, ...): "Remove" all data of source
blocks inside P/Q between (and including) start and end.

2) modify any block with start <= block <= stop

3) xor_syndrome(disks, start, stop, ...): "Reinsert" all data of
source blocks into P/Q between (and including) start and end.

Pages between start and stop that won't be changed should be filled
with a pointer to the kernel zero page. The reasons for not taking NULL
pages are:

1) Algorithms cross the whole source data line by line. Thus avoid
additional branches.

2) Having a NULL page avoids calculating the XOR P parity but still
need calulation steps for the Q parity. Depending on the algorithm
unrolling that might be only a difference of 2 instructions per loop.

The benchmark numbers of the gen_syndrome() functions are displayed in
the kernel log. Do the same for the xor_syndrome() functions. This
will help to analyze performance problems and give an rough estimate
how well the algorithm works. The choice of the fastest algorithm will
still depend on the gen_syndrome() performance.

With the start/stop page implementation the speed can vary a lot in real
life. E.g. a change of page 0 & page 15 on a stripe will be harder to
compute than the case where page 0 & page 1 are XOR candidates. To be not
to enthusiatic about the expected speeds we will run a worse case test
that simulates a change on the upper half of the stripe. So we do:

1) calculation of P/Q for the upper pages

2) continuation of Q for the lower (empty) pages

Signed-off-by: Markus Stockhausen <stockhausen@collogia.de>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 include/linux/raid/pq.h |  1 +
 lib/raid6/algos.c       | 41 ++++++++++++++++++++++++++++++++++-------
 lib/raid6/altivec.uc    |  1 +
 lib/raid6/avx2.c        |  3 +++
 lib/raid6/int.uc        |  3 ++-
 lib/raid6/mmx.c         |  2 ++
 lib/raid6/neon.c        |  1 +
 lib/raid6/sse1.c        |  2 ++
 lib/raid6/sse2.c        |  3 +++
 lib/raid6/tilegx.uc     |  1 +
 10 files changed, 50 insertions(+), 8 deletions(-)

(limited to 'lib')

diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index 73069cb6c54a..a7a06d1dcf9c 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -72,6 +72,7 @@ extern const char raid6_empty_zero_page[PAGE_SIZE];
 /* Routine choices */
 struct raid6_calls {
 	void (*gen_syndrome)(int, size_t, void **);
+	void (*xor_syndrome)(int, int, int, size_t, void **);
 	int  (*valid)(void);	/* Returns 1 if this routine set is usable */
 	const char *name;	/* Name of this routine set */
 	int prefer;		/* Has special performance attribute */
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index dbef2314901e..975c6e0434bd 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -131,11 +131,12 @@ static inline const struct raid6_recov_calls *raid6_choose_recov(void)
 static inline const struct raid6_calls *raid6_choose_gen(
 	void *(*const dptrs)[(65536/PAGE_SIZE)+2], const int disks)
 {
-	unsigned long perf, bestperf, j0, j1;
+	unsigned long perf, bestgenperf, bestxorperf, j0, j1;
+	int start = (disks>>1)-1, stop = disks-3;	/* work on the second half of the disks */
 	const struct raid6_calls *const *algo;
 	const struct raid6_calls *best;
 
-	for (bestperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) {
+	for (bestgenperf = 0, bestxorperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) {
 		if (!best || (*algo)->prefer >= best->prefer) {
 			if ((*algo)->valid && !(*algo)->valid())
 				continue;
@@ -153,19 +154,45 @@ static inline const struct raid6_calls *raid6_choose_gen(
 			}
 			preempt_enable();
 
-			if (perf > bestperf) {
-				bestperf = perf;
+			if (perf > bestgenperf) {
+				bestgenperf = perf;
 				best = *algo;
 			}
-			pr_info("raid6: %-8s %5ld MB/s\n", (*algo)->name,
+			pr_info("raid6: %-8s gen() %5ld MB/s\n", (*algo)->name,
 			       (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2));
+
+			if (!(*algo)->xor_syndrome)
+				continue;
+
+			perf = 0;
+
+			preempt_disable();
+			j0 = jiffies;
+			while ((j1 = jiffies) == j0)
+				cpu_relax();
+			while (time_before(jiffies,
+					    j1 + (1<<RAID6_TIME_JIFFIES_LG2))) {
+				(*algo)->xor_syndrome(disks, start, stop,
+						      PAGE_SIZE, *dptrs);
+				perf++;
+			}
+			preempt_enable();
+
+			if (best == *algo)
+				bestxorperf = perf;
+
+			pr_info("raid6: %-8s xor() %5ld MB/s\n", (*algo)->name,
+				(perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2+1));
 		}
 	}
 
 	if (best) {
-		pr_info("raid6: using algorithm %s (%ld MB/s)\n",
+		pr_info("raid6: using algorithm %s gen() %ld MB/s\n",
 		       best->name,
-		       (bestperf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2));
+		       (bestgenperf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2));
+		if (best->xor_syndrome)
+			pr_info("raid6: .... xor() %ld MB/s, rmw enabled\n",
+			       (bestxorperf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2+1));
 		raid6_call = *best;
 	} else
 		pr_err("raid6: Yikes!  No algorithm found!\n");
diff --git a/lib/raid6/altivec.uc b/lib/raid6/altivec.uc
index 7cc12b532e95..bec27fce7501 100644
--- a/lib/raid6/altivec.uc
+++ b/lib/raid6/altivec.uc
@@ -119,6 +119,7 @@ int raid6_have_altivec(void)
 
 const struct raid6_calls raid6_altivec$# = {
 	raid6_altivec$#_gen_syndrome,
+	NULL,			/* XOR not yet implemented */
 	raid6_have_altivec,
 	"altivecx$#",
 	0
diff --git a/lib/raid6/avx2.c b/lib/raid6/avx2.c
index bc3b1dd436eb..76734004358d 100644
--- a/lib/raid6/avx2.c
+++ b/lib/raid6/avx2.c
@@ -89,6 +89,7 @@ static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
 
 const struct raid6_calls raid6_avx2x1 = {
 	raid6_avx21_gen_syndrome,
+	NULL,			/* XOR not yet implemented */
 	raid6_have_avx2,
 	"avx2x1",
 	1			/* Has cache hints */
@@ -150,6 +151,7 @@ static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
 
 const struct raid6_calls raid6_avx2x2 = {
 	raid6_avx22_gen_syndrome,
+	NULL,			/* XOR not yet implemented */
 	raid6_have_avx2,
 	"avx2x2",
 	1			/* Has cache hints */
@@ -242,6 +244,7 @@ static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
 
 const struct raid6_calls raid6_avx2x4 = {
 	raid6_avx24_gen_syndrome,
+	NULL,			/* XOR not yet implemented */
 	raid6_have_avx2,
 	"avx2x4",
 	1			/* Has cache hints */
diff --git a/lib/raid6/int.uc b/lib/raid6/int.uc
index 5b50f8dfc5d2..5ca60bee1388 100644
--- a/lib/raid6/int.uc
+++ b/lib/raid6/int.uc
@@ -109,7 +109,8 @@ static void raid6_int$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
 
 const struct raid6_calls raid6_intx$# = {
 	raid6_int$#_gen_syndrome,
-	NULL,		/* always valid */
+	NULL,			/* XOR not yet implemented */
+	NULL,			/* always valid */
 	"int" NSTRING "x$#",
 	0
 };
diff --git a/lib/raid6/mmx.c b/lib/raid6/mmx.c
index 590c71c9e200..b3b0e1fcd3af 100644
--- a/lib/raid6/mmx.c
+++ b/lib/raid6/mmx.c
@@ -76,6 +76,7 @@ static void raid6_mmx1_gen_syndrome(int disks, size_t bytes, void **ptrs)
 
 const struct raid6_calls raid6_mmxx1 = {
 	raid6_mmx1_gen_syndrome,
+	NULL,			/* XOR not yet implemented */
 	raid6_have_mmx,
 	"mmxx1",
 	0
@@ -134,6 +135,7 @@ static void raid6_mmx2_gen_syndrome(int disks, size_t bytes, void **ptrs)
 
 const struct raid6_calls raid6_mmxx2 = {
 	raid6_mmx2_gen_syndrome,
+	NULL,			/* XOR not yet implemented */
 	raid6_have_mmx,
 	"mmxx2",
 	0
diff --git a/lib/raid6/neon.c b/lib/raid6/neon.c
index 36ad4705df1a..d9ad6ee284f4 100644
--- a/lib/raid6/neon.c
+++ b/lib/raid6/neon.c
@@ -42,6 +42,7 @@
 	}								\
 	struct raid6_calls const raid6_neonx ## _n = {			\
 		raid6_neon ## _n ## _gen_syndrome,			\
+		NULL,		/* XOR not yet implemented */		\
 		raid6_have_neon,					\
 		"neonx" #_n,						\
 		0							\
diff --git a/lib/raid6/sse1.c b/lib/raid6/sse1.c
index f76297139445..9025b8ca9aa3 100644
--- a/lib/raid6/sse1.c
+++ b/lib/raid6/sse1.c
@@ -92,6 +92,7 @@ static void raid6_sse11_gen_syndrome(int disks, size_t bytes, void **ptrs)
 
 const struct raid6_calls raid6_sse1x1 = {
 	raid6_sse11_gen_syndrome,
+	NULL,			/* XOR not yet implemented */
 	raid6_have_sse1_or_mmxext,
 	"sse1x1",
 	1			/* Has cache hints */
@@ -154,6 +155,7 @@ static void raid6_sse12_gen_syndrome(int disks, size_t bytes, void **ptrs)
 
 const struct raid6_calls raid6_sse1x2 = {
 	raid6_sse12_gen_syndrome,
+	NULL,			/* XOR not yet implemented */
 	raid6_have_sse1_or_mmxext,
 	"sse1x2",
 	1			/* Has cache hints */
diff --git a/lib/raid6/sse2.c b/lib/raid6/sse2.c
index 85b82c85f28e..31acd59a0ef7 100644
--- a/lib/raid6/sse2.c
+++ b/lib/raid6/sse2.c
@@ -90,6 +90,7 @@ static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs)
 
 const struct raid6_calls raid6_sse2x1 = {
 	raid6_sse21_gen_syndrome,
+	NULL,			/* XOR not yet implemented */
 	raid6_have_sse2,
 	"sse2x1",
 	1			/* Has cache hints */
@@ -152,6 +153,7 @@ static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs)
 
 const struct raid6_calls raid6_sse2x2 = {
 	raid6_sse22_gen_syndrome,
+	NULL,			/* XOR not yet implemented */
 	raid6_have_sse2,
 	"sse2x2",
 	1			/* Has cache hints */
@@ -250,6 +252,7 @@ static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs)
 
 const struct raid6_calls raid6_sse2x4 = {
 	raid6_sse24_gen_syndrome,
+	NULL,			/* XOR not yet implemented */
 	raid6_have_sse2,
 	"sse2x4",
 	1			/* Has cache hints */
diff --git a/lib/raid6/tilegx.uc b/lib/raid6/tilegx.uc
index e7c29459cbcd..2dd291a11264 100644
--- a/lib/raid6/tilegx.uc
+++ b/lib/raid6/tilegx.uc
@@ -80,6 +80,7 @@ void raid6_tilegx$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
 
 const struct raid6_calls raid6_tilegx$# = {
 	raid6_tilegx$#_gen_syndrome,
+	NULL,			/* XOR not yet implemented */
 	NULL,
 	"tilegx$#",
 	0
-- 
cgit v1.2.3


From 7e92e1d7629b00578cef22b1f4c6ada726663701 Mon Sep 17 00:00:00 2001
From: Markus Stockhausen <stockhausen@collogia.de>
Date: Mon, 15 Dec 2014 12:57:04 +1100
Subject: md/raid6 algorithms: improve test program

It is always helpful to have a test tool in place if we implement
new data critical algorithms. So add some test routines to the raid6
checker that can prove if the new xor_syndrome() works as expected.

Run through all permutations of start/stop pages per algorithm and
simulate a xor_syndrome() assisted rmw run. After each rmw check if
the recovery algorithm still confirms that the stripe is fine.

Signed-off-by: Markus Stockhausen <stockhausen@collogia.de>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 lib/raid6/test/test.c | 51 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 36 insertions(+), 15 deletions(-)

(limited to 'lib')

diff --git a/lib/raid6/test/test.c b/lib/raid6/test/test.c
index 5a485b7a7d3c..3bebbabdb510 100644
--- a/lib/raid6/test/test.c
+++ b/lib/raid6/test/test.c
@@ -28,11 +28,11 @@ char *dataptrs[NDISKS];
 char data[NDISKS][PAGE_SIZE];
 char recovi[PAGE_SIZE], recovj[PAGE_SIZE];
 
-static void makedata(void)
+static void makedata(int start, int stop)
 {
 	int i, j;
 
-	for (i = 0; i < NDISKS; i++) {
+	for (i = start; i <= stop; i++) {
 		for (j = 0; j < PAGE_SIZE; j++)
 			data[i][j] = rand();
 
@@ -91,34 +91,55 @@ int main(int argc, char *argv[])
 {
 	const struct raid6_calls *const *algo;
 	const struct raid6_recov_calls *const *ra;
-	int i, j;
+	int i, j, p1, p2;
 	int err = 0;
 
-	makedata();
+	makedata(0, NDISKS-1);
 
 	for (ra = raid6_recov_algos; *ra; ra++) {
 		if ((*ra)->valid  && !(*ra)->valid())
 			continue;
+
 		raid6_2data_recov = (*ra)->data2;
 		raid6_datap_recov = (*ra)->datap;
 
 		printf("using recovery %s\n", (*ra)->name);
 
 		for (algo = raid6_algos; *algo; algo++) {
-			if (!(*algo)->valid || (*algo)->valid()) {
-				raid6_call = **algo;
+			if ((*algo)->valid && !(*algo)->valid())
+				continue;
+
+			raid6_call = **algo;
+
+			/* Nuke syndromes */
+			memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE);
+
+			/* Generate assumed good syndrome */
+			raid6_call.gen_syndrome(NDISKS, PAGE_SIZE,
+						(void **)&dataptrs);
+
+			for (i = 0; i < NDISKS-1; i++)
+				for (j = i+1; j < NDISKS; j++)
+					err += test_disks(i, j);
+
+			if (!raid6_call.xor_syndrome)
+				continue;
+
+			for (p1 = 0; p1 < NDISKS-2; p1++)
+				for (p2 = p1; p2 < NDISKS-2; p2++) {
 
-				/* Nuke syndromes */
-				memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE);
+					/* Simulate rmw run */
+					raid6_call.xor_syndrome(NDISKS, p1, p2, PAGE_SIZE,
+								(void **)&dataptrs);
+					makedata(p1, p2);
+					raid6_call.xor_syndrome(NDISKS, p1, p2, PAGE_SIZE,
+                                                                (void **)&dataptrs);
 
-				/* Generate assumed good syndrome */
-				raid6_call.gen_syndrome(NDISKS, PAGE_SIZE,
-							(void **)&dataptrs);
+					for (i = 0; i < NDISKS-1; i++)
+						for (j = i+1; j < NDISKS; j++)
+							err += test_disks(i, j);
+				}
 
-				for (i = 0; i < NDISKS-1; i++)
-					for (j = i+1; j < NDISKS; j++)
-						err += test_disks(i, j);
-			}
 		}
 		printf("\n");
 	}
-- 
cgit v1.2.3


From 9a5ce91d053961b7cc8fa56bd083819a9fc92734 Mon Sep 17 00:00:00 2001
From: Markus Stockhausen <stockhausen@collogia.de>
Date: Mon, 15 Dec 2014 12:57:04 +1100
Subject: md/raid6 algorithms: xor_syndrome() for generic int

Start the algorithms with the very basic one. It is left and right
optimized. That means we can avoid all calculations for unneeded pages
above the right stop offset. For pages below the left start offset we
still need the syndrome multiplication but without reading data pages.

Signed-off-by: Markus Stockhausen <stockhausen@collogia.de>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 lib/raid6/int.uc | 40 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/raid6/int.uc b/lib/raid6/int.uc
index 5ca60bee1388..558aeac9342a 100644
--- a/lib/raid6/int.uc
+++ b/lib/raid6/int.uc
@@ -107,9 +107,47 @@ static void raid6_int$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
 	}
 }
 
+static void raid6_int$#_xor_syndrome(int disks, int start, int stop,
+				     size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks-2];	/* XOR parity */
+	q = dptr[disks-1];	/* RS syndrome */
+
+	for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
+		/* P/Q data pages */
+		wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE];
+		for ( z = z0-1 ; z >= start ; z-- ) {
+			wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+			wp$$ ^= wd$$;
+			w2$$ = MASK(wq$$);
+			w1$$ = SHLBYTE(wq$$);
+			w2$$ &= NBYTES(0x1d);
+			w1$$ ^= w2$$;
+			wq$$ = w1$$ ^ wd$$;
+		}
+		/* P/Q left side optimization */
+		for ( z = start-1 ; z >= 0 ; z-- ) {
+			w2$$ = MASK(wq$$);
+			w1$$ = SHLBYTE(wq$$);
+			w2$$ &= NBYTES(0x1d);
+			wq$$ = w1$$ ^ w2$$;
+		}
+		*(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
+		*(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
+	}
+
+}
+
 const struct raid6_calls raid6_intx$# = {
 	raid6_int$#_gen_syndrome,
-	NULL,			/* XOR not yet implemented */
+	raid6_int$#_xor_syndrome,
 	NULL,			/* always valid */
 	"int" NSTRING "x$#",
 	0
-- 
cgit v1.2.3


From a582564b24bec0443b5c5ff43ee6d1258f8bd658 Mon Sep 17 00:00:00 2001
From: Markus Stockhausen <stockhausen@collogia.de>
Date: Mon, 15 Dec 2014 12:57:05 +1100
Subject: md/raid6 algorithms: xor_syndrome() for SSE2

The second and (last) optimized XOR syndrome calculation. This version
supports right and left side optimization. All CPUs with architecture
older than Haswell will benefit from it.

It should be noted that SSE2 movntdq kills performance for memory areas
that are read and written simultaneously in chunks smaller than cache
line size. So use movdqa instead for P/Q writes in sse21 and sse22 XOR
functions.

Signed-off-by: Markus Stockhausen <stockhausen@collogia.de>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 lib/raid6/sse2.c | 230 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 227 insertions(+), 3 deletions(-)

(limited to 'lib')

diff --git a/lib/raid6/sse2.c b/lib/raid6/sse2.c
index 31acd59a0ef7..1d2276b007ee 100644
--- a/lib/raid6/sse2.c
+++ b/lib/raid6/sse2.c
@@ -88,9 +88,58 @@ static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs)
 	kernel_fpu_end();
 }
 
+
+static void raid6_sse21_xor_syndrome(int disks, int start, int stop,
+				     size_t bytes, void **ptrs)
+ {
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks-2];	/* XOR parity */
+	q = dptr[disks-1];	/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
+
+	for ( d = 0 ; d < bytes ; d += 16 ) {
+		asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
+		asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
+		asm volatile("pxor %xmm4,%xmm2");
+		/* P/Q data pages */
+		for ( z = z0-1 ; z >= start ; z-- ) {
+			asm volatile("pxor %xmm5,%xmm5");
+			asm volatile("pcmpgtb %xmm4,%xmm5");
+			asm volatile("paddb %xmm4,%xmm4");
+			asm volatile("pand %xmm0,%xmm5");
+			asm volatile("pxor %xmm5,%xmm4");
+			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
+			asm volatile("pxor %xmm5,%xmm2");
+			asm volatile("pxor %xmm5,%xmm4");
+		}
+		/* P/Q left side optimization */
+		for ( z = start-1 ; z >= 0 ; z-- ) {
+			asm volatile("pxor %xmm5,%xmm5");
+			asm volatile("pcmpgtb %xmm4,%xmm5");
+			asm volatile("paddb %xmm4,%xmm4");
+			asm volatile("pand %xmm0,%xmm5");
+			asm volatile("pxor %xmm5,%xmm4");
+		}
+		asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
+		/* Don't use movntdq for r/w memory area < cache line */
+		asm volatile("movdqa %%xmm4,%0" : "=m" (q[d]));
+		asm volatile("movdqa %%xmm2,%0" : "=m" (p[d]));
+	}
+
+	asm volatile("sfence" : : : "memory");
+	kernel_fpu_end();
+}
+
 const struct raid6_calls raid6_sse2x1 = {
 	raid6_sse21_gen_syndrome,
-	NULL,			/* XOR not yet implemented */
+	raid6_sse21_xor_syndrome,
 	raid6_have_sse2,
 	"sse2x1",
 	1			/* Has cache hints */
@@ -151,9 +200,76 @@ static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs)
 	kernel_fpu_end();
 }
 
+ static void raid6_sse22_xor_syndrome(int disks, int start, int stop,
+				     size_t bytes, void **ptrs)
+ {
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks-2];	/* XOR parity */
+	q = dptr[disks-1];	/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
+
+	for ( d = 0 ; d < bytes ; d += 32 ) {
+		asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
+		asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16]));
+		asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
+		asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16]));
+		asm volatile("pxor %xmm4,%xmm2");
+		asm volatile("pxor %xmm6,%xmm3");
+		/* P/Q data pages */
+		for ( z = z0-1 ; z >= start ; z-- ) {
+			asm volatile("pxor %xmm5,%xmm5");
+			asm volatile("pxor %xmm7,%xmm7");
+			asm volatile("pcmpgtb %xmm4,%xmm5");
+			asm volatile("pcmpgtb %xmm6,%xmm7");
+			asm volatile("paddb %xmm4,%xmm4");
+			asm volatile("paddb %xmm6,%xmm6");
+			asm volatile("pand %xmm0,%xmm5");
+			asm volatile("pand %xmm0,%xmm7");
+			asm volatile("pxor %xmm5,%xmm4");
+			asm volatile("pxor %xmm7,%xmm6");
+			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
+			asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
+			asm volatile("pxor %xmm5,%xmm2");
+			asm volatile("pxor %xmm7,%xmm3");
+			asm volatile("pxor %xmm5,%xmm4");
+			asm volatile("pxor %xmm7,%xmm6");
+		}
+		/* P/Q left side optimization */
+		for ( z = start-1 ; z >= 0 ; z-- ) {
+			asm volatile("pxor %xmm5,%xmm5");
+			asm volatile("pxor %xmm7,%xmm7");
+			asm volatile("pcmpgtb %xmm4,%xmm5");
+			asm volatile("pcmpgtb %xmm6,%xmm7");
+			asm volatile("paddb %xmm4,%xmm4");
+			asm volatile("paddb %xmm6,%xmm6");
+			asm volatile("pand %xmm0,%xmm5");
+			asm volatile("pand %xmm0,%xmm7");
+			asm volatile("pxor %xmm5,%xmm4");
+			asm volatile("pxor %xmm7,%xmm6");
+		}
+		asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
+		asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16]));
+		/* Don't use movntdq for r/w memory area < cache line */
+		asm volatile("movdqa %%xmm4,%0" : "=m" (q[d]));
+		asm volatile("movdqa %%xmm6,%0" : "=m" (q[d+16]));
+		asm volatile("movdqa %%xmm2,%0" : "=m" (p[d]));
+		asm volatile("movdqa %%xmm3,%0" : "=m" (p[d+16]));
+	}
+
+	asm volatile("sfence" : : : "memory");
+	kernel_fpu_end();
+ }
+
 const struct raid6_calls raid6_sse2x2 = {
 	raid6_sse22_gen_syndrome,
-	NULL,			/* XOR not yet implemented */
+	raid6_sse22_xor_syndrome,
 	raid6_have_sse2,
 	"sse2x2",
 	1			/* Has cache hints */
@@ -250,9 +366,117 @@ static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs)
 	kernel_fpu_end();
 }
 
+ static void raid6_sse24_xor_syndrome(int disks, int start, int stop,
+				     size_t bytes, void **ptrs)
+ {
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks-2];	/* XOR parity */
+	q = dptr[disks-1];	/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0]));
+
+	for ( d = 0 ; d < bytes ; d += 64 ) {
+		asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
+		asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16]));
+		asm volatile("movdqa %0,%%xmm12" :: "m" (dptr[z0][d+32]));
+		asm volatile("movdqa %0,%%xmm14" :: "m" (dptr[z0][d+48]));
+		asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
+		asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16]));
+		asm volatile("movdqa %0,%%xmm10" : : "m" (p[d+32]));
+		asm volatile("movdqa %0,%%xmm11" : : "m" (p[d+48]));
+		asm volatile("pxor %xmm4,%xmm2");
+		asm volatile("pxor %xmm6,%xmm3");
+		asm volatile("pxor %xmm12,%xmm10");
+		asm volatile("pxor %xmm14,%xmm11");
+		/* P/Q data pages */
+		for ( z = z0-1 ; z >= start ; z-- ) {
+			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
+			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32]));
+			asm volatile("pxor %xmm5,%xmm5");
+			asm volatile("pxor %xmm7,%xmm7");
+			asm volatile("pxor %xmm13,%xmm13");
+			asm volatile("pxor %xmm15,%xmm15");
+			asm volatile("pcmpgtb %xmm4,%xmm5");
+			asm volatile("pcmpgtb %xmm6,%xmm7");
+			asm volatile("pcmpgtb %xmm12,%xmm13");
+			asm volatile("pcmpgtb %xmm14,%xmm15");
+			asm volatile("paddb %xmm4,%xmm4");
+			asm volatile("paddb %xmm6,%xmm6");
+			asm volatile("paddb %xmm12,%xmm12");
+			asm volatile("paddb %xmm14,%xmm14");
+			asm volatile("pand %xmm0,%xmm5");
+			asm volatile("pand %xmm0,%xmm7");
+			asm volatile("pand %xmm0,%xmm13");
+			asm volatile("pand %xmm0,%xmm15");
+			asm volatile("pxor %xmm5,%xmm4");
+			asm volatile("pxor %xmm7,%xmm6");
+			asm volatile("pxor %xmm13,%xmm12");
+			asm volatile("pxor %xmm15,%xmm14");
+			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
+			asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
+			asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32]));
+			asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48]));
+			asm volatile("pxor %xmm5,%xmm2");
+			asm volatile("pxor %xmm7,%xmm3");
+			asm volatile("pxor %xmm13,%xmm10");
+			asm volatile("pxor %xmm15,%xmm11");
+			asm volatile("pxor %xmm5,%xmm4");
+			asm volatile("pxor %xmm7,%xmm6");
+			asm volatile("pxor %xmm13,%xmm12");
+			asm volatile("pxor %xmm15,%xmm14");
+		}
+		asm volatile("prefetchnta %0" :: "m" (q[d]));
+		asm volatile("prefetchnta %0" :: "m" (q[d+32]));
+		/* P/Q left side optimization */
+		for ( z = start-1 ; z >= 0 ; z-- ) {
+			asm volatile("pxor %xmm5,%xmm5");
+			asm volatile("pxor %xmm7,%xmm7");
+			asm volatile("pxor %xmm13,%xmm13");
+			asm volatile("pxor %xmm15,%xmm15");
+			asm volatile("pcmpgtb %xmm4,%xmm5");
+			asm volatile("pcmpgtb %xmm6,%xmm7");
+			asm volatile("pcmpgtb %xmm12,%xmm13");
+			asm volatile("pcmpgtb %xmm14,%xmm15");
+			asm volatile("paddb %xmm4,%xmm4");
+			asm volatile("paddb %xmm6,%xmm6");
+			asm volatile("paddb %xmm12,%xmm12");
+			asm volatile("paddb %xmm14,%xmm14");
+			asm volatile("pand %xmm0,%xmm5");
+			asm volatile("pand %xmm0,%xmm7");
+			asm volatile("pand %xmm0,%xmm13");
+			asm volatile("pand %xmm0,%xmm15");
+			asm volatile("pxor %xmm5,%xmm4");
+			asm volatile("pxor %xmm7,%xmm6");
+			asm volatile("pxor %xmm13,%xmm12");
+			asm volatile("pxor %xmm15,%xmm14");
+		}
+		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
+		asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
+		asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32]));
+		asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48]));
+		asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
+		asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16]));
+		asm volatile("pxor %0,%%xmm12" : : "m" (q[d+32]));
+		asm volatile("pxor %0,%%xmm14" : : "m" (q[d+48]));
+		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
+		asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
+		asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32]));
+		asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48]));
+	}
+	asm volatile("sfence" : : : "memory");
+	kernel_fpu_end();
+ }
+
+
 const struct raid6_calls raid6_sse2x4 = {
 	raid6_sse24_gen_syndrome,
-	NULL,			/* XOR not yet implemented */
+	raid6_sse24_xor_syndrome,
 	raid6_have_sse2,
 	"sse2x4",
 	1			/* Has cache hints */
-- 
cgit v1.2.3