From fd58e55fcf5568e51da2ed54d7acd049c3fdb184 Mon Sep 17 00:00:00 2001 From: Mark Maule Date: Mon, 10 Apr 2006 21:17:48 -0500 Subject: [PATCH] PCI: msi abstractions and support for altix Abstract portions of the MSI core for platforms that do not use standard APIC interrupt controllers. This is implemented through a new arch-specific msi setup routine, and a set of msi ops which can be set on a per platform basis. Signed-off-by: Mark Maule Signed-off-by: Greg Kroah-Hartman --- include/asm-ia64/machvec.h | 7 +++++++ include/asm-ia64/machvec_sn2.h | 7 +++++++ include/asm-ia64/msi.h | 12 ++++++++++++ 3 files changed, 26 insertions(+) (limited to 'include/asm-ia64') diff --git a/include/asm-ia64/machvec.h b/include/asm-ia64/machvec.h index 0df72a134c8b..15b545a897a4 100644 --- a/include/asm-ia64/machvec.h +++ b/include/asm-ia64/machvec.h @@ -75,6 +75,7 @@ typedef unsigned char ia64_mv_readb_relaxed_t (const volatile void __iomem *); typedef unsigned short ia64_mv_readw_relaxed_t (const volatile void __iomem *); typedef unsigned int ia64_mv_readl_relaxed_t (const volatile void __iomem *); typedef unsigned long ia64_mv_readq_relaxed_t (const volatile void __iomem *); +typedef int ia64_mv_msi_init_t (void); static inline void machvec_noop (void) @@ -153,6 +154,7 @@ extern void machvec_tlb_migrate_finish (struct mm_struct *); # define platform_readl_relaxed ia64_mv.readl_relaxed # define platform_readq_relaxed ia64_mv.readq_relaxed # define platform_migrate ia64_mv.migrate +# define platform_msi_init ia64_mv.msi_init # endif /* __attribute__((__aligned__(16))) is required to make size of the @@ -202,6 +204,7 @@ struct ia64_machine_vector { ia64_mv_readl_relaxed_t *readl_relaxed; ia64_mv_readq_relaxed_t *readq_relaxed; ia64_mv_migrate_t *migrate; + ia64_mv_msi_init_t *msi_init; } __attribute__((__aligned__(16))); /* align attrib? see above comment */ #define MACHVEC_INIT(name) \ @@ -247,6 +250,7 @@ struct ia64_machine_vector { platform_readl_relaxed, \ platform_readq_relaxed, \ platform_migrate, \ + platform_msi_init, \ } extern struct ia64_machine_vector ia64_mv; @@ -400,5 +404,8 @@ extern int ia64_pci_legacy_write(struct pci_bus *bus, u16 port, u32 val, u8 size #ifndef platform_migrate # define platform_migrate machvec_noop_task #endif +#ifndef platform_msi_init +# define platform_msi_init ((ia64_mv_msi_init_t*)NULL) +#endif #endif /* _ASM_IA64_MACHVEC_H */ diff --git a/include/asm-ia64/machvec_sn2.h b/include/asm-ia64/machvec_sn2.h index da1d43755afe..cf724dc79d8c 100644 --- a/include/asm-ia64/machvec_sn2.h +++ b/include/asm-ia64/machvec_sn2.h @@ -67,6 +67,8 @@ extern ia64_mv_dma_sync_sg_for_device sn_dma_sync_sg_for_device; extern ia64_mv_dma_mapping_error sn_dma_mapping_error; extern ia64_mv_dma_supported sn_dma_supported; extern ia64_mv_migrate_t sn_migrate; +extern ia64_mv_msi_init_t sn_msi_init; + /* * This stuff has dual use! @@ -117,6 +119,11 @@ extern ia64_mv_migrate_t sn_migrate; #define platform_dma_mapping_error sn_dma_mapping_error #define platform_dma_supported sn_dma_supported #define platform_migrate sn_migrate +#ifdef CONFIG_PCI_MSI +#define platform_msi_init sn_msi_init +#else +#define platform_msi_init ((ia64_mv_msi_init_t*)NULL) +#endif #include diff --git a/include/asm-ia64/msi.h b/include/asm-ia64/msi.h index 97890f7762b3..bb92b0dbde2f 100644 --- a/include/asm-ia64/msi.h +++ b/include/asm-ia64/msi.h @@ -14,4 +14,16 @@ static inline void set_intr_gate (int nr, void *func) {} #define ack_APIC_irq ia64_eoi #define MSI_TARGET_CPU_SHIFT 4 +extern struct msi_ops msi_apic_ops; + +static inline int msi_arch_init(void) +{ + if (platform_msi_init) + return platform_msi_init(); + + /* default ops for most ia64 platforms */ + msi_register(&msi_apic_ops); + return 0; +} + #endif /* ASM_MSI_H */ -- cgit v1.2.3 From 10083072bfabc40bc47306e512c158c57cf55c2e Mon Sep 17 00:00:00 2001 From: Mark Maule Date: Fri, 14 Apr 2006 16:03:49 -0500 Subject: [PATCH] PCI: per-platform IA64_{FIRST,LAST}_DEVICE_VECTOR definitions Abstract IA64_FIRST_DEVICE_VECTOR/IA64_LAST_DEVICE_VECTOR since SN platforms use a subset of the IA64 range. Implement this by making the above macros global variables which the platform can override in it setup code. Also add a reserve_irq_vector() routine used by SN to mark a vector's as in-use when that weren't allocated through assign_irq_vector(). Signed-off-by: Mark Maule Signed-off-by: Greg Kroah-Hartman --- arch/ia64/kernel/irq_ia64.c | 19 ++++++++++++++++++- arch/ia64/sn/kernel/irq.c | 7 +++++++ drivers/pci/msi.c | 6 +++++- include/asm-ia64/hw_irq.h | 15 +++++++++++++-- 4 files changed, 43 insertions(+), 4 deletions(-) (limited to 'include/asm-ia64') diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c index 6c4d59fd0364..ef9a2b49307a 100644 --- a/arch/ia64/kernel/irq_ia64.c +++ b/arch/ia64/kernel/irq_ia64.c @@ -46,6 +46,10 @@ #define IRQ_DEBUG 0 +/* These can be overridden in platform_irq_init */ +int ia64_first_device_vector = IA64_DEF_FIRST_DEVICE_VECTOR; +int ia64_last_device_vector = IA64_DEF_LAST_DEVICE_VECTOR; + /* default base addr of IPI table */ void __iomem *ipi_base_addr = ((void __iomem *) (__IA64_UNCACHED_OFFSET | IA64_IPI_DEFAULT_BASE_ADDR)); @@ -60,7 +64,7 @@ __u8 isa_irq_to_vector_map[16] = { }; EXPORT_SYMBOL(isa_irq_to_vector_map); -static unsigned long ia64_vector_mask[BITS_TO_LONGS(IA64_NUM_DEVICE_VECTORS)]; +static unsigned long ia64_vector_mask[BITS_TO_LONGS(IA64_MAX_DEVICE_VECTORS)]; int assign_irq_vector (int irq) @@ -89,6 +93,19 @@ free_irq_vector (int vector) printk(KERN_WARNING "%s: double free!\n", __FUNCTION__); } +int +reserve_irq_vector (int vector) +{ + int pos; + + if (vector < IA64_FIRST_DEVICE_VECTOR || + vector > IA64_LAST_DEVICE_VECTOR) + return -EINVAL; + + pos = vector - IA64_FIRST_DEVICE_VECTOR; + return test_and_set_bit(pos, ia64_vector_mask); +} + #ifdef CONFIG_SMP # define IS_RESCHEDULE(vec) (vec == IA64_IPI_RESCHEDULE) #else diff --git a/arch/ia64/sn/kernel/irq.c b/arch/ia64/sn/kernel/irq.c index c265e02f5036..db187f5cdae1 100644 --- a/arch/ia64/sn/kernel/irq.c +++ b/arch/ia64/sn/kernel/irq.c @@ -202,6 +202,9 @@ void sn_irq_init(void) int i; irq_desc_t *base_desc = irq_desc; + ia64_first_device_vector = IA64_SN2_FIRST_DEVICE_VECTOR; + ia64_last_device_vector = IA64_SN2_LAST_DEVICE_VECTOR; + for (i = 0; i < NR_IRQS; i++) { if (base_desc[i].handler == &no_irq_type) { base_desc[i].handler = &irq_type_sn; @@ -285,6 +288,7 @@ void sn_irq_fixup(struct pci_dev *pci_dev, struct sn_irq_info *sn_irq_info) /* link it into the sn_irq[irq] list */ spin_lock(&sn_irq_info_lock); list_add_rcu(&sn_irq_info->list, sn_irq_lh[sn_irq_info->irq_irq]); + reserve_irq_vector(sn_irq_info->irq_irq); spin_unlock(&sn_irq_info_lock); register_intr_pda(sn_irq_info); @@ -310,8 +314,11 @@ void sn_irq_unfixup(struct pci_dev *pci_dev) spin_lock(&sn_irq_info_lock); list_del_rcu(&sn_irq_info->list); spin_unlock(&sn_irq_info_lock); + if (list_empty(sn_irq_lh[sn_irq_info->irq_irq])) + free_irq_vector(sn_irq_info->irq_irq); call_rcu(&sn_irq_info->rcu, sn_irq_info_free); pci_dev_put(pci_dev); + } static inline void diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 55ff52df5fe7..f8105783da2f 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -35,7 +35,7 @@ static int nr_msix_devices; #ifndef CONFIG_X86_IO_APIC int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1}; -u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 }; +u8 irq_vector[NR_IRQ_VECTORS]; #endif static struct msi_ops *msi_ops; @@ -383,6 +383,10 @@ static int msi_init(void) return status; } +#ifndef CONFIG_X86_IO_APIC + irq_vector[0] = FIRST_DEVICE_VECTOR; +#endif + if (last_alloc_vector < 0) { pci_msi_enable = 0; printk(KERN_WARNING "PCI: No interrupt vectors available for MSI\n"); diff --git a/include/asm-ia64/hw_irq.h b/include/asm-ia64/hw_irq.h index 0cf119b42f7d..ea8b8c407ab4 100644 --- a/include/asm-ia64/hw_irq.h +++ b/include/asm-ia64/hw_irq.h @@ -47,9 +47,19 @@ typedef u8 ia64_vector; #define IA64_CMC_VECTOR 0x1f /* corrected machine-check interrupt vector */ /* * Vectors 0x20-0x2f are reserved for legacy ISA IRQs. + * Use vectors 0x30-0xe7 as the default device vector range for ia64. + * Platforms may choose to reduce this range in platform_irq_setup, but the + * platform range must fall within + * [IA64_DEF_FIRST_DEVICE_VECTOR..IA64_DEF_LAST_DEVICE_VECTOR] */ -#define IA64_FIRST_DEVICE_VECTOR 0x30 -#define IA64_LAST_DEVICE_VECTOR 0xe7 +extern int ia64_first_device_vector; +extern int ia64_last_device_vector; + +#define IA64_DEF_FIRST_DEVICE_VECTOR 0x30 +#define IA64_DEF_LAST_DEVICE_VECTOR 0xe7 +#define IA64_FIRST_DEVICE_VECTOR ia64_first_device_vector +#define IA64_LAST_DEVICE_VECTOR ia64_last_device_vector +#define IA64_MAX_DEVICE_VECTORS (IA64_DEF_LAST_DEVICE_VECTOR - IA64_DEF_FIRST_DEVICE_VECTOR + 1) #define IA64_NUM_DEVICE_VECTORS (IA64_LAST_DEVICE_VECTOR - IA64_FIRST_DEVICE_VECTOR + 1) #define IA64_MCA_RENDEZ_VECTOR 0xe8 /* MCA rendez interrupt */ @@ -83,6 +93,7 @@ extern struct hw_interrupt_type irq_type_ia64_lsapic; /* CPU-internal interrupt extern int assign_irq_vector (int irq); /* allocate a free vector */ extern void free_irq_vector (int vector); +extern int reserve_irq_vector (int vector); extern void ia64_send_ipi (int cpu, int vector, int delivery_mode, int redirect); extern void register_percpu_irq (ia64_vector vec, struct irqaction *action); -- cgit v1.2.3 From 83821d3f558dc651e555d62182ed0c95651f41a6 Mon Sep 17 00:00:00 2001 From: Mark Maule Date: Fri, 14 Apr 2006 16:03:54 -0500 Subject: [PATCH] PCI: altix: msi support MSI callouts for altix. Involves a fair amount of code reorg in sn irq.c code as well as adding some extensions to the altix PCI provider abstaction. Signed-off-by: Mark Maule Signed-off-by: Greg Kroah-Hartman --- arch/ia64/sn/kernel/io_init.c | 9 +- arch/ia64/sn/kernel/irq.c | 135 ++++++++++--------- arch/ia64/sn/pci/pci_dma.c | 10 +- arch/ia64/sn/pci/pcibr/pcibr_dma.c | 62 ++++++--- arch/ia64/sn/pci/tioca_provider.c | 8 +- arch/ia64/sn/pci/tioce_provider.c | 65 ++++++---- drivers/pci/msi-altix.c | 200 ++++++++++++++++++++++++++++- include/asm-ia64/sn/intr.h | 8 ++ include/asm-ia64/sn/pcibr_provider.h | 5 +- include/asm-ia64/sn/pcibus_provider_defs.h | 17 ++- include/asm-ia64/sn/tiocp.h | 3 +- 11 files changed, 401 insertions(+), 121 deletions(-) (limited to 'include/asm-ia64') diff --git a/arch/ia64/sn/kernel/io_init.c b/arch/ia64/sn/kernel/io_init.c index 5101ac462643..dc09a6a28a37 100644 --- a/arch/ia64/sn/kernel/io_init.c +++ b/arch/ia64/sn/kernel/io_init.c @@ -58,7 +58,7 @@ static int max_pcibus_number = 255; /* Default highest pci bus number */ */ static dma_addr_t -sn_default_pci_map(struct pci_dev *pdev, unsigned long paddr, size_t size) +sn_default_pci_map(struct pci_dev *pdev, unsigned long paddr, size_t size, int type) { return 0; } @@ -457,13 +457,6 @@ void sn_pci_fixup_slot(struct pci_dev *dev) pcidev_info->pdi_sn_irq_info = NULL; kfree(sn_irq_info); } - - /* - * MSI currently not supported on altix. Remove this when - * the MSI abstraction patches are integrated into the kernel - * (sometime after 2.6.16 releases) - */ - dev->no_msi = 1; } /* diff --git a/arch/ia64/sn/kernel/irq.c b/arch/ia64/sn/kernel/irq.c index db187f5cdae1..dc8e2b696713 100644 --- a/arch/ia64/sn/kernel/irq.c +++ b/arch/ia64/sn/kernel/irq.c @@ -26,11 +26,11 @@ static void unregister_intr_pda(struct sn_irq_info *sn_irq_info); int sn_force_interrupt_flag = 1; extern int sn_ioif_inited; -static struct list_head **sn_irq_lh; +struct list_head **sn_irq_lh; static spinlock_t sn_irq_info_lock = SPIN_LOCK_UNLOCKED; /* non-IRQ lock */ -static inline u64 sn_intr_alloc(nasid_t local_nasid, int local_widget, - u64 sn_irq_info, +u64 sn_intr_alloc(nasid_t local_nasid, int local_widget, + struct sn_irq_info *sn_irq_info, int req_irq, nasid_t req_nasid, int req_slice) { @@ -40,12 +40,13 @@ static inline u64 sn_intr_alloc(nasid_t local_nasid, int local_widget, SAL_CALL_NOLOCK(ret_stuff, (u64) SN_SAL_IOIF_INTERRUPT, (u64) SAL_INTR_ALLOC, (u64) local_nasid, - (u64) local_widget, (u64) sn_irq_info, (u64) req_irq, + (u64) local_widget, __pa(sn_irq_info), (u64) req_irq, (u64) req_nasid, (u64) req_slice); + return ret_stuff.status; } -static inline void sn_intr_free(nasid_t local_nasid, int local_widget, +void sn_intr_free(nasid_t local_nasid, int local_widget, struct sn_irq_info *sn_irq_info) { struct ia64_sal_retval ret_stuff; @@ -112,73 +113,91 @@ static void sn_end_irq(unsigned int irq) static void sn_irq_info_free(struct rcu_head *head); -static void sn_set_affinity_irq(unsigned int irq, cpumask_t mask) +struct sn_irq_info *sn_retarget_vector(struct sn_irq_info *sn_irq_info, + nasid_t nasid, int slice) { - struct sn_irq_info *sn_irq_info, *sn_irq_info_safe; - int cpuid, cpuphys; + int vector; + int cpuphys; + int64_t bridge; + int local_widget, status; + nasid_t local_nasid; + struct sn_irq_info *new_irq_info; + struct sn_pcibus_provider *pci_provider; - cpuid = first_cpu(mask); - cpuphys = cpu_physical_id(cpuid); + new_irq_info = kmalloc(sizeof(struct sn_irq_info), GFP_ATOMIC); + if (new_irq_info == NULL) + return NULL; - list_for_each_entry_safe(sn_irq_info, sn_irq_info_safe, - sn_irq_lh[irq], list) { - u64 bridge; - int local_widget, status; - nasid_t local_nasid; - struct sn_irq_info *new_irq_info; - struct sn_pcibus_provider *pci_provider; - - new_irq_info = kmalloc(sizeof(struct sn_irq_info), GFP_ATOMIC); - if (new_irq_info == NULL) - break; - memcpy(new_irq_info, sn_irq_info, sizeof(struct sn_irq_info)); - - bridge = (u64) new_irq_info->irq_bridge; - if (!bridge) { - kfree(new_irq_info); - break; /* irq is not a device interrupt */ - } + memcpy(new_irq_info, sn_irq_info, sizeof(struct sn_irq_info)); - local_nasid = NASID_GET(bridge); + bridge = (u64) new_irq_info->irq_bridge; + if (!bridge) { + kfree(new_irq_info); + return NULL; /* irq is not a device interrupt */ + } - if (local_nasid & 1) - local_widget = TIO_SWIN_WIDGETNUM(bridge); - else - local_widget = SWIN_WIDGETNUM(bridge); + local_nasid = NASID_GET(bridge); - /* Free the old PROM new_irq_info structure */ - sn_intr_free(local_nasid, local_widget, new_irq_info); - /* Update kernels new_irq_info with new target info */ - unregister_intr_pda(new_irq_info); + if (local_nasid & 1) + local_widget = TIO_SWIN_WIDGETNUM(bridge); + else + local_widget = SWIN_WIDGETNUM(bridge); - /* allocate a new PROM new_irq_info struct */ - status = sn_intr_alloc(local_nasid, local_widget, - __pa(new_irq_info), irq, - cpuid_to_nasid(cpuid), - cpuid_to_slice(cpuid)); + vector = sn_irq_info->irq_irq; + /* Free the old PROM new_irq_info structure */ + sn_intr_free(local_nasid, local_widget, new_irq_info); + /* Update kernels new_irq_info with new target info */ + unregister_intr_pda(new_irq_info); - /* SAL call failed */ - if (status) { - kfree(new_irq_info); - break; - } + /* allocate a new PROM new_irq_info struct */ + status = sn_intr_alloc(local_nasid, local_widget, + new_irq_info, vector, + nasid, slice); + + /* SAL call failed */ + if (status) { + kfree(new_irq_info); + return NULL; + } - new_irq_info->irq_cpuid = cpuid; - register_intr_pda(new_irq_info); + cpuphys = nasid_slice_to_cpuid(nasid, slice); + new_irq_info->irq_cpuid = cpuphys; + register_intr_pda(new_irq_info); - pci_provider = sn_pci_provider[new_irq_info->irq_bridge_type]; - if (pci_provider && pci_provider->target_interrupt) - (pci_provider->target_interrupt)(new_irq_info); + pci_provider = sn_pci_provider[new_irq_info->irq_bridge_type]; + + /* + * If this represents a line interrupt, target it. If it's + * an msi (irq_int_bit < 0), it's already targeted. + */ + if (new_irq_info->irq_int_bit >= 0 && + pci_provider && pci_provider->target_interrupt) + (pci_provider->target_interrupt)(new_irq_info); - spin_lock(&sn_irq_info_lock); - list_replace_rcu(&sn_irq_info->list, &new_irq_info->list); - spin_unlock(&sn_irq_info_lock); - call_rcu(&sn_irq_info->rcu, sn_irq_info_free); + spin_lock(&sn_irq_info_lock); + list_replace_rcu(&sn_irq_info->list, &new_irq_info->list); + spin_unlock(&sn_irq_info_lock); + call_rcu(&sn_irq_info->rcu, sn_irq_info_free); #ifdef CONFIG_SMP - set_irq_affinity_info((irq & 0xff), cpuphys, 0); + set_irq_affinity_info((vector & 0xff), cpuphys, 0); #endif - } + + return new_irq_info; +} + +static void sn_set_affinity_irq(unsigned int irq, cpumask_t mask) +{ + struct sn_irq_info *sn_irq_info, *sn_irq_info_safe; + nasid_t nasid; + int slice; + + nasid = cpuid_to_nasid(first_cpu(mask)); + slice = cpuid_to_slice(first_cpu(mask)); + + list_for_each_entry_safe(sn_irq_info, sn_irq_info_safe, + sn_irq_lh[irq], list) + (void)sn_retarget_vector(sn_irq_info, nasid, slice); } struct hw_interrupt_type irq_type_sn = { diff --git a/arch/ia64/sn/pci/pci_dma.c b/arch/ia64/sn/pci/pci_dma.c index b4b84c269210..7a291a271511 100644 --- a/arch/ia64/sn/pci/pci_dma.c +++ b/arch/ia64/sn/pci/pci_dma.c @@ -11,7 +11,7 @@ #include #include -#include +#include #include #include #include @@ -113,7 +113,8 @@ void *sn_dma_alloc_coherent(struct device *dev, size_t size, * resources. */ - *dma_handle = provider->dma_map_consistent(pdev, phys_addr, size); + *dma_handle = provider->dma_map_consistent(pdev, phys_addr, size, + SN_DMA_ADDR_PHYS); if (!*dma_handle) { printk(KERN_ERR "%s: out of ATEs\n", __FUNCTION__); free_pages((unsigned long)cpuaddr, get_order(size)); @@ -176,7 +177,7 @@ dma_addr_t sn_dma_map_single(struct device *dev, void *cpu_addr, size_t size, BUG_ON(dev->bus != &pci_bus_type); phys_addr = __pa(cpu_addr); - dma_addr = provider->dma_map(pdev, phys_addr, size); + dma_addr = provider->dma_map(pdev, phys_addr, size, SN_DMA_ADDR_PHYS); if (!dma_addr) { printk(KERN_ERR "%s: out of ATEs\n", __FUNCTION__); return 0; @@ -260,7 +261,8 @@ int sn_dma_map_sg(struct device *dev, struct scatterlist *sg, int nhwentries, for (i = 0; i < nhwentries; i++, sg++) { phys_addr = SG_ENT_PHYS_ADDRESS(sg); sg->dma_address = provider->dma_map(pdev, - phys_addr, sg->length); + phys_addr, sg->length, + SN_DMA_ADDR_PHYS); if (!sg->dma_address) { printk(KERN_ERR "%s: out of ATEs\n", __FUNCTION__); diff --git a/arch/ia64/sn/pci/pcibr/pcibr_dma.c b/arch/ia64/sn/pci/pcibr/pcibr_dma.c index 9f86bb6519aa..a86c7b945962 100644 --- a/arch/ia64/sn/pci/pcibr/pcibr_dma.c +++ b/arch/ia64/sn/pci/pcibr/pcibr_dma.c @@ -41,7 +41,7 @@ extern int sn_ioif_inited; static dma_addr_t pcibr_dmamap_ate32(struct pcidev_info *info, - u64 paddr, size_t req_size, u64 flags) + u64 paddr, size_t req_size, u64 flags, int dma_flags) { struct pcidev_info *pcidev_info = info->pdi_host_pcidev_info; @@ -81,9 +81,12 @@ pcibr_dmamap_ate32(struct pcidev_info *info, if (IS_PCIX(pcibus_info)) ate_flags &= ~(PCI32_ATE_PREF); - xio_addr = - IS_PIC_SOFT(pcibus_info) ? PHYS_TO_DMA(paddr) : - PHYS_TO_TIODMA(paddr); + if (SN_DMA_ADDRTYPE(dma_flags == SN_DMA_ADDR_PHYS)) + xio_addr = IS_PIC_SOFT(pcibus_info) ? PHYS_TO_DMA(paddr) : + PHYS_TO_TIODMA(paddr); + else + xio_addr = paddr; + offset = IOPGOFF(xio_addr); ate = ate_flags | (xio_addr - offset); @@ -91,6 +94,13 @@ pcibr_dmamap_ate32(struct pcidev_info *info, if (IS_PIC_SOFT(pcibus_info)) { ate |= (pcibus_info->pbi_hub_xid << PIC_ATE_TARGETID_SHFT); } + + /* + * If we're mapping for MSI, set the MSI bit in the ATE + */ + if (dma_flags & SN_DMA_MSI) + ate |= PCI32_ATE_MSI; + ate_write(pcibus_info, ate_index, ate_count, ate); /* @@ -105,20 +115,27 @@ pcibr_dmamap_ate32(struct pcidev_info *info, if (pcibus_info->pbi_devreg[internal_device] & PCIBR_DEV_SWAP_DIR) ATE_SWAP_ON(pci_addr); + return pci_addr; } static dma_addr_t pcibr_dmatrans_direct64(struct pcidev_info * info, u64 paddr, - u64 dma_attributes) + u64 dma_attributes, int dma_flags) { struct pcibus_info *pcibus_info = (struct pcibus_info *) ((info->pdi_host_pcidev_info)->pdi_pcibus_info); u64 pci_addr; /* Translate to Crosstalk View of Physical Address */ - pci_addr = (IS_PIC_SOFT(pcibus_info) ? PHYS_TO_DMA(paddr) : - PHYS_TO_TIODMA(paddr)) | dma_attributes; + if (SN_DMA_ADDRTYPE(dma_flags) == SN_DMA_ADDR_PHYS) + pci_addr = IS_PIC_SOFT(pcibus_info) ? + PHYS_TO_DMA(paddr) : + PHYS_TO_TIODMA(paddr) | dma_attributes; + else + pci_addr = IS_PIC_SOFT(pcibus_info) ? + paddr : + paddr | dma_attributes; /* Handle Bus mode */ if (IS_PCIX(pcibus_info)) @@ -130,7 +147,9 @@ pcibr_dmatrans_direct64(struct pcidev_info * info, u64 paddr, ((u64) pcibus_info-> pbi_hub_xid << PIC_PCI64_ATTR_TARG_SHFT); } else - pci_addr |= TIOCP_PCI64_CMDTYPE_MEM; + pci_addr |= (dma_flags & SN_DMA_MSI) ? + TIOCP_PCI64_CMDTYPE_MSI : + TIOCP_PCI64_CMDTYPE_MEM; /* If PCI mode, func zero uses VCHAN0, every other func uses VCHAN1 */ if (!IS_PCIX(pcibus_info) && PCI_FUNC(info->pdi_linux_pcidev->devfn)) @@ -141,7 +160,7 @@ pcibr_dmatrans_direct64(struct pcidev_info * info, u64 paddr, static dma_addr_t pcibr_dmatrans_direct32(struct pcidev_info * info, - u64 paddr, size_t req_size, u64 flags) + u64 paddr, size_t req_size, u64 flags, int dma_flags) { struct pcidev_info *pcidev_info = info->pdi_host_pcidev_info; struct pcibus_info *pcibus_info = (struct pcibus_info *)pcidev_info-> @@ -156,8 +175,14 @@ pcibr_dmatrans_direct32(struct pcidev_info * info, return 0; } - xio_addr = IS_PIC_SOFT(pcibus_info) ? PHYS_TO_DMA(paddr) : - PHYS_TO_TIODMA(paddr); + if (dma_flags & SN_DMA_MSI) + return 0; + + if (SN_DMA_ADDRTYPE(dma_flags) == SN_DMA_ADDR_PHYS) + xio_addr = IS_PIC_SOFT(pcibus_info) ? PHYS_TO_DMA(paddr) : + PHYS_TO_TIODMA(paddr); + else + xio_addr = paddr; xio_base = pcibus_info->pbi_dir_xbase; offset = xio_addr - xio_base; @@ -327,7 +352,7 @@ void sn_dma_flush(u64 addr) */ dma_addr_t -pcibr_dma_map(struct pci_dev * hwdev, unsigned long phys_addr, size_t size) +pcibr_dma_map(struct pci_dev * hwdev, unsigned long phys_addr, size_t size, int dma_flags) { dma_addr_t dma_handle; struct pcidev_info *pcidev_info = SN_PCIDEV_INFO(hwdev); @@ -344,11 +369,11 @@ pcibr_dma_map(struct pci_dev * hwdev, unsigned long phys_addr, size_t size) */ dma_handle = pcibr_dmatrans_direct64(pcidev_info, phys_addr, - PCI64_ATTR_PREF); + PCI64_ATTR_PREF, dma_flags); } else { /* Handle 32-63 bit cards via direct mapping */ dma_handle = pcibr_dmatrans_direct32(pcidev_info, phys_addr, - size, 0); + size, 0, dma_flags); if (!dma_handle) { /* * It is a 32 bit card and we cannot do direct mapping, @@ -356,7 +381,8 @@ pcibr_dma_map(struct pci_dev * hwdev, unsigned long phys_addr, size_t size) */ dma_handle = pcibr_dmamap_ate32(pcidev_info, phys_addr, - size, PCI32_ATE_PREF); + size, PCI32_ATE_PREF, + dma_flags); } } @@ -365,18 +391,18 @@ pcibr_dma_map(struct pci_dev * hwdev, unsigned long phys_addr, size_t size) dma_addr_t pcibr_dma_map_consistent(struct pci_dev * hwdev, unsigned long phys_addr, - size_t size) + size_t size, int dma_flags) { dma_addr_t dma_handle; struct pcidev_info *pcidev_info = SN_PCIDEV_INFO(hwdev); if (hwdev->dev.coherent_dma_mask == ~0UL) { dma_handle = pcibr_dmatrans_direct64(pcidev_info, phys_addr, - PCI64_ATTR_BAR); + PCI64_ATTR_BAR, dma_flags); } else { dma_handle = (dma_addr_t) pcibr_dmamap_ate32(pcidev_info, phys_addr, size, - PCI32_ATE_BAR); + PCI32_ATE_BAR, dma_flags); } return dma_handle; diff --git a/arch/ia64/sn/pci/tioca_provider.c b/arch/ia64/sn/pci/tioca_provider.c index be0176912968..20de72791b97 100644 --- a/arch/ia64/sn/pci/tioca_provider.c +++ b/arch/ia64/sn/pci/tioca_provider.c @@ -515,10 +515,16 @@ tioca_dma_unmap(struct pci_dev *pdev, dma_addr_t bus_addr, int dir) * use the GART mapped mode. */ static u64 -tioca_dma_map(struct pci_dev *pdev, u64 paddr, size_t byte_count) +tioca_dma_map(struct pci_dev *pdev, u64 paddr, size_t byte_count, int dma_flags) { u64 mapaddr; + /* + * Not supported for now ... + */ + if (dma_flags & SN_DMA_MSI) + return 0; + /* * If card is 64 or 48 bit addresable, use a direct mapping. 32 * bit direct is so restrictive w.r.t. where the memory resides that diff --git a/arch/ia64/sn/pci/tioce_provider.c b/arch/ia64/sn/pci/tioce_provider.c index 833295624e5d..4cac7bdc7c7f 100644 --- a/arch/ia64/sn/pci/tioce_provider.c +++ b/arch/ia64/sn/pci/tioce_provider.c @@ -170,7 +170,8 @@ tioce_mmr_war_post(struct tioce_kernel *kern, void *mmr_addr) (ATE_PAGE((start)+(len)-1, pagesize) - ATE_PAGE(start, pagesize) + 1) #define ATE_VALID(ate) ((ate) & (1UL << 63)) -#define ATE_MAKE(addr, ps) (((addr) & ~ATE_PAGEMASK(ps)) | (1UL << 63)) +#define ATE_MAKE(addr, ps, msi) \ + (((addr) & ~ATE_PAGEMASK(ps)) | (1UL << 63) | ((msi)?(1UL << 62):0)) /* * Flavors of ate-based mapping supported by tioce_alloc_map() @@ -196,15 +197,17 @@ tioce_mmr_war_post(struct tioce_kernel *kern, void *mmr_addr) * * 63 - must be 1 to indicate d64 mode to CE hardware * 62 - barrier bit ... controlled with tioce_dma_barrier() - * 61 - 0 since this is not an MSI transaction + * 61 - msi bit ... specified through dma_flags * 60:54 - reserved, MBZ */ static u64 -tioce_dma_d64(unsigned long ct_addr) +tioce_dma_d64(unsigned long ct_addr, int dma_flags) { u64 bus_addr; bus_addr = ct_addr | (1UL << 63); + if (dma_flags & SN_DMA_MSI) + bus_addr |= (1UL << 61); return bus_addr; } @@ -261,7 +264,7 @@ pcidev_to_tioce(struct pci_dev *pdev, struct tioce **base, */ static u64 tioce_alloc_map(struct tioce_kernel *ce_kern, int type, int port, - u64 ct_addr, int len) + u64 ct_addr, int len, int dma_flags) { int i; int j; @@ -270,6 +273,7 @@ tioce_alloc_map(struct tioce_kernel *ce_kern, int type, int port, int entries; int nates; u64 pagesize; + int msi_capable, msi_wanted; u64 *ate_shadow; u64 *ate_reg; u64 addr; @@ -291,6 +295,7 @@ tioce_alloc_map(struct tioce_kernel *ce_kern, int type, int port, ate_reg = ce_mmr->ce_ure_ate3240; pagesize = ce_kern->ce_ate3240_pagesize; bus_base = TIOCE_M32_MIN; + msi_capable = 1; break; case TIOCE_ATE_M40: first = 0; @@ -299,6 +304,7 @@ tioce_alloc_map(struct tioce_kernel *ce_kern, int type, int port, ate_reg = ce_mmr->ce_ure_ate40; pagesize = MB(64); bus_base = TIOCE_M40_MIN; + msi_capable = 0; break; case TIOCE_ATE_M40S: /* @@ -311,11 +317,16 @@ tioce_alloc_map(struct tioce_kernel *ce_kern, int type, int port, ate_reg = ce_mmr->ce_ure_ate3240; pagesize = GB(16); bus_base = TIOCE_M40S_MIN; + msi_capable = 0; break; default: return 0; } + msi_wanted = dma_flags & SN_DMA_MSI; + if (msi_wanted && !msi_capable) + return 0; + nates = ATE_NPAGES(ct_addr, len, pagesize); if (nates > entries) return 0; @@ -344,7 +355,7 @@ tioce_alloc_map(struct tioce_kernel *ce_kern, int type, int port, for (j = 0; j < nates; j++) { u64 ate; - ate = ATE_MAKE(addr, pagesize); + ate = ATE_MAKE(addr, pagesize, msi_wanted); ate_shadow[i + j] = ate; tioce_mmr_storei(ce_kern, &ate_reg[i + j], ate); addr += pagesize; @@ -371,7 +382,7 @@ tioce_alloc_map(struct tioce_kernel *ce_kern, int type, int port, * Map @paddr into 32-bit bus space of the CE associated with @pcidev_info. */ static u64 -tioce_dma_d32(struct pci_dev *pdev, u64 ct_addr) +tioce_dma_d32(struct pci_dev *pdev, u64 ct_addr, int dma_flags) { int dma_ok; int port; @@ -381,6 +392,9 @@ tioce_dma_d32(struct pci_dev *pdev, u64 ct_addr) u64 ct_lower; dma_addr_t bus_addr; + if (dma_flags & SN_DMA_MSI) + return 0; + ct_upper = ct_addr & ~0x3fffffffUL; ct_lower = ct_addr & 0x3fffffffUL; @@ -507,7 +521,7 @@ tioce_dma_unmap(struct pci_dev *pdev, dma_addr_t bus_addr, int dir) */ static u64 tioce_do_dma_map(struct pci_dev *pdev, u64 paddr, size_t byte_count, - int barrier) + int barrier, int dma_flags) { unsigned long flags; u64 ct_addr; @@ -523,15 +537,18 @@ tioce_do_dma_map(struct pci_dev *pdev, u64 paddr, size_t byte_count, if (dma_mask < 0x7fffffffUL) return 0; - ct_addr = PHYS_TO_TIODMA(paddr); + if (SN_DMA_ADDRTYPE(dma_flags) == SN_DMA_ADDR_PHYS) + ct_addr = PHYS_TO_TIODMA(paddr); + else + ct_addr = paddr; /* * If the device can generate 64 bit addresses, create a D64 map. - * Since this should never fail, bypass the rest of the checks. */ if (dma_mask == ~0UL) { - mapaddr = tioce_dma_d64(ct_addr); - goto dma_map_done; + mapaddr = tioce_dma_d64(ct_addr, dma_flags); + if (mapaddr) + goto dma_map_done; } pcidev_to_tioce(pdev, NULL, &ce_kern, &port); @@ -574,18 +591,22 @@ tioce_do_dma_map(struct pci_dev *pdev, u64 paddr, size_t byte_count, if (byte_count > MB(64)) { mapaddr = tioce_alloc_map(ce_kern, TIOCE_ATE_M40S, - port, ct_addr, byte_count); + port, ct_addr, byte_count, + dma_flags); if (!mapaddr) mapaddr = tioce_alloc_map(ce_kern, TIOCE_ATE_M40, -1, - ct_addr, byte_count); + ct_addr, byte_count, + dma_flags); } else { mapaddr = tioce_alloc_map(ce_kern, TIOCE_ATE_M40, -1, - ct_addr, byte_count); + ct_addr, byte_count, + dma_flags); if (!mapaddr) mapaddr = tioce_alloc_map(ce_kern, TIOCE_ATE_M40S, - port, ct_addr, byte_count); + port, ct_addr, byte_count, + dma_flags); } } @@ -593,7 +614,7 @@ tioce_do_dma_map(struct pci_dev *pdev, u64 paddr, size_t byte_count, * 32-bit direct is the next mode to try */ if (!mapaddr && dma_mask >= 0xffffffffUL) - mapaddr = tioce_dma_d32(pdev, ct_addr); + mapaddr = tioce_dma_d32(pdev, ct_addr, dma_flags); /* * Last resort, try 32-bit ATE-based map. @@ -601,7 +622,7 @@ tioce_do_dma_map(struct pci_dev *pdev, u64 paddr, size_t byte_count, if (!mapaddr) mapaddr = tioce_alloc_map(ce_kern, TIOCE_ATE_M32, -1, ct_addr, - byte_count); + byte_count, dma_flags); spin_unlock_irqrestore(&ce_kern->ce_lock, flags); @@ -622,9 +643,9 @@ dma_map_done: * in the address. */ static u64 -tioce_dma(struct pci_dev *pdev, u64 paddr, size_t byte_count) +tioce_dma(struct pci_dev *pdev, u64 paddr, size_t byte_count, int dma_flags) { - return tioce_do_dma_map(pdev, paddr, byte_count, 0); + return tioce_do_dma_map(pdev, paddr, byte_count, 0, dma_flags); } /** @@ -636,9 +657,9 @@ tioce_dma(struct pci_dev *pdev, u64 paddr, size_t byte_count) * Simply call tioce_do_dma_map() to create a map with the barrier bit set * in the address. */ static u64 -tioce_dma_consistent(struct pci_dev *pdev, u64 paddr, size_t byte_count) +tioce_dma_consistent(struct pci_dev *pdev, u64 paddr, size_t byte_count, int dma_flags) { - return tioce_do_dma_map(pdev, paddr, byte_count, 1); + return tioce_do_dma_map(pdev, paddr, byte_count, 1, dma_flags); } /** @@ -696,7 +717,7 @@ tioce_reserve_m32(struct tioce_kernel *ce_kern, u64 base, u64 limit) while (ate_index <= last_ate) { u64 ate; - ate = ATE_MAKE(0xdeadbeef, ps); + ate = ATE_MAKE(0xdeadbeef, ps, 0); ce_kern->ce_ate3240_shadow[ate_index] = ate; tioce_mmr_storei(ce_kern, &ce_mmr->ce_ure_ate3240[ate_index], ate); diff --git a/drivers/pci/msi-altix.c b/drivers/pci/msi-altix.c index 9bd240602c1e..bed4183a5e39 100644 --- a/drivers/pci/msi-altix.c +++ b/drivers/pci/msi-altix.c @@ -6,13 +6,205 @@ * Copyright (C) 2006 Silicon Graphics, Inc. All Rights Reserved. */ -#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "msi.h" + +struct sn_msi_info { + u64 pci_addr; + struct sn_irq_info *sn_irq_info; +}; + +static struct sn_msi_info *sn_msi_info; + +static void +sn_msi_teardown(unsigned int vector) +{ + nasid_t nasid; + int widget; + struct pci_dev *pdev; + struct pcidev_info *sn_pdev; + struct sn_irq_info *sn_irq_info; + struct pcibus_bussoft *bussoft; + struct sn_pcibus_provider *provider; + + sn_irq_info = sn_msi_info[vector].sn_irq_info; + if (sn_irq_info == NULL || sn_irq_info->irq_int_bit >= 0) + return; + + sn_pdev = (struct pcidev_info *)sn_irq_info->irq_pciioinfo; + pdev = sn_pdev->pdi_linux_pcidev; + provider = SN_PCIDEV_BUSPROVIDER(pdev); + + (*provider->dma_unmap)(pdev, + sn_msi_info[vector].pci_addr, + PCI_DMA_FROMDEVICE); + sn_msi_info[vector].pci_addr = 0; + + bussoft = SN_PCIDEV_BUSSOFT(pdev); + nasid = NASID_GET(bussoft->bs_base); + widget = (nasid & 1) ? + TIO_SWIN_WIDGETNUM(bussoft->bs_base) : + SWIN_WIDGETNUM(bussoft->bs_base); + + sn_intr_free(nasid, widget, sn_irq_info); + sn_msi_info[vector].sn_irq_info = NULL; + + return; +} int -sn_msi_init(void) +sn_msi_setup(struct pci_dev *pdev, unsigned int vector, + u32 *addr_hi, u32 *addr_lo, u32 *data) { + int widget; + int status; + nasid_t nasid; + u64 bus_addr; + struct sn_irq_info *sn_irq_info; + struct pcibus_bussoft *bussoft = SN_PCIDEV_BUSSOFT(pdev); + struct sn_pcibus_provider *provider = SN_PCIDEV_BUSPROVIDER(pdev); + + if (bussoft == NULL) + return -EINVAL; + + if (provider == NULL || provider->dma_map_consistent == NULL) + return -EINVAL; + + /* + * Set up the vector plumbing. Let the prom (via sn_intr_alloc) + * decide which cpu to direct this msi at by default. + */ + + nasid = NASID_GET(bussoft->bs_base); + widget = (nasid & 1) ? + TIO_SWIN_WIDGETNUM(bussoft->bs_base) : + SWIN_WIDGETNUM(bussoft->bs_base); + + sn_irq_info = kzalloc(sizeof(struct sn_irq_info), GFP_KERNEL); + if (! sn_irq_info) + return -ENOMEM; + + status = sn_intr_alloc(nasid, widget, sn_irq_info, vector, -1, -1); + if (status) { + kfree(sn_irq_info); + return -ENOMEM; + } + + sn_irq_info->irq_int_bit = -1; /* mark this as an MSI irq */ + sn_irq_fixup(pdev, sn_irq_info); + + /* Prom probably should fill these in, but doesn't ... */ + sn_irq_info->irq_bridge_type = bussoft->bs_asic_type; + sn_irq_info->irq_bridge = (void *)bussoft->bs_base; + /* - * return error until MSI is supported on altix platforms + * Map the xio address into bus space */ - return -EINVAL; + bus_addr = (*provider->dma_map_consistent)(pdev, + sn_irq_info->irq_xtalkaddr, + sizeof(sn_irq_info->irq_xtalkaddr), + SN_DMA_MSI|SN_DMA_ADDR_XIO); + if (! bus_addr) { + sn_intr_free(nasid, widget, sn_irq_info); + kfree(sn_irq_info); + return -ENOMEM; + } + + sn_msi_info[vector].sn_irq_info = sn_irq_info; + sn_msi_info[vector].pci_addr = bus_addr; + + *addr_hi = (u32)(bus_addr >> 32); + *addr_lo = (u32)(bus_addr & 0x00000000ffffffff); + + /* + * In the SN platform, bit 16 is a "send vector" bit which + * must be present in order to move the vector through the system. + */ + *data = 0x100 + (unsigned int)vector; + +#ifdef CONFIG_SMP + set_irq_affinity_info((vector & 0xff), sn_irq_info->irq_cpuid, 0); +#endif + + return 0; +} + +static void +sn_msi_target(unsigned int vector, unsigned int cpu, + u32 *addr_hi, u32 *addr_lo) +{ + int slice; + nasid_t nasid; + u64 bus_addr; + struct pci_dev *pdev; + struct pcidev_info *sn_pdev; + struct sn_irq_info *sn_irq_info; + struct sn_irq_info *new_irq_info; + struct sn_pcibus_provider *provider; + + sn_irq_info = sn_msi_info[vector].sn_irq_info; + if (sn_irq_info == NULL || sn_irq_info->irq_int_bit >= 0) + return; + + /* + * Release XIO resources for the old MSI PCI address + */ + + sn_pdev = (struct pcidev_info *)sn_irq_info->irq_pciioinfo; + pdev = sn_pdev->pdi_linux_pcidev; + provider = SN_PCIDEV_BUSPROVIDER(pdev); + + bus_addr = (u64)(*addr_hi) << 32 | (u64)(*addr_lo); + (*provider->dma_unmap)(pdev, bus_addr, PCI_DMA_FROMDEVICE); + sn_msi_info[vector].pci_addr = 0; + + nasid = cpuid_to_nasid(cpu); + slice = cpuid_to_slice(cpu); + + new_irq_info = sn_retarget_vector(sn_irq_info, nasid, slice); + sn_msi_info[vector].sn_irq_info = new_irq_info; + if (new_irq_info == NULL) + return; + + /* + * Map the xio address into bus space + */ + + bus_addr = (*provider->dma_map_consistent)(pdev, + new_irq_info->irq_xtalkaddr, + sizeof(new_irq_info->irq_xtalkaddr), + SN_DMA_MSI|SN_DMA_ADDR_XIO); + + sn_msi_info[vector].pci_addr = bus_addr; + *addr_hi = (u32)(bus_addr >> 32); + *addr_lo = (u32)(bus_addr & 0x00000000ffffffff); +} + +struct msi_ops sn_msi_ops = { + .setup = sn_msi_setup, + .teardown = sn_msi_teardown, +#ifdef CONFIG_SMP + .target = sn_msi_target, +#endif +}; + +int +sn_msi_init(void) +{ + sn_msi_info = + kzalloc(sizeof(struct sn_msi_info) * NR_VECTORS, GFP_KERNEL); + if (! sn_msi_info) + return -ENOMEM; + + msi_register(&sn_msi_ops); + return 0; } diff --git a/include/asm-ia64/sn/intr.h b/include/asm-ia64/sn/intr.h index 60a51a406eec..12b54ddb06be 100644 --- a/include/asm-ia64/sn/intr.h +++ b/include/asm-ia64/sn/intr.h @@ -10,6 +10,7 @@ #define _ASM_IA64_SN_INTR_H #include +#include #define SGI_UART_VECTOR 0xe9 @@ -40,6 +41,7 @@ struct sn_irq_info { int irq_cpuid; /* kernel logical cpuid */ int irq_irq; /* the IRQ number */ int irq_int_bit; /* Bridge interrupt pin */ + /* <0 means MSI */ u64 irq_xtalkaddr; /* xtalkaddr IRQ is sent to */ int irq_bridge_type;/* pciio asic type (pciio.h) */ void *irq_bridge; /* bridge generating irq */ @@ -53,6 +55,12 @@ struct sn_irq_info { }; extern void sn_send_IPI_phys(int, long, int, int); +extern u64 sn_intr_alloc(nasid_t, int, + struct sn_irq_info *, + int, nasid_t, int); +extern void sn_intr_free(nasid_t, int, struct sn_irq_info *); +extern struct sn_irq_info *sn_retarget_vector(struct sn_irq_info *, nasid_t, int); +extern struct list_head **sn_irq_lh; #define CPU_VECTOR_TO_IRQ(cpuid,vector) (vector) diff --git a/include/asm-ia64/sn/pcibr_provider.h b/include/asm-ia64/sn/pcibr_provider.h index 51260ab70d91..e3b0c3fe5eed 100644 --- a/include/asm-ia64/sn/pcibr_provider.h +++ b/include/asm-ia64/sn/pcibr_provider.h @@ -55,6 +55,7 @@ #define PCI32_ATE_V (0x1 << 0) #define PCI32_ATE_CO (0x1 << 1) #define PCI32_ATE_PREC (0x1 << 2) +#define PCI32_ATE_MSI (0x1 << 2) #define PCI32_ATE_PREF (0x1 << 3) #define PCI32_ATE_BAR (0x1 << 4) #define PCI32_ATE_ADDR_SHFT 12 @@ -117,8 +118,8 @@ struct pcibus_info { extern int pcibr_init_provider(void); extern void *pcibr_bus_fixup(struct pcibus_bussoft *, struct pci_controller *); -extern dma_addr_t pcibr_dma_map(struct pci_dev *, unsigned long, size_t); -extern dma_addr_t pcibr_dma_map_consistent(struct pci_dev *, unsigned long, size_t); +extern dma_addr_t pcibr_dma_map(struct pci_dev *, unsigned long, size_t, int type); +extern dma_addr_t pcibr_dma_map_consistent(struct pci_dev *, unsigned long, size_t, int type); extern void pcibr_dma_unmap(struct pci_dev *, dma_addr_t, int); /* diff --git a/include/asm-ia64/sn/pcibus_provider_defs.h b/include/asm-ia64/sn/pcibus_provider_defs.h index ce3f6c328241..8f7c83d0f6d3 100644 --- a/include/asm-ia64/sn/pcibus_provider_defs.h +++ b/include/asm-ia64/sn/pcibus_provider_defs.h @@ -3,7 +3,7 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. * - * Copyright (C) 1992 - 1997, 2000-2004 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 1992 - 1997, 2000-2005 Silicon Graphics, Inc. All rights reserved. */ #ifndef _ASM_IA64_SN_PCI_PCIBUS_PROVIDER_H #define _ASM_IA64_SN_PCI_PCIBUS_PROVIDER_H @@ -45,13 +45,24 @@ struct pci_controller; */ struct sn_pcibus_provider { - dma_addr_t (*dma_map)(struct pci_dev *, unsigned long, size_t); - dma_addr_t (*dma_map_consistent)(struct pci_dev *, unsigned long, size_t); + dma_addr_t (*dma_map)(struct pci_dev *, unsigned long, size_t, int flags); + dma_addr_t (*dma_map_consistent)(struct pci_dev *, unsigned long, size_t, int flags); void (*dma_unmap)(struct pci_dev *, dma_addr_t, int); void * (*bus_fixup)(struct pcibus_bussoft *, struct pci_controller *); void (*force_interrupt)(struct sn_irq_info *); void (*target_interrupt)(struct sn_irq_info *); }; +/* + * Flags used by the map interfaces + * bits 3:0 specifies format of passed in address + * bit 4 specifies that address is to be used for MSI + */ + +#define SN_DMA_ADDRTYPE(x) ((x) & 0xf) +#define SN_DMA_ADDR_PHYS 1 /* address is an xio address. */ +#define SN_DMA_ADDR_XIO 2 /* address is phys memory */ +#define SN_DMA_MSI 0x10 /* Bus address is to be used for MSI */ + extern struct sn_pcibus_provider *sn_pci_provider[]; #endif /* _ASM_IA64_SN_PCI_PCIBUS_PROVIDER_H */ diff --git a/include/asm-ia64/sn/tiocp.h b/include/asm-ia64/sn/tiocp.h index f47c08ab483c..e8ad0bb5b6c5 100644 --- a/include/asm-ia64/sn/tiocp.h +++ b/include/asm-ia64/sn/tiocp.h @@ -3,13 +3,14 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. * - * Copyright (C) 2003-2004 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2003-2005 Silicon Graphics, Inc. All rights reserved. */ #ifndef _ASM_IA64_SN_PCI_TIOCP_H #define _ASM_IA64_SN_PCI_TIOCP_H #define TIOCP_HOST_INTR_ADDR 0x003FFFFFFFFFFFFFUL #define TIOCP_PCI64_CMDTYPE_MEM (0x1ull << 60) +#define TIOCP_PCI64_CMDTYPE_MSI (0x3ull << 60) /***************************************************************************** -- cgit v1.2.3 From 4f1bcaf094ccc512c23e10104c05a6f8e5b7a9e4 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 22 Jun 2006 14:47:32 -0700 Subject: [PATCH] vgacon: make VGA_MAP_MEM take size, remove extra use VGA_MAP_MEM translates to ioremap() on some architectures. It makes sense to do this to vga_vram_base, because we're going to access memory between vga_vram_base and vga_vram_end. But it doesn't really make sense to map starting at vga_vram_end, because we aren't going to access memory starting there. On ia64, which always has to be different, ioremapping vga_vram_end gives you something completely incompatible with ioremapped vga_vram_start, so vga_vram_size ends up being nonsense. As a bonus, we often know the size up front, so we can use ioremap() correctly, rather than giving it a zero size. Signed-off-by: Bjorn Helgaas Cc: "Antonino A. Daplas" Cc: "Luck, Tony" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/console/mdacon.c | 2 +- drivers/video/console/vgacon.c | 19 +++++++++---------- drivers/video/vga16fb.c | 2 +- include/asm-alpha/vga.h | 2 +- include/asm-arm/vga.h | 2 +- include/asm-i386/vga.h | 2 +- include/asm-ia64/vga.h | 2 +- include/asm-m32r/vga.h | 2 +- include/asm-mips/vga.h | 2 +- include/asm-powerpc/vga.h | 4 ++-- include/asm-sparc64/vga.h | 2 +- include/asm-x86_64/vga.h | 2 +- include/asm-xtensa/vga.h | 2 +- 13 files changed, 22 insertions(+), 23 deletions(-) (limited to 'include/asm-ia64') diff --git a/drivers/video/console/mdacon.c b/drivers/video/console/mdacon.c index 989e4d49e5bb..7f939d066a5a 100644 --- a/drivers/video/console/mdacon.c +++ b/drivers/video/console/mdacon.c @@ -313,8 +313,8 @@ static const char __init *mdacon_startup(void) mda_num_columns = 80; mda_num_lines = 25; - mda_vram_base = VGA_MAP_MEM(0xb0000); mda_vram_len = 0x01000; + mda_vram_base = VGA_MAP_MEM(0xb0000, mda_vram_len); mda_index_port = 0x3b4; mda_value_port = 0x3b5; diff --git a/drivers/video/console/vgacon.c b/drivers/video/console/vgacon.c index d5a04b68c4d4..e64d42e2449e 100644 --- a/drivers/video/console/vgacon.c +++ b/drivers/video/console/vgacon.c @@ -391,7 +391,7 @@ static const char __init *vgacon_startup(void) static struct resource ega_console_resource = { "ega", 0x3B0, 0x3BF }; vga_video_type = VIDEO_TYPE_EGAM; - vga_vram_end = 0xb8000; + vga_vram_size = 0x8000; display_desc = "EGA+"; request_resource(&ioport_resource, &ega_console_resource); @@ -401,7 +401,7 @@ static const char __init *vgacon_startup(void) static struct resource mda2_console_resource = { "mda", 0x3BF, 0x3BF }; vga_video_type = VIDEO_TYPE_MDA; - vga_vram_end = 0xb2000; + vga_vram_size = 0x2000; display_desc = "*MDA"; request_resource(&ioport_resource, &mda1_console_resource); @@ -418,7 +418,7 @@ static const char __init *vgacon_startup(void) if ((ORIG_VIDEO_EGA_BX & 0xff) != 0x10) { int i; - vga_vram_end = 0xc0000; + vga_vram_size = 0x8000; if (!ORIG_VIDEO_ISVGA) { static struct resource ega_console_resource @@ -443,7 +443,7 @@ static const char __init *vgacon_startup(void) * and COE=1 isn't necessarily a good idea) */ vga_vram_base = 0xa0000; - vga_vram_end = 0xb0000; + vga_vram_size = 0x10000; outb_p(6, VGA_GFX_I); outb_p(6, VGA_GFX_D); #endif @@ -475,7 +475,7 @@ static const char __init *vgacon_startup(void) static struct resource cga_console_resource = { "cga", 0x3D4, 0x3D5 }; vga_video_type = VIDEO_TYPE_CGA; - vga_vram_end = 0xba000; + vga_vram_size = 0x2000; display_desc = "*CGA"; request_resource(&ioport_resource, &cga_console_resource); @@ -483,9 +483,8 @@ static const char __init *vgacon_startup(void) } } - vga_vram_base = VGA_MAP_MEM(vga_vram_base); - vga_vram_end = VGA_MAP_MEM(vga_vram_end); - vga_vram_size = vga_vram_end - vga_vram_base; + vga_vram_base = VGA_MAP_MEM(vga_vram_base, vga_vram_size); + vga_vram_end = vga_vram_base + vga_vram_size; /* * Find out if there is a graphics card present. @@ -1020,14 +1019,14 @@ static int vgacon_do_font_op(struct vgastate *state,char *arg,int set,int ch512) char *charmap; if (vga_video_type != VIDEO_TYPE_EGAM) { - charmap = (char *) VGA_MAP_MEM(colourmap); + charmap = (char *) VGA_MAP_MEM(colourmap, 0); beg = 0x0e; #ifdef VGA_CAN_DO_64KB if (vga_video_type == VIDEO_TYPE_VGAC) beg = 0x06; #endif } else { - charmap = (char *) VGA_MAP_MEM(blackwmap); + charmap = (char *) VGA_MAP_MEM(blackwmap, 0); beg = 0x0a; } diff --git a/drivers/video/vga16fb.c b/drivers/video/vga16fb.c index f3f16fd9f231..4fd2a272e03d 100644 --- a/drivers/video/vga16fb.c +++ b/drivers/video/vga16fb.c @@ -1351,7 +1351,7 @@ static int __init vga16fb_probe(struct device *device) } /* XXX share VGA_FB_PHYS and I/O region with vgacon and others */ - info->screen_base = (void __iomem *)VGA_MAP_MEM(VGA_FB_PHYS); + info->screen_base = (void __iomem *)VGA_MAP_MEM(VGA_FB_PHYS, 0); if (!info->screen_base) { printk(KERN_ERR "vga16fb: unable to map device\n"); diff --git a/include/asm-alpha/vga.h b/include/asm-alpha/vga.h index 8ca4f6b2da19..ed06f59b544d 100644 --- a/include/asm-alpha/vga.h +++ b/include/asm-alpha/vga.h @@ -46,6 +46,6 @@ extern void scr_memcpyw(u16 *d, const u16 *s, unsigned int count); #define vga_readb(a) readb((u8 __iomem *)(a)) #define vga_writeb(v,a) writeb(v, (u8 __iomem *)(a)) -#define VGA_MAP_MEM(x) ((unsigned long) ioremap(x, 0)) +#define VGA_MAP_MEM(x,s) ((unsigned long) ioremap(x, s)) #endif diff --git a/include/asm-arm/vga.h b/include/asm-arm/vga.h index 926e5ee128e9..1e0b913c3d71 100644 --- a/include/asm-arm/vga.h +++ b/include/asm-arm/vga.h @@ -4,7 +4,7 @@ #include #include -#define VGA_MAP_MEM(x) (PCIMEM_BASE + (x)) +#define VGA_MAP_MEM(x,s) (PCIMEM_BASE + (x)) #define vga_readb(x) (*((volatile unsigned char *)x)) #define vga_writeb(x,y) (*((volatile unsigned char *)y) = (x)) diff --git a/include/asm-i386/vga.h b/include/asm-i386/vga.h index ef0c0e50cc95..0ecf68ac03aa 100644 --- a/include/asm-i386/vga.h +++ b/include/asm-i386/vga.h @@ -12,7 +12,7 @@ * access the videoram directly without any black magic. */ -#define VGA_MAP_MEM(x) (unsigned long)phys_to_virt(x) +#define VGA_MAP_MEM(x,s) (unsigned long)phys_to_virt(x) #define vga_readb(x) (*(x)) #define vga_writeb(x,y) (*(y) = (x)) diff --git a/include/asm-ia64/vga.h b/include/asm-ia64/vga.h index 091177cda223..02184ecd8208 100644 --- a/include/asm-ia64/vga.h +++ b/include/asm-ia64/vga.h @@ -17,7 +17,7 @@ extern unsigned long vga_console_iobase; extern unsigned long vga_console_membase; -#define VGA_MAP_MEM(x) ((unsigned long) ioremap_nocache(vga_console_membase + (x), 0)) +#define VGA_MAP_MEM(x,s) ((unsigned long) ioremap_nocache(vga_console_membase + (x), s)) #define vga_readb(x) (*(x)) #define vga_writeb(x,y) (*(y) = (x)) diff --git a/include/asm-m32r/vga.h b/include/asm-m32r/vga.h index d0f4b6eed7a3..533163447cc9 100644 --- a/include/asm-m32r/vga.h +++ b/include/asm-m32r/vga.h @@ -14,7 +14,7 @@ * access the videoram directly without any black magic. */ -#define VGA_MAP_MEM(x) (unsigned long)phys_to_virt(x) +#define VGA_MAP_MEM(x,s) (unsigned long)phys_to_virt(x) #define vga_readb(x) (*(x)) #define vga_writeb(x,y) (*(y) = (x)) diff --git a/include/asm-mips/vga.h b/include/asm-mips/vga.h index 34755c0a6398..c1dd0b10bc27 100644 --- a/include/asm-mips/vga.h +++ b/include/asm-mips/vga.h @@ -13,7 +13,7 @@ * access the videoram directly without any black magic. */ -#define VGA_MAP_MEM(x) (0xb0000000L + (unsigned long)(x)) +#define VGA_MAP_MEM(x,s) (0xb0000000L + (unsigned long)(x)) #define vga_readb(x) (*(x)) #define vga_writeb(x,y) (*(y) = (x)) diff --git a/include/asm-powerpc/vga.h b/include/asm-powerpc/vga.h index eadaf2f3d032..a2eac409c1ec 100644 --- a/include/asm-powerpc/vga.h +++ b/include/asm-powerpc/vga.h @@ -41,9 +41,9 @@ static inline u16 scr_readw(volatile const u16 *addr) extern unsigned long vgacon_remap_base; #ifdef __powerpc64__ -#define VGA_MAP_MEM(x) ((unsigned long) ioremap((x), 0)) +#define VGA_MAP_MEM(x,s) ((unsigned long) ioremap((x), s)) #else -#define VGA_MAP_MEM(x) (x + vgacon_remap_base) +#define VGA_MAP_MEM(x,s) (x + vgacon_remap_base) #endif #define vga_readb(x) (*(x)) diff --git a/include/asm-sparc64/vga.h b/include/asm-sparc64/vga.h index 9c57eb363b40..c69d5b2ba19a 100644 --- a/include/asm-sparc64/vga.h +++ b/include/asm-sparc64/vga.h @@ -28,6 +28,6 @@ static inline u16 scr_readw(const u16 *addr) return *addr; } -#define VGA_MAP_MEM(x) (x) +#define VGA_MAP_MEM(x,s) (x) #endif diff --git a/include/asm-x86_64/vga.h b/include/asm-x86_64/vga.h index ef0c0e50cc95..0ecf68ac03aa 100644 --- a/include/asm-x86_64/vga.h +++ b/include/asm-x86_64/vga.h @@ -12,7 +12,7 @@ * access the videoram directly without any black magic. */ -#define VGA_MAP_MEM(x) (unsigned long)phys_to_virt(x) +#define VGA_MAP_MEM(x,s) (unsigned long)phys_to_virt(x) #define vga_readb(x) (*(x)) #define vga_writeb(x,y) (*(y) = (x)) diff --git a/include/asm-xtensa/vga.h b/include/asm-xtensa/vga.h index 23d82f6acb57..1fd8cab3a297 100644 --- a/include/asm-xtensa/vga.h +++ b/include/asm-xtensa/vga.h @@ -11,7 +11,7 @@ #ifndef _XTENSA_VGA_H #define _XTENSA_VGA_H -#define VGA_MAP_MEM(x) (unsigned long)phys_to_virt(x) +#define VGA_MAP_MEM(x,s) (unsigned long)phys_to_virt(x) #define vga_readb(x) (*(x)) #define vga_writeb(x,y) (*(y) = (x)) -- cgit v1.2.3 From 742755a1d8ce2b548428f7aacf1758b4bba50080 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:55 -0700 Subject: [PATCH] page migration: sys_move_pages(): support moving of individual pages move_pages() is used to move individual pages of a process. The function can be used to determine the location of pages and to move them onto the desired node. move_pages() returns status information for each page. long move_pages(pid, number_of_pages_to_move, addresses_of_pages[], nodes[] or NULL, status[], flags); The addresses of pages is an array of void * pointing to the pages to be moved. The nodes array contains the node numbers that the pages should be moved to. If a NULL is passed instead of an array then no pages are moved but the status array is updated. The status request may be used to determine the page state before issuing another move_pages() to move pages. The status array will contain the state of all individual page migration attempts when the function terminates. The status array is only valid if move_pages() completed successfullly. Possible page states in status[]: 0..MAX_NUMNODES The page is now on the indicated node. -ENOENT Page is not present -EACCES Page is mapped by multiple processes and can only be moved if MPOL_MF_MOVE_ALL is specified. -EPERM The page has been mlocked by a process/driver and cannot be moved. -EBUSY Page is busy and cannot be moved. Try again later. -EFAULT Invalid address (no VMA or zero page). -ENOMEM Unable to allocate memory on target node. -EIO Unable to write back page. The page must be written back in order to move it since the page is dirty and the filesystem does not provide a migration function that would allow the moving of dirty pages. -EINVAL A dirty page cannot be moved. The filesystem does not provide a migration function and has no ability to write back pages. The flags parameter indicates what types of pages to move: MPOL_MF_MOVE Move pages that are only mapped by the process. MPOL_MF_MOVE_ALL Also move pages that are mapped by multiple processes. Requires sufficient capabilities. Possible return codes from move_pages() -ENOENT No pages found that would require moving. All pages are either already on the target node, not present, had an invalid address or could not be moved because they were mapped by multiple processes. -EINVAL Flags other than MPOL_MF_MOVE(_ALL) specified or an attempt to migrate pages in a kernel thread. -EPERM MPOL_MF_MOVE_ALL specified without sufficient priviledges. or an attempt to move a process belonging to another user. -EACCES One of the target nodes is not allowed by the current cpuset. -ENODEV One of the target nodes is not online. -ESRCH Process does not exist. -E2BIG Too many pages to move. -ENOMEM Not enough memory to allocate control array. -EFAULT Parameters could not be accessed. A test program for move_pages() may be found with the patches on ftp.kernel.org:/pub/linux/kernel/people/christoph/pmig/patches-2.6.17-rc4-mm3 From: Christoph Lameter Detailed results for sys_move_pages() Pass a pointer to an integer to get_new_page() that may be used to indicate where the completion status of a migration operation should be placed. This allows sys_move_pags() to report back exactly what happened to each page. Wish there would be a better way to do this. Looks a bit hacky. Signed-off-by: Christoph Lameter Cc: Hugh Dickins Cc: Jes Sorensen Cc: KAMEZAWA Hiroyuki Cc: Lee Schermerhorn Cc: Andi Kleen Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/page_migration | 29 ++--- arch/ia64/kernel/entry.S | 2 +- include/asm-ia64/unistd.h | 2 +- include/linux/migrate.h | 2 +- include/linux/syscalls.h | 5 + kernel/sys_ni.c | 1 + mm/mempolicy.c | 4 +- mm/migrate.c | 268 +++++++++++++++++++++++++++++++++++++++- 8 files changed, 288 insertions(+), 25 deletions(-) (limited to 'include/asm-ia64') diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration index 0a5d5fb18854..99f89aa10169 100644 --- a/Documentation/vm/page_migration +++ b/Documentation/vm/page_migration @@ -26,8 +26,13 @@ a process are located. See also the numa_maps manpage in the numactl package. Manual migration is useful if for example the scheduler has relocated a process to a processor on a distant node. A batch scheduler or an administrator may detect the situation and move the pages of the process -nearer to the new processor. At some point in the future we may have -some mechanism in the scheduler that will automatically move the pages. +nearer to the new processor. The kernel itself does only provide +manual page migration support. Automatic page migration may be implemented +through user space processes that move pages. A special function call +"move_pages" allows the moving of individual pages within a process. +A NUMA profiler may f.e. obtain a log showing frequent off node +accesses and may use the result to move pages to more advantageous +locations. Larger installations usually partition the system using cpusets into sections of nodes. Paul Jackson has equipped cpusets with the ability to @@ -62,22 +67,14 @@ A. In kernel use of migrate_pages() It also prevents the swapper or other scans to encounter the page. -2. Generate a list of newly allocates pages. These pages will contain the - contents of the pages from the first list after page migration is - complete. +2. We need to have a function of type new_page_t that can be + passed to migrate_pages(). This function should figure out + how to allocate the correct new page given the old page. 3. The migrate_pages() function is called which attempts - to do the migration. It returns the moved pages in the - list specified as the third parameter and the failed - migrations in the fourth parameter. When the function - returns the first list will contain the pages that could still be retried. - -4. The leftover pages of various types are returned - to the LRU using putback_to_lru_pages() or otherwise - disposed of. The pages will still have the refcount as - increased by isolate_lru_pages() if putback_to_lru_pages() is not - used! The kernel may want to handle the various cases of failures in - different ways. + to do the migration. It will call the function to allocate + the new page for each page that is considered for + moving. B. How migrate_pages() works ---------------------------- diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index bcb80ca5cf40..32c999f58d12 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -1584,7 +1584,7 @@ sys_call_table: data8 sys_keyctl data8 sys_ioprio_set data8 sys_ioprio_get // 1275 - data8 sys_ni_syscall + data8 sys_move_pages data8 sys_inotify_init data8 sys_inotify_add_watch data8 sys_inotify_rm_watch diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h index 632f2eedf72c..bb0eb727dcd0 100644 --- a/include/asm-ia64/unistd.h +++ b/include/asm-ia64/unistd.h @@ -265,7 +265,7 @@ #define __NR_keyctl 1273 #define __NR_ioprio_set 1274 #define __NR_ioprio_get 1275 -/* 1276 is available for reuse (was briefly sys_set_zone_reclaim) */ +#define __NR_move_pages 1276 #define __NR_inotify_init 1277 #define __NR_inotify_add_watch 1278 #define __NR_inotify_rm_watch 1279 diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 5b95d6568dc4..5dba23a1c0d0 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -3,7 +3,7 @@ #include -typedef struct page *new_page_t(struct page *, unsigned long private); +typedef struct page *new_page_t(struct page *, unsigned long private, int **); #ifdef CONFIG_MIGRATION extern int isolate_lru_page(struct page *p, struct list_head *pagelist); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index bd67a4413df7..7e3f23490918 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -516,6 +516,11 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, const unsigned long __user *from, const unsigned long __user *to); +asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, + const void __user * __user *pages, + const int __user *nodes, + int __user *status, + int flags); asmlinkage long sys_mbind(unsigned long start, unsigned long len, unsigned long mode, unsigned long __user *nmask, diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 5433195040f1..597229749dec 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -87,6 +87,7 @@ cond_syscall(sys_inotify_init); cond_syscall(sys_inotify_add_watch); cond_syscall(sys_inotify_rm_watch); cond_syscall(sys_migrate_pages); +cond_syscall(sys_move_pages); cond_syscall(sys_chown16); cond_syscall(sys_fchown16); cond_syscall(sys_getegid16); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f432642e9e66..05b84acf0bb3 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -588,7 +588,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, isolate_lru_page(page, pagelist); } -static struct page *new_node_page(struct page *page, unsigned long node) +static struct page *new_node_page(struct page *page, unsigned long node, int **x) { return alloc_pages_node(node, GFP_HIGHUSER, 0); } @@ -698,7 +698,7 @@ int do_migrate_pages(struct mm_struct *mm, } -static struct page *new_vma_page(struct page *page, unsigned long private) +static struct page *new_vma_page(struct page *page, unsigned long private, int **x) { struct vm_area_struct *vma = (struct vm_area_struct *)private; diff --git a/mm/migrate.c b/mm/migrate.c index 251a8d158257..033a12f4c949 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include "internal.h" @@ -62,9 +64,8 @@ int isolate_lru_page(struct page *page, struct list_head *pagelist) } /* - * migrate_prep() needs to be called after we have compiled the list of pages - * to be migrated using isolate_lru_page() but before we begin a series of calls - * to migrate_pages(). + * migrate_prep() needs to be called before we start compiling a list of pages + * to be migrated using isolate_lru_page(). */ int migrate_prep(void) { @@ -588,7 +589,8 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, struct page *page, int force) { int rc = 0; - struct page *newpage = get_new_page(page, private); + int *result = NULL; + struct page *newpage = get_new_page(page, private, &result); if (!newpage) return -ENOMEM; @@ -642,6 +644,12 @@ move_newpage: * then this will free the page. */ move_to_lru(newpage); + if (result) { + if (rc) + *result = rc; + else + *result = page_to_nid(newpage); + } return rc; } @@ -710,3 +718,255 @@ out: return nr_failed + retry; } +#ifdef CONFIG_NUMA +/* + * Move a list of individual pages + */ +struct page_to_node { + unsigned long addr; + struct page *page; + int node; + int status; +}; + +static struct page *new_page_node(struct page *p, unsigned long private, + int **result) +{ + struct page_to_node *pm = (struct page_to_node *)private; + + while (pm->node != MAX_NUMNODES && pm->page != p) + pm++; + + if (pm->node == MAX_NUMNODES) + return NULL; + + *result = &pm->status; + + return alloc_pages_node(pm->node, GFP_HIGHUSER, 0); +} + +/* + * Move a set of pages as indicated in the pm array. The addr + * field must be set to the virtual address of the page to be moved + * and the node number must contain a valid target node. + */ +static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm, + int migrate_all) +{ + int err; + struct page_to_node *pp; + LIST_HEAD(pagelist); + + down_read(&mm->mmap_sem); + + /* + * Build a list of pages to migrate + */ + migrate_prep(); + for (pp = pm; pp->node != MAX_NUMNODES; pp++) { + struct vm_area_struct *vma; + struct page *page; + + /* + * A valid page pointer that will not match any of the + * pages that will be moved. + */ + pp->page = ZERO_PAGE(0); + + err = -EFAULT; + vma = find_vma(mm, pp->addr); + if (!vma) + goto set_status; + + page = follow_page(vma, pp->addr, FOLL_GET); + err = -ENOENT; + if (!page) + goto set_status; + + if (PageReserved(page)) /* Check for zero page */ + goto put_and_set; + + pp->page = page; + err = page_to_nid(page); + + if (err == pp->node) + /* + * Node already in the right place + */ + goto put_and_set; + + err = -EACCES; + if (page_mapcount(page) > 1 && + !migrate_all) + goto put_and_set; + + err = isolate_lru_page(page, &pagelist); +put_and_set: + /* + * Either remove the duplicate refcount from + * isolate_lru_page() or drop the page ref if it was + * not isolated. + */ + put_page(page); +set_status: + pp->status = err; + } + + if (!list_empty(&pagelist)) + err = migrate_pages(&pagelist, new_page_node, + (unsigned long)pm); + else + err = -ENOENT; + + up_read(&mm->mmap_sem); + return err; +} + +/* + * Determine the nodes of a list of pages. The addr in the pm array + * must have been set to the virtual address of which we want to determine + * the node number. + */ +static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm) +{ + down_read(&mm->mmap_sem); + + for ( ; pm->node != MAX_NUMNODES; pm++) { + struct vm_area_struct *vma; + struct page *page; + int err; + + err = -EFAULT; + vma = find_vma(mm, pm->addr); + if (!vma) + goto set_status; + + page = follow_page(vma, pm->addr, 0); + err = -ENOENT; + /* Use PageReserved to check for zero page */ + if (!page || PageReserved(page)) + goto set_status; + + err = page_to_nid(page); +set_status: + pm->status = err; + } + + up_read(&mm->mmap_sem); + return 0; +} + +/* + * Move a list of pages in the address space of the currently executing + * process. + */ +asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, + const void __user * __user *pages, + const int __user *nodes, + int __user *status, int flags) +{ + int err = 0; + int i; + struct task_struct *task; + nodemask_t task_nodes; + struct mm_struct *mm; + struct page_to_node *pm = NULL; + + /* Check flags */ + if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) + return -EINVAL; + + if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) + return -EPERM; + + /* Find the mm_struct */ + read_lock(&tasklist_lock); + task = pid ? find_task_by_pid(pid) : current; + if (!task) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + mm = get_task_mm(task); + read_unlock(&tasklist_lock); + + if (!mm) + return -EINVAL; + + /* + * Check if this process has the right to modify the specified + * process. The right exists if the process has administrative + * capabilities, superuser privileges or the same + * userid as the target process. + */ + if ((current->euid != task->suid) && (current->euid != task->uid) && + (current->uid != task->suid) && (current->uid != task->uid) && + !capable(CAP_SYS_NICE)) { + err = -EPERM; + goto out2; + } + + task_nodes = cpuset_mems_allowed(task); + + /* Limit nr_pages so that the multiplication may not overflow */ + if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) { + err = -E2BIG; + goto out2; + } + + pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node)); + if (!pm) { + err = -ENOMEM; + goto out2; + } + + /* + * Get parameters from user space and initialize the pm + * array. Return various errors if the user did something wrong. + */ + for (i = 0; i < nr_pages; i++) { + const void *p; + + err = -EFAULT; + if (get_user(p, pages + i)) + goto out; + + pm[i].addr = (unsigned long)p; + if (nodes) { + int node; + + if (get_user(node, nodes + i)) + goto out; + + err = -ENODEV; + if (!node_online(node)) + goto out; + + err = -EACCES; + if (!node_isset(node, task_nodes)) + goto out; + + pm[i].node = node; + } + } + /* End marker */ + pm[nr_pages].node = MAX_NUMNODES; + + if (nodes) + err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL); + else + err = do_pages_stat(mm, pm); + + if (err >= 0) + /* Return status information */ + for (i = 0; i < nr_pages; i++) + if (put_user(pm[i].status, status + i)) + err = -EFAULT; + +out: + vfree(pm); +out2: + mmput(mm); + return err; +} +#endif + -- cgit v1.2.3