diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-12-27 10:43:24 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-12-27 10:43:24 -0800 |
commit | 8d6973327ee84c2f40dd9efd8928d4a1186c96e2 (patch) | |
tree | 1c6accd71b6e9c4e05d5aaae766b958ad440d320 /drivers | |
parent | 6d101ba6be2a26a3e1f513b5e293f0fd2b79ec5c (diff) | |
parent | 12526b0d6c580df860b31e59d68e5696e16c6e5b (diff) | |
download | lwn-8d6973327ee84c2f40dd9efd8928d4a1186c96e2.tar.gz lwn-8d6973327ee84c2f40dd9efd8928d4a1186c96e2.zip |
Merge tag 'powerpc-4.21-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux
Pull powerpc updates from Michael Ellerman:
"Notable changes:
- Mitigations for Spectre v2 on some Freescale (NXP) CPUs.
- A large series adding support for pass-through of Nvidia V100 GPUs
to guests on Power9.
- Another large series to enable hardware assistance for TLB table
walk on MPC8xx CPUs.
- Some preparatory changes to our DMA code, to make way for further
cleanups from Christoph.
- Several fixes for our Transactional Memory handling discovered by
fuzzing the signal return path.
- Support for generating our system call table(s) from a text file
like other architectures.
- A fix to our page fault handler so that instead of generating a
WARN_ON_ONCE, user accesses of kernel addresses instead print a
ratelimited and appropriately scary warning.
- A cosmetic change to make our unhandled page fault messages more
similar to other arches and also more compact and informative.
- Freescale updates from Scott:
"Highlights include elimination of legacy clock bindings use from
dts files, an 83xx watchdog handler, fixes to old dts interrupt
errors, and some minor cleanup."
And many clean-ups, reworks and minor fixes etc.
Thanks to: Alexandre Belloni, Alexey Kardashevskiy, Andrew Donnellan,
Aneesh Kumar K.V, Arnd Bergmann, Benjamin Herrenschmidt, Breno Leitao,
Christian Lamparter, Christophe Leroy, Christoph Hellwig, Daniel
Axtens, Darren Stevens, David Gibson, Diana Craciun, Dmitry V. Levin,
Firoz Khan, Geert Uytterhoeven, Greg Kurz, Gustavo Romero, Hari
Bathini, Joel Stanley, Kees Cook, Madhavan Srinivasan, Mahesh
Salgaonkar, Markus Elfring, Mathieu Malaterre, Michal Suchánek, Naveen
N. Rao, Nick Desaulniers, Oliver O'Halloran, Paul Mackerras, Ram Pai,
Ravi Bangoria, Rob Herring, Russell Currey, Sabyasachi Gupta, Sam
Bobroff, Satheesh Rajendran, Scott Wood, Segher Boessenkool, Stephen
Rothwell, Tang Yuantian, Thiago Jung Bauermann, Yangtao Li, Yuantian
Tang, Yue Haibing"
* tag 'powerpc-4.21-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux: (201 commits)
Revert "powerpc/fsl_pci: simplify fsl_pci_dma_set_mask"
powerpc/zImage: Also check for stdout-path
powerpc: Fix HMIs on big-endian with CONFIG_RELOCATABLE=y
macintosh: Use of_node_name_{eq, prefix} for node name comparisons
ide: Use of_node_name_eq for node name comparisons
powerpc: Use of_node_name_eq for node name comparisons
powerpc/pseries/pmem: Convert to %pOFn instead of device_node.name
powerpc/mm: Remove very old comment in hash-4k.h
powerpc/pseries: Fix node leak in update_lmb_associativity_index()
powerpc/configs/85xx: Enable CONFIG_DEBUG_KERNEL
powerpc/dts/fsl: Fix dtc-flagged interrupt errors
clk: qoriq: add more compatibles strings
powerpc/fsl: Use new clockgen binding
powerpc/83xx: handle machine check caused by watchdog timer
powerpc/fsl-rio: fix spelling mistake "reserverd" -> "reserved"
powerpc/fsl_pci: simplify fsl_pci_dma_set_mask
arch/powerpc/fsl_rmu: Use dma_zalloc_coherent
vfio_pci: Add NVIDIA GV100GL [Tesla V100 SXM2] subdriver
vfio_pci: Allow regions to add own capabilities
vfio_pci: Allow mapping extra regions
...
Diffstat (limited to 'drivers')
26 files changed, 778 insertions, 127 deletions
diff --git a/drivers/cpufreq/pmac32-cpufreq.c b/drivers/cpufreq/pmac32-cpufreq.c index 61ae06ca008e..52f0d91d30c1 100644 --- a/drivers/cpufreq/pmac32-cpufreq.c +++ b/drivers/cpufreq/pmac32-cpufreq.c @@ -128,7 +128,7 @@ static int cpu_750fx_cpu_speed(int low_speed) mtspr(SPRN_HID2, hid2); } } -#ifdef CONFIG_6xx +#ifdef CONFIG_PPC_BOOK3S_32 low_choose_750fx_pll(low_speed); #endif if (low_speed == 1) { @@ -166,7 +166,7 @@ static int dfs_set_cpu_speed(int low_speed) } /* set frequency */ -#ifdef CONFIG_6xx +#ifdef CONFIG_PPC_BOOK3S_32 low_choose_7447a_dfs(low_speed); #endif udelay(100); diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c index 9e56bc411061..74c247972bb3 100644 --- a/drivers/cpuidle/cpuidle-pseries.c +++ b/drivers/cpuidle/cpuidle-pseries.c @@ -247,7 +247,13 @@ static int pseries_idle_probe(void) return -ENODEV; if (firmware_has_feature(FW_FEATURE_SPLPAR)) { - if (lppaca_shared_proc(get_lppaca())) { + /* + * Use local_paca instead of get_lppaca() since + * preemption is not disabled, and it is not required in + * fact, since lppaca_ptr does not need to be the value + * associated to the current CPU, it can be from any CPU. + */ + if (lppaca_shared_proc(local_paca->lppaca_ptr)) { cpuidle_state_table = shared_states; max_idle_state = ARRAY_SIZE(shared_states); } else { diff --git a/drivers/crypto/amcc/crypto4xx_core.c b/drivers/crypto/amcc/crypto4xx_core.c index 6eaec9ba0f68..63cb6956c948 100644 --- a/drivers/crypto/amcc/crypto4xx_core.c +++ b/drivers/crypto/amcc/crypto4xx_core.c @@ -596,7 +596,7 @@ static void crypto4xx_aead_done(struct crypto4xx_device *dev, pd->pd_ctl_len.bf.pkt_len, dst); } else { - __dma_sync_page(sg_page(dst), dst->offset, dst->length, + dma_unmap_page(dev->core_dev->device, pd->dest, dst->length, DMA_FROM_DEVICE); } diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c index 203ed4adc04a..92f840365718 100644 --- a/drivers/ide/pmac.c +++ b/drivers/ide/pmac.c @@ -1046,7 +1046,7 @@ static int pmac_ide_setup_device(pmac_ide_hwif_t *pmif, struct ide_hw *hw) d.port_ops = &pmac_ide_ata4_port_ops; d.udma_mask = ATA_UDMA5; } else if (of_device_is_compatible(np, "keylargo-ata")) { - if (strcmp(np->name, "ata-4") == 0) { + if (of_node_name_eq(np, "ata-4")) { pmif->kind = controller_kl_ata4; d.port_ops = &pmac_ide_ata4_port_ops; d.udma_mask = ATA_UDMA4; diff --git a/drivers/macintosh/ans-lcd.c b/drivers/macintosh/ans-lcd.c index c8e078b911c7..ef0c2366cf59 100644 --- a/drivers/macintosh/ans-lcd.c +++ b/drivers/macintosh/ans-lcd.c @@ -160,7 +160,7 @@ anslcd_init(void) struct device_node* node; node = of_find_node_by_name(NULL, "lcd"); - if (!node || !node->parent || strcmp(node->parent->name, "gc")) { + if (!node || !of_node_name_eq(node->parent, "gc")) { of_node_put(node); return -ENODEV; } diff --git a/drivers/macintosh/macio_asic.c b/drivers/macintosh/macio_asic.c index 17d3bc917562..3543a82081de 100644 --- a/drivers/macintosh/macio_asic.c +++ b/drivers/macintosh/macio_asic.c @@ -190,11 +190,11 @@ static int macio_resource_quirks(struct device_node *np, struct resource *res, return 0; /* Grand Central has too large resource 0 on some machines */ - if (index == 0 && !strcmp(np->name, "gc")) + if (index == 0 && of_node_name_eq(np, "gc")) res->end = res->start + 0x1ffff; /* Airport has bogus resource 2 */ - if (index >= 2 && !strcmp(np->name, "radio")) + if (index >= 2 && of_node_name_eq(np, "radio")) return 1; #ifndef CONFIG_PPC64 @@ -207,21 +207,21 @@ static int macio_resource_quirks(struct device_node *np, struct resource *res, * level of hierarchy, but I don't really feel the need * for it */ - if (!strcmp(np->name, "escc")) + if (of_node_name_eq(np, "escc")) return 1; /* ESCC has bogus resources >= 3 */ - if (index >= 3 && !(strcmp(np->name, "ch-a") && - strcmp(np->name, "ch-b"))) + if (index >= 3 && (of_node_name_eq(np, "ch-a") || + of_node_name_eq(np, "ch-b"))) return 1; /* Media bay has too many resources, keep only first one */ - if (index > 0 && !strcmp(np->name, "media-bay")) + if (index > 0 && of_node_name_eq(np, "media-bay")) return 1; /* Some older IDE resources have bogus sizes */ - if (!(strcmp(np->name, "IDE") && strcmp(np->name, "ATA") && - strcmp(np->type, "ide") && strcmp(np->type, "ata"))) { + if (of_node_name_eq(np, "IDE") || of_node_name_eq(np, "ATA") || + of_node_is_type(np, "ide") || of_node_is_type(np, "ata")) { if (index == 0 && (res->end - res->start) > 0xfff) res->end = res->start + 0xfff; if (index == 1 && (res->end - res->start) > 0xff) @@ -260,7 +260,7 @@ static void macio_add_missing_resources(struct macio_dev *dev) irq_base = 64; /* Fix SCC */ - if (strcmp(np->name, "ch-a") == 0) { + if (of_node_name_eq(np, "ch-a")) { macio_create_fixup_irq(dev, 0, 15 + irq_base); macio_create_fixup_irq(dev, 1, 4 + irq_base); macio_create_fixup_irq(dev, 2, 5 + irq_base); @@ -268,18 +268,18 @@ static void macio_add_missing_resources(struct macio_dev *dev) } /* Fix media-bay */ - if (strcmp(np->name, "media-bay") == 0) { + if (of_node_name_eq(np, "media-bay")) { macio_create_fixup_irq(dev, 0, 29 + irq_base); printk(KERN_INFO "macio: fixed media-bay irq on gatwick\n"); } /* Fix left media bay childs */ - if (dev->media_bay != NULL && strcmp(np->name, "floppy") == 0) { + if (dev->media_bay != NULL && of_node_name_eq(np, "floppy")) { macio_create_fixup_irq(dev, 0, 19 + irq_base); macio_create_fixup_irq(dev, 1, 1 + irq_base); printk(KERN_INFO "macio: fixed left floppy irqs\n"); } - if (dev->media_bay != NULL && strcasecmp(np->name, "ata4") == 0) { + if (dev->media_bay != NULL && of_node_name_eq(np, "ata4")) { macio_create_fixup_irq(dev, 0, 14 + irq_base); macio_create_fixup_irq(dev, 0, 3 + irq_base); printk(KERN_INFO "macio: fixed left ide irqs\n"); @@ -438,11 +438,8 @@ static struct macio_dev * macio_add_one_device(struct macio_chip *chip, static int macio_skip_device(struct device_node *np) { - if (strncmp(np->name, "battery", 7) == 0) - return 1; - if (strncmp(np->name, "escc-legacy", 11) == 0) - return 1; - return 0; + return of_node_name_prefix(np, "battery") || + of_node_name_prefix(np, "escc-legacy"); } /** @@ -489,9 +486,9 @@ static void macio_pci_add_devices(struct macio_chip *chip) root_res); if (mdev == NULL) of_node_put(np); - else if (strncmp(np->name, "media-bay", 9) == 0) + else if (of_node_name_prefix(np, "media-bay")) mbdev = mdev; - else if (strncmp(np->name, "escc", 4) == 0) + else if (of_node_name_prefix(np, "escc")) sdev = mdev; } diff --git a/drivers/macintosh/macio_sysfs.c b/drivers/macintosh/macio_sysfs.c index d2451e58acb9..27f5eefc508f 100644 --- a/drivers/macintosh/macio_sysfs.c +++ b/drivers/macintosh/macio_sysfs.c @@ -3,17 +3,6 @@ #include <linux/stat.h> #include <asm/macio.h> - -#define macio_config_of_attr(field, format_string) \ -static ssize_t \ -field##_show (struct device *dev, struct device_attribute *attr, \ - char *buf) \ -{ \ - struct macio_dev *mdev = to_macio_device (dev); \ - return sprintf (buf, format_string, mdev->ofdev.dev.of_node->field); \ -} \ -static DEVICE_ATTR_RO(field); - static ssize_t compatible_show (struct device *dev, struct device_attribute *attr, char *buf) { @@ -65,7 +54,12 @@ static ssize_t name_show(struct device *dev, } static DEVICE_ATTR_RO(name); -macio_config_of_attr (type, "%s\n"); +static ssize_t type_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%s\n", of_node_get_device_type(dev->of_node)); +} +static DEVICE_ATTR_RO(type); static struct attribute *macio_dev_attrs[] = { &dev_attr_name.attr, diff --git a/drivers/macintosh/rack-meter.c b/drivers/macintosh/rack-meter.c index 1f29d2413c74..3940e2a032f7 100644 --- a/drivers/macintosh/rack-meter.c +++ b/drivers/macintosh/rack-meter.c @@ -376,18 +376,19 @@ static int rackmeter_probe(struct macio_dev* mdev, pr_debug("rackmeter_probe()\n"); /* Get i2s-a node */ - while ((i2s = of_get_next_child(mdev->ofdev.dev.of_node, i2s)) != NULL) - if (strcmp(i2s->name, "i2s-a") == 0) - break; + for_each_child_of_node(mdev->ofdev.dev.of_node, i2s) + if (of_node_name_eq(i2s, "i2s-a")) + break; + if (i2s == NULL) { pr_debug(" i2s-a child not found\n"); goto bail; } /* Get lightshow or virtual sound */ - while ((np = of_get_next_child(i2s, np)) != NULL) { - if (strcmp(np->name, "lightshow") == 0) + for_each_child_of_node(i2s, np) { + if (of_node_name_eq(np, "lightshow")) break; - if ((strcmp(np->name, "sound") == 0) && + if (of_node_name_eq(np, "sound") && of_get_property(np, "virtual", NULL) != NULL) break; } diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c index 60f57e2abf21..ac0cf37d6239 100644 --- a/drivers/macintosh/via-pmu.c +++ b/drivers/macintosh/via-pmu.c @@ -318,8 +318,8 @@ int __init find_via_pmu(void) PMU_INT_ADB | PMU_INT_TICK; - if (vias->parent->name && ((strcmp(vias->parent->name, "ohare") == 0) - || of_device_is_compatible(vias->parent, "ohare"))) + if (of_node_name_eq(vias->parent, "ohare") || + of_device_is_compatible(vias->parent, "ohare")) pmu_kind = PMU_OHARE_BASED; else if (of_device_is_compatible(vias->parent, "paddington")) pmu_kind = PMU_PADDINGTON_BASED; diff --git a/drivers/macintosh/windfarm_fcu_controls.c b/drivers/macintosh/windfarm_fcu_controls.c index fab7a21e9577..629f19875d7f 100644 --- a/drivers/macintosh/windfarm_fcu_controls.c +++ b/drivers/macintosh/windfarm_fcu_controls.c @@ -425,25 +425,25 @@ static void wf_fcu_lookup_fans(struct wf_fcu_priv *pv) { "CPU B 2", "cpu-fan-b-1", }, { "CPU B 3", "cpu-fan-c-1", }, }; - struct device_node *np = NULL, *fcu = pv->i2c->dev.of_node; + struct device_node *np, *fcu = pv->i2c->dev.of_node; int i; DBG("Looking up FCU controls in device-tree...\n"); - while ((np = of_get_next_child(fcu, np)) != NULL) { + for_each_child_of_node(fcu, np) { int id, type = -1; const char *loc; const char *name; const u32 *reg; - DBG(" control: %s, type: %s\n", np->name, np->type); + DBG(" control: %pOFn, type: %s\n", np, of_node_get_device_type(np)); /* Detect control type */ - if (!strcmp(np->type, "fan-rpm-control") || - !strcmp(np->type, "fan-rpm")) + if (of_node_is_type(np, "fan-rpm-control") || + of_node_is_type(np, "fan-rpm")) type = FCU_FAN_RPM; - if (!strcmp(np->type, "fan-pwm-control") || - !strcmp(np->type, "fan-pwm")) + if (of_node_is_type(np, "fan-pwm-control") || + of_node_is_type(np, "fan-pwm")) type = FCU_FAN_PWM; /* Only care about fans for now */ if (type == -1) diff --git a/drivers/macintosh/windfarm_lm87_sensor.c b/drivers/macintosh/windfarm_lm87_sensor.c index 35aa571d498a..09724acd70b6 100644 --- a/drivers/macintosh/windfarm_lm87_sensor.c +++ b/drivers/macintosh/windfarm_lm87_sensor.c @@ -110,8 +110,8 @@ static int wf_lm87_probe(struct i2c_client *client, * the Xserve G5 has several lm87's. However, for now we only * care about the internal temperature sensor */ - while ((np = of_get_next_child(client->dev.of_node, np)) != NULL) { - if (strcmp(np->name, "int-temp")) + for_each_child_of_node(client->dev.of_node, np) { + if (!of_node_name_eq(np, "int-temp")) continue; loc = of_get_property(np, "location", NULL); if (!loc) diff --git a/drivers/macintosh/windfarm_smu_controls.c b/drivers/macintosh/windfarm_smu_controls.c index 86d65462a61c..2cb9652a9998 100644 --- a/drivers/macintosh/windfarm_smu_controls.c +++ b/drivers/macintosh/windfarm_smu_controls.c @@ -267,7 +267,7 @@ static int __init smu_controls_init(void) /* Look for RPM fans */ for (fans = NULL; (fans = of_get_next_child(smu, fans)) != NULL;) - if (!strcmp(fans->name, "rpm-fans") || + if (of_node_name_eq(fans, "rpm-fans") || of_device_is_compatible(fans, "smu-rpm-fans")) break; for (fan = NULL; @@ -287,7 +287,7 @@ static int __init smu_controls_init(void) /* Look for PWM fans */ for (fans = NULL; (fans = of_get_next_child(smu, fans)) != NULL;) - if (!strcmp(fans->name, "pwm-fans")) + if (of_node_name_eq(fans, "pwm-fans")) break; for (fan = NULL; fans && (fan = of_get_next_child(fans, fan)) != NULL;) { diff --git a/drivers/macintosh/windfarm_smu_sat.c b/drivers/macintosh/windfarm_smu_sat.c index a0f61eb853c5..b4be718beba8 100644 --- a/drivers/macintosh/windfarm_smu_sat.c +++ b/drivers/macintosh/windfarm_smu_sat.c @@ -197,7 +197,7 @@ static int wf_sat_probe(struct i2c_client *client, struct wf_sat *sat; struct wf_sat_sensor *sens; const u32 *reg; - const char *loc, *type; + const char *loc; u8 chip, core; struct device_node *child; int shift, cpu, index; @@ -220,7 +220,6 @@ static int wf_sat_probe(struct i2c_client *client, child = NULL; while ((child = of_get_next_child(dev, child)) != NULL) { reg = of_get_property(child, "reg", NULL); - type = of_get_property(child, "device_type", NULL); loc = of_get_property(child, "location", NULL); if (reg == NULL || loc == NULL) continue; @@ -249,15 +248,15 @@ static int wf_sat_probe(struct i2c_client *client, continue; } - if (strcmp(type, "voltage-sensor") == 0) { + if (of_node_is_type(child, "voltage-sensor")) { name = "cpu-voltage"; shift = 4; vsens[core] = index; - } else if (strcmp(type, "current-sensor") == 0) { + } else if (of_node_is_type(child, "current-sensor")) { name = "cpu-current"; shift = 8; isens[core] = index; - } else if (strcmp(type, "temp-sensor") == 0) { + } else if (of_node_is_type(child, "temp-sensor")) { name = "cpu-temp"; shift = 10; } else diff --git a/drivers/macintosh/windfarm_smu_sensors.c b/drivers/macintosh/windfarm_smu_sensors.c index 172fd267dcf6..a58f6733381a 100644 --- a/drivers/macintosh/windfarm_smu_sensors.c +++ b/drivers/macintosh/windfarm_smu_sensors.c @@ -197,15 +197,14 @@ static const struct wf_sensor_ops smu_slotspow_ops = { static struct smu_ad_sensor *smu_ads_create(struct device_node *node) { struct smu_ad_sensor *ads; - const char *c, *l; + const char *l; const u32 *v; ads = kmalloc(sizeof(struct smu_ad_sensor), GFP_KERNEL); if (ads == NULL) return NULL; - c = of_get_property(node, "device_type", NULL); l = of_get_property(node, "location", NULL); - if (c == NULL || l == NULL) + if (l == NULL) goto fail; /* We currently pick the sensors based on the OF name and location @@ -215,7 +214,7 @@ static struct smu_ad_sensor *smu_ads_create(struct device_node *node) * the names and locations consistents so I'll stick with the names * and locations for now. */ - if (!strcmp(c, "temp-sensor") && + if (of_node_is_type(node, "temp-sensor") && !strcmp(l, "CPU T-Diode")) { ads->sens.ops = &smu_cputemp_ops; ads->sens.name = "cpu-temp"; @@ -224,7 +223,7 @@ static struct smu_ad_sensor *smu_ads_create(struct device_node *node) SMU_SDB_CPUDIODE_ID); goto fail; } - } else if (!strcmp(c, "current-sensor") && + } else if (of_node_is_type(node, "current-sensor") && !strcmp(l, "CPU Current")) { ads->sens.ops = &smu_cpuamp_ops; ads->sens.name = "cpu-current"; @@ -233,7 +232,7 @@ static struct smu_ad_sensor *smu_ads_create(struct device_node *node) SMU_SDB_CPUVCP_ID); goto fail; } - } else if (!strcmp(c, "voltage-sensor") && + } else if (of_node_is_type(node, "voltage-sensor") && !strcmp(l, "CPU Voltage")) { ads->sens.ops = &smu_cpuvolt_ops; ads->sens.name = "cpu-voltage"; @@ -242,7 +241,7 @@ static struct smu_ad_sensor *smu_ads_create(struct device_node *node) SMU_SDB_CPUVCP_ID); goto fail; } - } else if (!strcmp(c, "power-sensor") && + } else if (of_node_is_type(node, "power-sensor") && !strcmp(l, "Slots Power")) { ads->sens.ops = &smu_slotspow_ops; ads->sens.name = "slots-power"; @@ -425,7 +424,7 @@ static int __init smu_sensors_init(void) /* Look for sensors subdir */ for (sensors = NULL; (sensors = of_get_next_child(smu, sensors)) != NULL;) - if (!strcmp(sensors->name, "sensors")) + if (of_node_name_eq(sensors, "sensors")) break; of_node_put(smu); diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c index b66d832d3233..c79ba1c699ad 100644 --- a/drivers/misc/cxl/pci.c +++ b/drivers/misc/cxl/pci.c @@ -1718,7 +1718,6 @@ int cxl_slot_is_switched(struct pci_dev *dev) { struct device_node *np; int depth = 0; - const __be32 *prop; if (!(np = pci_device_to_OF_node(dev))) { pr_err("cxl: np = NULL\n"); @@ -1727,8 +1726,7 @@ int cxl_slot_is_switched(struct pci_dev *dev) of_node_get(np); while (np) { np = of_get_next_parent(np); - prop = of_get_property(np, "device_type", NULL); - if (!prop || strcmp((char *)prop, "pciex")) + if (!of_node_is_type(np, "pciex")) break; depth++; } diff --git a/drivers/misc/cxl/vphb.c b/drivers/misc/cxl/vphb.c index 7908633d9204..49da2f744bbf 100644 --- a/drivers/misc/cxl/vphb.c +++ b/drivers/misc/cxl/vphb.c @@ -11,17 +11,6 @@ #include <misc/cxl.h> #include "cxl.h" -static int cxl_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) -{ - if (dma_mask < DMA_BIT_MASK(64)) { - pr_info("%s only 64bit DMA supported on CXL", __func__); - return -EIO; - } - - *(pdev->dev.dma_mask) = dma_mask; - return 0; -} - static int cxl_pci_probe_mode(struct pci_bus *bus) { return PCI_PROBE_NORMAL; @@ -220,7 +209,6 @@ static struct pci_controller_ops cxl_pci_controller_ops = .reset_secondary_bus = cxl_pci_reset_secondary_bus, .setup_msi_irqs = cxl_setup_msi_irqs, .teardown_msi_irqs = cxl_teardown_msi_irqs, - .dma_set_mask = cxl_dma_set_mask, }; int cxl_pci_vphb_add(struct cxl_afu *afu) diff --git a/drivers/misc/ocxl/afu_irq.c b/drivers/misc/ocxl/afu_irq.c index e70cfa24577f..11ab996657a2 100644 --- a/drivers/misc/ocxl/afu_irq.c +++ b/drivers/misc/ocxl/afu_irq.c @@ -2,7 +2,6 @@ // Copyright 2017 IBM Corp. #include <linux/interrupt.h> #include <linux/eventfd.h> -#include <asm/pnv-ocxl.h> #include "ocxl_internal.h" #include "trace.h" diff --git a/drivers/misc/ocxl/config.c b/drivers/misc/ocxl/config.c index 57a6bb1fd3c9..8f2c5d8bd2ee 100644 --- a/drivers/misc/ocxl/config.c +++ b/drivers/misc/ocxl/config.c @@ -318,7 +318,7 @@ static int read_afu_name(struct pci_dev *dev, struct ocxl_fn_config *fn, if (rc) return rc; ptr = (u32 *) &afu->name[i]; - *ptr = val; + *ptr = le32_to_cpu((__force __le32) val); } afu->name[OCXL_AFU_NAME_SZ - 1] = '\0'; /* play safe */ return 0; diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c index 31695a078485..d50b861d7e57 100644 --- a/drivers/misc/ocxl/link.c +++ b/drivers/misc/ocxl/link.c @@ -273,9 +273,9 @@ static int setup_xsl_irq(struct pci_dev *dev, struct link *link) spa->irq_name = kasprintf(GFP_KERNEL, "ocxl-xsl-%x-%x-%x", link->domain, link->bus, link->dev); if (!spa->irq_name) { - unmap_irq_registers(spa); dev_err(&dev->dev, "Can't allocate name for xsl interrupt\n"); - return -ENOMEM; + rc = -ENOMEM; + goto err_xsl; } /* * At some point, we'll need to look into allowing a higher @@ -283,11 +283,10 @@ static int setup_xsl_irq(struct pci_dev *dev, struct link *link) */ spa->virq = irq_create_mapping(NULL, hwirq); if (!spa->virq) { - kfree(spa->irq_name); - unmap_irq_registers(spa); dev_err(&dev->dev, "irq_create_mapping failed for translation interrupt\n"); - return -EINVAL; + rc = -EINVAL; + goto err_name; } dev_dbg(&dev->dev, "hwirq %d mapped to virq %d\n", hwirq, spa->virq); @@ -295,15 +294,21 @@ static int setup_xsl_irq(struct pci_dev *dev, struct link *link) rc = request_irq(spa->virq, xsl_fault_handler, 0, spa->irq_name, link); if (rc) { - irq_dispose_mapping(spa->virq); - kfree(spa->irq_name); - unmap_irq_registers(spa); dev_err(&dev->dev, "request_irq failed for translation interrupt: %d\n", rc); - return -EINVAL; + rc = -EINVAL; + goto err_mapping; } return 0; + +err_mapping: + irq_dispose_mapping(spa->virq); +err_name: + kfree(spa->irq_name); +err_xsl: + unmap_irq_registers(spa); + return rc; } static void release_xsl_irq(struct link *link) @@ -566,7 +571,7 @@ int ocxl_link_update_pe(void *link_handle, int pasid, __u16 tid) mutex_lock(&spa->spa_lock); - pe->tid = tid; + pe->tid = cpu_to_be32(tid); /* * The barrier makes sure the PE is updated diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 42dc1d3d71cf..d0f8e4f5a039 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -38,3 +38,9 @@ config VFIO_PCI_IGD and LPC bridge config space. To enable Intel IGD assignment through vfio-pci, say Y. + +config VFIO_PCI_NVLINK2 + def_bool y + depends on VFIO_PCI && PPC_POWERNV + help + VFIO PCI support for P9 Witherspoon machine with NVIDIA V100 GPUs diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index 76d8ec058edd..9662c063a6b1 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -1,5 +1,6 @@ vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o +vfio-pci-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2.o obj-$(CONFIG_VFIO_PCI) += vfio-pci.o diff --git a/drivers/vfio/pci/trace.h b/drivers/vfio/pci/trace.h new file mode 100644 index 000000000000..228ccdb8d1c8 --- /dev/null +++ b/drivers/vfio/pci/trace.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * VFIO PCI mmap/mmap_fault tracepoints + * + * Copyright (C) 2018 IBM Corp. All rights reserved. + * Author: Alexey Kardashevskiy <aik@ozlabs.ru> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM vfio_pci + +#if !defined(_TRACE_VFIO_PCI_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_VFIO_PCI_H + +#include <linux/tracepoint.h> + +TRACE_EVENT(vfio_pci_nvgpu_mmap_fault, + TP_PROTO(struct pci_dev *pdev, unsigned long hpa, unsigned long ua, + vm_fault_t ret), + TP_ARGS(pdev, hpa, ua, ret), + + TP_STRUCT__entry( + __field(const char *, name) + __field(unsigned long, hpa) + __field(unsigned long, ua) + __field(int, ret) + ), + + TP_fast_assign( + __entry->name = dev_name(&pdev->dev), + __entry->hpa = hpa; + __entry->ua = ua; + __entry->ret = ret; + ), + + TP_printk("%s: %lx -> %lx ret=%d", __entry->name, __entry->hpa, + __entry->ua, __entry->ret) +); + +TRACE_EVENT(vfio_pci_nvgpu_mmap, + TP_PROTO(struct pci_dev *pdev, unsigned long hpa, unsigned long ua, + unsigned long size, int ret), + TP_ARGS(pdev, hpa, ua, size, ret), + + TP_STRUCT__entry( + __field(const char *, name) + __field(unsigned long, hpa) + __field(unsigned long, ua) + __field(unsigned long, size) + __field(int, ret) + ), + + TP_fast_assign( + __entry->name = dev_name(&pdev->dev), + __entry->hpa = hpa; + __entry->ua = ua; + __entry->size = size; + __entry->ret = ret; + ), + + TP_printk("%s: %lx -> %lx size=%lx ret=%d", __entry->name, __entry->hpa, + __entry->ua, __entry->size, __entry->ret) +); + +TRACE_EVENT(vfio_pci_npu2_mmap, + TP_PROTO(struct pci_dev *pdev, unsigned long hpa, unsigned long ua, + unsigned long size, int ret), + TP_ARGS(pdev, hpa, ua, size, ret), + + TP_STRUCT__entry( + __field(const char *, name) + __field(unsigned long, hpa) + __field(unsigned long, ua) + __field(unsigned long, size) + __field(int, ret) + ), + + TP_fast_assign( + __entry->name = dev_name(&pdev->dev), + __entry->hpa = hpa; + __entry->ua = ua; + __entry->size = size; + __entry->ret = ret; + ), + + TP_printk("%s: %lx -> %lx size=%lx ret=%d", __entry->name, __entry->hpa, + __entry->ua, __entry->size, __entry->ret) +); + +#endif /* _TRACE_VFIO_PCI_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 50cdedfca9fe..a89fa5d4e877 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -289,14 +289,37 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev) if (ret) { dev_warn(&vdev->pdev->dev, "Failed to setup Intel IGD regions\n"); - vfio_pci_disable(vdev); - return ret; + goto disable_exit; + } + } + + if (pdev->vendor == PCI_VENDOR_ID_NVIDIA && + IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) { + ret = vfio_pci_nvdia_v100_nvlink2_init(vdev); + if (ret && ret != -ENODEV) { + dev_warn(&vdev->pdev->dev, + "Failed to setup NVIDIA NV2 RAM region\n"); + goto disable_exit; + } + } + + if (pdev->vendor == PCI_VENDOR_ID_IBM && + IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) { + ret = vfio_pci_ibm_npu2_init(vdev); + if (ret && ret != -ENODEV) { + dev_warn(&vdev->pdev->dev, + "Failed to setup NVIDIA NV2 ATSD region\n"); + goto disable_exit; } } vfio_pci_probe_mmaps(vdev); return 0; + +disable_exit: + vfio_pci_disable(vdev); + return ret; } static void vfio_pci_disable(struct vfio_pci_device *vdev) @@ -750,6 +773,12 @@ static long vfio_pci_ioctl(void *device_data, if (ret) return ret; + if (vdev->region[i].ops->add_capability) { + ret = vdev->region[i].ops->add_capability(vdev, + &vdev->region[i], &caps); + if (ret) + return ret; + } } } @@ -1117,6 +1146,15 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) return -EINVAL; if ((vma->vm_flags & VM_SHARED) == 0) return -EINVAL; + if (index >= VFIO_PCI_NUM_REGIONS) { + int regnum = index - VFIO_PCI_NUM_REGIONS; + struct vfio_pci_region *region = vdev->region + regnum; + + if (region && region->ops && region->ops->mmap && + (region->flags & VFIO_REGION_INFO_FLAG_MMAP)) + return region->ops->mmap(vdev, region, vma); + return -EINVAL; + } if (index >= VFIO_PCI_ROM_REGION_INDEX) return -EINVAL; if (!vdev->bar_mmap_supported[index]) diff --git a/drivers/vfio/pci/vfio_pci_nvlink2.c b/drivers/vfio/pci/vfio_pci_nvlink2.c new file mode 100644 index 000000000000..054a2cf9dd8e --- /dev/null +++ b/drivers/vfio/pci/vfio_pci_nvlink2.c @@ -0,0 +1,482 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * VFIO PCI NVIDIA Whitherspoon GPU support a.k.a. NVLink2. + * + * Copyright (C) 2018 IBM Corp. All rights reserved. + * Author: Alexey Kardashevskiy <aik@ozlabs.ru> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Register an on-GPU RAM region for cacheable access. + * + * Derived from original vfio_pci_igd.c: + * Copyright (C) 2016 Red Hat, Inc. All rights reserved. + * Author: Alex Williamson <alex.williamson@redhat.com> + */ + +#include <linux/io.h> +#include <linux/pci.h> +#include <linux/uaccess.h> +#include <linux/vfio.h> +#include <linux/sched/mm.h> +#include <linux/mmu_context.h> +#include <asm/kvm_ppc.h> +#include "vfio_pci_private.h" + +#define CREATE_TRACE_POINTS +#include "trace.h" + +EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap_fault); +EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap); +EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_npu2_mmap); + +struct vfio_pci_nvgpu_data { + unsigned long gpu_hpa; /* GPU RAM physical address */ + unsigned long gpu_tgt; /* TGT address of corresponding GPU RAM */ + unsigned long useraddr; /* GPU RAM userspace address */ + unsigned long size; /* Size of the GPU RAM window (usually 128GB) */ + struct mm_struct *mm; + struct mm_iommu_table_group_mem_t *mem; /* Pre-registered RAM descr. */ + struct pci_dev *gpdev; + struct notifier_block group_notifier; +}; + +static size_t vfio_pci_nvgpu_rw(struct vfio_pci_device *vdev, + char __user *buf, size_t count, loff_t *ppos, bool iswrite) +{ + unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; + struct vfio_pci_nvgpu_data *data = vdev->region[i].data; + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + loff_t posaligned = pos & PAGE_MASK, posoff = pos & ~PAGE_MASK; + size_t sizealigned; + void __iomem *ptr; + + if (pos >= vdev->region[i].size) + return -EINVAL; + + count = min(count, (size_t)(vdev->region[i].size - pos)); + + /* + * We map only a bit of GPU RAM for a short time instead of mapping it + * for the guest lifetime as: + * + * 1) we do not know GPU RAM size, only aperture which is 4-8 times + * bigger than actual RAM size (16/32GB RAM vs. 128GB aperture); + * 2) mapping GPU RAM allows CPU to prefetch and if this happens + * before NVLink bridge is reset (which fences GPU RAM), + * hardware management interrupts (HMI) might happen, this + * will freeze NVLink bridge. + * + * This is not fast path anyway. + */ + sizealigned = _ALIGN_UP(posoff + count, PAGE_SIZE); + ptr = ioremap_cache(data->gpu_hpa + posaligned, sizealigned); + if (!ptr) + return -EFAULT; + + if (iswrite) { + if (copy_from_user(ptr + posoff, buf, count)) + count = -EFAULT; + else + *ppos += count; + } else { + if (copy_to_user(buf, ptr + posoff, count)) + count = -EFAULT; + else + *ppos += count; + } + + iounmap(ptr); + + return count; +} + +static void vfio_pci_nvgpu_release(struct vfio_pci_device *vdev, + struct vfio_pci_region *region) +{ + struct vfio_pci_nvgpu_data *data = region->data; + long ret; + + /* If there were any mappings at all... */ + if (data->mm) { + ret = mm_iommu_put(data->mm, data->mem); + WARN_ON(ret); + + mmdrop(data->mm); + } + + vfio_unregister_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY, + &data->group_notifier); + + pnv_npu2_unmap_lpar_dev(data->gpdev); + + kfree(data); +} + +static vm_fault_t vfio_pci_nvgpu_mmap_fault(struct vm_fault *vmf) +{ + vm_fault_t ret; + struct vm_area_struct *vma = vmf->vma; + struct vfio_pci_region *region = vma->vm_private_data; + struct vfio_pci_nvgpu_data *data = region->data; + unsigned long vmf_off = (vmf->address - vma->vm_start) >> PAGE_SHIFT; + unsigned long nv2pg = data->gpu_hpa >> PAGE_SHIFT; + unsigned long vm_pgoff = vma->vm_pgoff & + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + unsigned long pfn = nv2pg + vm_pgoff + vmf_off; + + ret = vmf_insert_pfn(vma, vmf->address, pfn); + trace_vfio_pci_nvgpu_mmap_fault(data->gpdev, pfn << PAGE_SHIFT, + vmf->address, ret); + + return ret; +} + +static const struct vm_operations_struct vfio_pci_nvgpu_mmap_vmops = { + .fault = vfio_pci_nvgpu_mmap_fault, +}; + +static int vfio_pci_nvgpu_mmap(struct vfio_pci_device *vdev, + struct vfio_pci_region *region, struct vm_area_struct *vma) +{ + int ret; + struct vfio_pci_nvgpu_data *data = region->data; + + if (data->useraddr) + return -EPERM; + + if (vma->vm_end - vma->vm_start > data->size) + return -EINVAL; + + vma->vm_private_data = region; + vma->vm_flags |= VM_PFNMAP; + vma->vm_ops = &vfio_pci_nvgpu_mmap_vmops; + + /* + * Calling mm_iommu_newdev() here once as the region is not + * registered yet and therefore right initialization will happen now. + * Other places will use mm_iommu_find() which returns + * registered @mem and does not go gup(). + */ + data->useraddr = vma->vm_start; + data->mm = current->mm; + + atomic_inc(&data->mm->mm_count); + ret = (int) mm_iommu_newdev(data->mm, data->useraddr, + (vma->vm_end - vma->vm_start) >> PAGE_SHIFT, + data->gpu_hpa, &data->mem); + + trace_vfio_pci_nvgpu_mmap(vdev->pdev, data->gpu_hpa, data->useraddr, + vma->vm_end - vma->vm_start, ret); + + return ret; +} + +static int vfio_pci_nvgpu_add_capability(struct vfio_pci_device *vdev, + struct vfio_pci_region *region, struct vfio_info_cap *caps) +{ + struct vfio_pci_nvgpu_data *data = region->data; + struct vfio_region_info_cap_nvlink2_ssatgt cap = { 0 }; + + cap.header.id = VFIO_REGION_INFO_CAP_NVLINK2_SSATGT; + cap.header.version = 1; + cap.tgt = data->gpu_tgt; + + return vfio_info_add_capability(caps, &cap.header, sizeof(cap)); +} + +static const struct vfio_pci_regops vfio_pci_nvgpu_regops = { + .rw = vfio_pci_nvgpu_rw, + .release = vfio_pci_nvgpu_release, + .mmap = vfio_pci_nvgpu_mmap, + .add_capability = vfio_pci_nvgpu_add_capability, +}; + +static int vfio_pci_nvgpu_group_notifier(struct notifier_block *nb, + unsigned long action, void *opaque) +{ + struct kvm *kvm = opaque; + struct vfio_pci_nvgpu_data *data = container_of(nb, + struct vfio_pci_nvgpu_data, + group_notifier); + + if (action == VFIO_GROUP_NOTIFY_SET_KVM && kvm && + pnv_npu2_map_lpar_dev(data->gpdev, + kvm->arch.lpid, MSR_DR | MSR_PR)) + return NOTIFY_BAD; + + return NOTIFY_OK; +} + +int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev) +{ + int ret; + u64 reg[2]; + u64 tgt = 0; + struct device_node *npu_node, *mem_node; + struct pci_dev *npu_dev; + struct vfio_pci_nvgpu_data *data; + uint32_t mem_phandle = 0; + unsigned long events = VFIO_GROUP_NOTIFY_SET_KVM; + + /* + * PCI config space does not tell us about NVLink presense but + * platform does, use this. + */ + npu_dev = pnv_pci_get_npu_dev(vdev->pdev, 0); + if (!npu_dev) + return -ENODEV; + + npu_node = pci_device_to_OF_node(npu_dev); + if (!npu_node) + return -EINVAL; + + if (of_property_read_u32(npu_node, "memory-region", &mem_phandle)) + return -EINVAL; + + mem_node = of_find_node_by_phandle(mem_phandle); + if (!mem_node) + return -EINVAL; + + if (of_property_read_variable_u64_array(mem_node, "reg", reg, + ARRAY_SIZE(reg), ARRAY_SIZE(reg)) != + ARRAY_SIZE(reg)) + return -EINVAL; + + if (of_property_read_u64(npu_node, "ibm,device-tgt-addr", &tgt)) { + dev_warn(&vdev->pdev->dev, "No ibm,device-tgt-addr found\n"); + return -EFAULT; + } + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->gpu_hpa = reg[0]; + data->gpu_tgt = tgt; + data->size = reg[1]; + + dev_dbg(&vdev->pdev->dev, "%lx..%lx\n", data->gpu_hpa, + data->gpu_hpa + data->size - 1); + + data->gpdev = vdev->pdev; + data->group_notifier.notifier_call = vfio_pci_nvgpu_group_notifier; + + ret = vfio_register_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY, + &events, &data->group_notifier); + if (ret) + goto free_exit; + + /* + * We have just set KVM, we do not need the listener anymore. + * Also, keeping it registered means that if more than one GPU is + * assigned, we will get several similar notifiers notifying about + * the same device again which does not help with anything. + */ + vfio_unregister_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY, + &data->group_notifier); + + ret = vfio_pci_register_dev_region(vdev, + PCI_VENDOR_ID_NVIDIA | VFIO_REGION_TYPE_PCI_VENDOR_TYPE, + VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM, + &vfio_pci_nvgpu_regops, + data->size, + VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE | + VFIO_REGION_INFO_FLAG_MMAP, + data); + if (ret) + goto free_exit; + + return 0; +free_exit: + kfree(data); + + return ret; +} + +/* + * IBM NPU2 bridge + */ +struct vfio_pci_npu2_data { + void *base; /* ATSD register virtual address, for emulated access */ + unsigned long mmio_atsd; /* ATSD physical address */ + unsigned long gpu_tgt; /* TGT address of corresponding GPU RAM */ + unsigned int link_speed; /* The link speed from DT's ibm,nvlink-speed */ +}; + +static size_t vfio_pci_npu2_rw(struct vfio_pci_device *vdev, + char __user *buf, size_t count, loff_t *ppos, bool iswrite) +{ + unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; + struct vfio_pci_npu2_data *data = vdev->region[i].data; + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + + if (pos >= vdev->region[i].size) + return -EINVAL; + + count = min(count, (size_t)(vdev->region[i].size - pos)); + + if (iswrite) { + if (copy_from_user(data->base + pos, buf, count)) + return -EFAULT; + } else { + if (copy_to_user(buf, data->base + pos, count)) + return -EFAULT; + } + *ppos += count; + + return count; +} + +static int vfio_pci_npu2_mmap(struct vfio_pci_device *vdev, + struct vfio_pci_region *region, struct vm_area_struct *vma) +{ + int ret; + struct vfio_pci_npu2_data *data = region->data; + unsigned long req_len = vma->vm_end - vma->vm_start; + + if (req_len != PAGE_SIZE) + return -EINVAL; + + vma->vm_flags |= VM_PFNMAP; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + ret = remap_pfn_range(vma, vma->vm_start, data->mmio_atsd >> PAGE_SHIFT, + req_len, vma->vm_page_prot); + trace_vfio_pci_npu2_mmap(vdev->pdev, data->mmio_atsd, vma->vm_start, + vma->vm_end - vma->vm_start, ret); + + return ret; +} + +static void vfio_pci_npu2_release(struct vfio_pci_device *vdev, + struct vfio_pci_region *region) +{ + struct vfio_pci_npu2_data *data = region->data; + + memunmap(data->base); + kfree(data); +} + +static int vfio_pci_npu2_add_capability(struct vfio_pci_device *vdev, + struct vfio_pci_region *region, struct vfio_info_cap *caps) +{ + struct vfio_pci_npu2_data *data = region->data; + struct vfio_region_info_cap_nvlink2_ssatgt captgt = { 0 }; + struct vfio_region_info_cap_nvlink2_lnkspd capspd = { 0 }; + int ret; + + captgt.header.id = VFIO_REGION_INFO_CAP_NVLINK2_SSATGT; + captgt.header.version = 1; + captgt.tgt = data->gpu_tgt; + + capspd.header.id = VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD; + capspd.header.version = 1; + capspd.link_speed = data->link_speed; + + ret = vfio_info_add_capability(caps, &captgt.header, sizeof(captgt)); + if (ret) + return ret; + + return vfio_info_add_capability(caps, &capspd.header, sizeof(capspd)); +} + +static const struct vfio_pci_regops vfio_pci_npu2_regops = { + .rw = vfio_pci_npu2_rw, + .mmap = vfio_pci_npu2_mmap, + .release = vfio_pci_npu2_release, + .add_capability = vfio_pci_npu2_add_capability, +}; + +int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev) +{ + int ret; + struct vfio_pci_npu2_data *data; + struct device_node *nvlink_dn; + u32 nvlink_index = 0; + struct pci_dev *npdev = vdev->pdev; + struct device_node *npu_node = pci_device_to_OF_node(npdev); + struct pci_controller *hose = pci_bus_to_host(npdev->bus); + u64 mmio_atsd = 0; + u64 tgt = 0; + u32 link_speed = 0xff; + + /* + * PCI config space does not tell us about NVLink presense but + * platform does, use this. + */ + if (!pnv_pci_get_gpu_dev(vdev->pdev)) + return -ENODEV; + + /* + * NPU2 normally has 8 ATSD registers (for concurrency) and 6 links + * so we can allocate one register per link, using nvlink index as + * a key. + * There is always at least one ATSD register so as long as at least + * NVLink bridge #0 is passed to the guest, ATSD will be available. + */ + nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0); + if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", + &nvlink_index))) + return -ENODEV; + + if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", nvlink_index, + &mmio_atsd)) { + dev_warn(&vdev->pdev->dev, "No available ATSD found\n"); + mmio_atsd = 0; + } + + if (of_property_read_u64(npu_node, "ibm,device-tgt-addr", &tgt)) { + dev_warn(&vdev->pdev->dev, "No ibm,device-tgt-addr found\n"); + return -EFAULT; + } + + if (of_property_read_u32(npu_node, "ibm,nvlink-speed", &link_speed)) { + dev_warn(&vdev->pdev->dev, "No ibm,nvlink-speed found\n"); + return -EFAULT; + } + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->mmio_atsd = mmio_atsd; + data->gpu_tgt = tgt; + data->link_speed = link_speed; + if (data->mmio_atsd) { + data->base = memremap(data->mmio_atsd, SZ_64K, MEMREMAP_WT); + if (!data->base) { + ret = -ENOMEM; + goto free_exit; + } + } + + /* + * We want to expose the capability even if this specific NVLink + * did not get its own ATSD register because capabilities + * belong to VFIO regions and normally there will be ATSD register + * assigned to the NVLink bridge. + */ + ret = vfio_pci_register_dev_region(vdev, + PCI_VENDOR_ID_IBM | + VFIO_REGION_TYPE_PCI_VENDOR_TYPE, + VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD, + &vfio_pci_npu2_regops, + data->mmio_atsd ? PAGE_SIZE : 0, + VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE | + VFIO_REGION_INFO_FLAG_MMAP, + data); + if (ret) + goto free_exit; + + return 0; + +free_exit: + kfree(data); + + return ret; +} diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index cde3b5d3441a..127071b84dd7 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -59,6 +59,12 @@ struct vfio_pci_regops { size_t count, loff_t *ppos, bool iswrite); void (*release)(struct vfio_pci_device *vdev, struct vfio_pci_region *region); + int (*mmap)(struct vfio_pci_device *vdev, + struct vfio_pci_region *region, + struct vm_area_struct *vma); + int (*add_capability)(struct vfio_pci_device *vdev, + struct vfio_pci_region *region, + struct vfio_info_cap *caps); }; struct vfio_pci_region { @@ -157,4 +163,18 @@ static inline int vfio_pci_igd_init(struct vfio_pci_device *vdev) return -ENODEV; } #endif +#ifdef CONFIG_VFIO_PCI_NVLINK2 +extern int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev); +extern int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev); +#else +static inline int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev) +{ + return -ENODEV; +} + +static inline int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev) +{ + return -ENODEV; +} +#endif #endif /* VFIO_PCI_PRIVATE_H */ diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index b30926e11d87..c424913324e3 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -152,11 +152,12 @@ static long tce_iommu_unregister_pages(struct tce_container *container, struct mm_iommu_table_group_mem_t *mem; struct tce_iommu_prereg *tcemem; bool found = false; + long ret; if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK)) return -EINVAL; - mem = mm_iommu_find(container->mm, vaddr, size >> PAGE_SHIFT); + mem = mm_iommu_get(container->mm, vaddr, size >> PAGE_SHIFT); if (!mem) return -ENOENT; @@ -168,9 +169,13 @@ static long tce_iommu_unregister_pages(struct tce_container *container, } if (!found) - return -ENOENT; + ret = -ENOENT; + else + ret = tce_iommu_prereg_free(container, tcemem); - return tce_iommu_prereg_free(container, tcemem); + mm_iommu_put(container->mm, mem); + + return ret; } static long tce_iommu_register_pages(struct tce_container *container, @@ -185,22 +190,24 @@ static long tce_iommu_register_pages(struct tce_container *container, ((vaddr + size) < vaddr)) return -EINVAL; - mem = mm_iommu_find(container->mm, vaddr, entries); + mem = mm_iommu_get(container->mm, vaddr, entries); if (mem) { list_for_each_entry(tcemem, &container->prereg_list, next) { - if (tcemem->mem == mem) - return -EBUSY; + if (tcemem->mem == mem) { + ret = -EBUSY; + goto put_exit; + } } + } else { + ret = mm_iommu_new(container->mm, vaddr, entries, &mem); + if (ret) + return ret; } - ret = mm_iommu_get(container->mm, vaddr, entries, &mem); - if (ret) - return ret; - tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL); if (!tcemem) { - mm_iommu_put(container->mm, mem); - return -ENOMEM; + ret = -ENOMEM; + goto put_exit; } tcemem->mem = mem; @@ -209,10 +216,22 @@ static long tce_iommu_register_pages(struct tce_container *container, container->enabled = true; return 0; + +put_exit: + mm_iommu_put(container->mm, mem); + return ret; } -static bool tce_page_is_contained(struct page *page, unsigned page_shift) +static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa, + unsigned int page_shift) { + struct page *page; + unsigned long size = 0; + + if (mm_iommu_is_devmem(mm, hpa, page_shift, &size)) + return size == (1UL << page_shift); + + page = pfn_to_page(hpa >> PAGE_SHIFT); /* * Check that the TCE table granularity is not bigger than the size of * a page we just found. Otherwise the hardware can get access to @@ -371,6 +390,7 @@ static void tce_iommu_release(void *iommu_data) { struct tce_container *container = iommu_data; struct tce_iommu_group *tcegrp; + struct tce_iommu_prereg *tcemem, *tmtmp; long i; while (tce_groups_attached(container)) { @@ -393,13 +413,8 @@ static void tce_iommu_release(void *iommu_data) tce_iommu_free_table(container, tbl); } - while (!list_empty(&container->prereg_list)) { - struct tce_iommu_prereg *tcemem; - - tcemem = list_first_entry(&container->prereg_list, - struct tce_iommu_prereg, next); - WARN_ON_ONCE(tce_iommu_prereg_free(container, tcemem)); - } + list_for_each_entry_safe(tcemem, tmtmp, &container->prereg_list, next) + WARN_ON(tce_iommu_prereg_free(container, tcemem)); tce_iommu_disable(container); if (container->mm) @@ -492,7 +507,8 @@ static int tce_iommu_clear(struct tce_container *container, direction = DMA_NONE; oldhpa = 0; - ret = iommu_tce_xchg(tbl, entry, &oldhpa, &direction); + ret = iommu_tce_xchg(container->mm, tbl, entry, &oldhpa, + &direction); if (ret) continue; @@ -530,7 +546,6 @@ static long tce_iommu_build(struct tce_container *container, enum dma_data_direction direction) { long i, ret = 0; - struct page *page; unsigned long hpa; enum dma_data_direction dirtmp; @@ -541,15 +556,16 @@ static long tce_iommu_build(struct tce_container *container, if (ret) break; - page = pfn_to_page(hpa >> PAGE_SHIFT); - if (!tce_page_is_contained(page, tbl->it_page_shift)) { + if (!tce_page_is_contained(container->mm, hpa, + tbl->it_page_shift)) { ret = -EPERM; break; } hpa |= offset; dirtmp = direction; - ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp); + ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa, + &dirtmp); if (ret) { tce_iommu_unuse_page(container, hpa); pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", @@ -576,7 +592,6 @@ static long tce_iommu_build_v2(struct tce_container *container, enum dma_data_direction direction) { long i, ret = 0; - struct page *page; unsigned long hpa; enum dma_data_direction dirtmp; @@ -589,8 +604,8 @@ static long tce_iommu_build_v2(struct tce_container *container, if (ret) break; - page = pfn_to_page(hpa >> PAGE_SHIFT); - if (!tce_page_is_contained(page, tbl->it_page_shift)) { + if (!tce_page_is_contained(container->mm, hpa, + tbl->it_page_shift)) { ret = -EPERM; break; } @@ -603,7 +618,8 @@ static long tce_iommu_build_v2(struct tce_container *container, if (mm_iommu_mapped_inc(mem)) break; - ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp); + ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa, + &dirtmp); if (ret) { /* dirtmp cannot be DMA_NONE here */ tce_iommu_unuse_page_v2(container, tbl, entry + i); |