diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-11 16:08:54 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-11 16:08:54 -0700 |
commit | c2d95729e3094ecdd8c54e856bbe971adbbd7f48 (patch) | |
tree | 76cc5b551227d3d55d68a93105c1fe8080dfb812 | |
parent | bbda1baeeb2f4aff3addac3d086a1e56c3f2503e (diff) | |
parent | b34081f1cd59585451efaa69e1dff1b9507e6c89 (diff) | |
download | lwn-c2d95729e3094ecdd8c54e856bbe971adbbd7f48.tar.gz lwn-c2d95729e3094ecdd8c54e856bbe971adbbd7f48.zip |
Merge branch 'akpm' (patches from Andrew Morton)
Merge first patch-bomb from Andrew Morton:
- Some pidns/fork/exec tweaks
- OCFS2 updates
- Most of MM - there remain quite a few memcg parts which depend on
pending core cgroups changes. Which might have been already merged -
I'll check tomorrow...
- Various misc stuff all over the place
- A few block bits which I never got around to sending to Jens -
relatively minor things.
- MAINTAINERS maintenance
- A small number of lib/ updates
- checkpatch updates
- epoll
- firmware/dmi-scan
- Some kprobes work for S390
- drivers/rtc updates
- hfsplus feature work
- vmcore feature work
- rbtree upgrades
- AOE updates
- pktcdvd cleanups
- PPS
- memstick
- w1
- New "inittmpfs" feature, which does the obvious
- More IPC work from Davidlohr.
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (303 commits)
lz4: fix compression/decompression signedness mismatch
ipc: drop ipc_lock_check
ipc, shm: drop shm_lock_check
ipc: drop ipc_lock_by_ptr
ipc, shm: guard against non-existant vma in shmdt(2)
ipc: document general ipc locking scheme
ipc,msg: drop msg_unlock
ipc: rename ids->rw_mutex
ipc,shm: shorten critical region for shmat
ipc,shm: cleanup do_shmat pasta
ipc,shm: shorten critical region for shmctl
ipc,shm: make shmctl_nolock lockless
ipc,shm: introduce shmctl_nolock
ipc: drop ipcctl_pre_down
ipc,shm: shorten critical region in shmctl_down
ipc,shm: introduce lockless functions to obtain the ipc object
initmpfs: use initramfs if rootfstype= or root= specified
initmpfs: make rootfs use tmpfs when CONFIG_TMPFS enabled
initmpfs: move rootfs code from fs/ramfs/ to init/
initmpfs: move bdi setup from init_rootfs to init_ramfs
...
247 files changed, 8796 insertions, 2889 deletions
diff --git a/Documentation/aoe/udev.txt b/Documentation/aoe/udev.txt index 8686e789542e..1f06daf03f5b 100644 --- a/Documentation/aoe/udev.txt +++ b/Documentation/aoe/udev.txt @@ -23,4 +23,4 @@ SUBSYSTEM=="aoe", KERNEL=="revalidate", NAME="etherd/%k", GROUP="disk", MODE="02 SUBSYSTEM=="aoe", KERNEL=="flush", NAME="etherd/%k", GROUP="disk", MODE="0220" # aoe block devices -KERNEL=="etherd*", NAME="%k", GROUP="disk" +KERNEL=="etherd*", GROUP="disk" diff --git a/Documentation/block/cmdline-partition.txt b/Documentation/block/cmdline-partition.txt new file mode 100644 index 000000000000..2bbf4cc40c3f --- /dev/null +++ b/Documentation/block/cmdline-partition.txt @@ -0,0 +1,39 @@ +Embedded device command line partition +===================================================================== + +Read block device partition table from command line. +The partition used for fixed block device (eMMC) embedded device. +It is no MBR, save storage space. Bootloader can be easily accessed +by absolute address of data on the block device. +Users can easily change the partition. + +The format for the command line is just like mtdparts: + +blkdevparts=<blkdev-def>[;<blkdev-def>] + <blkdev-def> := <blkdev-id>:<partdef>[,<partdef>] + <partdef> := <size>[@<offset>](part-name) + +<blkdev-id> + block device disk name, embedded device used fixed block device, + it's disk name also fixed. such as: mmcblk0, mmcblk1, mmcblk0boot0. + +<size> + partition size, in bytes, such as: 512, 1m, 1G. + +<offset> + partition start address, in bytes. + +(part-name) + partition name, kernel send uevent with "PARTNAME". application can create + a link to block device partition with the name "PARTNAME". + user space application can access partition by partition name. + +Example: + eMMC disk name is "mmcblk0" and "mmcblk0boot0" + + bootargs: + 'blkdevparts=mmcblk0:1G(data0),1G(data1),-;mmcblk0boot0:1m(boot),-(kernel)' + + dmesg: + mmcblk0: p1(data0) p2(data1) p3() + mmcblk0boot0: p1(boot) p2(kernel) diff --git a/Documentation/devicetree/bindings/rtc/moxa,moxart-rtc.txt b/Documentation/devicetree/bindings/rtc/moxa,moxart-rtc.txt new file mode 100644 index 000000000000..c9d3ac1477fe --- /dev/null +++ b/Documentation/devicetree/bindings/rtc/moxa,moxart-rtc.txt @@ -0,0 +1,17 @@ +MOXA ART real-time clock + +Required properties: + +- compatible : Should be "moxa,moxart-rtc" +- gpio-rtc-sclk : RTC sclk gpio, with zero flags +- gpio-rtc-data : RTC data gpio, with zero flags +- gpio-rtc-reset : RTC reset gpio, with zero flags + +Example: + + rtc: rtc { + compatible = "moxa,moxart-rtc"; + gpio-rtc-sclk = <&gpio 5 0>; + gpio-rtc-data = <&gpio 6 0>; + gpio-rtc-reset = <&gpio 7 0>; + }; diff --git a/Documentation/devicetree/bindings/rtc/rtc-omap.txt b/Documentation/devicetree/bindings/rtc/rtc-omap.txt index b47aa415c820..5a0f02d34d95 100644 --- a/Documentation/devicetree/bindings/rtc/rtc-omap.txt +++ b/Documentation/devicetree/bindings/rtc/rtc-omap.txt @@ -1,7 +1,11 @@ TI Real Time Clock Required properties: -- compatible: "ti,da830-rtc" +- compatible: + - "ti,da830-rtc" - for RTC IP used similar to that on DA8xx SoC family. + - "ti,am3352-rtc" - for RTC IP used similar to that on AM335x SoC family. + This RTC IP has special WAKE-EN Register to enable + Wakeup generation for event Alarm. - reg: Address range of rtc register set - interrupts: rtc timer, alarm interrupts in order - interrupt-parent: phandle for the interrupt controller diff --git a/Documentation/devicetree/bindings/rtc/rtc-palmas.txt b/Documentation/devicetree/bindings/rtc/rtc-palmas.txt new file mode 100644 index 000000000000..adbccc0a51e1 --- /dev/null +++ b/Documentation/devicetree/bindings/rtc/rtc-palmas.txt @@ -0,0 +1,33 @@ +Palmas RTC controller bindings + +Required properties: +- compatible: + - "ti,palmas-rtc" for palma series of the RTC controller +- interrupt-parent: Parent interrupt device, must be handle of palmas node. +- interrupts: Interrupt number of RTC submodule on device. + +Optional properties: + +- ti,backup-battery-chargeable: The Palmas series device like TPS65913 or + TPS80036 supports the backup battery for powering the RTC when main + battery is removed or in very low power state. The backup battery + can be chargeable or non-chargeable. This flag will tells whether + battery is chargeable or not. If charging battery then driver can + enable the charging. +- ti,backup-battery-charge-high-current: Enable high current charging in + backup battery. Device supports the < 100mA and > 100mA charging. + The high current will be > 100mA. Absence of this property will + charge battery to lower current i.e. < 100mA. + +Example: + palmas: tps65913@58 { + ... + palmas_rtc: rtc { + compatible = "ti,palmas-rtc"; + interrupt-parent = <&palmas>; + interrupts = <8 0>; + ti,backup-battery-chargeable; + ti,backup-battery-charge-high-current; + }; + ... + }; diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index fcc22c982a25..823c95faebd2 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -854,16 +854,15 @@ Committed_AS: The amount of memory presently allocated on the system. The committed memory is a sum of all of the memory which has been allocated by processes, even if it has not been "used" by them as of yet. A process which malloc()'s 1G - of memory, but only touches 300M of it will only show up - as using 300M of memory even if it has the address space - allocated for the entire 1G. This 1G is memory which has - been "committed" to by the VM and can be used at any time - by the allocating application. With strict overcommit - enabled on the system (mode 2 in 'vm.overcommit_memory'), - allocations which would exceed the CommitLimit (detailed - above) will not be permitted. This is useful if one needs - to guarantee that processes will not fail due to lack of - memory once that memory has been successfully allocated. + of memory, but only touches 300M of it will show up as + using 1G. This 1G is memory which has been "committed" to + by the VM and can be used at any time by the allocating + application. With strict overcommit enabled on the system + (mode 2 in 'vm.overcommit_memory'),allocations which would + exceed the CommitLimit (detailed above) will not be permitted. + This is useful if one needs to guarantee that processes will + not fail due to lack of memory once that memory has been + successfully allocated. VmallocTotal: total size of vmalloc memory area VmallocUsed: amount of vmalloc area which is used VmallocChunk: largest contiguous block of vmalloc area which is free diff --git a/Documentation/filesystems/ramfs-rootfs-initramfs.txt b/Documentation/filesystems/ramfs-rootfs-initramfs.txt index 59b4a0962e0f..b176928e6963 100644 --- a/Documentation/filesystems/ramfs-rootfs-initramfs.txt +++ b/Documentation/filesystems/ramfs-rootfs-initramfs.txt @@ -79,6 +79,10 @@ to just make sure certain lists can't become empty. Most systems just mount another filesystem over rootfs and ignore it. The amount of space an empty instance of ramfs takes up is tiny. +If CONFIG_TMPFS is enabled, rootfs will use tmpfs instead of ramfs by +default. To force ramfs, add "rootfstype=ramfs" to the kernel command +line. + What is initramfs? ------------------ diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index ab7d16efa96b..9d4c1d18ad44 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -182,6 +182,7 @@ core_pattern is used to specify a core dumpfile pattern name. %<NUL> '%' is dropped %% output one '%' %p pid + %P global pid (init PID namespace) %u uid %g gid %d dump mode, matches PR_SET_DUMPABLE and diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 36ecc26c7433..79a797eb3e87 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -200,17 +200,25 @@ fragmentation index is <= extfrag_threshold. The default value is 500. hugepages_treat_as_movable -This parameter is only useful when kernelcore= is specified at boot time to -create ZONE_MOVABLE for pages that may be reclaimed or migrated. Huge pages -are not movable so are not normally allocated from ZONE_MOVABLE. A non-zero -value written to hugepages_treat_as_movable allows huge pages to be allocated -from ZONE_MOVABLE. - -Once enabled, the ZONE_MOVABLE is treated as an area of memory the huge -pages pool can easily grow or shrink within. Assuming that applications are -not running that mlock() a lot of memory, it is likely the huge pages pool -can grow to the size of ZONE_MOVABLE by repeatedly entering the desired value -into nr_hugepages and triggering page reclaim. +This parameter controls whether we can allocate hugepages from ZONE_MOVABLE +or not. If set to non-zero, hugepages can be allocated from ZONE_MOVABLE. +ZONE_MOVABLE is created when kernel boot parameter kernelcore= is specified, +so this parameter has no effect if used without kernelcore=. + +Hugepage migration is now available in some situations which depend on the +architecture and/or the hugepage size. If a hugepage supports migration, +allocation from ZONE_MOVABLE is always enabled for the hugepage regardless +of the value of this parameter. +IOW, this parameter affects only non-migratable hugepages. + +Assuming that hugepages are not migratable in your system, one usecase of +this parameter is that users can make hugepage pool more extensible by +enabling the allocation from ZONE_MOVABLE. This is because on ZONE_MOVABLE +page reclaim/migration/compaction work more and you can get contiguous +memory more likely. Note that using ZONE_MOVABLE for non-migratable +hugepages can do harm to other features like memory hotremove (because +memory hotremove expects that memory blocks on ZONE_MOVABLE are always +removable,) so it's a trade-off responsible for the users. ============================================================== diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt index 4ac359b7aa17..bdd4bb97fff7 100644 --- a/Documentation/vm/hugetlbpage.txt +++ b/Documentation/vm/hugetlbpage.txt @@ -165,6 +165,7 @@ which function as described above for the default huge page-sized case. Interaction of Task Memory Policy with Huge Page Allocation/Freeing +=================================================================== Whether huge pages are allocated and freed via the /proc interface or the /sysfs interface using the nr_hugepages_mempolicy attribute, the NUMA @@ -229,6 +230,7 @@ resulting effect on persistent huge page allocation is as follows: of huge pages over all on-lines nodes with memory. Per Node Hugepages Attributes +============================= A subset of the contents of the root huge page control directory in sysfs, described above, will be replicated under each the system device of each @@ -258,6 +260,7 @@ applied, from which node the huge page allocation will be attempted. Using Huge Pages +================ If the user applications are going to request huge pages using mmap system call, then it is required that system administrator mount a file system of @@ -296,20 +299,16 @@ calls, though the mount of filesystem will be required for using mmap calls without MAP_HUGETLB. For an example of how to use mmap with MAP_HUGETLB see map_hugetlb.c. -******************************************************************* +Examples +======== -/* - * map_hugetlb: see tools/testing/selftests/vm/map_hugetlb.c - */ +1) map_hugetlb: see tools/testing/selftests/vm/map_hugetlb.c -******************************************************************* +2) hugepage-shm: see tools/testing/selftests/vm/hugepage-shm.c -/* - * hugepage-shm: see tools/testing/selftests/vm/hugepage-shm.c - */ +3) hugepage-mmap: see tools/testing/selftests/vm/hugepage-mmap.c -******************************************************************* - -/* - * hugepage-mmap: see tools/testing/selftests/vm/hugepage-mmap.c - */ +4) The libhugetlbfs (http://libhugetlbfs.sourceforge.net) library provides a + wide range of userspace tools to help with huge page usability, environment + setup, and control. Furthermore it provides useful test cases that should be + used when modifying code to ensure no regressions are introduced. diff --git a/Documentation/vm/soft-dirty.txt b/Documentation/vm/soft-dirty.txt index 9a12a5956bc0..55684d11a1e8 100644 --- a/Documentation/vm/soft-dirty.txt +++ b/Documentation/vm/soft-dirty.txt @@ -28,6 +28,13 @@ This is so, since the pages are still mapped to physical memory, and thus all the kernel does is finds this fact out and puts both writable and soft-dirty bits on the PTE. + While in most cases tracking memory changes by #PF-s is more than enough +there is still a scenario when we can lose soft dirty bits -- a task +unmaps a previously mapped memory region and then maps a new one at exactly +the same place. When unmap is called, the kernel internally clears PTE values +including soft dirty bits. To notify user space application about such +memory region renewal the kernel always marks new memory regions (and +expanded regions) as soft dirty. This feature is actively used by the checkpoint-restore project. You can find more details about it on http://criu.org diff --git a/MAINTAINERS b/MAINTAINERS index be70759e51c5..e61c2e83fc2b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1028,7 +1028,7 @@ F: arch/arm/mach-orion5x/ts78xx-* ARM/MICREL KS8695 ARCHITECTURE M: Greg Ungerer <gerg@uclinux.org> L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) -F: arch/arm/mach-ks8695 +F: arch/arm/mach-ks8695/ S: Odd Fixes ARM/MIOA701 MACHINE SUPPORT @@ -1048,7 +1048,6 @@ M: STEricsson <STEricsson_nomadik_linux@list.st.com> L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: arch/arm/mach-nomadik/ -F: arch/arm/plat-nomadik/ F: drivers/i2c/busses/i2c-nomadik.c T: git git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-nomadik.git @@ -1070,7 +1069,7 @@ F: drivers/mmc/host/msm_sdcc.h F: drivers/tty/serial/msm_serial.h F: drivers/tty/serial/msm_serial.c F: drivers/*/pm8???-* -F: drivers/ssbi/ +F: drivers/mfd/ssbi/ F: include/linux/mfd/pm8xxx/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/davidb/linux-msm.git S: Maintained @@ -1156,7 +1155,6 @@ L: linux-samsung-soc@vger.kernel.org (moderated for non-subscribers) W: http://www.fluff.org/ben/linux/ S: Maintained F: arch/arm/plat-samsung/ -F: arch/arm/plat-s3c24xx/ F: arch/arm/mach-s3c24*/ F: arch/arm/mach-s3c64xx/ F: drivers/*/*s3c2410* @@ -1179,8 +1177,6 @@ L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: arch/arm/mach-s5pv210/mach-aquila.c F: arch/arm/mach-s5pv210/mach-goni.c -F: arch/arm/mach-exynos/mach-universal_c210.c -F: arch/arm/mach-exynos/mach-nuri.c ARM/SAMSUNG S5P SERIES 2D GRAPHICS ACCELERATION (G2D) SUPPORT M: Kyungmin Park <kyungmin.park@samsung.com> @@ -1325,7 +1321,7 @@ F: drivers/mmc/host/wmt-sdmmc.c F: drivers/pwm/pwm-vt8500.c F: drivers/rtc/rtc-vt8500.c F: drivers/tty/serial/vt8500_serial.c -F: drivers/usb/host/ehci-vt8500.c +F: drivers/usb/host/ehci-platform.c F: drivers/usb/host/uhci-platform.c F: drivers/video/vt8500lcdfb.* F: drivers/video/wm8505fb* @@ -1815,6 +1811,17 @@ L: netdev@vger.kernel.org S: Supported F: drivers/net/ethernet/broadcom/bnx2x/ +BROADCOM BCM281XX/BCM11XXX ARM ARCHITECTURE +M: Christian Daudt <csd@broadcom.com> +T: git git://git.github.com/broadcom/bcm11351 +S: Maintained +F: arch/arm/mach-bcm/ +F: arch/arm/boot/dts/bcm113* +F: arch/arm/boot/dts/bcm281* +F: arch/arm/configs/bcm_defconfig +F: drivers/mmc/host/sdhci_bcm_kona.c +F: drivers/clocksource/bcm_kona_timer.c + BROADCOM BCM2835 ARM ARCHICTURE M: Stephen Warren <swarren@wwwdotorg.org> L: linux-rpi-kernel@lists.infradead.org (moderated for non-subscribers) @@ -2035,10 +2042,10 @@ W: http://ceph.com/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git S: Supported F: Documentation/filesystems/ceph.txt -F: fs/ceph -F: net/ceph -F: include/linux/ceph -F: include/linux/crush +F: fs/ceph/ +F: net/ceph/ +F: include/linux/ceph/ +F: include/linux/crush/ CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM: L: linux-usb@vger.kernel.org @@ -2335,7 +2342,7 @@ CPU POWER MONITORING SUBSYSTEM M: Dominik Brodowski <linux@dominikbrodowski.net> M: Thomas Renninger <trenn@suse.de> S: Maintained -F: tools/power/cpupower +F: tools/power/cpupower/ CPUSETS M: Li Zefan <lizefan@huawei.com> @@ -2773,7 +2780,7 @@ L: intel-gfx@lists.freedesktop.org L: dri-devel@lists.freedesktop.org T: git git://people.freedesktop.org/~danvet/drm-intel S: Supported -F: drivers/gpu/drm/i915 +F: drivers/gpu/drm/i915/ F: include/drm/i915* F: include/uapi/drm/i915* @@ -2785,7 +2792,7 @@ M: Kyungmin Park <kyungmin.park@samsung.com> L: dri-devel@lists.freedesktop.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/daeinki/drm-exynos.git S: Supported -F: drivers/gpu/drm/exynos +F: drivers/gpu/drm/exynos/ F: include/drm/exynos* F: include/uapi/drm/exynos* @@ -3038,7 +3045,7 @@ M: Mauro Carvalho Chehab <m.chehab@samsung.com> L: linux-edac@vger.kernel.org W: bluesmoke.sourceforge.net S: Maintained -F: drivers/edac/ghes-edac.c +F: drivers/edac/ghes_edac.c EDAC-I82443BXGX M: Tim Small <tim@buttersideup.com> @@ -3644,8 +3651,8 @@ M: Arnd Bergmann <arnd@arndb.de> L: linux-arch@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic.git S: Maintained -F: include/asm-generic -F: include/uapi/asm-generic +F: include/asm-generic/ +F: include/uapi/asm-generic/ GENERIC UIO DRIVER FOR PCI DEVICES M: "Michael S. Tsirkin" <mst@redhat.com> @@ -3687,7 +3694,8 @@ GRE DEMULTIPLEXER DRIVER M: Dmitry Kozlov <xeb@mail.ru> L: netdev@vger.kernel.org S: Maintained -F: net/ipv4/gre.c +F: net/ipv4/gre_demux.c +F: net/ipv4/gre_offload.c F: include/net/gre.h GRETH 10/100/1G Ethernet MAC device driver @@ -3765,7 +3773,7 @@ L: linux-media@vger.kernel.org T: git git://linuxtv.org/media_tree.git W: http://linuxtv.org S: Odd Fixes -F: drivers/media/usb/hdpvr +F: drivers/media/usb/hdpvr/ HWPOISON MEMORY FAILURE HANDLING M: Andi Kleen <andi@firstfloor.org> @@ -4574,7 +4582,7 @@ S: Supported W: http://www.openfabrics.org W: www.open-iscsi.org Q: http://patchwork.kernel.org/project/linux-rdma/list/ -F: drivers/infiniband/ulp/iser +F: drivers/infiniband/ulp/iser/ ISDN SUBSYSTEM M: Karsten Keil <isdn@linux-pingi.de> @@ -4628,7 +4636,7 @@ W: http://palosaari.fi/linux/ Q: http://patchwork.linuxtv.org/project/linux-media/list/ T: git git://linuxtv.org/anttip/media_tree.git S: Maintained -F: drivers/media/tuners/it913x* +F: drivers/media/tuners/tuner_it913x* IVTV VIDEO4LINUX DRIVER M: Andy Walls <awalls@md.metrocast.net> @@ -5964,15 +5972,12 @@ S: Maintained F: arch/arm/*omap*/*pm* F: drivers/cpufreq/omap-cpufreq.c -OMAP POWERDOMAIN/CLOCKDOMAIN SOC ADAPTATION LAYER SUPPORT +OMAP POWERDOMAIN SOC ADAPTATION LAYER SUPPORT M: Rajendra Nayak <rnayak@ti.com> M: Paul Walmsley <paul@pwsan.com> L: linux-omap@vger.kernel.org S: Maintained -F: arch/arm/mach-omap2/powerdomain2xxx_3xxx.c -F: arch/arm/mach-omap2/powerdomain44xx.c -F: arch/arm/mach-omap2/clockdomain2xxx_3xxx.c -F: arch/arm/mach-omap2/clockdomain44xx.c +F: arch/arm/mach-omap2/prm* OMAP AUDIO SUPPORT M: Peter Ujfalusi <peter.ujfalusi@ti.com> @@ -6138,7 +6143,7 @@ W: http://openrisc.net L: linux@lists.openrisc.net (moderated for non-subscribers) S: Maintained T: git git://openrisc.net/~jonas/linux -F: arch/openrisc +F: arch/openrisc/ OPENVSWITCH M: Jesse Gross <jesse@nicira.com> @@ -6429,7 +6434,7 @@ M: Jamie Iles <jamie@jamieiles.com> L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) T: git git://github.com/jamieiles/linux-2.6-ji.git S: Supported -F: arch/arm/mach-picoxcell +F: arch/arm/mach-picoxcell/ F: drivers/*/picoxcell* F: drivers/*/*/picoxcell* @@ -6702,7 +6707,7 @@ F: drivers/spi/spi-pxa2xx* F: drivers/usb/gadget/pxa2* F: include/sound/pxa2xx-lib.h F: sound/arm/pxa* -F: sound/soc/pxa +F: sound/soc/pxa/ MMP SUPPORT M: Eric Miao <eric.y.miao@gmail.com> @@ -7155,7 +7160,7 @@ SAMSUNG AUDIO (ASoC) DRIVERS M: Sangbeom Kim <sbkim73@samsung.com> L: alsa-devel@alsa-project.org (moderated for non-subscribers) S: Supported -F: sound/soc/samsung +F: sound/soc/samsung/ SAMSUNG FRAMEBUFFER DRIVER M: Jingoo Han <jg1.han@samsung.com> @@ -7201,7 +7206,7 @@ SERIAL DRIVERS M: Greg Kroah-Hartman <gregkh@linuxfoundation.org> L: linux-serial@vger.kernel.org S: Maintained -F: drivers/tty/serial +F: drivers/tty/serial/ SYNOPSYS DESIGNWARE DMAC DRIVER M: Viresh Kumar <viresh.linux@gmail.com> @@ -7236,7 +7241,7 @@ TLG2300 VIDEO4LINUX-2 DRIVER M: Huang Shijie <shijie8@gmail.com> M: Hans Verkuil <hverkuil@xs4all.nl> S: Odd Fixes -F: drivers/media/usb/tlg2300 +F: drivers/media/usb/tlg2300/ SC1200 WDT DRIVER M: Zwane Mwaikambo <zwane@arm.linux.org.uk> @@ -7497,7 +7502,7 @@ L: linux-media@vger.kernel.org T: git git://linuxtv.org/media_tree.git W: http://linuxtv.org S: Odd Fixes -F: drivers/media/radio/radio-si4713.h +F: drivers/media/radio/radio-si4713.c SIANO DVB DRIVER M: Mauro Carvalho Chehab <m.chehab@samsung.com> @@ -7506,9 +7511,9 @@ W: http://linuxtv.org T: git git://linuxtv.org/media_tree.git S: Odd fixes F: drivers/media/common/siano/ -F: drivers/media/dvb/siano/ F: drivers/media/usb/siano/ -F: drivers/media/mmc/siano +F: drivers/media/usb/siano/ +F: drivers/media/mmc/siano/ SH_VEU V4L2 MEM2MEM DRIVER M: Guennadi Liakhovetski <g.liakhovetski@gmx.de> @@ -7546,9 +7551,9 @@ P: Vincent Sanders <vince@simtec.co.uk> M: Simtec Linux Team <linux@simtec.co.uk> W: http://www.simtec.co.uk/products/EB2410ITX/ S: Supported -F: arch/arm/mach-s3c2410/mach-bast.c -F: arch/arm/mach-s3c2410/bast-ide.c -F: arch/arm/mach-s3c2410/bast-irq.c +F: arch/arm/mach-s3c24xx/mach-bast.c +F: arch/arm/mach-s3c24xx/bast-ide.c +F: arch/arm/mach-s3c24xx/bast-irq.c TI DAVINCI MACHINE SUPPORT M: Sekhar Nori <nsekhar@ti.com> @@ -7557,7 +7562,7 @@ L: davinci-linux-open-source@linux.davincidsp.com (moderated for non-subscribers T: git git://gitorious.org/linux-davinci/linux-davinci.git Q: http://patchwork.kernel.org/project/linux-davinci/list/ S: Supported -F: arch/arm/mach-davinci +F: arch/arm/mach-davinci/ F: drivers/i2c/busses/i2c-davinci.c TI DAVINCI SERIES MEDIA DRIVER @@ -7642,7 +7647,7 @@ SMIA AND SMIA++ IMAGE SENSOR DRIVER M: Sakari Ailus <sakari.ailus@iki.fi> L: linux-media@vger.kernel.org S: Maintained -F: drivers/media/i2c/smiapp +F: drivers/media/i2c/smiapp/ F: include/media/smiapp.h F: drivers/media/i2c/smiapp-pll.c F: drivers/media/i2c/smiapp-pll.h @@ -7745,6 +7750,11 @@ W: http://tifmxx.berlios.de/ S: Maintained F: drivers/memstick/host/tifm_ms.c +SONY MEMORYSTICK STANDARD SUPPORT +M: Maxim Levitsky <maximlevitsky@gmail.com> +S: Maintained +F: drivers/memstick/core/ms_block.* + SOUND M: Jaroslav Kysela <perex@perex.cz> M: Takashi Iwai <tiwai@suse.de> @@ -7821,35 +7831,7 @@ L: spear-devel@list.st.com L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) W: http://www.st.com/spear S: Maintained -F: arch/arm/plat-spear/ - -SPEAR13XX MACHINE SUPPORT -M: Viresh Kumar <viresh.linux@gmail.com> -M: Shiraz Hashim <shiraz.hashim@st.com> -L: spear-devel@list.st.com -L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) -W: http://www.st.com/spear -S: Maintained -F: arch/arm/mach-spear13xx/ - -SPEAR3XX MACHINE SUPPORT -M: Viresh Kumar <viresh.linux@gmail.com> -M: Shiraz Hashim <shiraz.hashim@st.com> -L: spear-devel@list.st.com -L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) -W: http://www.st.com/spear -S: Maintained -F: arch/arm/mach-spear3xx/ - -SPEAR6XX MACHINE SUPPORT -M: Rajeev Kumar <rajeev-dlh.kumar@st.com> -M: Shiraz Hashim <shiraz.hashim@st.com> -M: Viresh Kumar <viresh.linux@gmail.com> -L: spear-devel@list.st.com -L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) -W: http://www.st.com/spear -S: Maintained -F: arch/arm/mach-spear6xx/ +F: arch/arm/mach-spear/ SPEAR CLOCK FRAMEWORK SUPPORT M: Viresh Kumar <viresh.linux@gmail.com> @@ -8118,7 +8100,7 @@ M: Vineet Gupta <vgupta@synopsys.com> S: Supported F: arch/arc/ F: Documentation/devicetree/bindings/arc/ -F: drivers/tty/serial/arc-uart.c +F: drivers/tty/serial/arc_uart.c SYSV FILESYSTEM M: Christoph Hellwig <hch@infradead.org> @@ -8808,7 +8790,6 @@ L: linux-usb@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/balbi/usb.git S: Maintained F: drivers/usb/phy/ -F: drivers/usb/otg/ USB PRINTER DRIVER (usblp) M: Pete Zaitcev <zaitcev@redhat.com> @@ -9339,7 +9320,7 @@ M: Matthew Garrett <matthew.garrett@nebula.com> L: platform-driver-x86@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/mjg59/platform-drivers-x86.git S: Maintained -F: drivers/platform/x86 +F: drivers/platform/x86/ X86 MCE INFRASTRUCTURE M: Tony Luck <tony.luck@intel.com> diff --git a/arch/alpha/lib/csum_partial_copy.c b/arch/alpha/lib/csum_partial_copy.c index 40736da9bea8..ffb19b7da999 100644 --- a/arch/alpha/lib/csum_partial_copy.c +++ b/arch/alpha/lib/csum_partial_copy.c @@ -338,6 +338,11 @@ csum_partial_copy_from_user(const void __user *src, void *dst, int len, unsigned long doff = 7 & (unsigned long) dst; if (len) { + if (!access_ok(VERIFY_READ, src, len)) { + *errp = -EFAULT; + memset(dst, 0, len); + return sum; + } if (!doff) { if (!soff) checksum = csum_partial_cfu_aligned( diff --git a/arch/arm/mm/hugetlbpage.c b/arch/arm/mm/hugetlbpage.c index 66781bf34077..54ee6163c181 100644 --- a/arch/arm/mm/hugetlbpage.c +++ b/arch/arm/mm/hugetlbpage.c @@ -56,3 +56,8 @@ int pmd_huge(pmd_t pmd) { return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT); } + +int pmd_huge_support(void) +{ + return 1; +} diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 2fc8258bab2d..5e9aec358306 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -54,6 +54,11 @@ int pud_huge(pud_t pud) return !(pud_val(pud) & PUD_TABLE_BIT); } +int pmd_huge_support(void) +{ + return 1; +} + static __init int setup_hugepagesz(char *opt) { unsigned long ps = memparse(opt, &opt); diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index 76069c18ee42..68232db98baa 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -114,6 +114,11 @@ int pud_huge(pud_t pud) return 0; } +int pmd_huge_support(void) +{ + return 0; +} + struct page * follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) { diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c index 3c52fa6d0f8e..042431509b56 100644 --- a/arch/metag/mm/hugetlbpage.c +++ b/arch/metag/mm/hugetlbpage.c @@ -110,6 +110,11 @@ int pud_huge(pud_t pud) return 0; } +int pmd_huge_support(void) +{ + return 1; +} + struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) { diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c index a7fee0dfb7a9..01fda4419ed0 100644 --- a/arch/mips/mm/hugetlbpage.c +++ b/arch/mips/mm/hugetlbpage.c @@ -85,6 +85,11 @@ int pud_huge(pud_t pud) return (pud_val(pud) & _PAGE_HUGE) != 0; } +int pmd_huge_support(void) +{ + return 1; +} + struct page * follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 834ca8eb38f2..d67db4bd672d 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -86,6 +86,11 @@ int pgd_huge(pgd_t pgd) */ return ((pgd_val(pgd) & 0x3) != 0x0); } + +int pmd_huge_support(void) +{ + return 1; +} #else int pmd_huge(pmd_t pmd) { @@ -101,6 +106,11 @@ int pgd_huge(pgd_t pgd) { return 0; } + +int pmd_huge_support(void) +{ + return 0; +} #endif pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index fb2723e8ba65..3ec272859e1e 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -526,6 +526,7 @@ config CRASH_DUMP bool "kernel crash dumps" depends on 64BIT && SMP select KEXEC + select ZFCPDUMP help Generate crash dump after being started by kexec. Crash dump kernels are loaded in the main kernel with kexec-tools @@ -536,7 +537,7 @@ config CRASH_DUMP config ZFCPDUMP def_bool n prompt "zfcpdump support" - select SMP + depends on SMP help Select this option if you want to build an zfcpdump enabled kernel. Refer to <file:Documentation/s390/zfcpdump.txt> for more details on this. diff --git a/arch/s390/include/asm/kprobes.h b/arch/s390/include/asm/kprobes.h index dcf6948a875c..4176dfe0fba1 100644 --- a/arch/s390/include/asm/kprobes.h +++ b/arch/s390/include/asm/kprobes.h @@ -31,6 +31,8 @@ #include <linux/ptrace.h> #include <linux/percpu.h> +#define __ARCH_WANT_KPROBES_INSN_SLOT + struct pt_regs; struct kprobe; @@ -57,7 +59,7 @@ typedef u16 kprobe_opcode_t; /* Architecture specific copy of original instruction */ struct arch_specific_insn { /* copy of original instruction */ - kprobe_opcode_t insn[MAX_INSN_SIZE]; + kprobe_opcode_t *insn; }; struct prev_kprobe { diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index 06a136136047..7dc7f9c63b65 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -56,5 +56,6 @@ bool sclp_has_linemode(void); bool sclp_has_vt220(void); int sclp_pci_configure(u32 fid); int sclp_pci_deconfigure(u32 fid); +int memcpy_hsa(void *dest, unsigned long src, size_t count, int mode); #endif /* _ASM_S390_SCLP_H */ diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index d8f355657171..c84f33d51f7b 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -16,6 +16,7 @@ #include <asm/os_info.h> #include <asm/elf.h> #include <asm/ipl.h> +#include <asm/sclp.h> #define PTR_ADD(x, y) (((char *) (x)) + ((unsigned long) (y))) #define PTR_SUB(x, y) (((char *) (x)) - ((unsigned long) (y))) @@ -64,22 +65,46 @@ static ssize_t copy_page_real(void *buf, void *src, size_t csize) } /* - * Copy one page from "oldmem" + * Pointer to ELF header in new kernel + */ +static void *elfcorehdr_newmem; + +/* + * Copy one page from zfcpdump "oldmem" + * + * For pages below ZFCPDUMP_HSA_SIZE memory from the HSA is copied. Otherwise + * real memory copy is used. + */ +static ssize_t copy_oldmem_page_zfcpdump(char *buf, size_t csize, + unsigned long src, int userbuf) +{ + int rc; + + if (src < ZFCPDUMP_HSA_SIZE) { + rc = memcpy_hsa(buf, src, csize, userbuf); + } else { + if (userbuf) + rc = copy_to_user_real((void __force __user *) buf, + (void *) src, csize); + else + rc = memcpy_real(buf, (void *) src, csize); + } + return rc ? rc : csize; +} + +/* + * Copy one page from kdump "oldmem" * * For the kdump reserved memory this functions performs a swap operation: * - [OLDMEM_BASE - OLDMEM_BASE + OLDMEM_SIZE] is mapped to [0 - OLDMEM_SIZE]. * - [0 - OLDMEM_SIZE] is mapped to [OLDMEM_BASE - OLDMEM_BASE + OLDMEM_SIZE] */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, int userbuf) +static ssize_t copy_oldmem_page_kdump(char *buf, size_t csize, + unsigned long src, int userbuf) + { - unsigned long src; int rc; - if (!csize) - return 0; - - src = (pfn << PAGE_SHIFT) + offset; if (src < OLDMEM_SIZE) src += OLDMEM_BASE; else if (src > OLDMEM_BASE && @@ -90,7 +115,88 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, (void *) src, csize); else rc = copy_page_real(buf, (void *) src, csize); - return (rc == 0) ? csize : rc; + return (rc == 0) ? rc : csize; +} + +/* + * Copy one page from "oldmem" + */ +ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, + unsigned long offset, int userbuf) +{ + unsigned long src; + + if (!csize) + return 0; + src = (pfn << PAGE_SHIFT) + offset; + if (OLDMEM_BASE) + return copy_oldmem_page_kdump(buf, csize, src, userbuf); + else + return copy_oldmem_page_zfcpdump(buf, csize, src, userbuf); +} + +/* + * Remap "oldmem" for kdump + * + * For the kdump reserved memory this functions performs a swap operation: + * [0 - OLDMEM_SIZE] is mapped to [OLDMEM_BASE - OLDMEM_BASE + OLDMEM_SIZE] + */ +static int remap_oldmem_pfn_range_kdump(struct vm_area_struct *vma, + unsigned long from, unsigned long pfn, + unsigned long size, pgprot_t prot) +{ + unsigned long size_old; + int rc; + + if (pfn < OLDMEM_SIZE >> PAGE_SHIFT) { + size_old = min(size, OLDMEM_SIZE - (pfn << PAGE_SHIFT)); + rc = remap_pfn_range(vma, from, + pfn + (OLDMEM_BASE >> PAGE_SHIFT), + size_old, prot); + if (rc || size == size_old) + return rc; + size -= size_old; + from += size_old; + pfn += size_old >> PAGE_SHIFT; + } + return remap_pfn_range(vma, from, pfn, size, prot); +} + +/* + * Remap "oldmem" for zfcpdump + * + * We only map available memory above ZFCPDUMP_HSA_SIZE. Memory below + * ZFCPDUMP_HSA_SIZE is read on demand using the copy_oldmem_page() function. + */ +static int remap_oldmem_pfn_range_zfcpdump(struct vm_area_struct *vma, + unsigned long from, + unsigned long pfn, + unsigned long size, pgprot_t prot) +{ + unsigned long size_hsa; + + if (pfn < ZFCPDUMP_HSA_SIZE >> PAGE_SHIFT) { + size_hsa = min(size, ZFCPDUMP_HSA_SIZE - (pfn << PAGE_SHIFT)); + if (size == size_hsa) + return 0; + size -= size_hsa; + from += size_hsa; + pfn += size_hsa >> PAGE_SHIFT; + } + return remap_pfn_range(vma, from, pfn, size, prot); +} + +/* + * Remap "oldmem" for kdump or zfcpdump + */ +int remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + if (OLDMEM_BASE) + return remap_oldmem_pfn_range_kdump(vma, from, pfn, size, prot); + else + return remap_oldmem_pfn_range_zfcpdump(vma, from, pfn, size, + prot); } /* @@ -101,11 +207,21 @@ int copy_from_oldmem(void *dest, void *src, size_t count) unsigned long copied = 0; int rc; - if ((unsigned long) src < OLDMEM_SIZE) { - copied = min(count, OLDMEM_SIZE - (unsigned long) src); - rc = memcpy_real(dest, src + OLDMEM_BASE, copied); - if (rc) - return rc; + if (OLDMEM_BASE) { + if ((unsigned long) src < OLDMEM_SIZE) { + copied = min(count, OLDMEM_SIZE - (unsigned long) src); + rc = memcpy_real(dest, src + OLDMEM_BASE, copied); + if (rc) + return rc; + } + } else { + if ((unsigned long) src < ZFCPDUMP_HSA_SIZE) { + copied = min(count, + ZFCPDUMP_HSA_SIZE - (unsigned long) src); + rc = memcpy_hsa(dest, (unsigned long) src, copied, 0); + if (rc) + return rc; + } } return memcpy_real(dest + copied, src + copied, count - copied); } @@ -368,14 +484,6 @@ static int get_mem_chunk_cnt(void) } /* - * Relocate pointer in order to allow vmcore code access the data - */ -static inline unsigned long relocate(unsigned long addr) -{ - return OLDMEM_BASE + addr; -} - -/* * Initialize ELF loads (new kernel) */ static int loads_init(Elf64_Phdr *phdr, u64 loads_offset) @@ -426,7 +534,7 @@ static void *notes_init(Elf64_Phdr *phdr, void *ptr, u64 notes_offset) ptr = nt_vmcoreinfo(ptr); memset(phdr, 0, sizeof(*phdr)); phdr->p_type = PT_NOTE; - phdr->p_offset = relocate(notes_offset); + phdr->p_offset = notes_offset; phdr->p_filesz = (unsigned long) PTR_SUB(ptr, ptr_start); phdr->p_memsz = phdr->p_filesz; return ptr; @@ -435,7 +543,7 @@ static void *notes_init(Elf64_Phdr *phdr, void *ptr, u64 notes_offset) /* * Create ELF core header (new kernel) */ -static void s390_elf_corehdr_create(char **elfcorebuf, size_t *elfcorebuf_sz) +int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size) { Elf64_Phdr *phdr_notes, *phdr_loads; int mem_chunk_cnt; @@ -443,6 +551,12 @@ static void s390_elf_corehdr_create(char **elfcorebuf, size_t *elfcorebuf_sz) u32 alloc_size; u64 hdr_off; + /* If we are not in kdump or zfcpdump mode return */ + if (!OLDMEM_BASE && ipl_info.type != IPL_TYPE_FCP_DUMP) + return 0; + /* If elfcorehdr= has been passed via cmdline, we use that one */ + if (elfcorehdr_addr != ELFCORE_ADDR_MAX) + return 0; mem_chunk_cnt = get_mem_chunk_cnt(); alloc_size = 0x1000 + get_cpu_cnt() * 0x300 + @@ -460,27 +574,52 @@ static void s390_elf_corehdr_create(char **elfcorebuf, size_t *elfcorebuf_sz) ptr = notes_init(phdr_notes, ptr, ((unsigned long) hdr) + hdr_off); /* Init loads */ hdr_off = PTR_DIFF(ptr, hdr); - loads_init(phdr_loads, ((unsigned long) hdr) + hdr_off); - *elfcorebuf_sz = hdr_off; - *elfcorebuf = (void *) relocate((unsigned long) hdr); - BUG_ON(*elfcorebuf_sz > alloc_size); + loads_init(phdr_loads, hdr_off); + *addr = (unsigned long long) hdr; + elfcorehdr_newmem = hdr; + *size = (unsigned long long) hdr_off; + BUG_ON(elfcorehdr_size > alloc_size); + return 0; } /* - * Create kdump ELF core header in new kernel, if it has not been passed via - * the "elfcorehdr" kernel parameter + * Free ELF core header (new kernel) */ -static int setup_kdump_elfcorehdr(void) +void elfcorehdr_free(unsigned long long addr) { - size_t elfcorebuf_sz; - char *elfcorebuf; + if (!elfcorehdr_newmem) + return; + kfree((void *)(unsigned long)addr); +} - if (!OLDMEM_BASE || is_kdump_kernel()) - return -EINVAL; - s390_elf_corehdr_create(&elfcorebuf, &elfcorebuf_sz); - elfcorehdr_addr = (unsigned long long) elfcorebuf; - elfcorehdr_size = elfcorebuf_sz; - return 0; +/* + * Read from ELF header + */ +ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos) +{ + void *src = (void *)(unsigned long)*ppos; + + src = elfcorehdr_newmem ? src : src - OLDMEM_BASE; + memcpy(buf, src, count); + *ppos += count; + return count; } -subsys_initcall(setup_kdump_elfcorehdr); +/* + * Read from ELF notes data + */ +ssize_t elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos) +{ + void *src = (void *)(unsigned long)*ppos; + int rc; + + if (elfcorehdr_newmem) { + memcpy(buf, src, count); + } else { + rc = copy_from_oldmem(buf, src, count); + if (rc) + return rc; + } + *ppos += count; + return count; +} diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index adbbe7f1cb0d..0ce9fb245034 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -37,6 +37,26 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); struct kretprobe_blackpoint kretprobe_blacklist[] = { }; +DEFINE_INSN_CACHE_OPS(dmainsn); + +static void *alloc_dmainsn_page(void) +{ + return (void *)__get_free_page(GFP_KERNEL | GFP_DMA); +} + +static void free_dmainsn_page(void *page) +{ + free_page((unsigned long)page); +} + +struct kprobe_insn_cache kprobe_dmainsn_slots = { + .mutex = __MUTEX_INITIALIZER(kprobe_dmainsn_slots.mutex), + .alloc = alloc_dmainsn_page, + .free = free_dmainsn_page, + .pages = LIST_HEAD_INIT(kprobe_dmainsn_slots.pages), + .insn_size = MAX_INSN_SIZE, +}; + static int __kprobes is_prohibited_opcode(kprobe_opcode_t *insn) { switch (insn[0] >> 8) { @@ -100,9 +120,8 @@ static int __kprobes get_fixup_type(kprobe_opcode_t *insn) fixup |= FIXUP_RETURN_REGISTER; break; case 0xc0: - if ((insn[0] & 0x0f) == 0x00 || /* larl */ - (insn[0] & 0x0f) == 0x05) /* brasl */ - fixup |= FIXUP_RETURN_REGISTER; + if ((insn[0] & 0x0f) == 0x05) /* brasl */ + fixup |= FIXUP_RETURN_REGISTER; break; case 0xeb: switch (insn[2] & 0xff) { @@ -134,18 +153,128 @@ static int __kprobes get_fixup_type(kprobe_opcode_t *insn) return fixup; } +static int __kprobes is_insn_relative_long(kprobe_opcode_t *insn) +{ + /* Check if we have a RIL-b or RIL-c format instruction which + * we need to modify in order to avoid instruction emulation. */ + switch (insn[0] >> 8) { + case 0xc0: + if ((insn[0] & 0x0f) == 0x00) /* larl */ + return true; + break; + case 0xc4: + switch (insn[0] & 0x0f) { + case 0x02: /* llhrl */ + case 0x04: /* lghrl */ + case 0x05: /* lhrl */ + case 0x06: /* llghrl */ + case 0x07: /* sthrl */ + case 0x08: /* lgrl */ + case 0x0b: /* stgrl */ + case 0x0c: /* lgfrl */ + case 0x0d: /* lrl */ + case 0x0e: /* llgfrl */ + case 0x0f: /* strl */ + return true; + } + break; + case 0xc6: + switch (insn[0] & 0x0f) { + case 0x00: /* exrl */ + case 0x02: /* pfdrl */ + case 0x04: /* cghrl */ + case 0x05: /* chrl */ + case 0x06: /* clghrl */ + case 0x07: /* clhrl */ + case 0x08: /* cgrl */ + case 0x0a: /* clgrl */ + case 0x0c: /* cgfrl */ + case 0x0d: /* crl */ + case 0x0e: /* clgfrl */ + case 0x0f: /* clrl */ + return true; + } + break; + } + return false; +} + +static void __kprobes copy_instruction(struct kprobe *p) +{ + s64 disp, new_disp; + u64 addr, new_addr; + + memcpy(p->ainsn.insn, p->addr, ((p->opcode >> 14) + 3) & -2); + if (!is_insn_relative_long(p->ainsn.insn)) + return; + /* + * For pc-relative instructions in RIL-b or RIL-c format patch the + * RI2 displacement field. We have already made sure that the insn + * slot for the patched instruction is within the same 2GB area + * as the original instruction (either kernel image or module area). + * Therefore the new displacement will always fit. + */ + disp = *(s32 *)&p->ainsn.insn[1]; + addr = (u64)(unsigned long)p->addr; + new_addr = (u64)(unsigned long)p->ainsn.insn; + new_disp = ((addr + (disp * 2)) - new_addr) / 2; + *(s32 *)&p->ainsn.insn[1] = new_disp; +} + +static inline int is_kernel_addr(void *addr) +{ + return addr < (void *)_end; +} + +static inline int is_module_addr(void *addr) +{ +#ifdef CONFIG_64BIT + BUILD_BUG_ON(MODULES_LEN > (1UL << 31)); + if (addr < (void *)MODULES_VADDR) + return 0; + if (addr > (void *)MODULES_END) + return 0; +#endif + return 1; +} + +static int __kprobes s390_get_insn_slot(struct kprobe *p) +{ + /* + * Get an insn slot that is within the same 2GB area like the original + * instruction. That way instructions with a 32bit signed displacement + * field can be patched and executed within the insn slot. + */ + p->ainsn.insn = NULL; + if (is_kernel_addr(p->addr)) + p->ainsn.insn = get_dmainsn_slot(); + if (is_module_addr(p->addr)) + p->ainsn.insn = get_insn_slot(); + return p->ainsn.insn ? 0 : -ENOMEM; +} + +static void __kprobes s390_free_insn_slot(struct kprobe *p) +{ + if (!p->ainsn.insn) + return; + if (is_kernel_addr(p->addr)) + free_dmainsn_slot(p->ainsn.insn, 0); + else + free_insn_slot(p->ainsn.insn, 0); + p->ainsn.insn = NULL; +} + int __kprobes arch_prepare_kprobe(struct kprobe *p) { if ((unsigned long) p->addr & 0x01) return -EINVAL; - /* Make sure the probe isn't going on a difficult instruction */ if (is_prohibited_opcode(p->addr)) return -EINVAL; - + if (s390_get_insn_slot(p)) + return -ENOMEM; p->opcode = *p->addr; - memcpy(p->ainsn.insn, p->addr, ((p->opcode >> 14) + 3) & -2); - + copy_instruction(p); return 0; } @@ -186,6 +315,7 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p) void __kprobes arch_remove_kprobe(struct kprobe *p) { + s390_free_insn_slot(p); } static void __kprobes enable_singlestep(struct kprobe_ctlblk *kcb, diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index 248445f92604..d261c62e40a6 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -223,6 +223,11 @@ int pud_huge(pud_t pud) return 0; } +int pmd_huge_support(void) +{ + return 1; +} + struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmdp, int write) { diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index d7762349ea48..0d676a41081e 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c @@ -83,6 +83,11 @@ int pud_huge(pud_t pud) return 0; } +int pmd_huge_support(void) +{ + return 0; +} + struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) { diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c index 3d0ddbc005fe..71368850dfc0 100644 --- a/arch/sparc/kernel/sys_sparc32.c +++ b/arch/sparc/kernel/sys_sparc32.c @@ -169,10 +169,10 @@ COMPAT_SYSCALL_DEFINE5(rt_sigaction, int, sig, new_ka.ka_restorer = restorer; ret = get_user(u_handler, &act->sa_handler); new_ka.sa.sa_handler = compat_ptr(u_handler); - ret |= __copy_from_user(&set32, &act->sa_mask, sizeof(compat_sigset_t)); + ret |= copy_from_user(&set32, &act->sa_mask, sizeof(compat_sigset_t)); sigset_from_compat(&new_ka.sa.sa_mask, &set32); - ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags); - ret |= __get_user(u_restorer, &act->sa_restorer); + ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags); + ret |= get_user(u_restorer, &act->sa_restorer); new_ka.sa.sa_restorer = compat_ptr(u_restorer); if (ret) return -EFAULT; @@ -183,9 +183,9 @@ COMPAT_SYSCALL_DEFINE5(rt_sigaction, int, sig, if (!ret && oact) { sigset_to_compat(&set32, &old_ka.sa.sa_mask); ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler); - ret |= __copy_to_user(&oact->sa_mask, &set32, sizeof(compat_sigset_t)); - ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags); - ret |= __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer); + ret |= copy_to_user(&oact->sa_mask, &set32, sizeof(compat_sigset_t)); + ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags); + ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer); if (ret) ret = -EFAULT; } diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index d2b59441ebdd..96399646570a 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -234,6 +234,11 @@ int pud_huge(pud_t pud) return 0; } +int pmd_huge_support(void) +{ + return 0; +} + struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) { diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c index e514899e1100..0cb3bbaa580c 100644 --- a/arch/tile/mm/hugetlbpage.c +++ b/arch/tile/mm/hugetlbpage.c @@ -166,6 +166,11 @@ int pud_huge(pud_t pud) return !!(pud_val(pud) & _PAGE_HUGE_PAGE); } +int pmd_huge_support(void) +{ + return 1; +} + struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) { diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 8d16befdec88..3d1999458709 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -315,21 +315,6 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); } -static inline pte_t pte_swp_mksoft_dirty(pte_t pte) -{ - return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); -} - -static inline int pte_swp_soft_dirty(pte_t pte) -{ - return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; -} - -static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) -{ - return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); -} - static inline pte_t pte_file_clear_soft_dirty(pte_t pte) { return pte_clear_flags(pte, _PAGE_SOFT_DIRTY); @@ -446,6 +431,7 @@ pte_t *populate_extra_pte(unsigned long vaddr); #ifndef __ASSEMBLY__ #include <linux/mm_types.h> +#include <linux/mmdebug.h> #include <linux/log2.h> static inline int pte_none(pte_t pte) @@ -864,6 +850,24 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, { } +static inline pte_t pte_swp_mksoft_dirty(pte_t pte) +{ + VM_BUG_ON(pte_present(pte)); + return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); +} + +static inline int pte_swp_soft_dirty(pte_t pte) +{ + VM_BUG_ON(pte_present(pte)); + return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; +} + +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) +{ + VM_BUG_ON(pte_present(pte)); + return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); +} + #include <asm-generic/pgtable.h> #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index f4843e031131..0ecac257fb26 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -75,6 +75,9 @@ * with swap entry format. On x86 bits 6 and 7 are *not* involved * into swap entry computation, but bit 6 is used for nonlinear * file mapping, so we borrow bit 7 for soft dirty tracking. + * + * Please note that this bit must be treated as swap dirty page + * mark if and only if the PTE has present bit clear! */ #ifdef CONFIG_MEM_SOFT_DIRTY #define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index cf512003e663..e6d90babc245 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -62,6 +62,7 @@ static inline void __flush_tlb_all(void) static inline void __flush_tlb_one(unsigned long addr) { + count_vm_event(NR_TLB_LOCAL_FLUSH_ONE); __flush_tlb_single(addr); } @@ -84,14 +85,38 @@ static inline void __flush_tlb_one(unsigned long addr) #ifndef CONFIG_SMP -#define flush_tlb() __flush_tlb() -#define flush_tlb_all() __flush_tlb_all() -#define local_flush_tlb() __flush_tlb() +/* "_up" is for UniProcessor. + * + * This is a helper for other header functions. *Not* intended to be called + * directly. All global TLB flushes need to either call this, or to bump the + * vm statistics themselves. + */ +static inline void __flush_tlb_up(void) +{ + count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + __flush_tlb(); +} + +static inline void flush_tlb_all(void) +{ + count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + __flush_tlb_all(); +} + +static inline void flush_tlb(void) +{ + __flush_tlb_up(); +} + +static inline void local_flush_tlb(void) +{ + __flush_tlb_up(); +} static inline void flush_tlb_mm(struct mm_struct *mm) { if (mm == current->active_mm) - __flush_tlb(); + __flush_tlb_up(); } static inline void flush_tlb_page(struct vm_area_struct *vma, @@ -105,14 +130,14 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { if (vma->vm_mm == current->active_mm) - __flush_tlb(); + __flush_tlb_up(); } static inline void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long vmflag) { if (mm == current->active_mm) - __flush_tlb(); + __flush_tlb_up(); } static inline void native_flush_tlb_others(const struct cpumask *cpumask, diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index d4cdfa67509e..ce2d0a2c3e4f 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -683,6 +683,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) } /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ + count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); __flush_tlb(); /* Save MTRR state */ @@ -696,6 +697,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) static void post_set(void) __releases(set_atomicity_lock) { /* Flush TLBs (no need to flush caches - they are disabled) */ + count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); __flush_tlb(); /* Intel (P6) standard MTRRs */ diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 7e73e8c69096..9d980d88b747 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -59,6 +59,10 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, return NULL; } +int pmd_huge_support(void) +{ + return 0; +} #else struct page * @@ -77,6 +81,10 @@ int pud_huge(pud_t pud) return !!(pud_val(pud) & _PAGE_PSE); } +int pmd_huge_support(void) +{ + return 1; +} #endif /* x86_64 also uses this file */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 282375f13c7e..ae699b3bbac8 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -103,6 +103,7 @@ static void flush_tlb_func(void *info) if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) return; + count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { if (f->flush_end == TLB_FLUSH_ALL) local_flush_tlb(); @@ -130,6 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask, info.flush_start = start; info.flush_end = end; + count_vm_event(NR_TLB_REMOTE_FLUSH); if (is_uv_system()) { unsigned int cpu; @@ -149,6 +151,7 @@ void flush_tlb_current_task(void) preempt_disable(); + count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); @@ -211,16 +214,19 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm; /* tlb_flushall_shift is on balance point, details in commit log */ - if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) + if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) { + count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); - else { + } else { if (has_large_page(mm, start, end)) { local_flush_tlb(); goto flush_all; } /* flush range by one by one 'invlpg' */ - for (addr = start; addr < end; addr += PAGE_SIZE) + for (addr = start; addr < end; addr += PAGE_SIZE) { + count_vm_event(NR_TLB_LOCAL_FLUSH_ONE); __flush_tlb_single(addr); + } if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) @@ -256,6 +262,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) static void do_flush_tlb_all(void *info) { + count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); __flush_tlb_all(); if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) leave_mm(smp_processor_id()); @@ -263,6 +270,7 @@ static void do_flush_tlb_all(void *info) void flush_tlb_all(void) { + count_vm_event(NR_TLB_REMOTE_FLUSH); on_each_cpu(do_flush_tlb_all, NULL, 1); } diff --git a/block/Kconfig b/block/Kconfig index a7e40a7c8214..7f38e40fee08 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -99,6 +99,12 @@ config BLK_DEV_THROTTLING See Documentation/cgroups/blkio-controller.txt for more information. +config CMDLINE_PARSER + bool "Block device command line partition parser" + default n + ---help--- + Parsing command line, get the partitions information. + menu "Partition Types" source "block/partitions/Kconfig" diff --git a/block/Makefile b/block/Makefile index 39b76ba66ffd..4fa4be544ece 100644 --- a/block/Makefile +++ b/block/Makefile @@ -18,3 +18,4 @@ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o +obj-$(CONFIG_CMDLINE_PARSER) += cmdline-parser.o diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 4464c823cff2..46cd7bd18b34 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -367,7 +367,7 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, if (!icq) return NULL; - if (radix_tree_preload(gfp_mask) < 0) { + if (radix_tree_maybe_preload(gfp_mask) < 0) { kmem_cache_free(et->icq_cache, icq); return NULL; } diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 5efc5a647183..3aa5b195f4dd 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -29,7 +29,7 @@ queue_var_store(unsigned long *var, const char *page, size_t count) int err; unsigned long v; - err = strict_strtoul(page, 10, &v); + err = kstrtoul(page, 10, &v); if (err || v > UINT_MAX) return -EINVAL; diff --git a/block/cmdline-parser.c b/block/cmdline-parser.c new file mode 100644 index 000000000000..cc2637f8674e --- /dev/null +++ b/block/cmdline-parser.c @@ -0,0 +1,250 @@ +/* + * Parse command line, get partition information + * + * Written by Cai Zhiyong <caizhiyong@huawei.com> + * + */ +#include <linux/buffer_head.h> +#include <linux/module.h> +#include <linux/cmdline-parser.h> + +static int parse_subpart(struct cmdline_subpart **subpart, char *partdef) +{ + int ret = 0; + struct cmdline_subpart *new_subpart; + + *subpart = NULL; + + new_subpart = kzalloc(sizeof(struct cmdline_subpart), GFP_KERNEL); + if (!new_subpart) + return -ENOMEM; + + if (*partdef == '-') { + new_subpart->size = (sector_t)(~0ULL); + partdef++; + } else { + new_subpart->size = (sector_t)memparse(partdef, &partdef); + if (new_subpart->size < (sector_t)PAGE_SIZE) { + pr_warn("cmdline partition size is invalid."); + ret = -EINVAL; + goto fail; + } + } + + if (*partdef == '@') { + partdef++; + new_subpart->from = (sector_t)memparse(partdef, &partdef); + } else { + new_subpart->from = (sector_t)(~0ULL); + } + + if (*partdef == '(') { + int length; + char *next = strchr(++partdef, ')'); + + if (!next) { + pr_warn("cmdline partition format is invalid."); + ret = -EINVAL; + goto fail; + } + + length = min_t(int, next - partdef, + sizeof(new_subpart->name) - 1); + strncpy(new_subpart->name, partdef, length); + new_subpart->name[length] = '\0'; + + partdef = ++next; + } else + new_subpart->name[0] = '\0'; + + new_subpart->flags = 0; + + if (!strncmp(partdef, "ro", 2)) { + new_subpart->flags |= PF_RDONLY; + partdef += 2; + } + + if (!strncmp(partdef, "lk", 2)) { + new_subpart->flags |= PF_POWERUP_LOCK; + partdef += 2; + } + + *subpart = new_subpart; + return 0; +fail: + kfree(new_subpart); + return ret; +} + +static void free_subpart(struct cmdline_parts *parts) +{ + struct cmdline_subpart *subpart; + + while (parts->subpart) { + subpart = parts->subpart; + parts->subpart = subpart->next_subpart; + kfree(subpart); + } +} + +static int parse_parts(struct cmdline_parts **parts, const char *bdevdef) +{ + int ret = -EINVAL; + char *next; + int length; + struct cmdline_subpart **next_subpart; + struct cmdline_parts *newparts; + char buf[BDEVNAME_SIZE + 32 + 4]; + + *parts = NULL; + + newparts = kzalloc(sizeof(struct cmdline_parts), GFP_KERNEL); + if (!newparts) + return -ENOMEM; + + next = strchr(bdevdef, ':'); + if (!next) { + pr_warn("cmdline partition has no block device."); + goto fail; + } + + length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1); + strncpy(newparts->name, bdevdef, length); + newparts->name[length] = '\0'; + newparts->nr_subparts = 0; + + next_subpart = &newparts->subpart; + + while (next && *(++next)) { + bdevdef = next; + next = strchr(bdevdef, ','); + + length = (!next) ? (sizeof(buf) - 1) : + min_t(int, next - bdevdef, sizeof(buf) - 1); + + strncpy(buf, bdevdef, length); + buf[length] = '\0'; + + ret = parse_subpart(next_subpart, buf); + if (ret) + goto fail; + + newparts->nr_subparts++; + next_subpart = &(*next_subpart)->next_subpart; + } + + if (!newparts->subpart) { + pr_warn("cmdline partition has no valid partition."); + ret = -EINVAL; + goto fail; + } + + *parts = newparts; + + return 0; +fail: + free_subpart(newparts); + kfree(newparts); + return ret; +} + +void cmdline_parts_free(struct cmdline_parts **parts) +{ + struct cmdline_parts *next_parts; + + while (*parts) { + next_parts = (*parts)->next_parts; + free_subpart(*parts); + kfree(*parts); + *parts = next_parts; + } +} + +int cmdline_parts_parse(struct cmdline_parts **parts, const char *cmdline) +{ + int ret; + char *buf; + char *pbuf; + char *next; + struct cmdline_parts **next_parts; + + *parts = NULL; + + next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + next_parts = parts; + + while (next && *pbuf) { + next = strchr(pbuf, ';'); + if (next) + *next = '\0'; + + ret = parse_parts(next_parts, pbuf); + if (ret) + goto fail; + + if (next) + pbuf = ++next; + + next_parts = &(*next_parts)->next_parts; + } + + if (!*parts) { + pr_warn("cmdline partition has no valid partition."); + ret = -EINVAL; + goto fail; + } + + ret = 0; +done: + kfree(buf); + return ret; + +fail: + cmdline_parts_free(parts); + goto done; +} + +struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts, + const char *bdev) +{ + while (parts && strncmp(bdev, parts->name, sizeof(parts->name))) + parts = parts->next_parts; + return parts; +} + +/* + * add_part() + * 0 success. + * 1 can not add so many partitions. + */ +void cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size, + int slot, + int (*add_part)(int, struct cmdline_subpart *, void *), + void *param) + +{ + sector_t from = 0; + struct cmdline_subpart *subpart; + + for (subpart = parts->subpart; subpart; + subpart = subpart->next_subpart, slot++) { + if (subpart->from == (sector_t)(~0ULL)) + subpart->from = from; + else + from = subpart->from; + + if (from >= disk_size) + break; + + if (subpart->size > (disk_size - from)) + subpart->size = disk_size - from; + + from += subpart->size; + + if (add_part(slot, subpart, param)) + break; + } +} diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index 7e5d474dc6ba..fbd5a67cb773 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -70,7 +70,7 @@ static int compat_hdio_getgeo(struct gendisk *disk, struct block_device *bdev, return ret; ret = copy_to_user(ugeo, &geo, 4); - ret |= __put_user(geo.start, &ugeo->start); + ret |= put_user(geo.start, &ugeo->start); if (ret) ret = -EFAULT; diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig index 4cebb2f0d2f4..87a32086535d 100644 --- a/block/partitions/Kconfig +++ b/block/partitions/Kconfig @@ -260,3 +260,10 @@ config SYSV68_PARTITION partition table format used by Motorola Delta machines (using sysv68). Otherwise, say N. + +config CMDLINE_PARTITION + bool "Command line partition support" if PARTITION_ADVANCED + select CMDLINE_PARSER + help + Say Y here if you would read the partitions table from bootargs. + The format for the command line is just like mtdparts. diff --git a/block/partitions/Makefile b/block/partitions/Makefile index 2be4d7ba4e3a..37a95270503c 100644 --- a/block/partitions/Makefile +++ b/block/partitions/Makefile @@ -8,6 +8,7 @@ obj-$(CONFIG_ACORN_PARTITION) += acorn.o obj-$(CONFIG_AMIGA_PARTITION) += amiga.o obj-$(CONFIG_ATARI_PARTITION) += atari.o obj-$(CONFIG_AIX_PARTITION) += aix.o +obj-$(CONFIG_CMDLINE_PARTITION) += cmdline.o obj-$(CONFIG_MAC_PARTITION) += mac.o obj-$(CONFIG_LDM_PARTITION) += ldm.o obj-$(CONFIG_MSDOS_PARTITION) += msdos.o diff --git a/block/partitions/check.c b/block/partitions/check.c index 19ba207ea7d1..9ac1df74f699 100644 --- a/block/partitions/check.c +++ b/block/partitions/check.c @@ -34,6 +34,7 @@ #include "efi.h" #include "karma.h" #include "sysv68.h" +#include "cmdline.h" int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/ @@ -65,6 +66,9 @@ static int (*check_part[])(struct parsed_partitions *) = { adfspart_check_ADFS, #endif +#ifdef CONFIG_CMDLINE_PARTITION + cmdline_partition, +#endif #ifdef CONFIG_EFI_PARTITION efi_partition, /* this must come before msdos */ #endif diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c new file mode 100644 index 000000000000..56cf4ffad51e --- /dev/null +++ b/block/partitions/cmdline.c @@ -0,0 +1,99 @@ +/* + * Copyright (C) 2013 HUAWEI + * Author: Cai Zhiyong <caizhiyong@huawei.com> + * + * Read block device partition table from command line. + * The partition used for fixed block device (eMMC) embedded device. + * It is no MBR, save storage space. Bootloader can be easily accessed + * by absolute address of data on the block device. + * Users can easily change the partition. + * + * The format for the command line is just like mtdparts. + * + * Verbose config please reference "Documentation/block/cmdline-partition.txt" + * + */ + +#include <linux/cmdline-parser.h> + +#include "check.h" +#include "cmdline.h" + +static char *cmdline; +static struct cmdline_parts *bdev_parts; + +static int add_part(int slot, struct cmdline_subpart *subpart, void *param) +{ + int label_min; + struct partition_meta_info *info; + char tmp[sizeof(info->volname) + 4]; + struct parsed_partitions *state = (struct parsed_partitions *)param; + + if (slot >= state->limit) + return 1; + + put_partition(state, slot, subpart->from >> 9, + subpart->size >> 9); + + info = &state->parts[slot].info; + + label_min = min_t(int, sizeof(info->volname) - 1, + sizeof(subpart->name)); + strncpy(info->volname, subpart->name, label_min); + info->volname[label_min] = '\0'; + + snprintf(tmp, sizeof(tmp), "(%s)", info->volname); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + + state->parts[slot].has_info = true; + + return 0; +} + +static int __init cmdline_parts_setup(char *s) +{ + cmdline = s; + return 1; +} +__setup("blkdevparts=", cmdline_parts_setup); + +/* + * Purpose: allocate cmdline partitions. + * Returns: + * -1 if unable to read the partition table + * 0 if this isn't our partition table + * 1 if successful + */ +int cmdline_partition(struct parsed_partitions *state) +{ + sector_t disk_size; + char bdev[BDEVNAME_SIZE]; + struct cmdline_parts *parts; + + if (cmdline) { + if (bdev_parts) + cmdline_parts_free(&bdev_parts); + + if (cmdline_parts_parse(&bdev_parts, cmdline)) { + cmdline = NULL; + return -1; + } + cmdline = NULL; + } + + if (!bdev_parts) + return 0; + + bdevname(state->bdev, bdev); + parts = cmdline_parts_find(bdev_parts, bdev); + if (!parts) + return 0; + + disk_size = get_capacity(state->bdev->bd_disk) << 9; + + cmdline_parts_set(parts, disk_size, 1, add_part, (void *)state); + + strlcat(state->pp_buf, "\n", PAGE_SIZE); + + return 1; +} diff --git a/block/partitions/cmdline.h b/block/partitions/cmdline.h new file mode 100644 index 000000000000..26e0f8da1414 --- /dev/null +++ b/block/partitions/cmdline.h @@ -0,0 +1,2 @@ + +int cmdline_partition(struct parsed_partitions *state); diff --git a/block/partitions/efi.c b/block/partitions/efi.c index c85fc895ecdb..1a5ec9a03c00 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -25,6 +25,9 @@ * TODO: * * Changelog: + * Mon August 5th, 2013 Davidlohr Bueso <davidlohr@hp.com> + * - detect hybrid MBRs, tighter pMBR checking & cleanups. + * * Mon Nov 09 2004 Matt Domsch <Matt_Domsch@dell.com> * - test for valid PMBR and valid PGPT before ever reading * AGPT, allow override with 'gpt' kernel command line option. @@ -149,34 +152,80 @@ static u64 last_lba(struct block_device *bdev) bdev_logical_block_size(bdev)) - 1ULL; } -static inline int -pmbr_part_valid(struct partition *part) +static inline int pmbr_part_valid(gpt_mbr_record *part) { - if (part->sys_ind == EFI_PMBR_OSTYPE_EFI_GPT && - le32_to_cpu(part->start_sect) == 1UL) - return 1; - return 0; + if (part->os_type != EFI_PMBR_OSTYPE_EFI_GPT) + goto invalid; + + /* set to 0x00000001 (i.e., the LBA of the GPT Partition Header) */ + if (le32_to_cpu(part->starting_lba) != GPT_PRIMARY_PARTITION_TABLE_LBA) + goto invalid; + + return GPT_MBR_PROTECTIVE; +invalid: + return 0; } /** * is_pmbr_valid(): test Protective MBR for validity * @mbr: pointer to a legacy mbr structure + * @total_sectors: amount of sectors in the device * - * Description: Returns 1 if PMBR is valid, 0 otherwise. - * Validity depends on two things: + * Description: Checks for a valid protective or hybrid + * master boot record (MBR). The validity of a pMBR depends + * on all of the following properties: * 1) MSDOS signature is in the last two bytes of the MBR * 2) One partition of type 0xEE is found + * + * In addition, a hybrid MBR will have up to three additional + * primary partitions, which point to the same space that's + * marked out by up to three GPT partitions. + * + * Returns 0 upon invalid MBR, or GPT_MBR_PROTECTIVE or + * GPT_MBR_HYBRID depending on the device layout. */ -static int -is_pmbr_valid(legacy_mbr *mbr) +static int is_pmbr_valid(legacy_mbr *mbr, sector_t total_sectors) { - int i; + int i, part = 0, ret = 0; /* invalid by default */ + if (!mbr || le16_to_cpu(mbr->signature) != MSDOS_MBR_SIGNATURE) - return 0; + goto done; + + for (i = 0; i < 4; i++) { + ret = pmbr_part_valid(&mbr->partition_record[i]); + if (ret == GPT_MBR_PROTECTIVE) { + part = i; + /* + * Ok, we at least know that there's a protective MBR, + * now check if there are other partition types for + * hybrid MBR. + */ + goto check_hybrid; + } + } + + if (ret != GPT_MBR_PROTECTIVE) + goto done; +check_hybrid: for (i = 0; i < 4; i++) - if (pmbr_part_valid(&mbr->partition_record[i])) - return 1; - return 0; + if ((mbr->partition_record[i].os_type != + EFI_PMBR_OSTYPE_EFI_GPT) && + (mbr->partition_record[i].os_type != 0x00)) + ret = GPT_MBR_HYBRID; + + /* + * Protective MBRs take up the lesser of the whole disk + * or 2 TiB (32bit LBA), ignoring the rest of the disk. + * + * Hybrid MBRs do not necessarily comply with this. + */ + if (ret == GPT_MBR_PROTECTIVE) { + if (le32_to_cpu(mbr->partition_record[part].size_in_lba) != + min((uint32_t) total_sectors - 1, 0xFFFFFFFF)) + ret = 0; + } +done: + return ret; } /** @@ -243,8 +292,7 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state, return NULL; if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba), - (u8 *) pte, - count) < count) { + (u8 *) pte, count) < count) { kfree(pte); pte=NULL; return NULL; @@ -364,7 +412,12 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, (unsigned long long)lastlba); goto fail; } - + if (le64_to_cpu((*gpt)->last_usable_lba) < le64_to_cpu((*gpt)->first_usable_lba)) { + pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n", + (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba), + (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba)); + goto fail; + } /* Check that sizeof_partition_entry has the correct value */ if (le32_to_cpu((*gpt)->sizeof_partition_entry) != sizeof(gpt_entry)) { pr_debug("GUID Partitition Entry Size check failed.\n"); @@ -429,44 +482,42 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba) if (!pgpt || !agpt) return; if (le64_to_cpu(pgpt->my_lba) != le64_to_cpu(agpt->alternate_lba)) { - printk(KERN_WARNING - "GPT:Primary header LBA != Alt. header alternate_lba\n"); - printk(KERN_WARNING "GPT:%lld != %lld\n", + pr_warn("GPT:Primary header LBA != Alt. header alternate_lba\n"); + pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(pgpt->my_lba), (unsigned long long)le64_to_cpu(agpt->alternate_lba)); error_found++; } if (le64_to_cpu(pgpt->alternate_lba) != le64_to_cpu(agpt->my_lba)) { - printk(KERN_WARNING - "GPT:Primary header alternate_lba != Alt. header my_lba\n"); - printk(KERN_WARNING "GPT:%lld != %lld\n", + pr_warn("GPT:Primary header alternate_lba != Alt. header my_lba\n"); + pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(pgpt->alternate_lba), (unsigned long long)le64_to_cpu(agpt->my_lba)); error_found++; } if (le64_to_cpu(pgpt->first_usable_lba) != le64_to_cpu(agpt->first_usable_lba)) { - printk(KERN_WARNING "GPT:first_usable_lbas don't match.\n"); - printk(KERN_WARNING "GPT:%lld != %lld\n", + pr_warn("GPT:first_usable_lbas don't match.\n"); + pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(pgpt->first_usable_lba), (unsigned long long)le64_to_cpu(agpt->first_usable_lba)); error_found++; } if (le64_to_cpu(pgpt->last_usable_lba) != le64_to_cpu(agpt->last_usable_lba)) { - printk(KERN_WARNING "GPT:last_usable_lbas don't match.\n"); - printk(KERN_WARNING "GPT:%lld != %lld\n", + pr_warn("GPT:last_usable_lbas don't match.\n"); + pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(pgpt->last_usable_lba), (unsigned long long)le64_to_cpu(agpt->last_usable_lba)); error_found++; } if (efi_guidcmp(pgpt->disk_guid, agpt->disk_guid)) { - printk(KERN_WARNING "GPT:disk_guids don't match.\n"); + pr_warn("GPT:disk_guids don't match.\n"); error_found++; } if (le32_to_cpu(pgpt->num_partition_entries) != le32_to_cpu(agpt->num_partition_entries)) { - printk(KERN_WARNING "GPT:num_partition_entries don't match: " + pr_warn("GPT:num_partition_entries don't match: " "0x%x != 0x%x\n", le32_to_cpu(pgpt->num_partition_entries), le32_to_cpu(agpt->num_partition_entries)); @@ -474,8 +525,7 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba) } if (le32_to_cpu(pgpt->sizeof_partition_entry) != le32_to_cpu(agpt->sizeof_partition_entry)) { - printk(KERN_WARNING - "GPT:sizeof_partition_entry values don't match: " + pr_warn("GPT:sizeof_partition_entry values don't match: " "0x%x != 0x%x\n", le32_to_cpu(pgpt->sizeof_partition_entry), le32_to_cpu(agpt->sizeof_partition_entry)); @@ -483,34 +533,30 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba) } if (le32_to_cpu(pgpt->partition_entry_array_crc32) != le32_to_cpu(agpt->partition_entry_array_crc32)) { - printk(KERN_WARNING - "GPT:partition_entry_array_crc32 values don't match: " + pr_warn("GPT:partition_entry_array_crc32 values don't match: " "0x%x != 0x%x\n", le32_to_cpu(pgpt->partition_entry_array_crc32), le32_to_cpu(agpt->partition_entry_array_crc32)); error_found++; } if (le64_to_cpu(pgpt->alternate_lba) != lastlba) { - printk(KERN_WARNING - "GPT:Primary header thinks Alt. header is not at the end of the disk.\n"); - printk(KERN_WARNING "GPT:%lld != %lld\n", + pr_warn("GPT:Primary header thinks Alt. header is not at the end of the disk.\n"); + pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(pgpt->alternate_lba), (unsigned long long)lastlba); error_found++; } if (le64_to_cpu(agpt->my_lba) != lastlba) { - printk(KERN_WARNING - "GPT:Alternate GPT header not at the end of the disk.\n"); - printk(KERN_WARNING "GPT:%lld != %lld\n", + pr_warn("GPT:Alternate GPT header not at the end of the disk.\n"); + pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(agpt->my_lba), (unsigned long long)lastlba); error_found++; } if (error_found) - printk(KERN_WARNING - "GPT: Use GNU Parted to correct GPT errors.\n"); + pr_warn("GPT: Use GNU Parted to correct GPT errors.\n"); return; } @@ -536,6 +582,7 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, gpt_header *pgpt = NULL, *agpt = NULL; gpt_entry *pptes = NULL, *aptes = NULL; legacy_mbr *legacymbr; + sector_t total_sectors = i_size_read(state->bdev->bd_inode) >> 9; u64 lastlba; if (!ptes) @@ -543,17 +590,22 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, lastlba = last_lba(state->bdev); if (!force_gpt) { - /* This will be added to the EFI Spec. per Intel after v1.02. */ - legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL); - if (legacymbr) { - read_lba(state, 0, (u8 *) legacymbr, - sizeof (*legacymbr)); - good_pmbr = is_pmbr_valid(legacymbr); - kfree(legacymbr); - } - if (!good_pmbr) - goto fail; - } + /* This will be added to the EFI Spec. per Intel after v1.02. */ + legacymbr = kzalloc(sizeof(*legacymbr), GFP_KERNEL); + if (!legacymbr) + goto fail; + + read_lba(state, 0, (u8 *)legacymbr, sizeof(*legacymbr)); + good_pmbr = is_pmbr_valid(legacymbr, total_sectors); + kfree(legacymbr); + + if (!good_pmbr) + goto fail; + + pr_debug("Device has a %s MBR\n", + good_pmbr == GPT_MBR_PROTECTIVE ? + "protective" : "hybrid"); + } good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA, &pgpt, &pptes); @@ -576,11 +628,8 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, *ptes = pptes; kfree(agpt); kfree(aptes); - if (!good_agpt) { - printk(KERN_WARNING - "Alternate GPT is invalid, " - "using primary GPT.\n"); - } + if (!good_agpt) + pr_warn("Alternate GPT is invalid, using primary GPT.\n"); return 1; } else if (good_agpt) { @@ -588,8 +637,7 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, *ptes = aptes; kfree(pgpt); kfree(pptes); - printk(KERN_WARNING - "Primary GPT is invalid, using alternate GPT.\n"); + pr_warn("Primary GPT is invalid, using alternate GPT.\n"); return 1; } @@ -651,8 +699,7 @@ int efi_partition(struct parsed_partitions *state) put_partition(state, i+1, start * ssz, size * ssz); /* If this is a RAID volume, tell md */ - if (!efi_guidcmp(ptes[i].partition_type_guid, - PARTITION_LINUX_RAID_GUID)) + if (!efi_guidcmp(ptes[i].partition_type_guid, PARTITION_LINUX_RAID_GUID)) state->parts[i + 1].flags = ADDPART_FLAG_RAID; info = &state->parts[i + 1].info; diff --git a/block/partitions/efi.h b/block/partitions/efi.h index b69ab729558f..4efcafba7e64 100644 --- a/block/partitions/efi.h +++ b/block/partitions/efi.h @@ -37,6 +37,9 @@ #define EFI_PMBR_OSTYPE_EFI 0xEF #define EFI_PMBR_OSTYPE_EFI_GPT 0xEE +#define GPT_MBR_PROTECTIVE 1 +#define GPT_MBR_HYBRID 2 + #define GPT_HEADER_SIGNATURE 0x5452415020494645ULL #define GPT_HEADER_REVISION_V1 0x00010000 #define GPT_PRIMARY_PARTITION_TABLE_LBA 1 @@ -101,11 +104,25 @@ typedef struct _gpt_entry { efi_char16_t partition_name[72 / sizeof (efi_char16_t)]; } __attribute__ ((packed)) gpt_entry; +typedef struct _gpt_mbr_record { + u8 boot_indicator; /* unused by EFI, set to 0x80 for bootable */ + u8 start_head; /* unused by EFI, pt start in CHS */ + u8 start_sector; /* unused by EFI, pt start in CHS */ + u8 start_track; + u8 os_type; /* EFI and legacy non-EFI OS types */ + u8 end_head; /* unused by EFI, pt end in CHS */ + u8 end_sector; /* unused by EFI, pt end in CHS */ + u8 end_track; /* unused by EFI, pt end in CHS */ + __le32 starting_lba; /* used by EFI - start addr of the on disk pt */ + __le32 size_in_lba; /* used by EFI - size of pt in LBA */ +} __packed gpt_mbr_record; + + typedef struct _legacy_mbr { u8 boot_code[440]; __le32 unique_mbr_signature; __le16 unknown; - struct partition partition_record[4]; + gpt_mbr_record partition_record[4]; __le16 signature; } __attribute__ ((packed)) legacy_mbr; @@ -113,22 +130,3 @@ typedef struct _legacy_mbr { extern int efi_partition(struct parsed_partitions *state); #endif - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * -------------------------------------------------------------------------- - * Local variables: - * c-indent-level: 4 - * c-brace-imaginary-offset: 0 - * c-brace-offset: -4 - * c-argdecl-indent: 4 - * c-label-offset: -4 - * c-continued-statement-offset: 4 - * c-continued-brace-offset: 0 - * indent-tabs-mode: nil - * tab-width: 8 - * End: - */ diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h index 025c41d3cb33..14a9d1912318 100644 --- a/drivers/block/aoe/aoe.h +++ b/drivers/block/aoe/aoe.h @@ -1,5 +1,5 @@ /* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */ -#define VERSION "83" +#define VERSION "85" #define AOE_MAJOR 152 #define DEVICE_NAME "aoe" @@ -169,6 +169,7 @@ struct aoedev { ulong ref; struct work_struct work;/* disk create work struct */ struct gendisk *gd; + struct dentry *debugfs; struct request_queue *blkq; struct hd_geometry geo; sector_t ssize; @@ -206,6 +207,7 @@ struct ktstate { int aoeblk_init(void); void aoeblk_exit(void); void aoeblk_gdalloc(void *); +void aoedisk_rm_debugfs(struct aoedev *d); void aoedisk_rm_sysfs(struct aoedev *d); int aoechr_init(void); diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 916d9ed5c8aa..dd73e1ff1759 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */ +/* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */ /* * aoeblk.c * block device routines @@ -17,11 +17,13 @@ #include <linux/mutex.h> #include <linux/export.h> #include <linux/moduleparam.h> +#include <linux/debugfs.h> #include <scsi/sg.h> #include "aoe.h" static DEFINE_MUTEX(aoeblk_mutex); static struct kmem_cache *buf_pool_cache; +static struct dentry *aoe_debugfs_dir; /* GPFS needs a larger value than the default. */ static int aoe_maxsectors; @@ -108,6 +110,55 @@ static ssize_t aoedisk_show_payload(struct device *dev, return snprintf(page, PAGE_SIZE, "%lu\n", d->maxbcnt); } +static int aoedisk_debugfs_show(struct seq_file *s, void *ignored) +{ + struct aoedev *d; + struct aoetgt **t, **te; + struct aoeif *ifp, *ife; + unsigned long flags; + char c; + + d = s->private; + seq_printf(s, "rttavg: %d rttdev: %d\n", + d->rttavg >> RTTSCALE, + d->rttdev >> RTTDSCALE); + seq_printf(s, "nskbpool: %d\n", skb_queue_len(&d->skbpool)); + seq_printf(s, "kicked: %ld\n", d->kicked); + seq_printf(s, "maxbcnt: %ld\n", d->maxbcnt); + seq_printf(s, "ref: %ld\n", d->ref); + + spin_lock_irqsave(&d->lock, flags); + t = d->targets; + te = t + d->ntargets; + for (; t < te && *t; t++) { + c = '\t'; + seq_printf(s, "falloc: %ld\n", (*t)->falloc); + seq_printf(s, "ffree: %p\n", + list_empty(&(*t)->ffree) ? NULL : (*t)->ffree.next); + seq_printf(s, "%pm:%d:%d:%d\n", (*t)->addr, (*t)->nout, + (*t)->maxout, (*t)->nframes); + seq_printf(s, "\tssthresh:%d\n", (*t)->ssthresh); + seq_printf(s, "\ttaint:%d\n", (*t)->taint); + seq_printf(s, "\tr:%d\n", (*t)->rpkts); + seq_printf(s, "\tw:%d\n", (*t)->wpkts); + ifp = (*t)->ifs; + ife = ifp + ARRAY_SIZE((*t)->ifs); + for (; ifp->nd && ifp < ife; ifp++) { + seq_printf(s, "%c%s", c, ifp->nd->name); + c = ','; + } + seq_puts(s, "\n"); + } + spin_unlock_irqrestore(&d->lock, flags); + + return 0; +} + +static int aoe_debugfs_open(struct inode *inode, struct file *file) +{ + return single_open(file, aoedisk_debugfs_show, inode->i_private); +} + static DEVICE_ATTR(state, S_IRUGO, aoedisk_show_state, NULL); static DEVICE_ATTR(mac, S_IRUGO, aoedisk_show_mac, NULL); static DEVICE_ATTR(netif, S_IRUGO, aoedisk_show_netif, NULL); @@ -130,6 +181,44 @@ static const struct attribute_group attr_group = { .attrs = aoe_attrs, }; +static const struct file_operations aoe_debugfs_fops = { + .open = aoe_debugfs_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void +aoedisk_add_debugfs(struct aoedev *d) +{ + struct dentry *entry; + char *p; + + if (aoe_debugfs_dir == NULL) + return; + p = strchr(d->gd->disk_name, '/'); + if (p == NULL) + p = d->gd->disk_name; + else + p++; + BUG_ON(*p == '\0'); + entry = debugfs_create_file(p, 0444, aoe_debugfs_dir, d, + &aoe_debugfs_fops); + if (IS_ERR_OR_NULL(entry)) { + pr_info("aoe: cannot create debugfs file for %s\n", + d->gd->disk_name); + return; + } + BUG_ON(d->debugfs); + d->debugfs = entry; +} +void +aoedisk_rm_debugfs(struct aoedev *d) +{ + debugfs_remove(d->debugfs); + d->debugfs = NULL; +} + static int aoedisk_add_sysfs(struct aoedev *d) { @@ -330,6 +419,7 @@ aoeblk_gdalloc(void *vp) add_disk(gd); aoedisk_add_sysfs(d); + aoedisk_add_debugfs(d); spin_lock_irqsave(&d->lock, flags); WARN_ON(!(d->flags & DEVFL_GD_NOW)); @@ -351,6 +441,8 @@ err: void aoeblk_exit(void) { + debugfs_remove_recursive(aoe_debugfs_dir); + aoe_debugfs_dir = NULL; kmem_cache_destroy(buf_pool_cache); } @@ -362,7 +454,11 @@ aoeblk_init(void) 0, 0, NULL); if (buf_pool_cache == NULL) return -ENOMEM; - + aoe_debugfs_dir = debugfs_create_dir("aoe", NULL); + if (IS_ERR_OR_NULL(aoe_debugfs_dir)) { + pr_info("aoe: cannot create debugfs directory\n"); + aoe_debugfs_dir = NULL; + } return 0; } diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 4d45dba7fb8f..d2515435e23f 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -380,7 +380,6 @@ aoecmd_ata_rw(struct aoedev *d) { struct frame *f; struct buf *buf; - struct aoetgt *t; struct sk_buff *skb; struct sk_buff_head queue; ulong bcnt, fbcnt; @@ -391,7 +390,6 @@ aoecmd_ata_rw(struct aoedev *d) f = newframe(d); if (f == NULL) return 0; - t = *d->tgt; bcnt = d->maxbcnt; if (bcnt == 0) bcnt = DEFAULTBCNT; @@ -485,7 +483,6 @@ resend(struct aoedev *d, struct frame *f) struct sk_buff *skb; struct sk_buff_head queue; struct aoe_hdr *h; - struct aoe_atahdr *ah; struct aoetgt *t; char buf[128]; u32 n; @@ -500,7 +497,6 @@ resend(struct aoedev *d, struct frame *f) return; } h = (struct aoe_hdr *) skb_mac_header(skb); - ah = (struct aoe_atahdr *) (h+1); if (!(f->flags & FFL_PROBE)) { snprintf(buf, sizeof(buf), diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c index 784c92e038d1..e774c50b6842 100644 --- a/drivers/block/aoe/aoedev.c +++ b/drivers/block/aoe/aoedev.c @@ -12,6 +12,7 @@ #include <linux/bitmap.h> #include <linux/kdev_t.h> #include <linux/moduleparam.h> +#include <linux/string.h> #include "aoe.h" static void dummy_timer(ulong); @@ -241,16 +242,12 @@ aoedev_downdev(struct aoedev *d) static int user_req(char *s, size_t slen, struct aoedev *d) { - char *p; + const char *p; size_t lim; if (!d->gd) return 0; - p = strrchr(d->gd->disk_name, '/'); - if (!p) - p = d->gd->disk_name; - else - p += 1; + p = kbasename(d->gd->disk_name); lim = sizeof(d->gd->disk_name); lim -= p - d->gd->disk_name; if (slen < lim) @@ -278,6 +275,7 @@ freedev(struct aoedev *d) del_timer_sync(&d->timer); if (d->gd) { + aoedisk_rm_debugfs(d); aoedisk_rm_sysfs(d); del_gendisk(d->gd); put_disk(d->gd); diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 62b6c2cc80b5..d2d95ff5353b 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -4258,6 +4258,13 @@ static void cciss_find_board_params(ctlr_info_t *h) h->nr_cmds = h->max_commands - 4 - cciss_tape_cmds; h->maxsgentries = readl(&(h->cfgtable->MaxSGElements)); /* + * The P600 may exhibit poor performnace under some workloads + * if we use the value in the configuration table. Limit this + * controller to MAXSGENTRIES (32) instead. + */ + if (h->board_id == 0x3225103C) + h->maxsgentries = MAXSGENTRIES; + /* * Limit in-command s/g elements to 32 save dma'able memory. * Howvever spec says if 0, use 31 */ diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index a56cfcd5d648..77a60bedd7a3 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -636,7 +636,7 @@ ok_to_write: mg_request(host->breq); } -void mg_times_out(unsigned long data) +static void mg_times_out(unsigned long data) { struct mg_host *host = (struct mg_host *)data; char *name; diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c index 1bbc681688e4..79aa179305b5 100644 --- a/drivers/block/osdblk.c +++ b/drivers/block/osdblk.c @@ -598,7 +598,7 @@ static ssize_t class_osdblk_remove(struct class *c, unsigned long ul; struct list_head *tmp; - rc = strict_strtoul(buf, 10, &ul); + rc = kstrtoul(buf, 10, &ul); if (rc) return rc; diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index f5d0ea11d9fd..56188475cfd3 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -44,6 +44,8 @@ * *************************************************************************/ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/pktcdvd.h> #include <linux/module.h> #include <linux/types.h> @@ -69,23 +71,24 @@ #define DRIVER_NAME "pktcdvd" -#if PACKET_DEBUG -#define DPRINTK(fmt, args...) printk(KERN_NOTICE fmt, ##args) -#else -#define DPRINTK(fmt, args...) -#endif - -#if PACKET_DEBUG > 1 -#define VPRINTK(fmt, args...) printk(KERN_NOTICE fmt, ##args) -#else -#define VPRINTK(fmt, args...) -#endif +#define pkt_err(pd, fmt, ...) \ + pr_err("%s: " fmt, pd->name, ##__VA_ARGS__) +#define pkt_notice(pd, fmt, ...) \ + pr_notice("%s: " fmt, pd->name, ##__VA_ARGS__) +#define pkt_info(pd, fmt, ...) \ + pr_info("%s: " fmt, pd->name, ##__VA_ARGS__) + +#define pkt_dbg(level, pd, fmt, ...) \ +do { \ + if (level == 2 && PACKET_DEBUG >= 2) \ + pr_notice("%s: %s():" fmt, \ + pd->name, __func__, ##__VA_ARGS__); \ + else if (level == 1 && PACKET_DEBUG >= 1) \ + pr_notice("%s: " fmt, pd->name, ##__VA_ARGS__); \ +} while (0) #define MAX_SPEED 0xffff -#define ZONE(sector, pd) (((sector) + (pd)->offset) & \ - ~(sector_t)((pd)->settings.size - 1)) - static DEFINE_MUTEX(pktcdvd_mutex); static struct pktcdvd_device *pkt_devs[MAX_WRITERS]; static struct proc_dir_entry *pkt_proc; @@ -103,7 +106,10 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev); static int pkt_remove_dev(dev_t pkt_dev); static int pkt_seq_show(struct seq_file *m, void *p); - +static sector_t get_zone(sector_t sector, struct pktcdvd_device *pd) +{ + return (sector + pd->offset) & ~(sector_t)(pd->settings.size - 1); +} /* * create and register a pktcdvd kernel object. @@ -424,7 +430,7 @@ static int pkt_sysfs_init(void) if (ret) { kfree(class_pktcdvd); class_pktcdvd = NULL; - printk(DRIVER_NAME": failed to create class pktcdvd\n"); + pr_err("failed to create class pktcdvd\n"); return ret; } return 0; @@ -517,7 +523,7 @@ static void pkt_bio_finished(struct pktcdvd_device *pd) { BUG_ON(atomic_read(&pd->cdrw.pending_bios) <= 0); if (atomic_dec_and_test(&pd->cdrw.pending_bios)) { - VPRINTK(DRIVER_NAME": queue empty\n"); + pkt_dbg(2, pd, "queue empty\n"); atomic_set(&pd->iosched.attention, 1); wake_up(&pd->wqueue); } @@ -734,36 +740,33 @@ out: return ret; } +static const char *sense_key_string(__u8 index) +{ + static const char * const info[] = { + "No sense", "Recovered error", "Not ready", + "Medium error", "Hardware error", "Illegal request", + "Unit attention", "Data protect", "Blank check", + }; + + return index < ARRAY_SIZE(info) ? info[index] : "INVALID"; +} + /* * A generic sense dump / resolve mechanism should be implemented across * all ATAPI + SCSI devices. */ -static void pkt_dump_sense(struct packet_command *cgc) +static void pkt_dump_sense(struct pktcdvd_device *pd, + struct packet_command *cgc) { - static char *info[9] = { "No sense", "Recovered error", "Not ready", - "Medium error", "Hardware error", "Illegal request", - "Unit attention", "Data protect", "Blank check" }; - int i; struct request_sense *sense = cgc->sense; - printk(DRIVER_NAME":"); - for (i = 0; i < CDROM_PACKET_SIZE; i++) - printk(" %02x", cgc->cmd[i]); - printk(" - "); - - if (sense == NULL) { - printk("no sense\n"); - return; - } - - printk("sense %02x.%02x.%02x", sense->sense_key, sense->asc, sense->ascq); - - if (sense->sense_key > 8) { - printk(" (INVALID)\n"); - return; - } - - printk(" (%s)\n", info[sense->sense_key]); + if (sense) + pkt_err(pd, "%*ph - sense %02x.%02x.%02x (%s)\n", + CDROM_PACKET_SIZE, cgc->cmd, + sense->sense_key, sense->asc, sense->ascq, + sense_key_string(sense->sense_key)); + else + pkt_err(pd, "%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd); } /* @@ -806,7 +809,7 @@ static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd, cgc.cmd[5] = write_speed & 0xff; if ((ret = pkt_generic_packet(pd, &cgc))) - pkt_dump_sense(&cgc); + pkt_dump_sense(pd, &cgc); return ret; } @@ -872,7 +875,7 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd) need_write_seek = 0; if (need_write_seek && reads_queued) { if (atomic_read(&pd->cdrw.pending_bios) > 0) { - VPRINTK(DRIVER_NAME": write, waiting\n"); + pkt_dbg(2, pd, "write, waiting\n"); break; } pkt_flush_cache(pd); @@ -881,7 +884,7 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd) } else { if (!reads_queued && writes_queued) { if (atomic_read(&pd->cdrw.pending_bios) > 0) { - VPRINTK(DRIVER_NAME": read, waiting\n"); + pkt_dbg(2, pd, "read, waiting\n"); break; } pd->iosched.writing = 1; @@ -943,7 +946,7 @@ static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_que set_bit(PACKET_MERGE_SEGS, &pd->flags); return 0; } else { - printk(DRIVER_NAME": cdrom max_phys_segments too small\n"); + pkt_err(pd, "cdrom max_phys_segments too small\n"); return -EIO; } } @@ -987,8 +990,9 @@ static void pkt_end_io_read(struct bio *bio, int err) struct pktcdvd_device *pd = pkt->pd; BUG_ON(!pd); - VPRINTK("pkt_end_io_read: bio=%p sec0=%llx sec=%llx err=%d\n", bio, - (unsigned long long)pkt->sector, (unsigned long long)bio->bi_sector, err); + pkt_dbg(2, pd, "bio=%p sec0=%llx sec=%llx err=%d\n", + bio, (unsigned long long)pkt->sector, + (unsigned long long)bio->bi_sector, err); if (err) atomic_inc(&pkt->io_errors); @@ -1005,7 +1009,7 @@ static void pkt_end_io_packet_write(struct bio *bio, int err) struct pktcdvd_device *pd = pkt->pd; BUG_ON(!pd); - VPRINTK("pkt_end_io_packet_write: id=%d, err=%d\n", pkt->id, err); + pkt_dbg(2, pd, "id=%d, err=%d\n", pkt->id, err); pd->stats.pkt_ended++; @@ -1047,7 +1051,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) spin_unlock(&pkt->lock); if (pkt->cache_valid) { - VPRINTK("pkt_gather_data: zone %llx cached\n", + pkt_dbg(2, pd, "zone %llx cached\n", (unsigned long long)pkt->sector); goto out_account; } @@ -1070,7 +1074,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) p = (f * CD_FRAMESIZE) / PAGE_SIZE; offset = (f * CD_FRAMESIZE) % PAGE_SIZE; - VPRINTK("pkt_gather_data: Adding frame %d, page:%p offs:%d\n", + pkt_dbg(2, pd, "Adding frame %d, page:%p offs:%d\n", f, pkt->pages[p], offset); if (!bio_add_page(bio, pkt->pages[p], CD_FRAMESIZE, offset)) BUG(); @@ -1082,7 +1086,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) } out_account: - VPRINTK("pkt_gather_data: need %d frames for zone %llx\n", + pkt_dbg(2, pd, "need %d frames for zone %llx\n", frames_read, (unsigned long long)pkt->sector); pd->stats.pkt_started++; pd->stats.secs_rg += frames_read * (CD_FRAMESIZE >> 9); @@ -1183,7 +1187,8 @@ static inline void pkt_set_state(struct packet_data *pkt, enum packet_data_state "IDLE", "WAITING", "READ_WAIT", "WRITE_WAIT", "RECOVERY", "FINISHED" }; enum packet_data_state old_state = pkt->state; - VPRINTK("pkt %2d : s=%6llx %s -> %s\n", pkt->id, (unsigned long long)pkt->sector, + pkt_dbg(2, pd, "pkt %2d : s=%6llx %s -> %s\n", + pkt->id, (unsigned long long)pkt->sector, state_name[old_state], state_name[state]); #endif pkt->state = state; @@ -1202,12 +1207,10 @@ static int pkt_handle_queue(struct pktcdvd_device *pd) struct rb_node *n; int wakeup; - VPRINTK("handle_queue\n"); - atomic_set(&pd->scan_queue, 0); if (list_empty(&pd->cdrw.pkt_free_list)) { - VPRINTK("handle_queue: no pkt\n"); + pkt_dbg(2, pd, "no pkt\n"); return 0; } @@ -1224,7 +1227,7 @@ static int pkt_handle_queue(struct pktcdvd_device *pd) node = first_node; while (node) { bio = node->bio; - zone = ZONE(bio->bi_sector, pd); + zone = get_zone(bio->bi_sector, pd); list_for_each_entry(p, &pd->cdrw.pkt_active_list, list) { if (p->sector == zone) { bio = NULL; @@ -1244,7 +1247,7 @@ try_next_bio: } spin_unlock(&pd->lock); if (!bio) { - VPRINTK("handle_queue: no bio\n"); + pkt_dbg(2, pd, "no bio\n"); return 0; } @@ -1260,12 +1263,12 @@ try_next_bio: * to this packet. */ spin_lock(&pd->lock); - VPRINTK("pkt_handle_queue: looking for zone %llx\n", (unsigned long long)zone); + pkt_dbg(2, pd, "looking for zone %llx\n", (unsigned long long)zone); while ((node = pkt_rbtree_find(pd, zone)) != NULL) { bio = node->bio; - VPRINTK("pkt_handle_queue: found zone=%llx\n", - (unsigned long long)ZONE(bio->bi_sector, pd)); - if (ZONE(bio->bi_sector, pd) != zone) + pkt_dbg(2, pd, "found zone=%llx\n", + (unsigned long long)get_zone(bio->bi_sector, pd)); + if (get_zone(bio->bi_sector, pd) != zone) break; pkt_rbtree_erase(pd, node); spin_lock(&pkt->lock); @@ -1316,7 +1319,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) if (!bio_add_page(pkt->w_bio, bvec[f].bv_page, CD_FRAMESIZE, bvec[f].bv_offset)) BUG(); } - VPRINTK(DRIVER_NAME": vcnt=%d\n", pkt->w_bio->bi_vcnt); + pkt_dbg(2, pd, "vcnt=%d\n", pkt->w_bio->bi_vcnt); /* * Fill-in bvec with data from orig_bios. @@ -1327,7 +1330,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) pkt_set_state(pkt, PACKET_WRITE_WAIT_STATE); spin_unlock(&pkt->lock); - VPRINTK("pkt_start_write: Writing %d frames for zone %llx\n", + pkt_dbg(2, pd, "Writing %d frames for zone %llx\n", pkt->write_size, (unsigned long long)pkt->sector); if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames)) { @@ -1359,7 +1362,7 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data { int uptodate; - VPRINTK("run_state_machine: pkt %d\n", pkt->id); + pkt_dbg(2, pd, "pkt %d\n", pkt->id); for (;;) { switch (pkt->state) { @@ -1398,7 +1401,7 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data if (pkt_start_recovery(pkt)) { pkt_start_write(pd, pkt); } else { - VPRINTK("No recovery possible\n"); + pkt_dbg(2, pd, "No recovery possible\n"); pkt_set_state(pkt, PACKET_FINISHED_STATE); } break; @@ -1419,8 +1422,6 @@ static void pkt_handle_packets(struct pktcdvd_device *pd) { struct packet_data *pkt, *next; - VPRINTK("pkt_handle_packets\n"); - /* * Run state machine for active packets */ @@ -1502,9 +1503,9 @@ static int kcdrwd(void *foobar) if (PACKET_DEBUG > 1) { int states[PACKET_NUM_STATES]; pkt_count_states(pd, states); - VPRINTK("kcdrwd: i:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n", - states[0], states[1], states[2], states[3], - states[4], states[5]); + pkt_dbg(2, pd, "i:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n", + states[0], states[1], states[2], + states[3], states[4], states[5]); } min_sleep_time = MAX_SCHEDULE_TIMEOUT; @@ -1513,9 +1514,9 @@ static int kcdrwd(void *foobar) min_sleep_time = pkt->sleep_time; } - VPRINTK("kcdrwd: sleeping\n"); + pkt_dbg(2, pd, "sleeping\n"); residue = schedule_timeout(min_sleep_time); - VPRINTK("kcdrwd: wake up\n"); + pkt_dbg(2, pd, "wake up\n"); /* make swsusp happy with our thread */ try_to_freeze(); @@ -1563,9 +1564,10 @@ work_to_do: static void pkt_print_settings(struct pktcdvd_device *pd) { - printk(DRIVER_NAME": %s packets, ", pd->settings.fp ? "Fixed" : "Variable"); - printk("%u blocks, ", pd->settings.size >> 2); - printk("Mode-%c disc\n", pd->settings.block_mode == 8 ? '1' : '2'); + pkt_info(pd, "%s packets, %u blocks, Mode-%c disc\n", + pd->settings.fp ? "Fixed" : "Variable", + pd->settings.size >> 2, + pd->settings.block_mode == 8 ? '1' : '2'); } static int pkt_mode_sense(struct pktcdvd_device *pd, struct packet_command *cgc, int page_code, int page_control) @@ -1699,7 +1701,7 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) init_cdrom_command(&cgc, buffer, sizeof(*wp), CGC_DATA_READ); cgc.sense = &sense; if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) { - pkt_dump_sense(&cgc); + pkt_dump_sense(pd, &cgc); return ret; } @@ -1714,7 +1716,7 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) init_cdrom_command(&cgc, buffer, size, CGC_DATA_READ); cgc.sense = &sense; if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) { - pkt_dump_sense(&cgc); + pkt_dump_sense(pd, &cgc); return ret; } @@ -1749,14 +1751,14 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) /* * paranoia */ - printk(DRIVER_NAME": write mode wrong %d\n", wp->data_block_type); + pkt_err(pd, "write mode wrong %d\n", wp->data_block_type); return 1; } wp->packet_size = cpu_to_be32(pd->settings.size >> 2); cgc.buflen = cgc.cmd[8] = size; if ((ret = pkt_mode_select(pd, &cgc))) { - pkt_dump_sense(&cgc); + pkt_dump_sense(pd, &cgc); return ret; } @@ -1793,7 +1795,7 @@ static int pkt_writable_track(struct pktcdvd_device *pd, track_information *ti) if (ti->rt == 1 && ti->blank == 0) return 1; - printk(DRIVER_NAME": bad state %d-%d-%d\n", ti->rt, ti->blank, ti->packet); + pkt_err(pd, "bad state %d-%d-%d\n", ti->rt, ti->blank, ti->packet); return 0; } @@ -1811,7 +1813,8 @@ static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di) case 0x12: /* DVD-RAM */ return 1; default: - VPRINTK(DRIVER_NAME": Wrong disc profile (%x)\n", pd->mmc3_profile); + pkt_dbg(2, pd, "Wrong disc profile (%x)\n", + pd->mmc3_profile); return 0; } @@ -1820,22 +1823,22 @@ static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di) * but i'm not sure, should we leave this to user apps? probably. */ if (di->disc_type == 0xff) { - printk(DRIVER_NAME": Unknown disc. No track?\n"); + pkt_notice(pd, "unknown disc - no track?\n"); return 0; } if (di->disc_type != 0x20 && di->disc_type != 0) { - printk(DRIVER_NAME": Wrong disc type (%x)\n", di->disc_type); + pkt_err(pd, "wrong disc type (%x)\n", di->disc_type); return 0; } if (di->erasable == 0) { - printk(DRIVER_NAME": Disc not erasable\n"); + pkt_notice(pd, "disc not erasable\n"); return 0; } if (di->border_status == PACKET_SESSION_RESERVED) { - printk(DRIVER_NAME": Can't write to last track (reserved)\n"); + pkt_err(pd, "can't write to last track (reserved)\n"); return 0; } @@ -1860,7 +1863,7 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) memset(&ti, 0, sizeof(track_information)); if ((ret = pkt_get_disc_info(pd, &di))) { - printk("failed get_disc\n"); + pkt_err(pd, "failed get_disc\n"); return ret; } @@ -1871,12 +1874,12 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) track = 1; /* (di.last_track_msb << 8) | di.last_track_lsb; */ if ((ret = pkt_get_track_info(pd, track, 1, &ti))) { - printk(DRIVER_NAME": failed get_track\n"); + pkt_err(pd, "failed get_track\n"); return ret; } if (!pkt_writable_track(pd, &ti)) { - printk(DRIVER_NAME": can't write to this track\n"); + pkt_err(pd, "can't write to this track\n"); return -EROFS; } @@ -1886,11 +1889,11 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) */ pd->settings.size = be32_to_cpu(ti.fixed_packet_size) << 2; if (pd->settings.size == 0) { - printk(DRIVER_NAME": detected zero packet size!\n"); + pkt_notice(pd, "detected zero packet size!\n"); return -ENXIO; } if (pd->settings.size > PACKET_MAX_SECTORS) { - printk(DRIVER_NAME": packet size is too big\n"); + pkt_err(pd, "packet size is too big\n"); return -EROFS; } pd->settings.fp = ti.fp; @@ -1932,7 +1935,7 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) pd->settings.block_mode = PACKET_BLOCK_MODE2; break; default: - printk(DRIVER_NAME": unknown data mode\n"); + pkt_err(pd, "unknown data mode\n"); return -EROFS; } return 0; @@ -1966,10 +1969,10 @@ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd, cgc.buflen = cgc.cmd[8] = 2 + ((buf[0] << 8) | (buf[1] & 0xff)); ret = pkt_mode_select(pd, &cgc); if (ret) { - printk(DRIVER_NAME": write caching control failed\n"); - pkt_dump_sense(&cgc); + pkt_err(pd, "write caching control failed\n"); + pkt_dump_sense(pd, &cgc); } else if (!ret && set) - printk(DRIVER_NAME": enabled write caching on %s\n", pd->name); + pkt_notice(pd, "enabled write caching\n"); return ret; } @@ -2005,7 +2008,7 @@ static noinline_for_stack int pkt_get_max_speed(struct pktcdvd_device *pd, sizeof(struct mode_page_header); ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0); if (ret) { - pkt_dump_sense(&cgc); + pkt_dump_sense(pd, &cgc); return ret; } } @@ -2064,7 +2067,7 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd, cgc.cmd[8] = 2; ret = pkt_generic_packet(pd, &cgc); if (ret) { - pkt_dump_sense(&cgc); + pkt_dump_sense(pd, &cgc); return ret; } size = ((unsigned int) buf[0]<<8) + buf[1] + 2; @@ -2079,16 +2082,16 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd, cgc.cmd[8] = size; ret = pkt_generic_packet(pd, &cgc); if (ret) { - pkt_dump_sense(&cgc); + pkt_dump_sense(pd, &cgc); return ret; } if (!(buf[6] & 0x40)) { - printk(DRIVER_NAME": Disc type is not CD-RW\n"); + pkt_notice(pd, "disc type is not CD-RW\n"); return 1; } if (!(buf[6] & 0x4)) { - printk(DRIVER_NAME": A1 values on media are not valid, maybe not CDRW?\n"); + pkt_notice(pd, "A1 values on media are not valid, maybe not CDRW?\n"); return 1; } @@ -2108,14 +2111,14 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd, *speed = us_clv_to_speed[sp]; break; default: - printk(DRIVER_NAME": Unknown disc sub-type %d\n",st); + pkt_notice(pd, "unknown disc sub-type %d\n", st); return 1; } if (*speed) { - printk(DRIVER_NAME": Max. media speed: %d\n",*speed); + pkt_info(pd, "maximum media speed: %d\n", *speed); return 0; } else { - printk(DRIVER_NAME": Unknown speed %d for sub-type %d\n",sp,st); + pkt_notice(pd, "unknown speed %d for sub-type %d\n", sp, st); return 1; } } @@ -2126,7 +2129,7 @@ static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd) struct request_sense sense; int ret; - VPRINTK(DRIVER_NAME": Performing OPC\n"); + pkt_dbg(2, pd, "Performing OPC\n"); init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); cgc.sense = &sense; @@ -2134,7 +2137,7 @@ static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd) cgc.cmd[0] = GPCMD_SEND_OPC; cgc.cmd[1] = 1; if ((ret = pkt_generic_packet(pd, &cgc))) - pkt_dump_sense(&cgc); + pkt_dump_sense(pd, &cgc); return ret; } @@ -2144,12 +2147,12 @@ static int pkt_open_write(struct pktcdvd_device *pd) unsigned int write_speed, media_write_speed, read_speed; if ((ret = pkt_probe_settings(pd))) { - VPRINTK(DRIVER_NAME": %s failed probe\n", pd->name); + pkt_dbg(2, pd, "failed probe\n"); return ret; } if ((ret = pkt_set_write_settings(pd))) { - DPRINTK(DRIVER_NAME": %s failed saving write settings\n", pd->name); + pkt_dbg(1, pd, "failed saving write settings\n"); return -EIO; } @@ -2161,26 +2164,26 @@ static int pkt_open_write(struct pktcdvd_device *pd) case 0x13: /* DVD-RW */ case 0x1a: /* DVD+RW */ case 0x12: /* DVD-RAM */ - DPRINTK(DRIVER_NAME": write speed %ukB/s\n", write_speed); + pkt_dbg(1, pd, "write speed %ukB/s\n", write_speed); break; default: if ((ret = pkt_media_speed(pd, &media_write_speed))) media_write_speed = 16; write_speed = min(write_speed, media_write_speed * 177); - DPRINTK(DRIVER_NAME": write speed %ux\n", write_speed / 176); + pkt_dbg(1, pd, "write speed %ux\n", write_speed / 176); break; } read_speed = write_speed; if ((ret = pkt_set_speed(pd, write_speed, read_speed))) { - DPRINTK(DRIVER_NAME": %s couldn't set write speed\n", pd->name); + pkt_dbg(1, pd, "couldn't set write speed\n"); return -EIO; } pd->write_speed = write_speed; pd->read_speed = read_speed; if ((ret = pkt_perform_opc(pd))) { - DPRINTK(DRIVER_NAME": %s Optimum Power Calibration failed\n", pd->name); + pkt_dbg(1, pd, "Optimum Power Calibration failed\n"); } return 0; @@ -2205,7 +2208,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) goto out; if ((ret = pkt_get_last_written(pd, &lba))) { - printk(DRIVER_NAME": pkt_get_last_written failed\n"); + pkt_err(pd, "pkt_get_last_written failed\n"); goto out_putdev; } @@ -2235,11 +2238,11 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) if (write) { if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) { - printk(DRIVER_NAME": not enough memory for buffers\n"); + pkt_err(pd, "not enough memory for buffers\n"); ret = -ENOMEM; goto out_putdev; } - printk(DRIVER_NAME": %lukB available on disc\n", lba << 1); + pkt_info(pd, "%lukB available on disc\n", lba << 1); } return 0; @@ -2257,7 +2260,7 @@ out: static void pkt_release_dev(struct pktcdvd_device *pd, int flush) { if (flush && pkt_flush_cache(pd)) - DPRINTK(DRIVER_NAME": %s not flushing cache\n", pd->name); + pkt_dbg(1, pd, "not flushing cache\n"); pkt_lock_door(pd, 0); @@ -2279,8 +2282,6 @@ static int pkt_open(struct block_device *bdev, fmode_t mode) struct pktcdvd_device *pd = NULL; int ret; - VPRINTK(DRIVER_NAME": entering open\n"); - mutex_lock(&pktcdvd_mutex); mutex_lock(&ctl_mutex); pd = pkt_find_dev_from_minor(MINOR(bdev->bd_dev)); @@ -2315,7 +2316,6 @@ static int pkt_open(struct block_device *bdev, fmode_t mode) out_dec: pd->refcnt--; out: - VPRINTK(DRIVER_NAME": failed open (%d)\n", ret); mutex_unlock(&ctl_mutex); mutex_unlock(&pktcdvd_mutex); return ret; @@ -2360,7 +2360,8 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio) pd = q->queuedata; if (!pd) { - printk(DRIVER_NAME": %s incorrect request queue\n", bdevname(bio->bi_bdev, b)); + pr_err("%s incorrect request queue\n", + bdevname(bio->bi_bdev, b)); goto end_io; } @@ -2382,20 +2383,20 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio) } if (!test_bit(PACKET_WRITABLE, &pd->flags)) { - printk(DRIVER_NAME": WRITE for ro device %s (%llu)\n", - pd->name, (unsigned long long)bio->bi_sector); + pkt_notice(pd, "WRITE for ro device (%llu)\n", + (unsigned long long)bio->bi_sector); goto end_io; } if (!bio->bi_size || (bio->bi_size % CD_FRAMESIZE)) { - printk(DRIVER_NAME": wrong bio size\n"); + pkt_err(pd, "wrong bio size\n"); goto end_io; } blk_queue_bounce(q, &bio); - zone = ZONE(bio->bi_sector, pd); - VPRINTK("pkt_make_request: start = %6llx stop = %6llx\n", + zone = get_zone(bio->bi_sector, pd); + pkt_dbg(2, pd, "start = %6llx stop = %6llx\n", (unsigned long long)bio->bi_sector, (unsigned long long)bio_end_sector(bio)); @@ -2405,7 +2406,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio) sector_t last_zone; int first_sectors; - last_zone = ZONE(bio_end_sector(bio) - 1, pd); + last_zone = get_zone(bio_end_sector(bio) - 1, pd); if (last_zone != zone) { BUG_ON(last_zone != zone + pd->settings.size); first_sectors = last_zone - bio->bi_sector; @@ -2500,7 +2501,7 @@ static int pkt_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, struct bio_vec *bvec) { struct pktcdvd_device *pd = q->queuedata; - sector_t zone = ZONE(bmd->bi_sector, pd); + sector_t zone = get_zone(bmd->bi_sector, pd); int used = ((bmd->bi_sector - zone) << 9) + bmd->bi_size; int remaining = (pd->settings.size << 9) - used; int remaining2; @@ -2609,7 +2610,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) struct block_device *bdev; if (pd->pkt_dev == dev) { - printk(DRIVER_NAME": Recursive setup not allowed\n"); + pkt_err(pd, "recursive setup not allowed\n"); return -EBUSY; } for (i = 0; i < MAX_WRITERS; i++) { @@ -2617,11 +2618,12 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) if (!pd2) continue; if (pd2->bdev->bd_dev == dev) { - printk(DRIVER_NAME": %s already setup\n", bdevname(pd2->bdev, b)); + pkt_err(pd, "%s already setup\n", + bdevname(pd2->bdev, b)); return -EBUSY; } if (pd2->pkt_dev == dev) { - printk(DRIVER_NAME": Can't chain pktcdvd devices\n"); + pkt_err(pd, "can't chain pktcdvd devices\n"); return -EBUSY; } } @@ -2644,13 +2646,13 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) atomic_set(&pd->cdrw.pending_bios, 0); pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->name); if (IS_ERR(pd->cdrw.thread)) { - printk(DRIVER_NAME": can't start kernel thread\n"); + pkt_err(pd, "can't start kernel thread\n"); ret = -ENOMEM; goto out_mem; } proc_create_data(pd->name, 0, pkt_proc, &pkt_proc_fops, pd); - DPRINTK(DRIVER_NAME": writer %s mapped to %s\n", pd->name, bdevname(bdev, b)); + pkt_dbg(1, pd, "writer mapped to %s\n", bdevname(bdev, b)); return 0; out_mem: @@ -2665,8 +2667,8 @@ static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, struct pktcdvd_device *pd = bdev->bd_disk->private_data; int ret; - VPRINTK("pkt_ioctl: cmd %x, dev %d:%d\n", cmd, - MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); + pkt_dbg(2, pd, "cmd %x, dev %d:%d\n", + cmd, MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); mutex_lock(&pktcdvd_mutex); switch (cmd) { @@ -2690,7 +2692,7 @@ static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, break; default: - VPRINTK(DRIVER_NAME": Unknown ioctl for %s (%x)\n", pd->name, cmd); + pkt_dbg(2, pd, "Unknown ioctl (%x)\n", cmd); ret = -ENOTTY; } mutex_unlock(&pktcdvd_mutex); @@ -2743,7 +2745,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) if (!pkt_devs[idx]) break; if (idx == MAX_WRITERS) { - printk(DRIVER_NAME": max %d writers supported\n", MAX_WRITERS); + pr_err("max %d writers supported\n", MAX_WRITERS); ret = -EBUSY; goto out_mutex; } @@ -2818,7 +2820,7 @@ out_mem: kfree(pd); out_mutex: mutex_unlock(&ctl_mutex); - printk(DRIVER_NAME": setup of pktcdvd device failed\n"); + pr_err("setup of pktcdvd device failed\n"); return ret; } @@ -2839,7 +2841,7 @@ static int pkt_remove_dev(dev_t pkt_dev) break; } if (idx == MAX_WRITERS) { - DPRINTK(DRIVER_NAME": dev not setup\n"); + pr_debug("dev not setup\n"); ret = -ENXIO; goto out; } @@ -2859,7 +2861,7 @@ static int pkt_remove_dev(dev_t pkt_dev) blkdev_put(pd->bdev, FMODE_READ | FMODE_NDELAY); remove_proc_entry(pd->name, pkt_proc); - DPRINTK(DRIVER_NAME": writer %s unmapped\n", pd->name); + pkt_dbg(1, pd, "writer unmapped\n"); del_gendisk(pd->disk); blk_cleanup_queue(pd->disk->queue); @@ -2969,7 +2971,7 @@ static int __init pkt_init(void) ret = register_blkdev(pktdev_major, DRIVER_NAME); if (ret < 0) { - printk(DRIVER_NAME": Unable to register block device\n"); + pr_err("unable to register block device\n"); goto out2; } if (!pktdev_major) @@ -2983,7 +2985,7 @@ static int __init pkt_init(void) ret = misc_register(&pkt_misc); if (ret) { - printk(DRIVER_NAME": Unable to register misc device\n"); + pr_err("unable to register misc device\n"); goto out_misc; } diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 39c51cc7fabc..b22a7d0fe5b7 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -5132,7 +5132,7 @@ static ssize_t rbd_remove(struct bus_type *bus, bool already = false; int ret; - ret = strict_strtoul(buf, 10, &ul); + ret = kstrtoul(buf, 10, &ul); if (ret) return ret; diff --git a/drivers/block/swim.c b/drivers/block/swim.c index 8ed6ccb748cf..b02d53a399f3 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -924,7 +924,6 @@ static int swim_probe(struct platform_device *dev) return 0; out_kfree: - platform_set_drvdata(dev, NULL); kfree(swd); out_iounmap: iounmap(swim_base); @@ -962,7 +961,6 @@ static int swim_remove(struct platform_device *dev) if (res) release_mem_region(res->start, resource_size(res)); - platform_set_drvdata(dev, NULL); kfree(swd); return 0; diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index fe5c3cd10c34..c2014a0aa206 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -620,7 +620,7 @@ static void backend_changed(struct xenbus_watch *watch, } /* Front end dir is a number, which is used as the handle. */ - err = strict_strtoul(strrchr(dev->otherend, '/') + 1, 0, &handle); + err = kstrtoul(strrchr(dev->otherend, '/') + 1, 0, &handle); if (err) return; diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c index 4519cb332987..5796d0157ce0 100644 --- a/drivers/char/tpm/tpm_tis.c +++ b/drivers/char/tpm/tpm_tis.c @@ -766,6 +766,25 @@ static void tpm_tis_reenable_interrupts(struct tpm_chip *chip) } #endif +#ifdef CONFIG_PM_SLEEP +static int tpm_tis_resume(struct device *dev) +{ + struct tpm_chip *chip = dev_get_drvdata(dev); + int ret; + + if (chip->vendor.irq) + tpm_tis_reenable_interrupts(chip); + + ret = tpm_pm_resume(dev); + if (!ret) + tpm_do_selftest(chip); + + return ret; +} +#endif + +static SIMPLE_DEV_PM_OPS(tpm_tis_pm, tpm_pm_suspend, tpm_tis_resume); + #ifdef CONFIG_PNP static int tpm_tis_pnp_init(struct pnp_dev *pnp_dev, const struct pnp_device_id *pnp_id) @@ -787,26 +806,6 @@ static int tpm_tis_pnp_init(struct pnp_dev *pnp_dev, return tpm_tis_init(&pnp_dev->dev, start, len, irq); } -static int tpm_tis_pnp_suspend(struct pnp_dev *dev, pm_message_t msg) -{ - return tpm_pm_suspend(&dev->dev); -} - -static int tpm_tis_pnp_resume(struct pnp_dev *dev) -{ - struct tpm_chip *chip = pnp_get_drvdata(dev); - int ret; - - if (chip->vendor.irq) - tpm_tis_reenable_interrupts(chip); - - ret = tpm_pm_resume(&dev->dev); - if (!ret) - tpm_do_selftest(chip); - - return ret; -} - static struct pnp_device_id tpm_pnp_tbl[] = { {"PNP0C31", 0}, /* TPM */ {"ATM1200", 0}, /* Atmel */ @@ -835,9 +834,12 @@ static struct pnp_driver tis_pnp_driver = { .name = "tpm_tis", .id_table = tpm_pnp_tbl, .probe = tpm_tis_pnp_init, - .suspend = tpm_tis_pnp_suspend, - .resume = tpm_tis_pnp_resume, .remove = tpm_tis_pnp_remove, +#ifdef CONFIG_PM_SLEEP + .driver = { + .pm = &tpm_tis_pm, + }, +#endif }; #define TIS_HID_USR_IDX sizeof(tpm_pnp_tbl)/sizeof(struct pnp_device_id) -2 @@ -846,20 +848,6 @@ module_param_string(hid, tpm_pnp_tbl[TIS_HID_USR_IDX].id, MODULE_PARM_DESC(hid, "Set additional specific HID for this driver to probe"); #endif -#ifdef CONFIG_PM_SLEEP -static int tpm_tis_resume(struct device *dev) -{ - struct tpm_chip *chip = dev_get_drvdata(dev); - - if (chip->vendor.irq) - tpm_tis_reenable_interrupts(chip); - - return tpm_pm_resume(dev); -} -#endif - -static SIMPLE_DEV_PM_OPS(tpm_tis_pm, tpm_pm_suspend, tpm_tis_resume); - static struct platform_driver tis_drv = { .driver = { .name = "tpm_tis", diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c index 232fa8fce26a..fa0affb699b4 100644 --- a/drivers/firmware/dmi_scan.c +++ b/drivers/firmware/dmi_scan.c @@ -14,7 +14,7 @@ * of and an antecedent to, SMBIOS, which stands for System * Management BIOS. See further: http://www.dmtf.org/standards */ -static char dmi_empty_string[] = " "; +static const char dmi_empty_string[] = " "; static u16 __initdata dmi_ver; /* @@ -49,7 +49,7 @@ static const char * __init dmi_string_nosave(const struct dmi_header *dm, u8 s) return ""; } -static char * __init dmi_string(const struct dmi_header *dm, u8 s) +static const char * __init dmi_string(const struct dmi_header *dm, u8 s) { const char *bp = dmi_string_nosave(dm, s); char *str; @@ -62,8 +62,6 @@ static char * __init dmi_string(const struct dmi_header *dm, u8 s) str = dmi_alloc(len); if (str != NULL) strcpy(str, bp); - else - printk(KERN_ERR "dmi_string: cannot allocate %Zu bytes.\n", len); return str; } @@ -133,17 +131,18 @@ static int __init dmi_checksum(const u8 *buf, u8 len) return sum == 0; } -static char *dmi_ident[DMI_STRING_MAX]; +static const char *dmi_ident[DMI_STRING_MAX]; static LIST_HEAD(dmi_devices); int dmi_available; /* * Save a DMI string */ -static void __init dmi_save_ident(const struct dmi_header *dm, int slot, int string) +static void __init dmi_save_ident(const struct dmi_header *dm, int slot, + int string) { - const char *d = (const char*) dm; - char *p; + const char *d = (const char *) dm; + const char *p; if (dmi_ident[slot]) return; @@ -155,9 +154,10 @@ static void __init dmi_save_ident(const struct dmi_header *dm, int slot, int str dmi_ident[slot] = p; } -static void __init dmi_save_uuid(const struct dmi_header *dm, int slot, int index) +static void __init dmi_save_uuid(const struct dmi_header *dm, int slot, + int index) { - const u8 *d = (u8*) dm + index; + const u8 *d = (u8 *) dm + index; char *s; int is_ff = 1, is_00 = 1, i; @@ -188,12 +188,13 @@ static void __init dmi_save_uuid(const struct dmi_header *dm, int slot, int inde else sprintf(s, "%pUB", d); - dmi_ident[slot] = s; + dmi_ident[slot] = s; } -static void __init dmi_save_type(const struct dmi_header *dm, int slot, int index) +static void __init dmi_save_type(const struct dmi_header *dm, int slot, + int index) { - const u8 *d = (u8*) dm + index; + const u8 *d = (u8 *) dm + index; char *s; if (dmi_ident[slot]) @@ -216,10 +217,8 @@ static void __init dmi_save_one_device(int type, const char *name) return; dev = dmi_alloc(sizeof(*dev) + strlen(name) + 1); - if (!dev) { - printk(KERN_ERR "dmi_save_one_device: out of memory.\n"); + if (!dev) return; - } dev->type = type; strcpy((char *)(dev + 1), name); @@ -249,17 +248,14 @@ static void __init dmi_save_oem_strings_devices(const struct dmi_header *dm) struct dmi_device *dev; for (i = 1; i <= count; i++) { - char *devname = dmi_string(dm, i); + const char *devname = dmi_string(dm, i); if (devname == dmi_empty_string) continue; dev = dmi_alloc(sizeof(*dev)); - if (!dev) { - printk(KERN_ERR - "dmi_save_oem_strings_devices: out of memory.\n"); + if (!dev) break; - } dev->type = DMI_DEV_TYPE_OEM_STRING; dev->name = devname; @@ -272,21 +268,17 @@ static void __init dmi_save_oem_strings_devices(const struct dmi_header *dm) static void __init dmi_save_ipmi_device(const struct dmi_header *dm) { struct dmi_device *dev; - void * data; + void *data; data = dmi_alloc(dm->length); - if (data == NULL) { - printk(KERN_ERR "dmi_save_ipmi_device: out of memory.\n"); + if (data == NULL) return; - } memcpy(data, dm, dm->length); dev = dmi_alloc(sizeof(*dev)); - if (!dev) { - printk(KERN_ERR "dmi_save_ipmi_device: out of memory.\n"); + if (!dev) return; - } dev->type = DMI_DEV_TYPE_IPMI; dev->name = "IPMI controller"; @@ -301,10 +293,9 @@ static void __init dmi_save_dev_onboard(int instance, int segment, int bus, struct dmi_dev_onboard *onboard_dev; onboard_dev = dmi_alloc(sizeof(*onboard_dev) + strlen(name) + 1); - if (!onboard_dev) { - printk(KERN_ERR "dmi_save_dev_onboard: out of memory.\n"); + if (!onboard_dev) return; - } + onboard_dev->instance = instance; onboard_dev->segment = segment; onboard_dev->bus = bus; @@ -320,7 +311,7 @@ static void __init dmi_save_dev_onboard(int instance, int segment, int bus, static void __init dmi_save_extended_devices(const struct dmi_header *dm) { - const u8 *d = (u8*) dm + 5; + const u8 *d = (u8 *) dm + 5; /* Skip disabled device */ if ((*d & 0x80) == 0) @@ -338,7 +329,7 @@ static void __init dmi_save_extended_devices(const struct dmi_header *dm) */ static void __init dmi_decode(const struct dmi_header *dm, void *dummy) { - switch(dm->type) { + switch (dm->type) { case 0: /* BIOS Information */ dmi_save_ident(dm, DMI_BIOS_VENDOR, 4); dmi_save_ident(dm, DMI_BIOS_VERSION, 5); @@ -502,13 +493,7 @@ void __init dmi_scan_machine(void) dmi_available = 1; goto out; } - } - else { - /* - * no iounmap() for that ioremap(); it would be a no-op, but - * it's so early in setup that sucker gets confused into doing - * what it shouldn't if we actually call it. - */ + } else { p = dmi_ioremap(0xF0000, 0x10000); if (p == NULL) goto error; @@ -533,7 +518,7 @@ void __init dmi_scan_machine(void) dmi_iounmap(p, 0x10000); } error: - printk(KERN_INFO "DMI not present or invalid.\n"); + pr_info("DMI not present or invalid.\n"); out: dmi_initialized = 1; } @@ -669,7 +654,7 @@ int dmi_name_in_serial(const char *str) /** * dmi_name_in_vendors - Check if string is in the DMI system or board vendor name - * @str: Case sensitive Name + * @str: Case sensitive Name */ int dmi_name_in_vendors(const char *str) { @@ -696,13 +681,13 @@ EXPORT_SYMBOL(dmi_name_in_vendors); * A new search is initiated by passing %NULL as the @from argument. * If @from is not %NULL, searches continue from next device. */ -const struct dmi_device * dmi_find_device(int type, const char *name, +const struct dmi_device *dmi_find_device(int type, const char *name, const struct dmi_device *from) { const struct list_head *head = from ? &from->list : &dmi_devices; struct list_head *d; - for(d = head->next; d != &dmi_devices; d = d->next) { + for (d = head->next; d != &dmi_devices; d = d->next) { const struct dmi_device *dev = list_entry(d, struct dmi_device, list); diff --git a/drivers/firmware/google/gsmi.c b/drivers/firmware/google/gsmi.c index acba0b9f4406..6eb535ffeddc 100644 --- a/drivers/firmware/google/gsmi.c +++ b/drivers/firmware/google/gsmi.c @@ -525,7 +525,7 @@ static ssize_t gsmi_clear_eventlog_store(struct kobject *kobj, u32 data_type; } param; - rc = strict_strtoul(buf, 0, &val); + rc = kstrtoul(buf, 0, &val); if (rc) return rc; diff --git a/drivers/iommu/msm_iommu_dev.c b/drivers/iommu/msm_iommu_dev.c index 0a1c9626aa9e..08ba4972da9d 100644 --- a/drivers/iommu/msm_iommu_dev.c +++ b/drivers/iommu/msm_iommu_dev.c @@ -282,7 +282,6 @@ static int msm_iommu_remove(struct platform_device *pdev) clk_put(drv->pclk); memset(drv, 0, sizeof(*drv)); kfree(drv); - platform_set_drvdata(pdev, NULL); } return 0; } @@ -366,7 +365,6 @@ static int msm_iommu_ctx_remove(struct platform_device *pdev) if (drv) { memset(drv, 0, sizeof(struct msm_iommu_ctx_drvdata)); kfree(drv); - platform_set_drvdata(pdev, NULL); } return 0; } diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index 0ba3766240d5..bcd78a720630 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -1008,8 +1008,6 @@ static int omap_iommu_remove(struct platform_device *pdev) struct resource *res; struct omap_iommu *obj = platform_get_drvdata(pdev); - platform_set_drvdata(pdev, NULL); - iopgtable_clear_entry_all(obj); irq = platform_get_irq(pdev, 0); diff --git a/drivers/memstick/core/Kconfig b/drivers/memstick/core/Kconfig index 95f1814b5368..1d389491d5fd 100644 --- a/drivers/memstick/core/Kconfig +++ b/drivers/memstick/core/Kconfig @@ -24,3 +24,15 @@ config MSPRO_BLOCK support. This provides a block device driver, which you can use to mount the filesystem. Almost everyone wishing MemoryStick support should say Y or M here. + +config MS_BLOCK + tristate "MemoryStick Standard device driver" + depends on BLOCK + help + Say Y here to enable the MemoryStick Standard device driver + support. This provides a block device driver, which you can use + to mount the filesystem. + This driver works with old (bulky) MemoryStick and MemoryStick Duo + but not PRO. Say Y if you have such card. + Driver is new and not yet well tested, thus it can damage your card + (even permanently) diff --git a/drivers/memstick/core/Makefile b/drivers/memstick/core/Makefile index ecd029937738..0d7f90c0ff25 100644 --- a/drivers/memstick/core/Makefile +++ b/drivers/memstick/core/Makefile @@ -3,5 +3,5 @@ # obj-$(CONFIG_MEMSTICK) += memstick.o - +obj-$(CONFIG_MS_BLOCK) += ms_block.o obj-$(CONFIG_MSPRO_BLOCK) += mspro_block.o diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c new file mode 100644 index 000000000000..08e70232062f --- /dev/null +++ b/drivers/memstick/core/ms_block.c @@ -0,0 +1,2385 @@ +/* + * ms_block.c - Sony MemoryStick (legacy) storage support + + * Copyright (C) 2013 Maxim Levitsky <maximlevitsky@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Minor portions of the driver were copied from mspro_block.c which is + * Copyright (C) 2007 Alex Dubov <oakad@yahoo.com> + * + */ +#define DRIVER_NAME "ms_block" +#define pr_fmt(fmt) DRIVER_NAME ": " fmt + +#include <linux/module.h> +#include <linux/blkdev.h> +#include <linux/memstick.h> +#include <linux/idr.h> +#include <linux/hdreg.h> +#include <linux/delay.h> +#include <linux/slab.h> +#include <linux/random.h> +#include <linux/bitmap.h> +#include <linux/scatterlist.h> +#include <linux/jiffies.h> +#include <linux/workqueue.h> +#include <linux/mutex.h> +#include "ms_block.h" + +static int debug; +static int cache_flush_timeout = 1000; +static bool verify_writes; + +/* + * Copies section of 'sg_from' starting from offset 'offset' and with length + * 'len' To another scatterlist of to_nents enties + */ +static size_t msb_sg_copy(struct scatterlist *sg_from, + struct scatterlist *sg_to, int to_nents, size_t offset, size_t len) +{ + size_t copied = 0; + + while (offset > 0) { + if (offset >= sg_from->length) { + if (sg_is_last(sg_from)) + return 0; + + offset -= sg_from->length; + sg_from = sg_next(sg_from); + continue; + } + + copied = min(len, sg_from->length - offset); + sg_set_page(sg_to, sg_page(sg_from), + copied, sg_from->offset + offset); + + len -= copied; + offset = 0; + + if (sg_is_last(sg_from) || !len) + goto out; + + sg_to = sg_next(sg_to); + to_nents--; + sg_from = sg_next(sg_from); + } + + while (len > sg_from->length && to_nents--) { + len -= sg_from->length; + copied += sg_from->length; + + sg_set_page(sg_to, sg_page(sg_from), + sg_from->length, sg_from->offset); + + if (sg_is_last(sg_from) || !len) + goto out; + + sg_from = sg_next(sg_from); + sg_to = sg_next(sg_to); + } + + if (len && to_nents) { + sg_set_page(sg_to, sg_page(sg_from), len, sg_from->offset); + copied += len; + } +out: + sg_mark_end(sg_to); + return copied; +} + +/* + * Compares section of 'sg' starting from offset 'offset' and with length 'len' + * to linear buffer of length 'len' at address 'buffer' + * Returns 0 if equal and -1 otherwice + */ +static int msb_sg_compare_to_buffer(struct scatterlist *sg, + size_t offset, u8 *buffer, size_t len) +{ + int retval = 0, cmplen; + struct sg_mapping_iter miter; + + sg_miter_start(&miter, sg, sg_nents(sg), + SG_MITER_ATOMIC | SG_MITER_FROM_SG); + + while (sg_miter_next(&miter) && len > 0) { + if (offset >= miter.length) { + offset -= miter.length; + continue; + } + + cmplen = min(miter.length - offset, len); + retval = memcmp(miter.addr + offset, buffer, cmplen) ? -1 : 0; + if (retval) + break; + + buffer += cmplen; + len -= cmplen; + offset = 0; + } + + if (!retval && len) + retval = -1; + + sg_miter_stop(&miter); + return retval; +} + + +/* Get zone at which block with logical address 'lba' lives + * Flash is broken into zones. + * Each zone consists of 512 eraseblocks, out of which in first + * zone 494 are used and 496 are for all following zones. + * Therefore zone #0 hosts blocks 0-493, zone #1 blocks 494-988, etc... +*/ +static int msb_get_zone_from_lba(int lba) +{ + if (lba < 494) + return 0; + return ((lba - 494) / 496) + 1; +} + +/* Get zone of physical block. Trivial */ +static int msb_get_zone_from_pba(int pba) +{ + return pba / MS_BLOCKS_IN_ZONE; +} + +/* Debug test to validate free block counts */ +static int msb_validate_used_block_bitmap(struct msb_data *msb) +{ + int total_free_blocks = 0; + int i; + + if (!debug) + return 0; + + for (i = 0; i < msb->zone_count; i++) + total_free_blocks += msb->free_block_count[i]; + + if (msb->block_count - bitmap_weight(msb->used_blocks_bitmap, + msb->block_count) == total_free_blocks) + return 0; + + pr_err("BUG: free block counts don't match the bitmap"); + msb->read_only = true; + return -EINVAL; +} + +/* Mark physical block as used */ +static void msb_mark_block_used(struct msb_data *msb, int pba) +{ + int zone = msb_get_zone_from_pba(pba); + + if (test_bit(pba, msb->used_blocks_bitmap)) { + pr_err( + "BUG: attempt to mark already used pba %d as used", pba); + msb->read_only = true; + return; + } + + if (msb_validate_used_block_bitmap(msb)) + return; + + /* No races because all IO is single threaded */ + __set_bit(pba, msb->used_blocks_bitmap); + msb->free_block_count[zone]--; +} + +/* Mark physical block as free */ +static void msb_mark_block_unused(struct msb_data *msb, int pba) +{ + int zone = msb_get_zone_from_pba(pba); + + if (!test_bit(pba, msb->used_blocks_bitmap)) { + pr_err("BUG: attempt to mark already unused pba %d as unused" , pba); + msb->read_only = true; + return; + } + + if (msb_validate_used_block_bitmap(msb)) + return; + + /* No races because all IO is single threaded */ + __clear_bit(pba, msb->used_blocks_bitmap); + msb->free_block_count[zone]++; +} + +/* Invalidate current register window */ +static void msb_invalidate_reg_window(struct msb_data *msb) +{ + msb->reg_addr.w_offset = offsetof(struct ms_register, id); + msb->reg_addr.w_length = sizeof(struct ms_id_register); + msb->reg_addr.r_offset = offsetof(struct ms_register, id); + msb->reg_addr.r_length = sizeof(struct ms_id_register); + msb->addr_valid = false; +} + +/* Start a state machine */ +static int msb_run_state_machine(struct msb_data *msb, int (*state_func) + (struct memstick_dev *card, struct memstick_request **req)) +{ + struct memstick_dev *card = msb->card; + + WARN_ON(msb->state != -1); + msb->int_polling = false; + msb->state = 0; + msb->exit_error = 0; + + memset(&card->current_mrq, 0, sizeof(card->current_mrq)); + + card->next_request = state_func; + memstick_new_req(card->host); + wait_for_completion(&card->mrq_complete); + + WARN_ON(msb->state != -1); + return msb->exit_error; +} + +/* State machines call that to exit */ +static int msb_exit_state_machine(struct msb_data *msb, int error) +{ + WARN_ON(msb->state == -1); + + msb->state = -1; + msb->exit_error = error; + msb->card->next_request = h_msb_default_bad; + + /* Invalidate reg window on errors */ + if (error) + msb_invalidate_reg_window(msb); + + complete(&msb->card->mrq_complete); + return -ENXIO; +} + +/* read INT register */ +static int msb_read_int_reg(struct msb_data *msb, long timeout) +{ + struct memstick_request *mrq = &msb->card->current_mrq; + + WARN_ON(msb->state == -1); + + if (!msb->int_polling) { + msb->int_timeout = jiffies + + msecs_to_jiffies(timeout == -1 ? 500 : timeout); + msb->int_polling = true; + } else if (time_after(jiffies, msb->int_timeout)) { + mrq->data[0] = MEMSTICK_INT_CMDNAK; + return 0; + } + + if ((msb->caps & MEMSTICK_CAP_AUTO_GET_INT) && + mrq->need_card_int && !mrq->error) { + mrq->data[0] = mrq->int_reg; + mrq->need_card_int = false; + return 0; + } else { + memstick_init_req(mrq, MS_TPC_GET_INT, NULL, 1); + return 1; + } +} + +/* Read a register */ +static int msb_read_regs(struct msb_data *msb, int offset, int len) +{ + struct memstick_request *req = &msb->card->current_mrq; + + if (msb->reg_addr.r_offset != offset || + msb->reg_addr.r_length != len || !msb->addr_valid) { + + msb->reg_addr.r_offset = offset; + msb->reg_addr.r_length = len; + msb->addr_valid = true; + + memstick_init_req(req, MS_TPC_SET_RW_REG_ADRS, + &msb->reg_addr, sizeof(msb->reg_addr)); + return 0; + } + + memstick_init_req(req, MS_TPC_READ_REG, NULL, len); + return 1; +} + +/* Write a card register */ +static int msb_write_regs(struct msb_data *msb, int offset, int len, void *buf) +{ + struct memstick_request *req = &msb->card->current_mrq; + + if (msb->reg_addr.w_offset != offset || + msb->reg_addr.w_length != len || !msb->addr_valid) { + + msb->reg_addr.w_offset = offset; + msb->reg_addr.w_length = len; + msb->addr_valid = true; + + memstick_init_req(req, MS_TPC_SET_RW_REG_ADRS, + &msb->reg_addr, sizeof(msb->reg_addr)); + return 0; + } + + memstick_init_req(req, MS_TPC_WRITE_REG, buf, len); + return 1; +} + +/* Handler for absence of IO */ +static int h_msb_default_bad(struct memstick_dev *card, + struct memstick_request **mrq) +{ + return -ENXIO; +} + +/* + * This function is a handler for reads of one page from device. + * Writes output to msb->current_sg, takes sector address from msb->reg.param + * Can also be used to read extra data only. Set params accordintly. + */ +static int h_msb_read_page(struct memstick_dev *card, + struct memstick_request **out_mrq) +{ + struct msb_data *msb = memstick_get_drvdata(card); + struct memstick_request *mrq = *out_mrq = &card->current_mrq; + struct scatterlist sg[2]; + u8 command, intreg; + + if (mrq->error) { + dbg("read_page, unknown error"); + return msb_exit_state_machine(msb, mrq->error); + } +again: + switch (msb->state) { + case MSB_RP_SEND_BLOCK_ADDRESS: + /* msb_write_regs sometimes "fails" because it needs to update + the reg window, and thus it returns request for that. + Then we stay in this state and retry */ + if (!msb_write_regs(msb, + offsetof(struct ms_register, param), + sizeof(struct ms_param_register), + (unsigned char *)&msb->regs.param)) + return 0; + + msb->state = MSB_RP_SEND_READ_COMMAND; + return 0; + + case MSB_RP_SEND_READ_COMMAND: + command = MS_CMD_BLOCK_READ; + memstick_init_req(mrq, MS_TPC_SET_CMD, &command, 1); + msb->state = MSB_RP_SEND_INT_REQ; + return 0; + + case MSB_RP_SEND_INT_REQ: + msb->state = MSB_RP_RECEIVE_INT_REQ_RESULT; + /* If dont actually need to send the int read request (only in + serial mode), then just fall through */ + if (msb_read_int_reg(msb, -1)) + return 0; + /* fallthrough */ + + case MSB_RP_RECEIVE_INT_REQ_RESULT: + intreg = mrq->data[0]; + msb->regs.status.interrupt = intreg; + + if (intreg & MEMSTICK_INT_CMDNAK) + return msb_exit_state_machine(msb, -EIO); + + if (!(intreg & MEMSTICK_INT_CED)) { + msb->state = MSB_RP_SEND_INT_REQ; + goto again; + } + + msb->int_polling = false; + msb->state = (intreg & MEMSTICK_INT_ERR) ? + MSB_RP_SEND_READ_STATUS_REG : MSB_RP_SEND_OOB_READ; + goto again; + + case MSB_RP_SEND_READ_STATUS_REG: + /* read the status register to understand source of the INT_ERR */ + if (!msb_read_regs(msb, + offsetof(struct ms_register, status), + sizeof(struct ms_status_register))) + return 0; + + msb->state = MSB_RP_RECEIVE_OOB_READ; + return 0; + + case MSB_RP_RECIVE_STATUS_REG: + msb->regs.status = *(struct ms_status_register *)mrq->data; + msb->state = MSB_RP_SEND_OOB_READ; + /* fallthrough */ + + case MSB_RP_SEND_OOB_READ: + if (!msb_read_regs(msb, + offsetof(struct ms_register, extra_data), + sizeof(struct ms_extra_data_register))) + return 0; + + msb->state = MSB_RP_RECEIVE_OOB_READ; + return 0; + + case MSB_RP_RECEIVE_OOB_READ: + msb->regs.extra_data = + *(struct ms_extra_data_register *) mrq->data; + msb->state = MSB_RP_SEND_READ_DATA; + /* fallthrough */ + + case MSB_RP_SEND_READ_DATA: + /* Skip that state if we only read the oob */ + if (msb->regs.param.cp == MEMSTICK_CP_EXTRA) { + msb->state = MSB_RP_RECEIVE_READ_DATA; + goto again; + } + + sg_init_table(sg, ARRAY_SIZE(sg)); + msb_sg_copy(msb->current_sg, sg, ARRAY_SIZE(sg), + msb->current_sg_offset, + msb->page_size); + + memstick_init_req_sg(mrq, MS_TPC_READ_LONG_DATA, sg); + msb->state = MSB_RP_RECEIVE_READ_DATA; + return 0; + + case MSB_RP_RECEIVE_READ_DATA: + if (!(msb->regs.status.interrupt & MEMSTICK_INT_ERR)) { + msb->current_sg_offset += msb->page_size; + return msb_exit_state_machine(msb, 0); + } + + if (msb->regs.status.status1 & MEMSTICK_UNCORR_ERROR) { + dbg("read_page: uncorrectable error"); + return msb_exit_state_machine(msb, -EBADMSG); + } + + if (msb->regs.status.status1 & MEMSTICK_CORR_ERROR) { + dbg("read_page: correctable error"); + msb->current_sg_offset += msb->page_size; + return msb_exit_state_machine(msb, -EUCLEAN); + } else { + dbg("read_page: INT error, but no status error bits"); + return msb_exit_state_machine(msb, -EIO); + } + } + + BUG(); +} + +/* + * Handler of writes of exactly one block. + * Takes address from msb->regs.param. + * Writes same extra data to blocks, also taken + * from msb->regs.extra + * Returns -EBADMSG if write fails due to uncorrectable error, or -EIO if + * device refuses to take the command or something else + */ +static int h_msb_write_block(struct memstick_dev *card, + struct memstick_request **out_mrq) +{ + struct msb_data *msb = memstick_get_drvdata(card); + struct memstick_request *mrq = *out_mrq = &card->current_mrq; + struct scatterlist sg[2]; + u8 intreg, command; + + if (mrq->error) + return msb_exit_state_machine(msb, mrq->error); + +again: + switch (msb->state) { + + /* HACK: Jmicon handling of TPCs between 8 and + * sizeof(memstick_request.data) is broken due to hardware + * bug in PIO mode that is used for these TPCs + * Therefore split the write + */ + + case MSB_WB_SEND_WRITE_PARAMS: + if (!msb_write_regs(msb, + offsetof(struct ms_register, param), + sizeof(struct ms_param_register), + &msb->regs.param)) + return 0; + + msb->state = MSB_WB_SEND_WRITE_OOB; + return 0; + + case MSB_WB_SEND_WRITE_OOB: + if (!msb_write_regs(msb, + offsetof(struct ms_register, extra_data), + sizeof(struct ms_extra_data_register), + &msb->regs.extra_data)) + return 0; + msb->state = MSB_WB_SEND_WRITE_COMMAND; + return 0; + + + case MSB_WB_SEND_WRITE_COMMAND: + command = MS_CMD_BLOCK_WRITE; + memstick_init_req(mrq, MS_TPC_SET_CMD, &command, 1); + msb->state = MSB_WB_SEND_INT_REQ; + return 0; + + case MSB_WB_SEND_INT_REQ: + msb->state = MSB_WB_RECEIVE_INT_REQ; + if (msb_read_int_reg(msb, -1)) + return 0; + /* fallthrough */ + + case MSB_WB_RECEIVE_INT_REQ: + intreg = mrq->data[0]; + msb->regs.status.interrupt = intreg; + + /* errors mean out of here, and fast... */ + if (intreg & (MEMSTICK_INT_CMDNAK)) + return msb_exit_state_machine(msb, -EIO); + + if (intreg & MEMSTICK_INT_ERR) + return msb_exit_state_machine(msb, -EBADMSG); + + + /* for last page we need to poll CED */ + if (msb->current_page == msb->pages_in_block) { + if (intreg & MEMSTICK_INT_CED) + return msb_exit_state_machine(msb, 0); + msb->state = MSB_WB_SEND_INT_REQ; + goto again; + + } + + /* for non-last page we need BREQ before writing next chunk */ + if (!(intreg & MEMSTICK_INT_BREQ)) { + msb->state = MSB_WB_SEND_INT_REQ; + goto again; + } + + msb->int_polling = false; + msb->state = MSB_WB_SEND_WRITE_DATA; + /* fallthrough */ + + case MSB_WB_SEND_WRITE_DATA: + sg_init_table(sg, ARRAY_SIZE(sg)); + + if (msb_sg_copy(msb->current_sg, sg, ARRAY_SIZE(sg), + msb->current_sg_offset, + msb->page_size) < msb->page_size) + return msb_exit_state_machine(msb, -EIO); + + memstick_init_req_sg(mrq, MS_TPC_WRITE_LONG_DATA, sg); + mrq->need_card_int = 1; + msb->state = MSB_WB_RECEIVE_WRITE_CONFIRMATION; + return 0; + + case MSB_WB_RECEIVE_WRITE_CONFIRMATION: + msb->current_page++; + msb->current_sg_offset += msb->page_size; + msb->state = MSB_WB_SEND_INT_REQ; + goto again; + default: + BUG(); + } + + return 0; +} + +/* + * This function is used to send simple IO requests to device that consist + * of register write + command + */ +static int h_msb_send_command(struct memstick_dev *card, + struct memstick_request **out_mrq) +{ + struct msb_data *msb = memstick_get_drvdata(card); + struct memstick_request *mrq = *out_mrq = &card->current_mrq; + u8 intreg; + + if (mrq->error) { + dbg("send_command: unknown error"); + return msb_exit_state_machine(msb, mrq->error); + } +again: + switch (msb->state) { + + /* HACK: see h_msb_write_block */ + case MSB_SC_SEND_WRITE_PARAMS: /* write param register*/ + if (!msb_write_regs(msb, + offsetof(struct ms_register, param), + sizeof(struct ms_param_register), + &msb->regs.param)) + return 0; + msb->state = MSB_SC_SEND_WRITE_OOB; + return 0; + + case MSB_SC_SEND_WRITE_OOB: + if (!msb->command_need_oob) { + msb->state = MSB_SC_SEND_COMMAND; + goto again; + } + + if (!msb_write_regs(msb, + offsetof(struct ms_register, extra_data), + sizeof(struct ms_extra_data_register), + &msb->regs.extra_data)) + return 0; + + msb->state = MSB_SC_SEND_COMMAND; + return 0; + + case MSB_SC_SEND_COMMAND: + memstick_init_req(mrq, MS_TPC_SET_CMD, &msb->command_value, 1); + msb->state = MSB_SC_SEND_INT_REQ; + return 0; + + case MSB_SC_SEND_INT_REQ: + msb->state = MSB_SC_RECEIVE_INT_REQ; + if (msb_read_int_reg(msb, -1)) + return 0; + /* fallthrough */ + + case MSB_SC_RECEIVE_INT_REQ: + intreg = mrq->data[0]; + + if (intreg & MEMSTICK_INT_CMDNAK) + return msb_exit_state_machine(msb, -EIO); + if (intreg & MEMSTICK_INT_ERR) + return msb_exit_state_machine(msb, -EBADMSG); + + if (!(intreg & MEMSTICK_INT_CED)) { + msb->state = MSB_SC_SEND_INT_REQ; + goto again; + } + + return msb_exit_state_machine(msb, 0); + } + + BUG(); +} + +/* Small handler for card reset */ +static int h_msb_reset(struct memstick_dev *card, + struct memstick_request **out_mrq) +{ + u8 command = MS_CMD_RESET; + struct msb_data *msb = memstick_get_drvdata(card); + struct memstick_request *mrq = *out_mrq = &card->current_mrq; + + if (mrq->error) + return msb_exit_state_machine(msb, mrq->error); + + switch (msb->state) { + case MSB_RS_SEND: + memstick_init_req(mrq, MS_TPC_SET_CMD, &command, 1); + mrq->need_card_int = 0; + msb->state = MSB_RS_CONFIRM; + return 0; + case MSB_RS_CONFIRM: + return msb_exit_state_machine(msb, 0); + } + BUG(); +} + +/* This handler is used to do serial->parallel switch */ +static int h_msb_parallel_switch(struct memstick_dev *card, + struct memstick_request **out_mrq) +{ + struct msb_data *msb = memstick_get_drvdata(card); + struct memstick_request *mrq = *out_mrq = &card->current_mrq; + struct memstick_host *host = card->host; + + if (mrq->error) { + dbg("parallel_switch: error"); + msb->regs.param.system &= ~MEMSTICK_SYS_PAM; + return msb_exit_state_machine(msb, mrq->error); + } + + switch (msb->state) { + case MSB_PS_SEND_SWITCH_COMMAND: + /* Set the parallel interface on memstick side */ + msb->regs.param.system |= MEMSTICK_SYS_PAM; + + if (!msb_write_regs(msb, + offsetof(struct ms_register, param), + 1, + (unsigned char *)&msb->regs.param)) + return 0; + + msb->state = MSB_PS_SWICH_HOST; + return 0; + + case MSB_PS_SWICH_HOST: + /* Set parallel interface on our side + send a dummy request + to see if card responds */ + host->set_param(host, MEMSTICK_INTERFACE, MEMSTICK_PAR4); + memstick_init_req(mrq, MS_TPC_GET_INT, NULL, 1); + msb->state = MSB_PS_CONFIRM; + return 0; + + case MSB_PS_CONFIRM: + return msb_exit_state_machine(msb, 0); + } + + BUG(); +} + +static int msb_switch_to_parallel(struct msb_data *msb); + +/* Reset the card, to guard against hw errors beeing treated as bad blocks */ +static int msb_reset(struct msb_data *msb, bool full) +{ + + bool was_parallel = msb->regs.param.system & MEMSTICK_SYS_PAM; + struct memstick_dev *card = msb->card; + struct memstick_host *host = card->host; + int error; + + /* Reset the card */ + msb->regs.param.system = MEMSTICK_SYS_BAMD; + + if (full) { + error = host->set_param(host, + MEMSTICK_POWER, MEMSTICK_POWER_OFF); + if (error) + goto out_error; + + msb_invalidate_reg_window(msb); + + error = host->set_param(host, + MEMSTICK_POWER, MEMSTICK_POWER_ON); + if (error) + goto out_error; + + error = host->set_param(host, + MEMSTICK_INTERFACE, MEMSTICK_SERIAL); + if (error) { +out_error: + dbg("Failed to reset the host controller"); + msb->read_only = true; + return -EFAULT; + } + } + + error = msb_run_state_machine(msb, h_msb_reset); + if (error) { + dbg("Failed to reset the card"); + msb->read_only = true; + return -ENODEV; + } + + /* Set parallel mode */ + if (was_parallel) + msb_switch_to_parallel(msb); + return 0; +} + +/* Attempts to switch interface to parallel mode */ +static int msb_switch_to_parallel(struct msb_data *msb) +{ + int error; + + error = msb_run_state_machine(msb, h_msb_parallel_switch); + if (error) { + pr_err("Switch to parallel failed"); + msb->regs.param.system &= ~MEMSTICK_SYS_PAM; + msb_reset(msb, true); + return -EFAULT; + } + + msb->caps |= MEMSTICK_CAP_AUTO_GET_INT; + return 0; +} + +/* Changes overwrite flag on a page */ +static int msb_set_overwrite_flag(struct msb_data *msb, + u16 pba, u8 page, u8 flag) +{ + if (msb->read_only) + return -EROFS; + + msb->regs.param.block_address = cpu_to_be16(pba); + msb->regs.param.page_address = page; + msb->regs.param.cp = MEMSTICK_CP_OVERWRITE; + msb->regs.extra_data.overwrite_flag = flag; + msb->command_value = MS_CMD_BLOCK_WRITE; + msb->command_need_oob = true; + + dbg_verbose("changing overwrite flag to %02x for sector %d, page %d", + flag, pba, page); + return msb_run_state_machine(msb, h_msb_send_command); +} + +static int msb_mark_bad(struct msb_data *msb, int pba) +{ + pr_notice("marking pba %d as bad", pba); + msb_reset(msb, true); + return msb_set_overwrite_flag( + msb, pba, 0, 0xFF & ~MEMSTICK_OVERWRITE_BKST); +} + +static int msb_mark_page_bad(struct msb_data *msb, int pba, int page) +{ + dbg("marking page %d of pba %d as bad", page, pba); + msb_reset(msb, true); + return msb_set_overwrite_flag(msb, + pba, page, ~MEMSTICK_OVERWRITE_PGST0); +} + +/* Erases one physical block */ +static int msb_erase_block(struct msb_data *msb, u16 pba) +{ + int error, try; + if (msb->read_only) + return -EROFS; + + dbg_verbose("erasing pba %d", pba); + + for (try = 1; try < 3; try++) { + msb->regs.param.block_address = cpu_to_be16(pba); + msb->regs.param.page_address = 0; + msb->regs.param.cp = MEMSTICK_CP_BLOCK; + msb->command_value = MS_CMD_BLOCK_ERASE; + msb->command_need_oob = false; + + + error = msb_run_state_machine(msb, h_msb_send_command); + if (!error || msb_reset(msb, true)) + break; + } + + if (error) { + pr_err("erase failed, marking pba %d as bad", pba); + msb_mark_bad(msb, pba); + } + + dbg_verbose("erase success, marking pba %d as unused", pba); + msb_mark_block_unused(msb, pba); + __set_bit(pba, msb->erased_blocks_bitmap); + return error; +} + +/* Reads one page from device */ +static int msb_read_page(struct msb_data *msb, + u16 pba, u8 page, struct ms_extra_data_register *extra, + struct scatterlist *sg, int offset) +{ + int try, error; + + if (pba == MS_BLOCK_INVALID) { + unsigned long flags; + struct sg_mapping_iter miter; + size_t len = msb->page_size; + + dbg_verbose("read unmapped sector. returning 0xFF"); + + local_irq_save(flags); + sg_miter_start(&miter, sg, sg_nents(sg), + SG_MITER_ATOMIC | SG_MITER_TO_SG); + + while (sg_miter_next(&miter) && len > 0) { + + int chunklen; + + if (offset && offset >= miter.length) { + offset -= miter.length; + continue; + } + + chunklen = min(miter.length - offset, len); + memset(miter.addr + offset, 0xFF, chunklen); + len -= chunklen; + offset = 0; + } + + sg_miter_stop(&miter); + local_irq_restore(flags); + + if (offset) + return -EFAULT; + + if (extra) + memset(extra, 0xFF, sizeof(*extra)); + return 0; + } + + if (pba >= msb->block_count) { + pr_err("BUG: attempt to read beyond the end of the card at pba %d", pba); + return -EINVAL; + } + + for (try = 1; try < 3; try++) { + msb->regs.param.block_address = cpu_to_be16(pba); + msb->regs.param.page_address = page; + msb->regs.param.cp = MEMSTICK_CP_PAGE; + + msb->current_sg = sg; + msb->current_sg_offset = offset; + error = msb_run_state_machine(msb, h_msb_read_page); + + + if (error == -EUCLEAN) { + pr_notice("correctable error on pba %d, page %d", + pba, page); + error = 0; + } + + if (!error && extra) + *extra = msb->regs.extra_data; + + if (!error || msb_reset(msb, true)) + break; + + } + + /* Mark bad pages */ + if (error == -EBADMSG) { + pr_err("uncorrectable error on read of pba %d, page %d", + pba, page); + + if (msb->regs.extra_data.overwrite_flag & + MEMSTICK_OVERWRITE_PGST0) + msb_mark_page_bad(msb, pba, page); + return -EBADMSG; + } + + if (error) + pr_err("read of pba %d, page %d failed with error %d", + pba, page, error); + return error; +} + +/* Reads oob of page only */ +static int msb_read_oob(struct msb_data *msb, u16 pba, u16 page, + struct ms_extra_data_register *extra) +{ + int error; + + BUG_ON(!extra); + msb->regs.param.block_address = cpu_to_be16(pba); + msb->regs.param.page_address = page; + msb->regs.param.cp = MEMSTICK_CP_EXTRA; + + if (pba > msb->block_count) { + pr_err("BUG: attempt to read beyond the end of card at pba %d", pba); + return -EINVAL; + } + + error = msb_run_state_machine(msb, h_msb_read_page); + *extra = msb->regs.extra_data; + + if (error == -EUCLEAN) { + pr_notice("correctable error on pba %d, page %d", + pba, page); + return 0; + } + + return error; +} + +/* Reads a block and compares it with data contained in scatterlist orig_sg */ +static int msb_verify_block(struct msb_data *msb, u16 pba, + struct scatterlist *orig_sg, int offset) +{ + struct scatterlist sg; + int page = 0, error; + + sg_init_one(&sg, msb->block_buffer, msb->block_size); + + while (page < msb->pages_in_block) { + + error = msb_read_page(msb, pba, page, + NULL, &sg, page * msb->page_size); + if (error) + return error; + page++; + } + + if (msb_sg_compare_to_buffer(orig_sg, offset, + msb->block_buffer, msb->block_size)) + return -EIO; + return 0; +} + +/* Writes exectly one block + oob */ +static int msb_write_block(struct msb_data *msb, + u16 pba, u32 lba, struct scatterlist *sg, int offset) +{ + int error, current_try = 1; + BUG_ON(sg->length < msb->page_size); + + if (msb->read_only) + return -EROFS; + + if (pba == MS_BLOCK_INVALID) { + pr_err( + "BUG: write: attempt to write MS_BLOCK_INVALID block"); + return -EINVAL; + } + + if (pba >= msb->block_count || lba >= msb->logical_block_count) { + pr_err( + "BUG: write: attempt to write beyond the end of device"); + return -EINVAL; + } + + if (msb_get_zone_from_lba(lba) != msb_get_zone_from_pba(pba)) { + pr_err("BUG: write: lba zone mismatch"); + return -EINVAL; + } + + if (pba == msb->boot_block_locations[0] || + pba == msb->boot_block_locations[1]) { + pr_err("BUG: write: attempt to write to boot blocks!"); + return -EINVAL; + } + + while (1) { + + if (msb->read_only) + return -EROFS; + + msb->regs.param.cp = MEMSTICK_CP_BLOCK; + msb->regs.param.page_address = 0; + msb->regs.param.block_address = cpu_to_be16(pba); + + msb->regs.extra_data.management_flag = 0xFF; + msb->regs.extra_data.overwrite_flag = 0xF8; + msb->regs.extra_data.logical_address = cpu_to_be16(lba); + + msb->current_sg = sg; + msb->current_sg_offset = offset; + msb->current_page = 0; + + error = msb_run_state_machine(msb, h_msb_write_block); + + /* Sector we just wrote to is assumed erased since its pba + was erased. If it wasn't erased, write will succeed + and will just clear the bits that were set in the block + thus test that what we have written, + matches what we expect. + We do trust the blocks that we erased */ + if (!error && (verify_writes || + !test_bit(pba, msb->erased_blocks_bitmap))) + error = msb_verify_block(msb, pba, sg, offset); + + if (!error) + break; + + if (current_try > 1 || msb_reset(msb, true)) + break; + + pr_err("write failed, trying to erase the pba %d", pba); + error = msb_erase_block(msb, pba); + if (error) + break; + + current_try++; + } + return error; +} + +/* Finds a free block for write replacement */ +static u16 msb_get_free_block(struct msb_data *msb, int zone) +{ + u16 pos; + int pba = zone * MS_BLOCKS_IN_ZONE; + int i; + + get_random_bytes(&pos, sizeof(pos)); + + if (!msb->free_block_count[zone]) { + pr_err("NO free blocks in the zone %d, to use for a write, (media is WORN out) switching to RO mode", zone); + msb->read_only = true; + return MS_BLOCK_INVALID; + } + + pos %= msb->free_block_count[zone]; + + dbg_verbose("have %d choices for a free block, selected randomally: %d", + msb->free_block_count[zone], pos); + + pba = find_next_zero_bit(msb->used_blocks_bitmap, + msb->block_count, pba); + for (i = 0; i < pos; ++i) + pba = find_next_zero_bit(msb->used_blocks_bitmap, + msb->block_count, pba + 1); + + dbg_verbose("result of the free blocks scan: pba %d", pba); + + if (pba == msb->block_count || (msb_get_zone_from_pba(pba)) != zone) { + pr_err("BUG: cant get a free block"); + msb->read_only = true; + return MS_BLOCK_INVALID; + } + + msb_mark_block_used(msb, pba); + return pba; +} + +static int msb_update_block(struct msb_data *msb, u16 lba, + struct scatterlist *sg, int offset) +{ + u16 pba, new_pba; + int error, try; + + pba = msb->lba_to_pba_table[lba]; + dbg_verbose("start of a block update at lba %d, pba %d", lba, pba); + + if (pba != MS_BLOCK_INVALID) { + dbg_verbose("setting the update flag on the block"); + msb_set_overwrite_flag(msb, pba, 0, + 0xFF & ~MEMSTICK_OVERWRITE_UDST); + } + + for (try = 0; try < 3; try++) { + new_pba = msb_get_free_block(msb, + msb_get_zone_from_lba(lba)); + + if (new_pba == MS_BLOCK_INVALID) { + error = -EIO; + goto out; + } + + dbg_verbose("block update: writing updated block to the pba %d", + new_pba); + error = msb_write_block(msb, new_pba, lba, sg, offset); + if (error == -EBADMSG) { + msb_mark_bad(msb, new_pba); + continue; + } + + if (error) + goto out; + + dbg_verbose("block update: erasing the old block"); + msb_erase_block(msb, pba); + msb->lba_to_pba_table[lba] = new_pba; + return 0; + } +out: + if (error) { + pr_err("block update error after %d tries, switching to r/o mode", try); + msb->read_only = true; + } + return error; +} + +/* Converts endiannes in the boot block for easy use */ +static void msb_fix_boot_page_endianness(struct ms_boot_page *p) +{ + p->header.block_id = be16_to_cpu(p->header.block_id); + p->header.format_reserved = be16_to_cpu(p->header.format_reserved); + p->entry.disabled_block.start_addr + = be32_to_cpu(p->entry.disabled_block.start_addr); + p->entry.disabled_block.data_size + = be32_to_cpu(p->entry.disabled_block.data_size); + p->entry.cis_idi.start_addr + = be32_to_cpu(p->entry.cis_idi.start_addr); + p->entry.cis_idi.data_size + = be32_to_cpu(p->entry.cis_idi.data_size); + p->attr.block_size = be16_to_cpu(p->attr.block_size); + p->attr.number_of_blocks = be16_to_cpu(p->attr.number_of_blocks); + p->attr.number_of_effective_blocks + = be16_to_cpu(p->attr.number_of_effective_blocks); + p->attr.page_size = be16_to_cpu(p->attr.page_size); + p->attr.memory_manufacturer_code + = be16_to_cpu(p->attr.memory_manufacturer_code); + p->attr.memory_device_code = be16_to_cpu(p->attr.memory_device_code); + p->attr.implemented_capacity + = be16_to_cpu(p->attr.implemented_capacity); + p->attr.controller_number = be16_to_cpu(p->attr.controller_number); + p->attr.controller_function = be16_to_cpu(p->attr.controller_function); +} + +static int msb_read_boot_blocks(struct msb_data *msb) +{ + int pba = 0; + struct scatterlist sg; + struct ms_extra_data_register extra; + struct ms_boot_page *page; + + msb->boot_block_locations[0] = MS_BLOCK_INVALID; + msb->boot_block_locations[1] = MS_BLOCK_INVALID; + msb->boot_block_count = 0; + + dbg_verbose("Start of a scan for the boot blocks"); + + if (!msb->boot_page) { + page = kmalloc(sizeof(struct ms_boot_page)*2, GFP_KERNEL); + if (!page) + return -ENOMEM; + + msb->boot_page = page; + } else + page = msb->boot_page; + + msb->block_count = MS_BLOCK_MAX_BOOT_ADDR; + + for (pba = 0; pba < MS_BLOCK_MAX_BOOT_ADDR; pba++) { + + sg_init_one(&sg, page, sizeof(*page)); + if (msb_read_page(msb, pba, 0, &extra, &sg, 0)) { + dbg("boot scan: can't read pba %d", pba); + continue; + } + + if (extra.management_flag & MEMSTICK_MANAGEMENT_SYSFLG) { + dbg("managment flag doesn't indicate boot block %d", + pba); + continue; + } + + if (be16_to_cpu(page->header.block_id) != MS_BLOCK_BOOT_ID) { + dbg("the pba at %d doesn' contain boot block ID", pba); + continue; + } + + msb_fix_boot_page_endianness(page); + msb->boot_block_locations[msb->boot_block_count] = pba; + + page++; + msb->boot_block_count++; + + if (msb->boot_block_count == 2) + break; + } + + if (!msb->boot_block_count) { + pr_err("media doesn't contain master page, aborting"); + return -EIO; + } + + dbg_verbose("End of scan for boot blocks"); + return 0; +} + +static int msb_read_bad_block_table(struct msb_data *msb, int block_nr) +{ + struct ms_boot_page *boot_block; + struct scatterlist sg; + u16 *buffer = NULL; + int offset = 0; + int i, error = 0; + int data_size, data_offset, page, page_offset, size_to_read; + u16 pba; + + BUG_ON(block_nr > 1); + boot_block = &msb->boot_page[block_nr]; + pba = msb->boot_block_locations[block_nr]; + + if (msb->boot_block_locations[block_nr] == MS_BLOCK_INVALID) + return -EINVAL; + + data_size = boot_block->entry.disabled_block.data_size; + data_offset = sizeof(struct ms_boot_page) + + boot_block->entry.disabled_block.start_addr; + if (!data_size) + return 0; + + page = data_offset / msb->page_size; + page_offset = data_offset % msb->page_size; + size_to_read = + DIV_ROUND_UP(data_size + page_offset, msb->page_size) * + msb->page_size; + + dbg("reading bad block of boot block at pba %d, offset %d len %d", + pba, data_offset, data_size); + + buffer = kzalloc(size_to_read, GFP_KERNEL); + if (!buffer) + return -ENOMEM; + + /* Read the buffer */ + sg_init_one(&sg, buffer, size_to_read); + + while (offset < size_to_read) { + error = msb_read_page(msb, pba, page, NULL, &sg, offset); + if (error) + goto out; + + page++; + offset += msb->page_size; + + if (page == msb->pages_in_block) { + pr_err( + "bad block table extends beyond the boot block"); + break; + } + } + + /* Process the bad block table */ + for (i = page_offset; i < data_size / sizeof(u16); i++) { + + u16 bad_block = be16_to_cpu(buffer[i]); + + if (bad_block >= msb->block_count) { + dbg("bad block table contains invalid block %d", + bad_block); + continue; + } + + if (test_bit(bad_block, msb->used_blocks_bitmap)) { + dbg("duplicate bad block %d in the table", + bad_block); + continue; + } + + dbg("block %d is marked as factory bad", bad_block); + msb_mark_block_used(msb, bad_block); + } +out: + kfree(buffer); + return error; +} + +static int msb_ftl_initialize(struct msb_data *msb) +{ + int i; + + if (msb->ftl_initialized) + return 0; + + msb->zone_count = msb->block_count / MS_BLOCKS_IN_ZONE; + msb->logical_block_count = msb->zone_count * 496 - 2; + + msb->used_blocks_bitmap = kzalloc(msb->block_count / 8, GFP_KERNEL); + msb->erased_blocks_bitmap = kzalloc(msb->block_count / 8, GFP_KERNEL); + msb->lba_to_pba_table = + kmalloc(msb->logical_block_count * sizeof(u16), GFP_KERNEL); + + if (!msb->used_blocks_bitmap || !msb->lba_to_pba_table || + !msb->erased_blocks_bitmap) { + kfree(msb->used_blocks_bitmap); + kfree(msb->lba_to_pba_table); + kfree(msb->erased_blocks_bitmap); + return -ENOMEM; + } + + for (i = 0; i < msb->zone_count; i++) + msb->free_block_count[i] = MS_BLOCKS_IN_ZONE; + + memset(msb->lba_to_pba_table, MS_BLOCK_INVALID, + msb->logical_block_count * sizeof(u16)); + + dbg("initial FTL tables created. Zone count = %d, Logical block count = %d", + msb->zone_count, msb->logical_block_count); + + msb->ftl_initialized = true; + return 0; +} + +static int msb_ftl_scan(struct msb_data *msb) +{ + u16 pba, lba, other_block; + u8 overwrite_flag, managment_flag, other_overwrite_flag; + int error; + struct ms_extra_data_register extra; + u8 *overwrite_flags = kzalloc(msb->block_count, GFP_KERNEL); + + if (!overwrite_flags) + return -ENOMEM; + + dbg("Start of media scanning"); + for (pba = 0; pba < msb->block_count; pba++) { + + if (pba == msb->boot_block_locations[0] || + pba == msb->boot_block_locations[1]) { + dbg_verbose("pba %05d -> [boot block]", pba); + msb_mark_block_used(msb, pba); + continue; + } + + if (test_bit(pba, msb->used_blocks_bitmap)) { + dbg_verbose("pba %05d -> [factory bad]", pba); + continue; + } + + memset(&extra, 0, sizeof(extra)); + error = msb_read_oob(msb, pba, 0, &extra); + + /* can't trust the page if we can't read the oob */ + if (error == -EBADMSG) { + pr_notice( + "oob of pba %d damaged, will try to erase it", pba); + msb_mark_block_used(msb, pba); + msb_erase_block(msb, pba); + continue; + } else if (error) { + pr_err("unknown error %d on read of oob of pba %d - aborting", + error, pba); + + kfree(overwrite_flags); + return error; + } + + lba = be16_to_cpu(extra.logical_address); + managment_flag = extra.management_flag; + overwrite_flag = extra.overwrite_flag; + overwrite_flags[pba] = overwrite_flag; + + /* Skip bad blocks */ + if (!(overwrite_flag & MEMSTICK_OVERWRITE_BKST)) { + dbg("pba %05d -> [BAD]", pba); + msb_mark_block_used(msb, pba); + continue; + } + + /* Skip system/drm blocks */ + if ((managment_flag & MEMSTICK_MANAGMENT_FLAG_NORMAL) != + MEMSTICK_MANAGMENT_FLAG_NORMAL) { + dbg("pba %05d -> [reserved managment flag %02x]", + pba, managment_flag); + msb_mark_block_used(msb, pba); + continue; + } + + /* Erase temporary tables */ + if (!(managment_flag & MEMSTICK_MANAGEMENT_ATFLG)) { + dbg("pba %05d -> [temp table] - will erase", pba); + + msb_mark_block_used(msb, pba); + msb_erase_block(msb, pba); + continue; + } + + if (lba == MS_BLOCK_INVALID) { + dbg_verbose("pba %05d -> [free]", pba); + continue; + } + + msb_mark_block_used(msb, pba); + + /* Block has LBA not according to zoning*/ + if (msb_get_zone_from_lba(lba) != msb_get_zone_from_pba(pba)) { + pr_notice("pba %05d -> [bad lba %05d] - will erase", + pba, lba); + msb_erase_block(msb, pba); + continue; + } + + /* No collisions - great */ + if (msb->lba_to_pba_table[lba] == MS_BLOCK_INVALID) { + dbg_verbose("pba %05d -> [lba %05d]", pba, lba); + msb->lba_to_pba_table[lba] = pba; + continue; + } + + other_block = msb->lba_to_pba_table[lba]; + other_overwrite_flag = overwrite_flags[other_block]; + + pr_notice("Collision between pba %d and pba %d", + pba, other_block); + + if (!(overwrite_flag & MEMSTICK_OVERWRITE_UDST)) { + pr_notice("pba %d is marked as stable, use it", pba); + msb_erase_block(msb, other_block); + msb->lba_to_pba_table[lba] = pba; + continue; + } + + if (!(other_overwrite_flag & MEMSTICK_OVERWRITE_UDST)) { + pr_notice("pba %d is marked as stable, use it", + other_block); + msb_erase_block(msb, pba); + continue; + } + + pr_notice("collision between blocks %d and %d, without stable flag set on both, erasing pba %d", + pba, other_block, other_block); + + msb_erase_block(msb, other_block); + msb->lba_to_pba_table[lba] = pba; + } + + dbg("End of media scanning"); + kfree(overwrite_flags); + return 0; +} + +static void msb_cache_flush_timer(unsigned long data) +{ + struct msb_data *msb = (struct msb_data *)data; + msb->need_flush_cache = true; + queue_work(msb->io_queue, &msb->io_work); +} + + +static void msb_cache_discard(struct msb_data *msb) +{ + if (msb->cache_block_lba == MS_BLOCK_INVALID) + return; + + del_timer_sync(&msb->cache_flush_timer); + + dbg_verbose("Discarding the write cache"); + msb->cache_block_lba = MS_BLOCK_INVALID; + bitmap_zero(&msb->valid_cache_bitmap, msb->pages_in_block); +} + +static int msb_cache_init(struct msb_data *msb) +{ + setup_timer(&msb->cache_flush_timer, msb_cache_flush_timer, + (unsigned long)msb); + + if (!msb->cache) + msb->cache = kzalloc(msb->block_size, GFP_KERNEL); + if (!msb->cache) + return -ENOMEM; + + msb_cache_discard(msb); + return 0; +} + +static int msb_cache_flush(struct msb_data *msb) +{ + struct scatterlist sg; + struct ms_extra_data_register extra; + int page, offset, error; + u16 pba, lba; + + if (msb->read_only) + return -EROFS; + + if (msb->cache_block_lba == MS_BLOCK_INVALID) + return 0; + + lba = msb->cache_block_lba; + pba = msb->lba_to_pba_table[lba]; + + dbg_verbose("Flushing the write cache of pba %d (LBA %d)", + pba, msb->cache_block_lba); + + sg_init_one(&sg, msb->cache , msb->block_size); + + /* Read all missing pages in cache */ + for (page = 0; page < msb->pages_in_block; page++) { + + if (test_bit(page, &msb->valid_cache_bitmap)) + continue; + + offset = page * msb->page_size; + + dbg_verbose("reading non-present sector %d of cache block %d", + page, lba); + error = msb_read_page(msb, pba, page, &extra, &sg, offset); + + /* Bad pages are copied with 00 page status */ + if (error == -EBADMSG) { + pr_err("read error on sector %d, contents probably damaged", page); + continue; + } + + if (error) + return error; + + if ((extra.overwrite_flag & MEMSTICK_OV_PG_NORMAL) != + MEMSTICK_OV_PG_NORMAL) { + dbg("page %d is marked as bad", page); + continue; + } + + set_bit(page, &msb->valid_cache_bitmap); + } + + /* Write the cache now */ + error = msb_update_block(msb, msb->cache_block_lba, &sg, 0); + pba = msb->lba_to_pba_table[msb->cache_block_lba]; + + /* Mark invalid pages */ + if (!error) { + for (page = 0; page < msb->pages_in_block; page++) { + + if (test_bit(page, &msb->valid_cache_bitmap)) + continue; + + dbg("marking page %d as containing damaged data", + page); + msb_set_overwrite_flag(msb, + pba , page, 0xFF & ~MEMSTICK_OV_PG_NORMAL); + } + } + + msb_cache_discard(msb); + return error; +} + +static int msb_cache_write(struct msb_data *msb, int lba, + int page, bool add_to_cache_only, struct scatterlist *sg, int offset) +{ + int error; + struct scatterlist sg_tmp[10]; + + if (msb->read_only) + return -EROFS; + + if (msb->cache_block_lba == MS_BLOCK_INVALID || + lba != msb->cache_block_lba) + if (add_to_cache_only) + return 0; + + /* If we need to write different block */ + if (msb->cache_block_lba != MS_BLOCK_INVALID && + lba != msb->cache_block_lba) { + dbg_verbose("first flush the cache"); + error = msb_cache_flush(msb); + if (error) + return error; + } + + if (msb->cache_block_lba == MS_BLOCK_INVALID) { + msb->cache_block_lba = lba; + mod_timer(&msb->cache_flush_timer, + jiffies + msecs_to_jiffies(cache_flush_timeout)); + } + + dbg_verbose("Write of LBA %d page %d to cache ", lba, page); + + sg_init_table(sg_tmp, ARRAY_SIZE(sg_tmp)); + msb_sg_copy(sg, sg_tmp, ARRAY_SIZE(sg_tmp), offset, msb->page_size); + + sg_copy_to_buffer(sg_tmp, sg_nents(sg_tmp), + msb->cache + page * msb->page_size, msb->page_size); + + set_bit(page, &msb->valid_cache_bitmap); + return 0; +} + +static int msb_cache_read(struct msb_data *msb, int lba, + int page, struct scatterlist *sg, int offset) +{ + int pba = msb->lba_to_pba_table[lba]; + struct scatterlist sg_tmp[10]; + int error = 0; + + if (lba == msb->cache_block_lba && + test_bit(page, &msb->valid_cache_bitmap)) { + + dbg_verbose("Read of LBA %d (pba %d) sector %d from cache", + lba, pba, page); + + sg_init_table(sg_tmp, ARRAY_SIZE(sg_tmp)); + msb_sg_copy(sg, sg_tmp, ARRAY_SIZE(sg_tmp), + offset, msb->page_size); + sg_copy_from_buffer(sg_tmp, sg_nents(sg_tmp), + msb->cache + msb->page_size * page, + msb->page_size); + } else { + dbg_verbose("Read of LBA %d (pba %d) sector %d from device", + lba, pba, page); + + error = msb_read_page(msb, pba, page, NULL, sg, offset); + if (error) + return error; + + msb_cache_write(msb, lba, page, true, sg, offset); + } + return error; +} + +/* Emulated geometry table + * This table content isn't that importaint, + * One could put here different values, providing that they still + * cover whole disk. + * 64 MB entry is what windows reports for my 64M memstick */ + +static const struct chs_entry chs_table[] = { +/* size sectors cylynders heads */ + { 4, 16, 247, 2 }, + { 8, 16, 495, 2 }, + { 16, 16, 495, 4 }, + { 32, 16, 991, 4 }, + { 64, 16, 991, 8 }, + {128, 16, 991, 16 }, + { 0 } +}; + +/* Load information about the card */ +static int msb_init_card(struct memstick_dev *card) +{ + struct msb_data *msb = memstick_get_drvdata(card); + struct memstick_host *host = card->host; + struct ms_boot_page *boot_block; + int error = 0, i, raw_size_in_megs; + + msb->caps = 0; + + if (card->id.class >= MEMSTICK_CLASS_ROM && + card->id.class <= MEMSTICK_CLASS_ROM) + msb->read_only = true; + + msb->state = -1; + error = msb_reset(msb, false); + if (error) + return error; + + /* Due to a bug in Jmicron driver written by Alex Dubov, + its serial mode barely works, + so we switch to parallel mode right away */ + if (host->caps & MEMSTICK_CAP_PAR4) + msb_switch_to_parallel(msb); + + msb->page_size = sizeof(struct ms_boot_page); + + /* Read the boot page */ + error = msb_read_boot_blocks(msb); + if (error) + return -EIO; + + boot_block = &msb->boot_page[0]; + + /* Save intersting attributes from boot page */ + msb->block_count = boot_block->attr.number_of_blocks; + msb->page_size = boot_block->attr.page_size; + + msb->pages_in_block = boot_block->attr.block_size * 2; + msb->block_size = msb->page_size * msb->pages_in_block; + + if (msb->page_size > PAGE_SIZE) { + /* this isn't supported by linux at all, anyway*/ + dbg("device page %d size isn't supported", msb->page_size); + return -EINVAL; + } + + msb->block_buffer = kzalloc(msb->block_size, GFP_KERNEL); + if (!msb->block_buffer) + return -ENOMEM; + + raw_size_in_megs = (msb->block_size * msb->block_count) >> 20; + + for (i = 0; chs_table[i].size; i++) { + + if (chs_table[i].size != raw_size_in_megs) + continue; + + msb->geometry.cylinders = chs_table[i].cyl; + msb->geometry.heads = chs_table[i].head; + msb->geometry.sectors = chs_table[i].sec; + break; + } + + if (boot_block->attr.transfer_supporting == 1) + msb->caps |= MEMSTICK_CAP_PAR4; + + if (boot_block->attr.device_type & 0x03) + msb->read_only = true; + + dbg("Total block count = %d", msb->block_count); + dbg("Each block consists of %d pages", msb->pages_in_block); + dbg("Page size = %d bytes", msb->page_size); + dbg("Parallel mode supported: %d", !!(msb->caps & MEMSTICK_CAP_PAR4)); + dbg("Read only: %d", msb->read_only); + +#if 0 + /* Now we can switch the interface */ + if (host->caps & msb->caps & MEMSTICK_CAP_PAR4) + msb_switch_to_parallel(msb); +#endif + + error = msb_cache_init(msb); + if (error) + return error; + + error = msb_ftl_initialize(msb); + if (error) + return error; + + + /* Read the bad block table */ + error = msb_read_bad_block_table(msb, 0); + + if (error && error != -ENOMEM) { + dbg("failed to read bad block table from primary boot block, trying from backup"); + error = msb_read_bad_block_table(msb, 1); + } + + if (error) + return error; + + /* *drum roll* Scan the media */ + error = msb_ftl_scan(msb); + if (error) { + pr_err("Scan of media failed"); + return error; + } + + return 0; + +} + +static int msb_do_write_request(struct msb_data *msb, int lba, + int page, struct scatterlist *sg, size_t len, int *sucessfuly_written) +{ + int error = 0; + off_t offset = 0; + *sucessfuly_written = 0; + + while (offset < len) { + if (page == 0 && len - offset >= msb->block_size) { + + if (msb->cache_block_lba == lba) + msb_cache_discard(msb); + + dbg_verbose("Writing whole lba %d", lba); + error = msb_update_block(msb, lba, sg, offset); + if (error) + return error; + + offset += msb->block_size; + *sucessfuly_written += msb->block_size; + lba++; + continue; + } + + error = msb_cache_write(msb, lba, page, false, sg, offset); + if (error) + return error; + + offset += msb->page_size; + *sucessfuly_written += msb->page_size; + + page++; + if (page == msb->pages_in_block) { + page = 0; + lba++; + } + } + return 0; +} + +static int msb_do_read_request(struct msb_data *msb, int lba, + int page, struct scatterlist *sg, int len, int *sucessfuly_read) +{ + int error = 0; + int offset = 0; + *sucessfuly_read = 0; + + while (offset < len) { + + error = msb_cache_read(msb, lba, page, sg, offset); + if (error) + return error; + + offset += msb->page_size; + *sucessfuly_read += msb->page_size; + + page++; + if (page == msb->pages_in_block) { + page = 0; + lba++; + } + } + return 0; +} + +static void msb_io_work(struct work_struct *work) +{ + struct msb_data *msb = container_of(work, struct msb_data, io_work); + int page, error, len; + sector_t lba; + unsigned long flags; + struct scatterlist *sg = msb->prealloc_sg; + + dbg_verbose("IO: work started"); + + while (1) { + spin_lock_irqsave(&msb->q_lock, flags); + + if (msb->need_flush_cache) { + msb->need_flush_cache = false; + spin_unlock_irqrestore(&msb->q_lock, flags); + msb_cache_flush(msb); + continue; + } + + if (!msb->req) { + msb->req = blk_fetch_request(msb->queue); + if (!msb->req) { + dbg_verbose("IO: no more requests exiting"); + spin_unlock_irqrestore(&msb->q_lock, flags); + return; + } + } + + spin_unlock_irqrestore(&msb->q_lock, flags); + + /* If card was removed meanwhile */ + if (!msb->req) + return; + + /* process the request */ + dbg_verbose("IO: processing new request"); + blk_rq_map_sg(msb->queue, msb->req, sg); + + lba = blk_rq_pos(msb->req); + + sector_div(lba, msb->page_size / 512); + page = do_div(lba, msb->pages_in_block); + + if (rq_data_dir(msb->req) == READ) + error = msb_do_read_request(msb, lba, page, sg, + blk_rq_bytes(msb->req), &len); + else + error = msb_do_write_request(msb, lba, page, sg, + blk_rq_bytes(msb->req), &len); + + spin_lock_irqsave(&msb->q_lock, flags); + + if (len) + if (!__blk_end_request(msb->req, 0, len)) + msb->req = NULL; + + if (error && msb->req) { + dbg_verbose("IO: ending one sector of the request with error"); + if (!__blk_end_request(msb->req, error, msb->page_size)) + msb->req = NULL; + } + + if (msb->req) + dbg_verbose("IO: request still pending"); + + spin_unlock_irqrestore(&msb->q_lock, flags); + } +} + +static DEFINE_IDR(msb_disk_idr); /*set of used disk numbers */ +static DEFINE_MUTEX(msb_disk_lock); /* protects against races in open/release */ + +static int msb_bd_open(struct block_device *bdev, fmode_t mode) +{ + struct gendisk *disk = bdev->bd_disk; + struct msb_data *msb = disk->private_data; + + dbg_verbose("block device open"); + + mutex_lock(&msb_disk_lock); + + if (msb && msb->card) + msb->usage_count++; + + mutex_unlock(&msb_disk_lock); + return 0; +} + +static void msb_data_clear(struct msb_data *msb) +{ + kfree(msb->boot_page); + kfree(msb->used_blocks_bitmap); + kfree(msb->lba_to_pba_table); + kfree(msb->cache); + msb->card = NULL; +} + +static int msb_disk_release(struct gendisk *disk) +{ + struct msb_data *msb = disk->private_data; + + dbg_verbose("block device release"); + mutex_lock(&msb_disk_lock); + + if (msb) { + if (msb->usage_count) + msb->usage_count--; + + if (!msb->usage_count) { + disk->private_data = NULL; + idr_remove(&msb_disk_idr, msb->disk_id); + put_disk(disk); + kfree(msb); + } + } + mutex_unlock(&msb_disk_lock); + return 0; +} + +static void msb_bd_release(struct gendisk *disk, fmode_t mode) +{ + msb_disk_release(disk); +} + +static int msb_bd_getgeo(struct block_device *bdev, + struct hd_geometry *geo) +{ + struct msb_data *msb = bdev->bd_disk->private_data; + *geo = msb->geometry; + return 0; +} + +static int msb_prepare_req(struct request_queue *q, struct request *req) +{ + if (req->cmd_type != REQ_TYPE_FS && + req->cmd_type != REQ_TYPE_BLOCK_PC) { + blk_dump_rq_flags(req, "MS unsupported request"); + return BLKPREP_KILL; + } + req->cmd_flags |= REQ_DONTPREP; + return BLKPREP_OK; +} + +static void msb_submit_req(struct request_queue *q) +{ + struct memstick_dev *card = q->queuedata; + struct msb_data *msb = memstick_get_drvdata(card); + struct request *req = NULL; + + dbg_verbose("Submit request"); + + if (msb->card_dead) { + dbg("Refusing requests on removed card"); + + WARN_ON(!msb->io_queue_stopped); + + while ((req = blk_fetch_request(q)) != NULL) + __blk_end_request_all(req, -ENODEV); + return; + } + + if (msb->req) + return; + + if (!msb->io_queue_stopped) + queue_work(msb->io_queue, &msb->io_work); +} + +static int msb_check_card(struct memstick_dev *card) +{ + struct msb_data *msb = memstick_get_drvdata(card); + return (msb->card_dead == 0); +} + +static void msb_stop(struct memstick_dev *card) +{ + struct msb_data *msb = memstick_get_drvdata(card); + unsigned long flags; + + dbg("Stopping all msblock IO"); + + spin_lock_irqsave(&msb->q_lock, flags); + blk_stop_queue(msb->queue); + msb->io_queue_stopped = true; + spin_unlock_irqrestore(&msb->q_lock, flags); + + del_timer_sync(&msb->cache_flush_timer); + flush_workqueue(msb->io_queue); + + if (msb->req) { + spin_lock_irqsave(&msb->q_lock, flags); + blk_requeue_request(msb->queue, msb->req); + msb->req = NULL; + spin_unlock_irqrestore(&msb->q_lock, flags); + } + +} + +static void msb_start(struct memstick_dev *card) +{ + struct msb_data *msb = memstick_get_drvdata(card); + unsigned long flags; + + dbg("Resuming IO from msblock"); + + msb_invalidate_reg_window(msb); + + spin_lock_irqsave(&msb->q_lock, flags); + if (!msb->io_queue_stopped || msb->card_dead) { + spin_unlock_irqrestore(&msb->q_lock, flags); + return; + } + spin_unlock_irqrestore(&msb->q_lock, flags); + + /* Kick cache flush anyway, its harmless */ + msb->need_flush_cache = true; + msb->io_queue_stopped = false; + + spin_lock_irqsave(&msb->q_lock, flags); + blk_start_queue(msb->queue); + spin_unlock_irqrestore(&msb->q_lock, flags); + + queue_work(msb->io_queue, &msb->io_work); + +} + +static const struct block_device_operations msb_bdops = { + .open = msb_bd_open, + .release = msb_bd_release, + .getgeo = msb_bd_getgeo, + .owner = THIS_MODULE +}; + +/* Registers the block device */ +static int msb_init_disk(struct memstick_dev *card) +{ + struct msb_data *msb = memstick_get_drvdata(card); + struct memstick_host *host = card->host; + int rc; + u64 limit = BLK_BOUNCE_HIGH; + unsigned long capacity; + + if (host->dev.dma_mask && *(host->dev.dma_mask)) + limit = *(host->dev.dma_mask); + + mutex_lock(&msb_disk_lock); + msb->disk_id = idr_alloc(&msb_disk_idr, card, 0, 256, GFP_KERNEL); + mutex_unlock(&msb_disk_lock); + + if (msb->disk_id < 0) + return msb->disk_id; + + msb->disk = alloc_disk(0); + if (!msb->disk) { + rc = -ENOMEM; + goto out_release_id; + } + + msb->queue = blk_init_queue(msb_submit_req, &msb->q_lock); + if (!msb->queue) { + rc = -ENOMEM; + goto out_put_disk; + } + + msb->queue->queuedata = card; + blk_queue_prep_rq(msb->queue, msb_prepare_req); + + blk_queue_bounce_limit(msb->queue, limit); + blk_queue_max_hw_sectors(msb->queue, MS_BLOCK_MAX_PAGES); + blk_queue_max_segments(msb->queue, MS_BLOCK_MAX_SEGS); + blk_queue_max_segment_size(msb->queue, + MS_BLOCK_MAX_PAGES * msb->page_size); + blk_queue_logical_block_size(msb->queue, msb->page_size); + + sprintf(msb->disk->disk_name, "msblk%d", msb->disk_id); + msb->disk->fops = &msb_bdops; + msb->disk->private_data = msb; + msb->disk->queue = msb->queue; + msb->disk->driverfs_dev = &card->dev; + msb->disk->flags |= GENHD_FL_EXT_DEVT; + + capacity = msb->pages_in_block * msb->logical_block_count; + capacity *= (msb->page_size / 512); + set_capacity(msb->disk, capacity); + dbg("Set total disk size to %lu sectors", capacity); + + msb->usage_count = 1; + msb->io_queue = alloc_ordered_workqueue("ms_block", WQ_MEM_RECLAIM); + INIT_WORK(&msb->io_work, msb_io_work); + sg_init_table(msb->prealloc_sg, MS_BLOCK_MAX_SEGS+1); + + if (msb->read_only) + set_disk_ro(msb->disk, 1); + + msb_start(card); + add_disk(msb->disk); + dbg("Disk added"); + return 0; + +out_put_disk: + put_disk(msb->disk); +out_release_id: + mutex_lock(&msb_disk_lock); + idr_remove(&msb_disk_idr, msb->disk_id); + mutex_unlock(&msb_disk_lock); + return rc; +} + +static int msb_probe(struct memstick_dev *card) +{ + struct msb_data *msb; + int rc = 0; + + msb = kzalloc(sizeof(struct msb_data), GFP_KERNEL); + if (!msb) + return -ENOMEM; + memstick_set_drvdata(card, msb); + msb->card = card; + spin_lock_init(&msb->q_lock); + + rc = msb_init_card(card); + if (rc) + goto out_free; + + rc = msb_init_disk(card); + if (!rc) { + card->check = msb_check_card; + card->stop = msb_stop; + card->start = msb_start; + return 0; + } +out_free: + memstick_set_drvdata(card, NULL); + msb_data_clear(msb); + kfree(msb); + return rc; +} + +static void msb_remove(struct memstick_dev *card) +{ + struct msb_data *msb = memstick_get_drvdata(card); + unsigned long flags; + + if (!msb->io_queue_stopped) + msb_stop(card); + + dbg("Removing the disk device"); + + /* Take care of unhandled + new requests from now on */ + spin_lock_irqsave(&msb->q_lock, flags); + msb->card_dead = true; + blk_start_queue(msb->queue); + spin_unlock_irqrestore(&msb->q_lock, flags); + + /* Remove the disk */ + del_gendisk(msb->disk); + blk_cleanup_queue(msb->queue); + msb->queue = NULL; + + mutex_lock(&msb_disk_lock); + msb_data_clear(msb); + mutex_unlock(&msb_disk_lock); + + msb_disk_release(msb->disk); + memstick_set_drvdata(card, NULL); +} + +#ifdef CONFIG_PM + +static int msb_suspend(struct memstick_dev *card, pm_message_t state) +{ + msb_stop(card); + return 0; +} + +static int msb_resume(struct memstick_dev *card) +{ + struct msb_data *msb = memstick_get_drvdata(card); + struct msb_data *new_msb = NULL; + bool card_dead = true; + +#ifndef CONFIG_MEMSTICK_UNSAFE_RESUME + msb->card_dead = true; + return 0; +#endif + mutex_lock(&card->host->lock); + + new_msb = kzalloc(sizeof(struct msb_data), GFP_KERNEL); + if (!new_msb) + goto out; + + new_msb->card = card; + memstick_set_drvdata(card, new_msb); + spin_lock_init(&new_msb->q_lock); + sg_init_table(msb->prealloc_sg, MS_BLOCK_MAX_SEGS+1); + + if (msb_init_card(card)) + goto out; + + if (msb->block_size != new_msb->block_size) + goto out; + + if (memcmp(msb->boot_page, new_msb->boot_page, + sizeof(struct ms_boot_page))) + goto out; + + if (msb->logical_block_count != new_msb->logical_block_count || + memcmp(msb->lba_to_pba_table, new_msb->lba_to_pba_table, + msb->logical_block_count)) + goto out; + + if (msb->block_count != new_msb->block_count || + memcmp(msb->used_blocks_bitmap, new_msb->used_blocks_bitmap, + msb->block_count / 8)) + goto out; + + card_dead = false; +out: + if (card_dead) + dbg("Card was removed/replaced during suspend"); + + msb->card_dead = card_dead; + memstick_set_drvdata(card, msb); + + if (new_msb) { + msb_data_clear(new_msb); + kfree(new_msb); + } + + msb_start(card); + mutex_unlock(&card->host->lock); + return 0; +} +#else + +#define msb_suspend NULL +#define msb_resume NULL + +#endif /* CONFIG_PM */ + +static struct memstick_device_id msb_id_tbl[] = { + {MEMSTICK_MATCH_ALL, MEMSTICK_TYPE_LEGACY, MEMSTICK_CATEGORY_STORAGE, + MEMSTICK_CLASS_FLASH}, + + {MEMSTICK_MATCH_ALL, MEMSTICK_TYPE_LEGACY, MEMSTICK_CATEGORY_STORAGE, + MEMSTICK_CLASS_ROM}, + + {MEMSTICK_MATCH_ALL, MEMSTICK_TYPE_LEGACY, MEMSTICK_CATEGORY_STORAGE, + MEMSTICK_CLASS_RO}, + + {MEMSTICK_MATCH_ALL, MEMSTICK_TYPE_LEGACY, MEMSTICK_CATEGORY_STORAGE, + MEMSTICK_CLASS_WP}, + + {MEMSTICK_MATCH_ALL, MEMSTICK_TYPE_DUO, MEMSTICK_CATEGORY_STORAGE_DUO, + MEMSTICK_CLASS_DUO}, + {} +}; +MODULE_DEVICE_TABLE(memstick, msb_id_tbl); + + +static struct memstick_driver msb_driver = { + .driver = { + .name = DRIVER_NAME, + .owner = THIS_MODULE + }, + .id_table = msb_id_tbl, + .probe = msb_probe, + .remove = msb_remove, + .suspend = msb_suspend, + .resume = msb_resume +}; + +static int major; + +static int __init msb_init(void) +{ + int rc = register_blkdev(0, DRIVER_NAME); + + if (rc < 0) { + pr_err("failed to register major (error %d)\n", rc); + return rc; + } + + major = rc; + rc = memstick_register_driver(&msb_driver); + if (rc) { + unregister_blkdev(major, DRIVER_NAME); + pr_err("failed to register memstick driver (error %d)\n", rc); + } + + return rc; +} + +static void __exit msb_exit(void) +{ + memstick_unregister_driver(&msb_driver); + unregister_blkdev(major, DRIVER_NAME); + idr_destroy(&msb_disk_idr); +} + +module_init(msb_init); +module_exit(msb_exit); + +module_param(cache_flush_timeout, int, S_IRUGO); +MODULE_PARM_DESC(cache_flush_timeout, + "Cache flush timeout in msec (1000 default)"); +module_param(debug, int, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(debug, "Debug level (0-2)"); + +module_param(verify_writes, bool, S_IRUGO); +MODULE_PARM_DESC(verify_writes, "Read back and check all data that is written"); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Maxim Levitsky"); +MODULE_DESCRIPTION("Sony MemoryStick block device driver"); diff --git a/drivers/memstick/core/ms_block.h b/drivers/memstick/core/ms_block.h new file mode 100644 index 000000000000..96e637550988 --- /dev/null +++ b/drivers/memstick/core/ms_block.h @@ -0,0 +1,290 @@ +/* + * ms_block.h - Sony MemoryStick (legacy) storage support + + * Copyright (C) 2013 Maxim Levitsky <maximlevitsky@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Minor portions of the driver are copied from mspro_block.c which is + * Copyright (C) 2007 Alex Dubov <oakad@yahoo.com> + * + * Also ms structures were copied from old broken driver by same author + * These probably come from MS spec + * + */ + +#ifndef MS_BLOCK_NEW_H +#define MS_BLOCK_NEW_H + +#define MS_BLOCK_MAX_SEGS 32 +#define MS_BLOCK_MAX_PAGES ((2 << 16) - 1) + +#define MS_BLOCK_MAX_BOOT_ADDR 0x000c +#define MS_BLOCK_BOOT_ID 0x0001 +#define MS_BLOCK_INVALID 0xffff +#define MS_MAX_ZONES 16 +#define MS_BLOCKS_IN_ZONE 512 + +#define MS_BLOCK_MAP_LINE_SZ 16 +#define MS_BLOCK_PART_SHIFT 3 + + +#define MEMSTICK_UNCORR_ERROR (MEMSTICK_STATUS1_UCFG | \ + MEMSTICK_STATUS1_UCEX | MEMSTICK_STATUS1_UCDT) + +#define MEMSTICK_CORR_ERROR (MEMSTICK_STATUS1_FGER | MEMSTICK_STATUS1_EXER | \ + MEMSTICK_STATUS1_DTER) + +#define MEMSTICK_INT_ERROR (MEMSTICK_INT_CMDNAK | MEMSTICK_INT_ERR) + +#define MEMSTICK_OVERWRITE_FLAG_NORMAL \ + (MEMSTICK_OVERWRITE_PGST1 | \ + MEMSTICK_OVERWRITE_PGST0 | \ + MEMSTICK_OVERWRITE_BKST) + +#define MEMSTICK_OV_PG_NORMAL \ + (MEMSTICK_OVERWRITE_PGST1 | MEMSTICK_OVERWRITE_PGST0) + +#define MEMSTICK_MANAGMENT_FLAG_NORMAL \ + (MEMSTICK_MANAGEMENT_SYSFLG | \ + MEMSTICK_MANAGEMENT_SCMS1 | \ + MEMSTICK_MANAGEMENT_SCMS0) \ + +struct ms_boot_header { + unsigned short block_id; + unsigned short format_reserved; + unsigned char reserved0[184]; + unsigned char data_entry; + unsigned char reserved1[179]; +} __packed; + + +struct ms_system_item { + unsigned int start_addr; + unsigned int data_size; + unsigned char data_type_id; + unsigned char reserved[3]; +} __packed; + +struct ms_system_entry { + struct ms_system_item disabled_block; + struct ms_system_item cis_idi; + unsigned char reserved[24]; +} __packed; + +struct ms_boot_attr_info { + unsigned char memorystick_class; + unsigned char format_unique_value1; + unsigned short block_size; + unsigned short number_of_blocks; + unsigned short number_of_effective_blocks; + unsigned short page_size; + unsigned char extra_data_size; + unsigned char format_unique_value2; + unsigned char assembly_time[8]; + unsigned char format_unique_value3; + unsigned char serial_number[3]; + unsigned char assembly_manufacturer_code; + unsigned char assembly_model_code[3]; + unsigned short memory_manufacturer_code; + unsigned short memory_device_code; + unsigned short implemented_capacity; + unsigned char format_unique_value4[2]; + unsigned char vcc; + unsigned char vpp; + unsigned short controller_number; + unsigned short controller_function; + unsigned char reserved0[9]; + unsigned char transfer_supporting; + unsigned short format_unique_value5; + unsigned char format_type; + unsigned char memorystick_application; + unsigned char device_type; + unsigned char reserved1[22]; + unsigned char format_uniqure_value6[2]; + unsigned char reserved2[15]; +} __packed; + +struct ms_cis_idi { + unsigned short general_config; + unsigned short logical_cylinders; + unsigned short reserved0; + unsigned short logical_heads; + unsigned short track_size; + unsigned short page_size; + unsigned short pages_per_track; + unsigned short msw; + unsigned short lsw; + unsigned short reserved1; + unsigned char serial_number[20]; + unsigned short buffer_type; + unsigned short buffer_size_increments; + unsigned short long_command_ecc; + unsigned char firmware_version[28]; + unsigned char model_name[18]; + unsigned short reserved2[5]; + unsigned short pio_mode_number; + unsigned short dma_mode_number; + unsigned short field_validity; + unsigned short current_logical_cylinders; + unsigned short current_logical_heads; + unsigned short current_pages_per_track; + unsigned int current_page_capacity; + unsigned short mutiple_page_setting; + unsigned int addressable_pages; + unsigned short single_word_dma; + unsigned short multi_word_dma; + unsigned char reserved3[128]; +} __packed; + + +struct ms_boot_page { + struct ms_boot_header header; + struct ms_system_entry entry; + struct ms_boot_attr_info attr; +} __packed; + +struct msb_data { + unsigned int usage_count; + struct memstick_dev *card; + struct gendisk *disk; + struct request_queue *queue; + spinlock_t q_lock; + struct hd_geometry geometry; + struct attribute_group attr_group; + struct request *req; + int caps; + int disk_id; + + /* IO */ + struct workqueue_struct *io_queue; + bool io_queue_stopped; + struct work_struct io_work; + bool card_dead; + + /* Media properties */ + struct ms_boot_page *boot_page; + u16 boot_block_locations[2]; + int boot_block_count; + + bool read_only; + unsigned short page_size; + int block_size; + int pages_in_block; + int zone_count; + int block_count; + int logical_block_count; + + /* FTL tables */ + unsigned long *used_blocks_bitmap; + unsigned long *erased_blocks_bitmap; + u16 *lba_to_pba_table; + int free_block_count[MS_MAX_ZONES]; + bool ftl_initialized; + + /* Cache */ + unsigned char *cache; + unsigned long valid_cache_bitmap; + int cache_block_lba; + bool need_flush_cache; + struct timer_list cache_flush_timer; + + /* Preallocated buffers */ + unsigned char *block_buffer; + struct scatterlist prealloc_sg[MS_BLOCK_MAX_SEGS+1]; + + + /* handler's local data */ + struct ms_register_addr reg_addr; + bool addr_valid; + + u8 command_value; + bool command_need_oob; + struct scatterlist *current_sg; + int current_sg_offset; + + struct ms_register regs; + int current_page; + + int state; + int exit_error; + bool int_polling; + unsigned long int_timeout; + +}; + +enum msb_readpage_states { + MSB_RP_SEND_BLOCK_ADDRESS = 0, + MSB_RP_SEND_READ_COMMAND, + + MSB_RP_SEND_INT_REQ, + MSB_RP_RECEIVE_INT_REQ_RESULT, + + MSB_RP_SEND_READ_STATUS_REG, + MSB_RP_RECIVE_STATUS_REG, + + MSB_RP_SEND_OOB_READ, + MSB_RP_RECEIVE_OOB_READ, + + MSB_RP_SEND_READ_DATA, + MSB_RP_RECEIVE_READ_DATA, +}; + +enum msb_write_block_states { + MSB_WB_SEND_WRITE_PARAMS = 0, + MSB_WB_SEND_WRITE_OOB, + MSB_WB_SEND_WRITE_COMMAND, + + MSB_WB_SEND_INT_REQ, + MSB_WB_RECEIVE_INT_REQ, + + MSB_WB_SEND_WRITE_DATA, + MSB_WB_RECEIVE_WRITE_CONFIRMATION, +}; + +enum msb_send_command_states { + MSB_SC_SEND_WRITE_PARAMS, + MSB_SC_SEND_WRITE_OOB, + MSB_SC_SEND_COMMAND, + + MSB_SC_SEND_INT_REQ, + MSB_SC_RECEIVE_INT_REQ, + +}; + +enum msb_reset_states { + MSB_RS_SEND, + MSB_RS_CONFIRM, +}; + +enum msb_par_switch_states { + MSB_PS_SEND_SWITCH_COMMAND, + MSB_PS_SWICH_HOST, + MSB_PS_CONFIRM, +}; + +struct chs_entry { + unsigned long size; + unsigned char sec; + unsigned short cyl; + unsigned char head; +}; + +static int msb_reset(struct msb_data *msb, bool full); + +static int h_msb_default_bad(struct memstick_dev *card, + struct memstick_request **mrq); + +#define __dbg(level, format, ...) \ + do { \ + if (debug >= level) \ + pr_err(format "\n", ## __VA_ARGS__); \ + } while (0) + + +#define dbg(format, ...) __dbg(1, format, ## __VA_ARGS__) +#define dbg_verbose(format, ...) __dbg(2, format, ## __VA_ARGS__) + +#endif diff --git a/drivers/memstick/host/rtsx_pci_ms.c b/drivers/memstick/host/rtsx_pci_ms.c index cf8bd727dfc7..25f8f93decb6 100644 --- a/drivers/memstick/host/rtsx_pci_ms.c +++ b/drivers/memstick/host/rtsx_pci_ms.c @@ -612,8 +612,6 @@ static int rtsx_pci_ms_drv_remove(struct platform_device *pdev) memstick_remove_host(msh); memstick_free_host(msh); - platform_set_drvdata(pdev, NULL); - dev_dbg(&(pdev->dev), ": Realtek PCI-E Memstick controller has been removed\n"); diff --git a/drivers/platform/x86/apple-gmux.c b/drivers/platform/x86/apple-gmux.c index f74bfcbb7bad..8eea2efbbb6d 100644 --- a/drivers/platform/x86/apple-gmux.c +++ b/drivers/platform/x86/apple-gmux.c @@ -393,17 +393,21 @@ static void gmux_notify_handler(acpi_handle device, u32 value, void *context) complete(&gmux_data->powerchange_done); } -static int gmux_suspend(struct pnp_dev *pnp, pm_message_t state) +static int gmux_suspend(struct device *dev) { + struct pnp_dev *pnp = to_pnp_dev(dev); struct apple_gmux_data *gmux_data = pnp_get_drvdata(pnp); + gmux_data->resume_client_id = gmux_active_client(gmux_data); gmux_disable_interrupts(gmux_data); return 0; } -static int gmux_resume(struct pnp_dev *pnp) +static int gmux_resume(struct device *dev) { + struct pnp_dev *pnp = to_pnp_dev(dev); struct apple_gmux_data *gmux_data = pnp_get_drvdata(pnp); + gmux_enable_interrupts(gmux_data); gmux_switchto(gmux_data->resume_client_id); if (gmux_data->power_state == VGA_SWITCHEROO_OFF) @@ -605,13 +609,19 @@ static const struct pnp_device_id gmux_device_ids[] = { {"", 0} }; +static const struct dev_pm_ops gmux_dev_pm_ops = { + .suspend = gmux_suspend, + .resume = gmux_resume, +}; + static struct pnp_driver gmux_pnp_driver = { .name = "apple-gmux", .probe = gmux_probe, .remove = gmux_remove, .id_table = gmux_device_ids, - .suspend = gmux_suspend, - .resume = gmux_resume + .driver = { + .pm = &gmux_dev_pm_ops, + }, }; static int __init apple_gmux_init(void) diff --git a/drivers/pnp/driver.c b/drivers/pnp/driver.c index 12adb43a0693..a39ee38a9414 100644 --- a/drivers/pnp/driver.c +++ b/drivers/pnp/driver.c @@ -163,6 +163,13 @@ static int __pnp_bus_suspend(struct device *dev, pm_message_t state) if (!pnp_drv) return 0; + if (pnp_drv->driver.pm && pnp_drv->driver.pm->suspend) { + error = pnp_drv->driver.pm->suspend(dev); + suspend_report_result(pnp_drv->driver.pm->suspend, error); + if (error) + return error; + } + if (pnp_drv->suspend) { error = pnp_drv->suspend(pnp_dev, state); if (error) @@ -211,6 +218,12 @@ static int pnp_bus_resume(struct device *dev) return error; } + if (pnp_drv->driver.pm && pnp_drv->driver.pm->resume) { + error = pnp_drv->driver.pm->resume(dev); + if (error) + return error; + } + if (pnp_drv->resume) { error = pnp_drv->resume(pnp_dev); if (error) diff --git a/drivers/pps/clients/pps-gpio.c b/drivers/pps/clients/pps-gpio.c index eae0eda9ff39..9966124ad988 100644 --- a/drivers/pps/clients/pps-gpio.c +++ b/drivers/pps/clients/pps-gpio.c @@ -184,7 +184,6 @@ static int pps_gpio_remove(struct platform_device *pdev) { struct pps_gpio_device_data *data = platform_get_drvdata(pdev); - platform_set_drvdata(pdev, NULL); pps_unregister_source(data->pps); dev_info(&pdev->dev, "removed IRQ %d as PPS source\n", data->irq); return 0; diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index 9e3498bf302b..9654aa3c05cb 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -1249,6 +1249,15 @@ config RTC_DRV_SIRFSOC Say "yes" here to support the real time clock on SiRF SOC chips. This driver can also be built as a module called rtc-sirfsoc. +config RTC_DRV_MOXART + tristate "MOXA ART RTC" + help + If you say yes here you get support for the MOXA ART + RTC module. + + This driver can also be built as a module. If so, the module + will be called rtc-moxart + comment "HID Sensor RTC drivers" config RTC_DRV_HID_SENSOR_TIME diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile index d3b4488f48f2..2dff3d2009b5 100644 --- a/drivers/rtc/Makefile +++ b/drivers/rtc/Makefile @@ -130,3 +130,4 @@ obj-$(CONFIG_RTC_DRV_WM831X) += rtc-wm831x.o obj-$(CONFIG_RTC_DRV_WM8350) += rtc-wm8350.o obj-$(CONFIG_RTC_DRV_X1205) += rtc-x1205.o obj-$(CONFIG_RTC_DRV_SIRFSOC) += rtc-sirfsoc.o +obj-$(CONFIG_RTC_DRV_MOXART) += rtc-moxart.o diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index be06d7150de5..24e733c98f8b 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -1018,23 +1018,6 @@ static void __exit cmos_pnp_remove(struct pnp_dev *pnp) cmos_do_remove(&pnp->dev); } -#ifdef CONFIG_PM - -static int cmos_pnp_suspend(struct pnp_dev *pnp, pm_message_t mesg) -{ - return cmos_suspend(&pnp->dev); -} - -static int cmos_pnp_resume(struct pnp_dev *pnp) -{ - return cmos_resume(&pnp->dev); -} - -#else -#define cmos_pnp_suspend NULL -#define cmos_pnp_resume NULL -#endif - static void cmos_pnp_shutdown(struct pnp_dev *pnp) { if (system_state == SYSTEM_POWER_OFF && !cmos_poweroff(&pnp->dev)) @@ -1060,8 +1043,11 @@ static struct pnp_driver cmos_pnp_driver = { /* flag ensures resume() gets called, and stops syslog spam */ .flags = PNP_DRIVER_RES_DO_NOT_CHANGE, - .suspend = cmos_pnp_suspend, - .resume = cmos_pnp_resume, +#ifdef CONFIG_PM_SLEEP + .driver = { + .pm = &cmos_pm_ops, + }, +#endif }; #endif /* CONFIG_PNP */ diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c index 308a8fefe76f..bc7b4fcf603c 100644 --- a/drivers/rtc/rtc-ds1511.c +++ b/drivers/rtc/rtc-ds1511.c @@ -89,7 +89,6 @@ enum ds1511reg { struct rtc_plat_data { struct rtc_device *rtc; void __iomem *ioaddr; /* virtual base address */ - int size; /* amount of memory mapped */ int irq; unsigned int irqen; int alrm_sec; @@ -479,20 +478,14 @@ static int ds1511_rtc_probe(struct platform_device *pdev) struct rtc_plat_data *pdata; int ret = 0; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) - return -ENODEV; - pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); if (!pdata) return -ENOMEM; - pdata->size = resource_size(res); - if (!devm_request_mem_region(&pdev->dev, res->start, pdata->size, - pdev->name)) - return -EBUSY; - ds1511_base = devm_ioremap(&pdev->dev, res->start, pdata->size); - if (!ds1511_base) - return -ENOMEM; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + ds1511_base = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(ds1511_base)) + return PTR_ERR(ds1511_base); pdata->ioaddr = ds1511_base; pdata->irq = platform_get_irq(pdev, 0); diff --git a/drivers/rtc/rtc-ds1553.c b/drivers/rtc/rtc-ds1553.c index 8c6c952e90b1..fd31571941f5 100644 --- a/drivers/rtc/rtc-ds1553.c +++ b/drivers/rtc/rtc-ds1553.c @@ -285,19 +285,14 @@ static int ds1553_rtc_probe(struct platform_device *pdev) void __iomem *ioaddr; int ret = 0; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) - return -ENODEV; pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); if (!pdata) return -ENOMEM; - if (!devm_request_mem_region(&pdev->dev, res->start, RTC_REG_SIZE, - pdev->name)) - return -EBUSY; - ioaddr = devm_ioremap(&pdev->dev, res->start, RTC_REG_SIZE); - if (!ioaddr) - return -ENOMEM; + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + ioaddr = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(ioaddr)) + return PTR_ERR(ioaddr); pdata->ioaddr = ioaddr; pdata->irq = platform_get_irq(pdev, 0); diff --git a/drivers/rtc/rtc-ds1742.c b/drivers/rtc/rtc-ds1742.c index eccdc62ae1c0..17b73fdc3b6e 100644 --- a/drivers/rtc/rtc-ds1742.c +++ b/drivers/rtc/rtc-ds1742.c @@ -52,11 +52,9 @@ #define RTC_BATT_FLAG 0x80 struct rtc_plat_data { - struct rtc_device *rtc; void __iomem *ioaddr_nvram; void __iomem *ioaddr_rtc; size_t size_nvram; - size_t size; unsigned long last_jiffies; struct bin_attribute nvram_attr; }; @@ -117,11 +115,7 @@ static int ds1742_rtc_read_time(struct device *dev, struct rtc_time *tm) /* year is 1900 + tm->tm_year */ tm->tm_year = bcd2bin(year) + bcd2bin(century) * 100 - 1900; - if (rtc_valid_tm(tm) < 0) { - dev_err(dev, "retrieved date/time is not valid.\n"); - rtc_time_to_tm(0, tm); - } - return 0; + return rtc_valid_tm(tm); } static const struct rtc_class_ops ds1742_rtc_ops = { @@ -168,22 +162,17 @@ static int ds1742_rtc_probe(struct platform_device *pdev) void __iomem *ioaddr; int ret = 0; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) - return -ENODEV; pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); if (!pdata) return -ENOMEM; - pdata->size = resource_size(res); - if (!devm_request_mem_region(&pdev->dev, res->start, pdata->size, - pdev->name)) - return -EBUSY; - ioaddr = devm_ioremap(&pdev->dev, res->start, pdata->size); - if (!ioaddr) - return -ENOMEM; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + ioaddr = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(ioaddr)) + return PTR_ERR(ioaddr); pdata->ioaddr_nvram = ioaddr; - pdata->size_nvram = pdata->size - RTC_SIZE; + pdata->size_nvram = resource_size(res) - RTC_SIZE; pdata->ioaddr_rtc = ioaddr + pdata->size_nvram; sysfs_bin_attr_init(&pdata->nvram_attr); @@ -212,7 +201,6 @@ static int ds1742_rtc_probe(struct platform_device *pdev) &ds1742_rtc_ops, THIS_MODULE); if (IS_ERR(rtc)) return PTR_ERR(rtc); - pdata->rtc = rtc; ret = sysfs_create_bin_file(&pdev->dev.kobj, &pdata->nvram_attr); diff --git a/drivers/rtc/rtc-ep93xx.c b/drivers/rtc/rtc-ep93xx.c index 549b3c3792d2..580e7b56bde8 100644 --- a/drivers/rtc/rtc-ep93xx.c +++ b/drivers/rtc/rtc-ep93xx.c @@ -138,17 +138,9 @@ static int ep93xx_rtc_probe(struct platform_device *pdev) return -ENOMEM; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) - return -ENXIO; - - if (!devm_request_mem_region(&pdev->dev, res->start, - resource_size(res), pdev->name)) - return -EBUSY; - - ep93xx_rtc->mmio_base = devm_ioremap(&pdev->dev, res->start, - resource_size(res)); - if (!ep93xx_rtc->mmio_base) - return -ENXIO; + ep93xx_rtc->mmio_base = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(ep93xx_rtc->mmio_base)) + return PTR_ERR(ep93xx_rtc->mmio_base); pdev->dev.platform_data = ep93xx_rtc; platform_set_drvdata(pdev, ep93xx_rtc); diff --git a/drivers/rtc/rtc-hid-sensor-time.c b/drivers/rtc/rtc-hid-sensor-time.c index 7273b0139e5c..4e2a81854f51 100644 --- a/drivers/rtc/rtc-hid-sensor-time.c +++ b/drivers/rtc/rtc-hid-sensor-time.c @@ -23,10 +23,6 @@ #include <linux/iio/iio.h> #include <linux/rtc.h> -/* Format: HID-SENSOR-usage_id_in_hex */ -/* Usage ID from spec for Time: 0x2000A0 */ -#define DRIVER_NAME "HID-SENSOR-2000a0" /* must be lowercase */ - enum hid_time_channel { CHANNEL_SCAN_INDEX_YEAR, CHANNEL_SCAN_INDEX_MONTH, @@ -283,9 +279,11 @@ static int hid_time_probe(struct platform_device *pdev) "hid-sensor-time", &hid_time_rtc_ops, THIS_MODULE); - if (IS_ERR(time_state->rtc)) { + if (IS_ERR_OR_NULL(time_state->rtc)) { + ret = time_state->rtc ? PTR_ERR(time_state->rtc) : -ENODEV; + time_state->rtc = NULL; + sensor_hub_remove_callback(hsdev, HID_USAGE_SENSOR_TIME); dev_err(&pdev->dev, "rtc device register failed!\n"); - return PTR_ERR(time_state->rtc); } return ret; @@ -300,9 +298,19 @@ static int hid_time_remove(struct platform_device *pdev) return 0; } +static struct platform_device_id hid_time_ids[] = { + { + /* Format: HID-SENSOR-usage_id_in_hex_lowercase */ + .name = "HID-SENSOR-2000a0", + }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(platform, hid_time_ids); + static struct platform_driver hid_time_platform_driver = { + .id_table = hid_time_ids, .driver = { - .name = DRIVER_NAME, + .name = KBUILD_MODNAME, .owner = THIS_MODULE, }, .probe = hid_time_probe, diff --git a/drivers/rtc/rtc-imxdi.c b/drivers/rtc/rtc-imxdi.c index d3a8c8e255de..abd7f9091f34 100644 --- a/drivers/rtc/rtc-imxdi.c +++ b/drivers/rtc/rtc-imxdi.c @@ -375,24 +375,16 @@ static int __init dryice_rtc_probe(struct platform_device *pdev) struct imxdi_dev *imxdi; int rc; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) - return -ENODEV; - imxdi = devm_kzalloc(&pdev->dev, sizeof(*imxdi), GFP_KERNEL); if (!imxdi) return -ENOMEM; imxdi->pdev = pdev; - if (!devm_request_mem_region(&pdev->dev, res->start, resource_size(res), - pdev->name)) - return -EBUSY; - - imxdi->ioaddr = devm_ioremap(&pdev->dev, res->start, - resource_size(res)); - if (imxdi->ioaddr == NULL) - return -ENOMEM; + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + imxdi->ioaddr = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(imxdi->ioaddr)) + return PTR_ERR(imxdi->ioaddr); spin_lock_init(&imxdi->irq_lock); diff --git a/drivers/rtc/rtc-lpc32xx.c b/drivers/rtc/rtc-lpc32xx.c index 8276ae94a2a9..bfdbcb82d069 100644 --- a/drivers/rtc/rtc-lpc32xx.c +++ b/drivers/rtc/rtc-lpc32xx.c @@ -201,16 +201,9 @@ static int lpc32xx_rtc_probe(struct platform_device *pdev) { struct resource *res; struct lpc32xx_rtc *rtc; - resource_size_t size; int rtcirq; u32 tmp; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) { - dev_err(&pdev->dev, "Can't get memory resource\n"); - return -ENOENT; - } - rtcirq = platform_get_irq(pdev, 0); if (rtcirq < 0 || rtcirq >= NR_IRQS) { dev_warn(&pdev->dev, "Can't get interrupt resource\n"); @@ -224,19 +217,10 @@ static int lpc32xx_rtc_probe(struct platform_device *pdev) } rtc->irq = rtcirq; - size = resource_size(res); - - if (!devm_request_mem_region(&pdev->dev, res->start, size, - pdev->name)) { - dev_err(&pdev->dev, "RTC registers are not free\n"); - return -EBUSY; - } - - rtc->rtc_base = devm_ioremap(&pdev->dev, res->start, size); - if (!rtc->rtc_base) { - dev_err(&pdev->dev, "Can't map memory\n"); - return -ENOMEM; - } + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + rtc->rtc_base = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(rtc->rtc_base)) + return PTR_ERR(rtc->rtc_base); spin_lock_init(&rtc->lock); diff --git a/drivers/rtc/rtc-max77686.c b/drivers/rtc/rtc-max77686.c index 9915cb96014b..9efe118a28ba 100644 --- a/drivers/rtc/rtc-max77686.c +++ b/drivers/rtc/rtc-max77686.c @@ -240,9 +240,9 @@ static int max77686_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) } alrm->pending = 0; - ret = regmap_read(info->max77686->regmap, MAX77686_REG_STATUS1, &val); + ret = regmap_read(info->max77686->regmap, MAX77686_REG_STATUS2, &val); if (ret < 0) { - dev_err(info->dev, "%s:%d fail to read status1 reg(%d)\n", + dev_err(info->dev, "%s:%d fail to read status2 reg(%d)\n", __func__, __LINE__, ret); goto out; } diff --git a/drivers/rtc/rtc-moxart.c b/drivers/rtc/rtc-moxart.c new file mode 100644 index 000000000000..c29dee0946e6 --- /dev/null +++ b/drivers/rtc/rtc-moxart.c @@ -0,0 +1,330 @@ +/* + * MOXA ART RTC driver. + * + * Copyright (C) 2013 Jonas Jensen + * + * Jonas Jensen <jonas.jensen@gmail.com> + * + * Based on code from + * Moxa Technology Co., Ltd. <www.moxa.com> + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/delay.h> +#include <linux/rtc.h> +#include <linux/platform_device.h> +#include <linux/module.h> +#include <linux/gpio.h> +#include <linux/of_gpio.h> + +#define GPIO_RTC_RESERVED 0x0C +#define GPIO_RTC_DATA_SET 0x10 +#define GPIO_RTC_DATA_CLEAR 0x14 +#define GPIO_RTC_PIN_PULL_ENABLE 0x18 +#define GPIO_RTC_PIN_PULL_TYPE 0x1C +#define GPIO_RTC_INT_ENABLE 0x20 +#define GPIO_RTC_INT_RAW_STATE 0x24 +#define GPIO_RTC_INT_MASKED_STATE 0x28 +#define GPIO_RTC_INT_MASK 0x2C +#define GPIO_RTC_INT_CLEAR 0x30 +#define GPIO_RTC_INT_TRIGGER 0x34 +#define GPIO_RTC_INT_BOTH 0x38 +#define GPIO_RTC_INT_RISE_NEG 0x3C +#define GPIO_RTC_BOUNCE_ENABLE 0x40 +#define GPIO_RTC_BOUNCE_PRE_SCALE 0x44 +#define GPIO_RTC_PROTECT_W 0x8E +#define GPIO_RTC_PROTECT_R 0x8F +#define GPIO_RTC_YEAR_W 0x8C +#define GPIO_RTC_YEAR_R 0x8D +#define GPIO_RTC_DAY_W 0x8A +#define GPIO_RTC_DAY_R 0x8B +#define GPIO_RTC_MONTH_W 0x88 +#define GPIO_RTC_MONTH_R 0x89 +#define GPIO_RTC_DATE_W 0x86 +#define GPIO_RTC_DATE_R 0x87 +#define GPIO_RTC_HOURS_W 0x84 +#define GPIO_RTC_HOURS_R 0x85 +#define GPIO_RTC_MINUTES_W 0x82 +#define GPIO_RTC_MINUTES_R 0x83 +#define GPIO_RTC_SECONDS_W 0x80 +#define GPIO_RTC_SECONDS_R 0x81 +#define GPIO_RTC_DELAY_TIME 8 + +struct moxart_rtc { + struct rtc_device *rtc; + spinlock_t rtc_lock; + int gpio_data, gpio_sclk, gpio_reset; +}; + +static int day_of_year[12] = { 0, 31, 59, 90, 120, 151, 181, + 212, 243, 273, 304, 334 }; + +static void moxart_rtc_write_byte(struct device *dev, u8 data) +{ + struct moxart_rtc *moxart_rtc = dev_get_drvdata(dev); + int i; + + for (i = 0; i < 8; i++, data >>= 1) { + gpio_set_value(moxart_rtc->gpio_sclk, 0); + gpio_set_value(moxart_rtc->gpio_data, ((data & 1) == 1)); + udelay(GPIO_RTC_DELAY_TIME); + gpio_set_value(moxart_rtc->gpio_sclk, 1); + udelay(GPIO_RTC_DELAY_TIME); + } +} + +static u8 moxart_rtc_read_byte(struct device *dev) +{ + struct moxart_rtc *moxart_rtc = dev_get_drvdata(dev); + int i; + u8 data = 0; + + for (i = 0; i < 8; i++) { + gpio_set_value(moxart_rtc->gpio_sclk, 0); + udelay(GPIO_RTC_DELAY_TIME); + gpio_set_value(moxart_rtc->gpio_sclk, 1); + udelay(GPIO_RTC_DELAY_TIME); + if (gpio_get_value(moxart_rtc->gpio_data)) + data |= (1 << i); + udelay(GPIO_RTC_DELAY_TIME); + } + return data; +} + +static u8 moxart_rtc_read_register(struct device *dev, u8 cmd) +{ + struct moxart_rtc *moxart_rtc = dev_get_drvdata(dev); + u8 data; + unsigned long flags; + + local_irq_save(flags); + + gpio_direction_output(moxart_rtc->gpio_data, 0); + gpio_set_value(moxart_rtc->gpio_reset, 1); + udelay(GPIO_RTC_DELAY_TIME); + moxart_rtc_write_byte(dev, cmd); + gpio_direction_input(moxart_rtc->gpio_data); + udelay(GPIO_RTC_DELAY_TIME); + data = moxart_rtc_read_byte(dev); + gpio_set_value(moxart_rtc->gpio_sclk, 0); + gpio_set_value(moxart_rtc->gpio_reset, 0); + udelay(GPIO_RTC_DELAY_TIME); + + local_irq_restore(flags); + + return data; +} + +static void moxart_rtc_write_register(struct device *dev, u8 cmd, u8 data) +{ + struct moxart_rtc *moxart_rtc = dev_get_drvdata(dev); + unsigned long flags; + + local_irq_save(flags); + + gpio_direction_output(moxart_rtc->gpio_data, 0); + gpio_set_value(moxart_rtc->gpio_reset, 1); + udelay(GPIO_RTC_DELAY_TIME); + moxart_rtc_write_byte(dev, cmd); + moxart_rtc_write_byte(dev, data); + gpio_set_value(moxart_rtc->gpio_sclk, 0); + gpio_set_value(moxart_rtc->gpio_reset, 0); + udelay(GPIO_RTC_DELAY_TIME); + + local_irq_restore(flags); +} + +static int moxart_rtc_set_time(struct device *dev, struct rtc_time *tm) +{ + struct moxart_rtc *moxart_rtc = dev_get_drvdata(dev); + + spin_lock_irq(&moxart_rtc->rtc_lock); + + moxart_rtc_write_register(dev, GPIO_RTC_PROTECT_W, 0); + moxart_rtc_write_register(dev, GPIO_RTC_YEAR_W, + (((tm->tm_year - 100) / 10) << 4) | + ((tm->tm_year - 100) % 10)); + + moxart_rtc_write_register(dev, GPIO_RTC_MONTH_W, + (((tm->tm_mon + 1) / 10) << 4) | + ((tm->tm_mon + 1) % 10)); + + moxart_rtc_write_register(dev, GPIO_RTC_DATE_W, + ((tm->tm_mday / 10) << 4) | + (tm->tm_mday % 10)); + + moxart_rtc_write_register(dev, GPIO_RTC_HOURS_W, + ((tm->tm_hour / 10) << 4) | + (tm->tm_hour % 10)); + + moxart_rtc_write_register(dev, GPIO_RTC_MINUTES_W, + ((tm->tm_min / 10) << 4) | + (tm->tm_min % 10)); + + moxart_rtc_write_register(dev, GPIO_RTC_SECONDS_W, + ((tm->tm_sec / 10) << 4) | + (tm->tm_sec % 10)); + + moxart_rtc_write_register(dev, GPIO_RTC_PROTECT_W, 0x80); + + spin_unlock_irq(&moxart_rtc->rtc_lock); + + dev_dbg(dev, "%s: success tm_year=%d tm_mon=%d\n" + "tm_mday=%d tm_hour=%d tm_min=%d tm_sec=%d\n", + __func__, tm->tm_year, tm->tm_mon, tm->tm_mday, + tm->tm_hour, tm->tm_min, tm->tm_sec); + + return 0; +} + +static int moxart_rtc_read_time(struct device *dev, struct rtc_time *tm) +{ + struct moxart_rtc *moxart_rtc = dev_get_drvdata(dev); + unsigned char v; + + spin_lock_irq(&moxart_rtc->rtc_lock); + + v = moxart_rtc_read_register(dev, GPIO_RTC_SECONDS_R); + tm->tm_sec = (((v & 0x70) >> 4) * 10) + (v & 0x0F); + + v = moxart_rtc_read_register(dev, GPIO_RTC_MINUTES_R); + tm->tm_min = (((v & 0x70) >> 4) * 10) + (v & 0x0F); + + v = moxart_rtc_read_register(dev, GPIO_RTC_HOURS_R); + if (v & 0x80) { /* 12-hour mode */ + tm->tm_hour = (((v & 0x10) >> 4) * 10) + (v & 0x0F); + if (v & 0x20) { /* PM mode */ + tm->tm_hour += 12; + if (tm->tm_hour >= 24) + tm->tm_hour = 0; + } + } else { /* 24-hour mode */ + tm->tm_hour = (((v & 0x30) >> 4) * 10) + (v & 0x0F); + } + + v = moxart_rtc_read_register(dev, GPIO_RTC_DATE_R); + tm->tm_mday = (((v & 0x30) >> 4) * 10) + (v & 0x0F); + + v = moxart_rtc_read_register(dev, GPIO_RTC_MONTH_R); + tm->tm_mon = (((v & 0x10) >> 4) * 10) + (v & 0x0F); + tm->tm_mon--; + + v = moxart_rtc_read_register(dev, GPIO_RTC_YEAR_R); + tm->tm_year = (((v & 0xF0) >> 4) * 10) + (v & 0x0F); + tm->tm_year += 100; + if (tm->tm_year <= 69) + tm->tm_year += 100; + + v = moxart_rtc_read_register(dev, GPIO_RTC_DAY_R); + tm->tm_wday = (v & 0x0f) - 1; + tm->tm_yday = day_of_year[tm->tm_mon]; + tm->tm_yday += (tm->tm_mday - 1); + if (tm->tm_mon >= 2) { + if (!(tm->tm_year % 4) && (tm->tm_year % 100)) + tm->tm_yday++; + } + + tm->tm_isdst = 0; + + spin_unlock_irq(&moxart_rtc->rtc_lock); + + return 0; +} + +static const struct rtc_class_ops moxart_rtc_ops = { + .read_time = moxart_rtc_read_time, + .set_time = moxart_rtc_set_time, +}; + +static int moxart_rtc_probe(struct platform_device *pdev) +{ + struct moxart_rtc *moxart_rtc; + int ret = 0; + + moxart_rtc = devm_kzalloc(&pdev->dev, sizeof(*moxart_rtc), GFP_KERNEL); + if (!moxart_rtc) { + dev_err(&pdev->dev, "devm_kzalloc failed\n"); + return -ENOMEM; + } + + moxart_rtc->gpio_data = of_get_named_gpio(pdev->dev.of_node, + "gpio-rtc-data", 0); + if (!gpio_is_valid(moxart_rtc->gpio_data)) { + dev_err(&pdev->dev, "invalid gpio (data): %d\n", + moxart_rtc->gpio_data); + return moxart_rtc->gpio_data; + } + + moxart_rtc->gpio_sclk = of_get_named_gpio(pdev->dev.of_node, + "gpio-rtc-sclk", 0); + if (!gpio_is_valid(moxart_rtc->gpio_sclk)) { + dev_err(&pdev->dev, "invalid gpio (sclk): %d\n", + moxart_rtc->gpio_sclk); + return moxart_rtc->gpio_sclk; + } + + moxart_rtc->gpio_reset = of_get_named_gpio(pdev->dev.of_node, + "gpio-rtc-reset", 0); + if (!gpio_is_valid(moxart_rtc->gpio_reset)) { + dev_err(&pdev->dev, "invalid gpio (reset): %d\n", + moxart_rtc->gpio_reset); + return moxart_rtc->gpio_reset; + } + + spin_lock_init(&moxart_rtc->rtc_lock); + platform_set_drvdata(pdev, moxart_rtc); + + ret = devm_gpio_request(&pdev->dev, moxart_rtc->gpio_data, "rtc_data"); + if (ret) { + dev_err(&pdev->dev, "can't get rtc_data gpio\n"); + return ret; + } + + ret = devm_gpio_request_one(&pdev->dev, moxart_rtc->gpio_sclk, + GPIOF_DIR_OUT, "rtc_sclk"); + if (ret) { + dev_err(&pdev->dev, "can't get rtc_sclk gpio\n"); + return ret; + } + + ret = devm_gpio_request_one(&pdev->dev, moxart_rtc->gpio_reset, + GPIOF_DIR_OUT, "rtc_reset"); + if (ret) { + dev_err(&pdev->dev, "can't get rtc_reset gpio\n"); + return ret; + } + + moxart_rtc->rtc = devm_rtc_device_register(&pdev->dev, pdev->name, + &moxart_rtc_ops, + THIS_MODULE); + if (IS_ERR(moxart_rtc->rtc)) { + dev_err(&pdev->dev, "devm_rtc_device_register failed\n"); + return PTR_ERR(moxart_rtc->rtc); + } + + return 0; +} + +static const struct of_device_id moxart_rtc_match[] = { + { .compatible = "moxa,moxart-rtc" }, + { }, +}; + +static struct platform_driver moxart_rtc_driver = { + .probe = moxart_rtc_probe, + .driver = { + .name = "moxart-rtc", + .owner = THIS_MODULE, + .of_match_table = moxart_rtc_match, + }, +}; +module_platform_driver(moxart_rtc_driver); + +MODULE_DESCRIPTION("MOXART RTC driver"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jonas Jensen <jonas.jensen@gmail.com>"); diff --git a/drivers/rtc/rtc-mv.c b/drivers/rtc/rtc-mv.c index baab802f2153..d536c5962c99 100644 --- a/drivers/rtc/rtc-mv.c +++ b/drivers/rtc/rtc-mv.c @@ -221,26 +221,17 @@ static int __init mv_rtc_probe(struct platform_device *pdev) { struct resource *res; struct rtc_plat_data *pdata; - resource_size_t size; u32 rtc_time; int ret = 0; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) - return -ENODEV; - pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); if (!pdata) return -ENOMEM; - size = resource_size(res); - if (!devm_request_mem_region(&pdev->dev, res->start, size, - pdev->name)) - return -EBUSY; - - pdata->ioaddr = devm_ioremap(&pdev->dev, res->start, size); - if (!pdata->ioaddr) - return -ENOMEM; + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + pdata->ioaddr = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(pdata->ioaddr)) + return PTR_ERR(pdata->ioaddr); pdata->clk = devm_clk_get(&pdev->dev, NULL); /* Not all SoCs require a clock.*/ diff --git a/drivers/rtc/rtc-mxc.c b/drivers/rtc/rtc-mxc.c index ab87bacb8f88..50c572645546 100644 --- a/drivers/rtc/rtc-mxc.c +++ b/drivers/rtc/rtc-mxc.c @@ -377,22 +377,16 @@ static int mxc_rtc_probe(struct platform_device *pdev) unsigned long rate; int ret; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) - return -ENODEV; - pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); if (!pdata) return -ENOMEM; pdata->devtype = pdev->id_entry->driver_data; - if (!devm_request_mem_region(&pdev->dev, res->start, - resource_size(res), pdev->name)) - return -EBUSY; - - pdata->ioaddr = devm_ioremap(&pdev->dev, res->start, - resource_size(res)); + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + pdata->ioaddr = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(pdata->ioaddr)) + return PTR_ERR(pdata->ioaddr); pdata->clk = devm_clk_get(&pdev->dev, NULL); if (IS_ERR(pdata->clk)) { diff --git a/drivers/rtc/rtc-nuc900.c b/drivers/rtc/rtc-nuc900.c index 22861c5e0c59..248653c74b80 100644 --- a/drivers/rtc/rtc-nuc900.c +++ b/drivers/rtc/rtc-nuc900.c @@ -99,7 +99,7 @@ static int *check_rtc_access_enable(struct nuc900_rtc *nuc900_rtc) if (!timeout) return ERR_PTR(-EPERM); - return 0; + return NULL; } static int nuc900_rtc_bcd2bin(unsigned int timereg, diff --git a/drivers/rtc/rtc-omap.c b/drivers/rtc/rtc-omap.c index c6ffbaec32a4..c7d97ee59327 100644 --- a/drivers/rtc/rtc-omap.c +++ b/drivers/rtc/rtc-omap.c @@ -70,6 +70,8 @@ #define OMAP_RTC_KICK0_REG 0x6c #define OMAP_RTC_KICK1_REG 0x70 +#define OMAP_RTC_IRQWAKEEN 0x7c + /* OMAP_RTC_CTRL_REG bit fields: */ #define OMAP_RTC_CTRL_SPLIT (1<<7) #define OMAP_RTC_CTRL_DISABLE (1<<6) @@ -94,12 +96,21 @@ #define OMAP_RTC_INTERRUPTS_IT_ALARM (1<<3) #define OMAP_RTC_INTERRUPTS_IT_TIMER (1<<2) +/* OMAP_RTC_IRQWAKEEN bit fields: */ +#define OMAP_RTC_IRQWAKEEN_ALARM_WAKEEN (1<<1) + /* OMAP_RTC_KICKER values */ #define KICK0_VALUE 0x83e70b13 #define KICK1_VALUE 0x95a4f1e0 #define OMAP_RTC_HAS_KICKER 0x1 +/* + * Few RTC IP revisions has special WAKE-EN Register to enable Wakeup + * generation for event Alarm. + */ +#define OMAP_RTC_HAS_IRQWAKEEN 0x2 + static void __iomem *rtc_base; #define rtc_read(addr) readb(rtc_base + (addr)) @@ -299,12 +310,18 @@ static struct rtc_class_ops omap_rtc_ops = { static int omap_rtc_alarm; static int omap_rtc_timer; -#define OMAP_RTC_DATA_DA830_IDX 1 +#define OMAP_RTC_DATA_AM3352_IDX 1 +#define OMAP_RTC_DATA_DA830_IDX 2 static struct platform_device_id omap_rtc_devtype[] = { { .name = DRIVER_NAME, - }, { + }, + [OMAP_RTC_DATA_AM3352_IDX] = { + .name = "am3352-rtc", + .driver_data = OMAP_RTC_HAS_KICKER | OMAP_RTC_HAS_IRQWAKEEN, + }, + [OMAP_RTC_DATA_DA830_IDX] = { .name = "da830-rtc", .driver_data = OMAP_RTC_HAS_KICKER, }, @@ -316,6 +333,9 @@ static const struct of_device_id omap_rtc_of_match[] = { { .compatible = "ti,da830-rtc", .data = &omap_rtc_devtype[OMAP_RTC_DATA_DA830_IDX], }, + { .compatible = "ti,am3352-rtc", + .data = &omap_rtc_devtype[OMAP_RTC_DATA_AM3352_IDX], + }, {}, }; MODULE_DEVICE_TABLE(of, omap_rtc_of_match); @@ -464,16 +484,28 @@ static u8 irqstat; static int omap_rtc_suspend(struct device *dev) { + u8 irqwake_stat; + struct platform_device *pdev = to_platform_device(dev); + const struct platform_device_id *id_entry = + platform_get_device_id(pdev); + irqstat = rtc_read(OMAP_RTC_INTERRUPTS_REG); /* FIXME the RTC alarm is not currently acting as a wakeup event - * source, and in fact this enable() call is just saving a flag - * that's never used... + * source on some platforms, and in fact this enable() call is just + * saving a flag that's never used... */ - if (device_may_wakeup(dev)) + if (device_may_wakeup(dev)) { enable_irq_wake(omap_rtc_alarm); - else + + if (id_entry->driver_data & OMAP_RTC_HAS_IRQWAKEEN) { + irqwake_stat = rtc_read(OMAP_RTC_IRQWAKEEN); + irqwake_stat |= OMAP_RTC_IRQWAKEEN_ALARM_WAKEEN; + rtc_write(irqwake_stat, OMAP_RTC_IRQWAKEEN); + } + } else { rtc_write(0, OMAP_RTC_INTERRUPTS_REG); + } /* Disable the clock/module */ pm_runtime_put_sync(dev); @@ -483,13 +515,25 @@ static int omap_rtc_suspend(struct device *dev) static int omap_rtc_resume(struct device *dev) { + u8 irqwake_stat; + struct platform_device *pdev = to_platform_device(dev); + const struct platform_device_id *id_entry = + platform_get_device_id(pdev); + /* Enable the clock/module so that we can access the registers */ pm_runtime_get_sync(dev); - if (device_may_wakeup(dev)) + if (device_may_wakeup(dev)) { disable_irq_wake(omap_rtc_alarm); - else + + if (id_entry->driver_data & OMAP_RTC_HAS_IRQWAKEEN) { + irqwake_stat = rtc_read(OMAP_RTC_IRQWAKEEN); + irqwake_stat &= ~OMAP_RTC_IRQWAKEEN_ALARM_WAKEEN; + rtc_write(irqwake_stat, OMAP_RTC_IRQWAKEEN); + } + } else { rtc_write(irqstat, OMAP_RTC_INTERRUPTS_REG); + } return 0; } #endif diff --git a/drivers/rtc/rtc-palmas.c b/drivers/rtc/rtc-palmas.c index a1fecc8d97fc..fffb7d3449d7 100644 --- a/drivers/rtc/rtc-palmas.c +++ b/drivers/rtc/rtc-palmas.c @@ -238,6 +238,15 @@ static int palmas_rtc_probe(struct platform_device *pdev) struct palmas *palmas = dev_get_drvdata(pdev->dev.parent); struct palmas_rtc *palmas_rtc = NULL; int ret; + bool enable_bb_charging = false; + bool high_bb_charging; + + if (pdev->dev.of_node) { + enable_bb_charging = of_property_read_bool(pdev->dev.of_node, + "ti,backup-battery-chargeable"); + high_bb_charging = of_property_read_bool(pdev->dev.of_node, + "ti,backup-battery-charge-high-current"); + } palmas_rtc = devm_kzalloc(&pdev->dev, sizeof(struct palmas_rtc), GFP_KERNEL); @@ -254,6 +263,32 @@ static int palmas_rtc_probe(struct platform_device *pdev) palmas_rtc->dev = &pdev->dev; platform_set_drvdata(pdev, palmas_rtc); + if (enable_bb_charging) { + unsigned reg = PALMAS_BACKUP_BATTERY_CTRL_BBS_BBC_LOW_ICHRG; + + if (high_bb_charging) + reg = 0; + + ret = palmas_update_bits(palmas, PALMAS_PMU_CONTROL_BASE, + PALMAS_BACKUP_BATTERY_CTRL, + PALMAS_BACKUP_BATTERY_CTRL_BBS_BBC_LOW_ICHRG, reg); + if (ret < 0) { + dev_err(&pdev->dev, + "BACKUP_BATTERY_CTRL update failed, %d\n", ret); + return ret; + } + + ret = palmas_update_bits(palmas, PALMAS_PMU_CONTROL_BASE, + PALMAS_BACKUP_BATTERY_CTRL, + PALMAS_BACKUP_BATTERY_CTRL_BB_CHG_EN, + PALMAS_BACKUP_BATTERY_CTRL_BB_CHG_EN); + if (ret < 0) { + dev_err(&pdev->dev, + "BACKUP_BATTERY_CTRL update failed, %d\n", ret); + return ret; + } + } + /* Start RTC */ ret = palmas_update_bits(palmas, PALMAS_RTC_BASE, PALMAS_RTC_CTRL_REG, PALMAS_RTC_CTRL_REG_STOP_RTC, diff --git a/drivers/rtc/rtc-pcf2127.c b/drivers/rtc/rtc-pcf2127.c index 205b9f7da1b8..1ee514a3972c 100644 --- a/drivers/rtc/rtc-pcf2127.c +++ b/drivers/rtc/rtc-pcf2127.c @@ -203,11 +203,6 @@ static int pcf2127_probe(struct i2c_client *client, return 0; } -static int pcf2127_remove(struct i2c_client *client) -{ - return 0; -} - static const struct i2c_device_id pcf2127_id[] = { { "pcf2127", 0 }, { } @@ -229,7 +224,6 @@ static struct i2c_driver pcf2127_driver = { .of_match_table = of_match_ptr(pcf2127_of_match), }, .probe = pcf2127_probe, - .remove = pcf2127_remove, .id_table = pcf2127_id, }; diff --git a/drivers/rtc/rtc-sirfsoc.c b/drivers/rtc/rtc-sirfsoc.c index aa7ed4b5f7f0..63460cf80f1b 100644 --- a/drivers/rtc/rtc-sirfsoc.c +++ b/drivers/rtc/rtc-sirfsoc.c @@ -44,6 +44,7 @@ struct sirfsoc_rtc_drv { struct rtc_device *rtc; u32 rtc_base; u32 irq; + unsigned irq_wake; /* Overflow for every 8 years extra time */ u32 overflow_rtc; #ifdef CONFIG_PM @@ -355,8 +356,8 @@ static int sirfsoc_rtc_suspend(struct device *dev) rtcdrv->saved_counter = sirfsoc_rtc_iobrg_readl(rtcdrv->rtc_base + RTC_CN); rtcdrv->saved_overflow_rtc = rtcdrv->overflow_rtc; - if (device_may_wakeup(&pdev->dev)) - enable_irq_wake(rtcdrv->irq); + if (device_may_wakeup(&pdev->dev) && !enable_irq_wake(rtcdrv->irq)) + rtcdrv->irq_wake = 1; return 0; } @@ -423,8 +424,10 @@ static int sirfsoc_rtc_resume(struct device *dev) struct platform_device *pdev = to_platform_device(dev); struct sirfsoc_rtc_drv *rtcdrv = platform_get_drvdata(pdev); sirfsoc_rtc_thaw(dev); - if (device_may_wakeup(&pdev->dev)) + if (device_may_wakeup(&pdev->dev) && rtcdrv->irq_wake) { disable_irq_wake(rtcdrv->irq); + rtcdrv->irq_wake = 0; + } return 0; } @@ -434,8 +437,10 @@ static int sirfsoc_rtc_restore(struct device *dev) struct platform_device *pdev = to_platform_device(dev); struct sirfsoc_rtc_drv *rtcdrv = platform_get_drvdata(pdev); - if (device_may_wakeup(&pdev->dev)) + if (device_may_wakeup(&pdev->dev) && rtcdrv->irq_wake) { disable_irq_wake(rtcdrv->irq); + rtcdrv->irq_wake = 0; + } return 0; } diff --git a/drivers/rtc/rtc-stk17ta8.c b/drivers/rtc/rtc-stk17ta8.c index af5e97e3f272..a176ba614683 100644 --- a/drivers/rtc/rtc-stk17ta8.c +++ b/drivers/rtc/rtc-stk17ta8.c @@ -294,19 +294,14 @@ static int stk17ta8_rtc_probe(struct platform_device *pdev) void __iomem *ioaddr; int ret = 0; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) - return -ENODEV; - pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); if (!pdata) return -ENOMEM; - if (!devm_request_mem_region(&pdev->dev, res->start, RTC_REG_SIZE, - pdev->name)) - return -EBUSY; - ioaddr = devm_ioremap(&pdev->dev, res->start, RTC_REG_SIZE); - if (!ioaddr) - return -ENOMEM; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + ioaddr = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(ioaddr)) + return PTR_ERR(ioaddr); pdata->ioaddr = ioaddr; pdata->irq = platform_get_irq(pdev, 0); diff --git a/drivers/rtc/rtc-tx4939.c b/drivers/rtc/rtc-tx4939.c index f9a0677e4e3b..4f87234e0dee 100644 --- a/drivers/rtc/rtc-tx4939.c +++ b/drivers/rtc/rtc-tx4939.c @@ -244,9 +244,6 @@ static int __init tx4939_rtc_probe(struct platform_device *pdev) struct resource *res; int irq, ret; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) - return -ENODEV; irq = platform_get_irq(pdev, 0); if (irq < 0) return -ENODEV; @@ -255,13 +252,10 @@ static int __init tx4939_rtc_probe(struct platform_device *pdev) return -ENOMEM; platform_set_drvdata(pdev, pdata); - if (!devm_request_mem_region(&pdev->dev, res->start, - resource_size(res), pdev->name)) - return -EBUSY; - pdata->rtcreg = devm_ioremap(&pdev->dev, res->start, - resource_size(res)); - if (!pdata->rtcreg) - return -EBUSY; + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + pdata->rtcreg = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(pdata->rtcreg)) + return PTR_ERR(pdata->rtcreg); spin_lock_init(&pdata->lock); tx4939_rtc_cmd(pdata->rtcreg, TX4939_RTCCTL_COMMAND_NOP); diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c index 9e5e14686e75..794820a123d0 100644 --- a/drivers/s390/char/zcore.c +++ b/drivers/s390/char/zcore.c @@ -30,8 +30,8 @@ #define TRACE(x...) debug_sprintf_event(zcore_dbf, 1, x) -#define TO_USER 0 -#define TO_KERNEL 1 +#define TO_USER 1 +#define TO_KERNEL 0 #define CHUNK_INFO_SIZE 34 /* 2 16-byte char, each followed by blank */ enum arch_id { @@ -73,7 +73,7 @@ static struct ipl_parameter_block *ipl_block; * @count: Size of buffer, which should be copied * @mode: Either TO_KERNEL or TO_USER */ -static int memcpy_hsa(void *dest, unsigned long src, size_t count, int mode) +int memcpy_hsa(void *dest, unsigned long src, size_t count, int mode) { int offs, blk_num; static char buf[PAGE_SIZE] __attribute__((__aligned__(PAGE_SIZE))); diff --git a/drivers/video/acornfb.c b/drivers/video/acornfb.c index 6488a7351a60..7e8346ec9cdc 100644 --- a/drivers/video/acornfb.c +++ b/drivers/video/acornfb.c @@ -38,14 +38,6 @@ #include "acornfb.h" /* - * VIDC machines can't do 16 or 32BPP modes. - */ -#ifdef HAS_VIDC -#undef FBCON_HAS_CFB16 -#undef FBCON_HAS_CFB32 -#endif - -/* * Default resolution. * NOTE that it has to be supported in the table towards * the end of this file. @@ -106,238 +98,6 @@ static struct vidc_timing current_vidc; extern unsigned int vram_size; /* set by setup.c */ -#ifdef HAS_VIDC - -#define MAX_SIZE 480*1024 - -/* CTL VIDC Actual - * 24.000 0 8.000 - * 25.175 0 8.392 - * 36.000 0 12.000 - * 24.000 1 12.000 - * 25.175 1 12.588 - * 24.000 2 16.000 - * 25.175 2 16.783 - * 36.000 1 18.000 - * 24.000 3 24.000 - * 36.000 2 24.000 - * 25.175 3 25.175 - * 36.000 3 36.000 - */ -struct pixclock { - u_long min_clock; - u_long max_clock; - u_int vidc_ctl; - u_int vid_ctl; -}; - -static struct pixclock arc_clocks[] = { - /* we allow +/-1% on these */ - { 123750, 126250, VIDC_CTRL_DIV3, VID_CTL_24MHz }, /* 8.000MHz */ - { 82500, 84167, VIDC_CTRL_DIV2, VID_CTL_24MHz }, /* 12.000MHz */ - { 61875, 63125, VIDC_CTRL_DIV1_5, VID_CTL_24MHz }, /* 16.000MHz */ - { 41250, 42083, VIDC_CTRL_DIV1, VID_CTL_24MHz }, /* 24.000MHz */ -}; - -static struct pixclock * -acornfb_valid_pixrate(struct fb_var_screeninfo *var) -{ - u_long pixclock = var->pixclock; - u_int i; - - if (!var->pixclock) - return NULL; - - for (i = 0; i < ARRAY_SIZE(arc_clocks); i++) - if (pixclock > arc_clocks[i].min_clock && - pixclock < arc_clocks[i].max_clock) - return arc_clocks + i; - - return NULL; -} - -/* VIDC Rules: - * hcr : must be even (interlace, hcr/2 must be even) - * hswr : must be even - * hdsr : must be odd - * hder : must be odd - * - * vcr : must be odd - * vswr : >= 1 - * vdsr : >= 1 - * vder : >= vdsr - * if interlaced, then hcr/2 must be even - */ -static void -acornfb_set_timing(struct fb_var_screeninfo *var) -{ - struct pixclock *pclk; - struct vidc_timing vidc; - u_int horiz_correction; - u_int sync_len, display_start, display_end, cycle; - u_int is_interlaced; - u_int vid_ctl, vidc_ctl; - u_int bandwidth; - - memset(&vidc, 0, sizeof(vidc)); - - pclk = acornfb_valid_pixrate(var); - vidc_ctl = pclk->vidc_ctl; - vid_ctl = pclk->vid_ctl; - - bandwidth = var->pixclock * 8 / var->bits_per_pixel; - /* 25.175, 4bpp = 79.444ns per byte, 317.776ns per word: fifo = 2,6 */ - if (bandwidth > 143500) - vidc_ctl |= VIDC_CTRL_FIFO_3_7; - else if (bandwidth > 71750) - vidc_ctl |= VIDC_CTRL_FIFO_2_6; - else if (bandwidth > 35875) - vidc_ctl |= VIDC_CTRL_FIFO_1_5; - else - vidc_ctl |= VIDC_CTRL_FIFO_0_4; - - switch (var->bits_per_pixel) { - case 1: - horiz_correction = 19; - vidc_ctl |= VIDC_CTRL_1BPP; - break; - - case 2: - horiz_correction = 11; - vidc_ctl |= VIDC_CTRL_2BPP; - break; - - case 4: - horiz_correction = 7; - vidc_ctl |= VIDC_CTRL_4BPP; - break; - - default: - case 8: - horiz_correction = 5; - vidc_ctl |= VIDC_CTRL_8BPP; - break; - } - - if (var->sync & FB_SYNC_COMP_HIGH_ACT) /* should be FB_SYNC_COMP */ - vidc_ctl |= VIDC_CTRL_CSYNC; - else { - if (!(var->sync & FB_SYNC_HOR_HIGH_ACT)) - vid_ctl |= VID_CTL_HS_NHSYNC; - - if (!(var->sync & FB_SYNC_VERT_HIGH_ACT)) - vid_ctl |= VID_CTL_VS_NVSYNC; - } - - sync_len = var->hsync_len; - display_start = sync_len + var->left_margin; - display_end = display_start + var->xres; - cycle = display_end + var->right_margin; - - /* if interlaced, then hcr/2 must be even */ - is_interlaced = (var->vmode & FB_VMODE_MASK) == FB_VMODE_INTERLACED; - - if (is_interlaced) { - vidc_ctl |= VIDC_CTRL_INTERLACE; - if (cycle & 2) { - cycle += 2; - var->right_margin += 2; - } - } - - vidc.h_cycle = (cycle - 2) / 2; - vidc.h_sync_width = (sync_len - 2) / 2; - vidc.h_border_start = (display_start - 1) / 2; - vidc.h_display_start = (display_start - horiz_correction) / 2; - vidc.h_display_end = (display_end - horiz_correction) / 2; - vidc.h_border_end = (display_end - 1) / 2; - vidc.h_interlace = (vidc.h_cycle + 1) / 2; - - sync_len = var->vsync_len; - display_start = sync_len + var->upper_margin; - display_end = display_start + var->yres; - cycle = display_end + var->lower_margin; - - if (is_interlaced) - cycle = (cycle - 3) / 2; - else - cycle = cycle - 1; - - vidc.v_cycle = cycle; - vidc.v_sync_width = sync_len - 1; - vidc.v_border_start = display_start - 1; - vidc.v_display_start = vidc.v_border_start; - vidc.v_display_end = display_end - 1; - vidc.v_border_end = vidc.v_display_end; - - if (machine_is_a5k()) - __raw_writeb(vid_ctl, IOEB_VID_CTL); - - if (memcmp(¤t_vidc, &vidc, sizeof(vidc))) { - current_vidc = vidc; - - vidc_writel(0xe0000000 | vidc_ctl); - vidc_writel(0x80000000 | (vidc.h_cycle << 14)); - vidc_writel(0x84000000 | (vidc.h_sync_width << 14)); - vidc_writel(0x88000000 | (vidc.h_border_start << 14)); - vidc_writel(0x8c000000 | (vidc.h_display_start << 14)); - vidc_writel(0x90000000 | (vidc.h_display_end << 14)); - vidc_writel(0x94000000 | (vidc.h_border_end << 14)); - vidc_writel(0x98000000); - vidc_writel(0x9c000000 | (vidc.h_interlace << 14)); - vidc_writel(0xa0000000 | (vidc.v_cycle << 14)); - vidc_writel(0xa4000000 | (vidc.v_sync_width << 14)); - vidc_writel(0xa8000000 | (vidc.v_border_start << 14)); - vidc_writel(0xac000000 | (vidc.v_display_start << 14)); - vidc_writel(0xb0000000 | (vidc.v_display_end << 14)); - vidc_writel(0xb4000000 | (vidc.v_border_end << 14)); - vidc_writel(0xb8000000); - vidc_writel(0xbc000000); - } -#ifdef DEBUG_MODE_SELECTION - printk(KERN_DEBUG "VIDC registers for %dx%dx%d:\n", var->xres, - var->yres, var->bits_per_pixel); - printk(KERN_DEBUG " H-cycle : %d\n", vidc.h_cycle); - printk(KERN_DEBUG " H-sync-width : %d\n", vidc.h_sync_width); - printk(KERN_DEBUG " H-border-start : %d\n", vidc.h_border_start); - printk(KERN_DEBUG " H-display-start : %d\n", vidc.h_display_start); - printk(KERN_DEBUG " H-display-end : %d\n", vidc.h_display_end); - printk(KERN_DEBUG " H-border-end : %d\n", vidc.h_border_end); - printk(KERN_DEBUG " H-interlace : %d\n", vidc.h_interlace); - printk(KERN_DEBUG " V-cycle : %d\n", vidc.v_cycle); - printk(KERN_DEBUG " V-sync-width : %d\n", vidc.v_sync_width); - printk(KERN_DEBUG " V-border-start : %d\n", vidc.v_border_start); - printk(KERN_DEBUG " V-display-start : %d\n", vidc.v_display_start); - printk(KERN_DEBUG " V-display-end : %d\n", vidc.v_display_end); - printk(KERN_DEBUG " V-border-end : %d\n", vidc.v_border_end); - printk(KERN_DEBUG " VIDC Ctrl (E) : 0x%08X\n", vidc_ctl); - printk(KERN_DEBUG " IOEB Ctrl : 0x%08X\n", vid_ctl); -#endif -} - -static int -acornfb_setcolreg(u_int regno, u_int red, u_int green, u_int blue, - u_int trans, struct fb_info *info) -{ - union palette pal; - - if (regno >= current_par.palette_size) - return 1; - - pal.p = 0; - pal.vidc.reg = regno; - pal.vidc.red = red >> 12; - pal.vidc.green = green >> 12; - pal.vidc.blue = blue >> 12; - - current_par.palette[regno] = pal; - - vidc_writel(pal.p); - - return 0; -} -#endif - #ifdef HAS_VIDC20 #include <mach/acornfb.h> @@ -634,16 +394,7 @@ acornfb_adjust_timing(struct fb_info *info, struct fb_var_screeninfo *var, u_int /* hsync_len must be even */ var->hsync_len = (var->hsync_len + 1) & ~1; -#ifdef HAS_VIDC - /* left_margin must be odd */ - if ((var->left_margin & 1) == 0) { - var->left_margin -= 1; - var->right_margin += 1; - } - - /* right_margin must be odd */ - var->right_margin |= 1; -#elif defined(HAS_VIDC20) +#if defined(HAS_VIDC20) /* left_margin must be even */ if (var->left_margin & 1) { var->left_margin += 1; @@ -787,11 +538,7 @@ static int acornfb_set_par(struct fb_info *info) break; case 8: current_par.palette_size = VIDC_PALETTE_SIZE; -#ifdef HAS_VIDC - info->fix.visual = FB_VISUAL_STATIC_PSEUDOCOLOR; -#else info->fix.visual = FB_VISUAL_PSEUDOCOLOR; -#endif break; #ifdef HAS_VIDC20 case 16: @@ -971,9 +718,6 @@ static void acornfb_init_fbinfo(void) #if defined(HAS_VIDC20) fb_info.var.red.length = 8; fb_info.var.transp.length = 4; -#elif defined(HAS_VIDC) - fb_info.var.red.length = 4; - fb_info.var.transp.length = 1; #endif fb_info.var.green = fb_info.var.red; fb_info.var.blue = fb_info.var.red; @@ -1310,14 +1054,6 @@ static int acornfb_probe(struct platform_device *dev) fb_info.fix.smem_start = handle; } #endif -#if defined(HAS_VIDC) - /* - * Archimedes/A5000 machines use a fixed address for their - * framebuffers. Free unused pages - */ - free_unused_pages(PAGE_OFFSET + size, PAGE_OFFSET + MAX_SIZE); -#endif - fb_info.fix.smem_len = size; current_par.palette_size = VIDC_PALETTE_SIZE; diff --git a/drivers/video/acornfb.h b/drivers/video/acornfb.h index fb2a7fffe506..175c8ff3367c 100644 --- a/drivers/video/acornfb.h +++ b/drivers/video/acornfb.h @@ -13,10 +13,6 @@ #include <asm/hardware/iomd.h> #define VIDC_PALETTE_SIZE 256 #define VIDC_NAME "VIDC20" -#elif defined(HAS_VIDC) -#include <asm/hardware/memc.h> -#define VIDC_PALETTE_SIZE 16 -#define VIDC_NAME "VIDC" #endif #define EXTEND8(x) ((x)|(x)<<8) @@ -101,31 +97,6 @@ struct modex_params { const struct modey_params *modey; }; -#ifdef HAS_VIDC - -#define VID_CTL_VS_NVSYNC (1 << 3) -#define VID_CTL_HS_NHSYNC (1 << 2) -#define VID_CTL_24MHz (0) -#define VID_CTL_25MHz (1) -#define VID_CTL_36MHz (2) - -#define VIDC_CTRL_CSYNC (1 << 7) -#define VIDC_CTRL_INTERLACE (1 << 6) -#define VIDC_CTRL_FIFO_0_4 (0 << 4) -#define VIDC_CTRL_FIFO_1_5 (1 << 4) -#define VIDC_CTRL_FIFO_2_6 (2 << 4) -#define VIDC_CTRL_FIFO_3_7 (3 << 4) -#define VIDC_CTRL_1BPP (0 << 2) -#define VIDC_CTRL_2BPP (1 << 2) -#define VIDC_CTRL_4BPP (2 << 2) -#define VIDC_CTRL_8BPP (3 << 2) -#define VIDC_CTRL_DIV3 (0 << 0) -#define VIDC_CTRL_DIV2 (1 << 0) -#define VIDC_CTRL_DIV1_5 (2 << 0) -#define VIDC_CTRL_DIV1 (3 << 0) - -#endif - #ifdef HAS_VIDC20 /* * VIDC20 registers diff --git a/drivers/w1/masters/mxc_w1.c b/drivers/w1/masters/mxc_w1.c index 47e12cfc2a57..15c7251b0556 100644 --- a/drivers/w1/masters/mxc_w1.c +++ b/drivers/w1/masters/mxc_w1.c @@ -152,8 +152,6 @@ static int mxc_w1_remove(struct platform_device *pdev) clk_disable_unprepare(mdev->clk); - platform_set_drvdata(pdev, NULL); - return 0; } diff --git a/drivers/w1/w1.c b/drivers/w1/w1.c index 22013ca2119c..c7c64f18773d 100644 --- a/drivers/w1/w1.c +++ b/drivers/w1/w1.c @@ -234,9 +234,11 @@ static ssize_t w1_master_attribute_store_search(struct device * dev, { long tmp; struct w1_master *md = dev_to_w1_master(dev); + int ret; - if (strict_strtol(buf, 0, &tmp) == -EINVAL) - return -EINVAL; + ret = kstrtol(buf, 0, &tmp); + if (ret) + return ret; mutex_lock(&md->mutex); md->search_count = tmp; @@ -266,9 +268,11 @@ static ssize_t w1_master_attribute_store_pullup(struct device *dev, { long tmp; struct w1_master *md = dev_to_w1_master(dev); + int ret; - if (strict_strtol(buf, 0, &tmp) == -EINVAL) - return -EINVAL; + ret = kstrtol(buf, 0, &tmp); + if (ret) + return ret; mutex_lock(&md->mutex); md->enable_pullup = tmp; diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c index de7e4f497222..5be5e3d14f79 100644 --- a/drivers/watchdog/hpwdt.c +++ b/drivers/watchdog/hpwdt.c @@ -162,7 +162,8 @@ extern asmlinkage void asminline_call(struct cmn_registers *pi86Regs, #define HPWDT_ARCH 32 asm(".text \n\t" - ".align 4 \n" + ".align 4 \n\t" + ".globl asminline_call \n" "asminline_call: \n\t" "pushl %ebp \n\t" "movl %esp, %ebp \n\t" @@ -352,7 +353,8 @@ static int detect_cru_service(void) #define HPWDT_ARCH 64 asm(".text \n\t" - ".align 4 \n" + ".align 4 \n\t" + ".globl asminline_call \n" "asminline_call: \n\t" "pushq %rbp \n\t" "movq %rsp, %rbp \n\t" diff --git a/fs/affs/file.c b/fs/affs/file.c index af3261b78102..776e3935a758 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -836,7 +836,7 @@ affs_truncate(struct inode *inode) struct address_space *mapping = inode->i_mapping; struct page *page; void *fsdata; - u32 size = inode->i_size; + loff_t size = inode->i_size; int res; res = mapping->a_ops->write_begin(NULL, mapping, size, 0, 0, &page, &fsdata); diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 8fb42916d8a2..60250847929f 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -716,13 +716,14 @@ int bioset_integrity_create(struct bio_set *bs, int pool_size) return 0; bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab); - - bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size); - if (!bs->bvec_integrity_pool) + if (!bs->bio_integrity_pool) return -1; - if (!bs->bio_integrity_pool) + bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size); + if (!bs->bvec_integrity_pool) { + mempool_destroy(bs->bio_integrity_pool); return -1; + } return 0; } diff --git a/fs/coredump.c b/fs/coredump.c index 72f816d6cad9..9bdeca12ae0e 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -190,6 +190,11 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm) err = cn_printf(cn, "%d", task_tgid_vnr(current)); break; + /* global pid */ + case 'P': + err = cn_printf(cn, "%d", + task_tgid_nr(current)); + break; /* uid */ case 'u': err = cn_printf(cn, "%d", cred->uid); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 293f86741ddb..473e09da7d02 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -740,6 +740,7 @@ static void ep_free(struct eventpoll *ep) epi = rb_entry(rbp, struct epitem, rbn); ep_unregister_pollwait(ep, epi); + cond_resched(); } /* @@ -754,6 +755,7 @@ static void ep_free(struct eventpoll *ep) while ((rbp = rb_first(&ep->rbr)) != NULL) { epi = rb_entry(rbp, struct epitem, rbn); ep_remove(ep, epi); + cond_resched(); } mutex_unlock(&ep->mtx); diff --git a/fs/exec.c b/fs/exec.c index fd774c7cb483..8875dd10ae7a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -74,6 +74,8 @@ static DEFINE_RWLOCK(binfmt_lock); void __register_binfmt(struct linux_binfmt * fmt, int insert) { BUG_ON(!fmt); + if (WARN_ON(!fmt->load_binary)) + return; write_lock(&binfmt_lock); insert ? list_add(&fmt->lh, &formats) : list_add_tail(&fmt->lh, &formats); @@ -266,7 +268,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; - vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; + vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); INIT_LIST_HEAD(&vma->anon_vma_chain); @@ -1365,18 +1367,18 @@ out: } EXPORT_SYMBOL(remove_arg_zero); +#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) /* * cycle the list of binary formats handler, until one recognizes the image */ int search_binary_handler(struct linux_binprm *bprm) { - unsigned int depth = bprm->recursion_depth; - int try,retval; + bool need_retry = IS_ENABLED(CONFIG_MODULES); struct linux_binfmt *fmt; - pid_t old_pid, old_vpid; + int retval; /* This allows 4 levels of binfmt rewrites before failing hard. */ - if (depth > 5) + if (bprm->recursion_depth > 5) return -ELOOP; retval = security_bprm_check(bprm); @@ -1387,71 +1389,67 @@ int search_binary_handler(struct linux_binprm *bprm) if (retval) return retval; + retval = -ENOENT; + retry: + read_lock(&binfmt_lock); + list_for_each_entry(fmt, &formats, lh) { + if (!try_module_get(fmt->module)) + continue; + read_unlock(&binfmt_lock); + bprm->recursion_depth++; + retval = fmt->load_binary(bprm); + bprm->recursion_depth--; + if (retval >= 0 || retval != -ENOEXEC || + bprm->mm == NULL || bprm->file == NULL) { + put_binfmt(fmt); + return retval; + } + read_lock(&binfmt_lock); + put_binfmt(fmt); + } + read_unlock(&binfmt_lock); + + if (need_retry && retval == -ENOEXEC) { + if (printable(bprm->buf[0]) && printable(bprm->buf[1]) && + printable(bprm->buf[2]) && printable(bprm->buf[3])) + return retval; + if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0) + return retval; + need_retry = false; + goto retry; + } + + return retval; +} +EXPORT_SYMBOL(search_binary_handler); + +static int exec_binprm(struct linux_binprm *bprm) +{ + pid_t old_pid, old_vpid; + int ret; + /* Need to fetch pid before load_binary changes it */ old_pid = current->pid; rcu_read_lock(); old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent)); rcu_read_unlock(); - retval = -ENOENT; - for (try=0; try<2; try++) { - read_lock(&binfmt_lock); - list_for_each_entry(fmt, &formats, lh) { - int (*fn)(struct linux_binprm *) = fmt->load_binary; - if (!fn) - continue; - if (!try_module_get(fmt->module)) - continue; - read_unlock(&binfmt_lock); - bprm->recursion_depth = depth + 1; - retval = fn(bprm); - bprm->recursion_depth = depth; - if (retval >= 0) { - if (depth == 0) { - trace_sched_process_exec(current, old_pid, bprm); - ptrace_event(PTRACE_EVENT_EXEC, old_vpid); - } - put_binfmt(fmt); - allow_write_access(bprm->file); - if (bprm->file) - fput(bprm->file); - bprm->file = NULL; - current->did_exec = 1; - proc_exec_connector(current); - return retval; - } - read_lock(&binfmt_lock); - put_binfmt(fmt); - if (retval != -ENOEXEC || bprm->mm == NULL) - break; - if (!bprm->file) { - read_unlock(&binfmt_lock); - return retval; - } + ret = search_binary_handler(bprm); + if (ret >= 0) { + trace_sched_process_exec(current, old_pid, bprm); + ptrace_event(PTRACE_EVENT_EXEC, old_vpid); + current->did_exec = 1; + proc_exec_connector(current); + + if (bprm->file) { + allow_write_access(bprm->file); + fput(bprm->file); + bprm->file = NULL; /* to catch use-after-free */ } - read_unlock(&binfmt_lock); -#ifdef CONFIG_MODULES - if (retval != -ENOEXEC || bprm->mm == NULL) { - break; - } else { -#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) - if (printable(bprm->buf[0]) && - printable(bprm->buf[1]) && - printable(bprm->buf[2]) && - printable(bprm->buf[3])) - break; /* -ENOEXEC */ - if (try) - break; /* -ENOEXEC */ - request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2])); - } -#else - break; -#endif } - return retval; -} -EXPORT_SYMBOL(search_binary_handler); + return ret; +} /* * sys_execve() executes a new program. @@ -1541,7 +1539,7 @@ static int do_execve_common(const char *filename, if (retval < 0) goto out; - retval = search_binary_handler(bprm); + retval = exec_binprm(bprm); if (retval < 0) goto out; diff --git a/fs/file_table.c b/fs/file_table.c index 322cd37626cb..abdd15ad13c9 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -311,8 +311,7 @@ void fput(struct file *file) return; /* * After this task has run exit_task_work(), - * task_work_add() will fail. free_ipc_ns()-> - * shm_destroy() can do this. Fall through to delayed + * task_work_add() will fail. Fall through to delayed * fput to avoid leaking *file. */ } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 68851ff2fd41..30f6f27d5a59 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -723,7 +723,7 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb, return wrote; } -long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, +static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, enum wb_reason reason) { struct wb_writeback_work work = { @@ -1049,10 +1049,8 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) { struct backing_dev_info *bdi; - if (!nr_pages) { - nr_pages = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); - } + if (!nr_pages) + nr_pages = get_nr_dirty_pages(); rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { @@ -1173,6 +1171,8 @@ void __mark_inode_dirty(struct inode *inode, int flags) bool wakeup_bdi = false; bdi = inode_to_bdi(inode); + spin_unlock(&inode->i_lock); + spin_lock(&bdi->wb.list_lock); if (bdi_cap_writeback_dirty(bdi)) { WARN(!test_bit(BDI_registered, &bdi->state), "bdi-%s not registered\n", bdi->name); @@ -1187,8 +1187,6 @@ void __mark_inode_dirty(struct inode *inode, int flags) wakeup_bdi = true; } - spin_unlock(&inode->i_lock); - spin_lock(&bdi->wb.list_lock); inode->dirtied_when = jiffies; list_move(&inode->i_wb_list, &bdi->wb.b_dirty); spin_unlock(&bdi->wb.list_lock); diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 8702b732109a..73899c1c3449 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -913,7 +913,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, (1 << FSCACHE_OP_WAITING) | (1 << FSCACHE_OP_UNUSE_COOKIE); - ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM); + ret = radix_tree_maybe_preload(gfp & ~__GFP_HIGHMEM); if (ret < 0) goto nomem_free; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index e0fe703ee3d6..84434594e80e 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -930,7 +930,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) fc->bdi.name = "fuse"; fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; /* fuse does it's own writeback accounting */ - fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB; + fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT; err = bdi_init(&fc->bdi); if (err) diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig index a63371815aab..24bc20fd42f7 100644 --- a/fs/hfsplus/Kconfig +++ b/fs/hfsplus/Kconfig @@ -11,3 +11,21 @@ config HFSPLUS_FS MacOS 8. It includes all Mac specific filesystem data such as data forks and creator codes, but it also has several UNIX style features such as file ownership and permissions. + +config HFSPLUS_FS_POSIX_ACL + bool "HFS+ POSIX Access Control Lists" + depends on HFSPLUS_FS + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website <http://acl.bestbits.at/>. + + It needs to understand that POSIX ACLs are treated only under + Linux. POSIX ACLs doesn't mean something under Mac OS X. + Mac OS X beginning with version 10.4 ("Tiger") support NFSv4 ACLs, + which are part of the NFSv4 standard. + + If you don't know what Access Control Lists are, say N diff --git a/fs/hfsplus/Makefile b/fs/hfsplus/Makefile index 09d278bb7b91..683fca2e5e65 100644 --- a/fs/hfsplus/Makefile +++ b/fs/hfsplus/Makefile @@ -7,3 +7,5 @@ obj-$(CONFIG_HFSPLUS_FS) += hfsplus.o hfsplus-objs := super.o options.o inode.o ioctl.o extents.o catalog.o dir.o btree.o \ bnode.o brec.o bfind.o tables.o unicode.o wrapper.o bitmap.o part_tbl.o \ attributes.o xattr.o xattr_user.o xattr_security.o xattr_trusted.o + +hfsplus-$(CONFIG_HFSPLUS_FS_POSIX_ACL) += posix_acl.o diff --git a/fs/hfsplus/acl.h b/fs/hfsplus/acl.h new file mode 100644 index 000000000000..07c0d4947527 --- /dev/null +++ b/fs/hfsplus/acl.h @@ -0,0 +1,30 @@ +/* + * linux/fs/hfsplus/acl.h + * + * Vyacheslav Dubeyko <slava@dubeyko.com> + * + * Handler for Posix Access Control Lists (ACLs) support. + */ + +#include <linux/posix_acl_xattr.h> + +#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL + +/* posix_acl.c */ +struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type); +extern int hfsplus_posix_acl_chmod(struct inode *); +extern int hfsplus_init_posix_acl(struct inode *, struct inode *); + +#else /* CONFIG_HFSPLUS_FS_POSIX_ACL */ +#define hfsplus_get_posix_acl NULL + +static inline int hfsplus_posix_acl_chmod(struct inode *inode) +{ + return 0; +} + +static inline int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir) +{ + return 0; +} +#endif /* CONFIG_HFSPLUS_FS_POSIX_ACL */ diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index d8ce4bd17fc5..4a4fea002673 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -16,6 +16,7 @@ #include "hfsplus_fs.h" #include "hfsplus_raw.h" #include "xattr.h" +#include "acl.h" static inline void hfsplus_instantiate(struct dentry *dentry, struct inode *inode, u32 cnid) @@ -529,6 +530,9 @@ const struct inode_operations hfsplus_dir_inode_operations = { .getxattr = generic_getxattr, .listxattr = hfsplus_listxattr, .removexattr = hfsplus_removexattr, +#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL + .get_acl = hfsplus_get_posix_acl, +#endif }; const struct file_operations hfsplus_dir_operations = { diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index ede79317cfb8..2b9cd01696e2 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -30,6 +30,7 @@ #define DBG_EXTENT 0x00000020 #define DBG_BITMAP 0x00000040 #define DBG_ATTR_MOD 0x00000080 +#define DBG_ACL_MOD 0x00000100 #if 0 #define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD) diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index f833d35630ab..4d2edaea891c 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -19,6 +19,7 @@ #include "hfsplus_fs.h" #include "hfsplus_raw.h" #include "xattr.h" +#include "acl.h" static int hfsplus_readpage(struct file *file, struct page *page) { @@ -316,6 +317,13 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr) setattr_copy(inode, attr); mark_inode_dirty(inode); + + if (attr->ia_valid & ATTR_MODE) { + error = hfsplus_posix_acl_chmod(inode); + if (unlikely(error)) + return error; + } + return 0; } @@ -383,6 +391,9 @@ static const struct inode_operations hfsplus_file_inode_operations = { .getxattr = generic_getxattr, .listxattr = hfsplus_listxattr, .removexattr = hfsplus_removexattr, +#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL + .get_acl = hfsplus_get_posix_acl, +#endif }; static const struct file_operations hfsplus_file_operations = { diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c new file mode 100644 index 000000000000..b609cc14c72e --- /dev/null +++ b/fs/hfsplus/posix_acl.c @@ -0,0 +1,274 @@ +/* + * linux/fs/hfsplus/posix_acl.c + * + * Vyacheslav Dubeyko <slava@dubeyko.com> + * + * Handler for Posix Access Control Lists (ACLs) support. + */ + +#include "hfsplus_fs.h" +#include "xattr.h" +#include "acl.h" + +struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type) +{ + struct posix_acl *acl; + char *xattr_name; + char *value = NULL; + ssize_t size; + + acl = get_cached_acl(inode, type); + if (acl != ACL_NOT_CACHED) + return acl; + + switch (type) { + case ACL_TYPE_ACCESS: + xattr_name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + xattr_name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + return ERR_PTR(-EINVAL); + } + + size = __hfsplus_getxattr(inode, xattr_name, NULL, 0); + + if (size > 0) { + value = (char *)hfsplus_alloc_attr_entry(); + if (unlikely(!value)) + return ERR_PTR(-ENOMEM); + size = __hfsplus_getxattr(inode, xattr_name, value, size); + } + + if (size > 0) + acl = posix_acl_from_xattr(&init_user_ns, value, size); + else if (size == -ENODATA) + acl = NULL; + else + acl = ERR_PTR(size); + + hfsplus_destroy_attr_entry((hfsplus_attr_entry *)value); + + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); + + return acl; +} + +static int hfsplus_set_posix_acl(struct inode *inode, + int type, + struct posix_acl *acl) +{ + int err; + char *xattr_name; + size_t size = 0; + char *value = NULL; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + switch (type) { + case ACL_TYPE_ACCESS: + xattr_name = POSIX_ACL_XATTR_ACCESS; + if (acl) { + err = posix_acl_equiv_mode(acl, &inode->i_mode); + if (err < 0) + return err; + } + err = 0; + break; + + case ACL_TYPE_DEFAULT: + xattr_name = POSIX_ACL_XATTR_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + + default: + return -EINVAL; + } + + if (acl) { + size = posix_acl_xattr_size(acl->a_count); + if (unlikely(size > HFSPLUS_MAX_INLINE_DATA_SIZE)) + return -ENOMEM; + value = (char *)hfsplus_alloc_attr_entry(); + if (unlikely(!value)) + return -ENOMEM; + err = posix_acl_to_xattr(&init_user_ns, acl, value, size); + if (unlikely(err < 0)) + goto end_set_acl; + } + + err = __hfsplus_setxattr(inode, xattr_name, value, size, 0); + +end_set_acl: + hfsplus_destroy_attr_entry((hfsplus_attr_entry *)value); + + if (!err) + set_cached_acl(inode, type, acl); + + return err; +} + +int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir) +{ + int err = 0; + struct posix_acl *acl = NULL; + + hfs_dbg(ACL_MOD, + "[%s]: ino %lu, dir->ino %lu\n", + __func__, inode->i_ino, dir->i_ino); + + if (S_ISLNK(inode->i_mode)) + return 0; + + acl = hfsplus_get_posix_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return PTR_ERR(acl); + + if (acl) { + if (S_ISDIR(inode->i_mode)) { + err = hfsplus_set_posix_acl(inode, + ACL_TYPE_DEFAULT, + acl); + if (unlikely(err)) + goto init_acl_cleanup; + } + + err = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); + if (unlikely(err < 0)) + return err; + + if (err > 0) + err = hfsplus_set_posix_acl(inode, + ACL_TYPE_ACCESS, + acl); + } else + inode->i_mode &= ~current_umask(); + +init_acl_cleanup: + posix_acl_release(acl); + return err; +} + +int hfsplus_posix_acl_chmod(struct inode *inode) +{ + int err; + struct posix_acl *acl; + + hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino); + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + acl = hfsplus_get_posix_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl) || !acl) + return PTR_ERR(acl); + + err = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + if (unlikely(err)) + return err; + + err = hfsplus_set_posix_acl(inode, ACL_TYPE_ACCESS, acl); + posix_acl_release(acl); + return err; +} + +static int hfsplus_xattr_get_posix_acl(struct dentry *dentry, + const char *name, + void *buffer, + size_t size, + int type) +{ + int err = 0; + struct posix_acl *acl; + + hfs_dbg(ACL_MOD, + "[%s]: ino %lu, buffer %p, size %zu, type %#x\n", + __func__, dentry->d_inode->i_ino, buffer, size, type); + + if (strcmp(name, "") != 0) + return -EINVAL; + + acl = hfsplus_get_posix_acl(dentry->d_inode, type); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + + err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); + posix_acl_release(acl); + + return err; +} + +static int hfsplus_xattr_set_posix_acl(struct dentry *dentry, + const char *name, + const void *value, + size_t size, + int flags, + int type) +{ + int err = 0; + struct inode *inode = dentry->d_inode; + struct posix_acl *acl = NULL; + + hfs_dbg(ACL_MOD, + "[%s]: ino %lu, value %p, size %zu, flags %#x, type %#x\n", + __func__, inode->i_ino, value, size, flags, type); + + if (strcmp(name, "") != 0) + return -EINVAL; + + if (!inode_owner_or_capable(inode)) + return -EPERM; + + if (value) { + acl = posix_acl_from_xattr(&init_user_ns, value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + else if (acl) { + err = posix_acl_valid(acl); + if (err) + goto end_xattr_set_acl; + } + } + + err = hfsplus_set_posix_acl(inode, type, acl); + +end_xattr_set_acl: + posix_acl_release(acl); + return err; +} + +static size_t hfsplus_xattr_list_posix_acl(struct dentry *dentry, + char *list, + size_t list_size, + const char *name, + size_t name_len, + int type) +{ + /* + * This method is not used. + * It is used hfsplus_listxattr() instead of generic_listxattr(). + */ + return -EOPNOTSUPP; +} + +const struct xattr_handler hfsplus_xattr_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, + .list = hfsplus_xattr_list_posix_acl, + .get = hfsplus_xattr_get_posix_acl, + .set = hfsplus_xattr_set_posix_acl, +}; + +const struct xattr_handler hfsplus_xattr_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, + .list = hfsplus_xattr_list_posix_acl, + .get = hfsplus_xattr_get_posix_acl, + .set = hfsplus_xattr_set_posix_acl, +}; diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index f66346155df5..bd8471fb9a6a 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -8,11 +8,16 @@ #include "hfsplus_fs.h" #include "xattr.h" +#include "acl.h" const struct xattr_handler *hfsplus_xattr_handlers[] = { &hfsplus_xattr_osx_handler, &hfsplus_xattr_user_handler, &hfsplus_xattr_trusted_handler, +#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL + &hfsplus_xattr_acl_access_handler, + &hfsplus_xattr_acl_default_handler, +#endif &hfsplus_xattr_security_handler, NULL }; @@ -46,11 +51,58 @@ static inline int is_known_namespace(const char *name) return true; } +static int can_set_system_xattr(struct inode *inode, const char *name, + const void *value, size_t size) +{ +#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL + struct posix_acl *acl; + int err; + + if (!inode_owner_or_capable(inode)) + return -EPERM; + + /* + * POSIX_ACL_XATTR_ACCESS is tied to i_mode + */ + if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) { + acl = posix_acl_from_xattr(&init_user_ns, value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl) { + err = posix_acl_equiv_mode(acl, &inode->i_mode); + posix_acl_release(acl); + if (err < 0) + return err; + mark_inode_dirty(inode); + } + /* + * We're changing the ACL. Get rid of the cached one + */ + forget_cached_acl(inode, ACL_TYPE_ACCESS); + + return 0; + } else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) { + acl = posix_acl_from_xattr(&init_user_ns, value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + posix_acl_release(acl); + + /* + * We're changing the default ACL. Get rid of the cached one + */ + forget_cached_acl(inode, ACL_TYPE_DEFAULT); + + return 0; + } +#endif /* CONFIG_HFSPLUS_FS_POSIX_ACL */ + return -EOPNOTSUPP; +} + static int can_set_xattr(struct inode *inode, const char *name, const void *value, size_t value_len) { if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return -EOPNOTSUPP; /* TODO: implement ACL support */ + return can_set_system_xattr(inode, name, value, value_len); if (!strncmp(name, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN)) { /* @@ -253,11 +305,10 @@ static int copy_name(char *buffer, const char *xattr_name, int name_len) return len; } -static ssize_t hfsplus_getxattr_finder_info(struct dentry *dentry, +static ssize_t hfsplus_getxattr_finder_info(struct inode *inode, void *value, size_t size) { ssize_t res = 0; - struct inode *inode = dentry->d_inode; struct hfs_find_data fd; u16 entry_type; u16 folder_rec_len = sizeof(struct DInfo) + sizeof(struct DXInfo); @@ -304,10 +355,9 @@ end_getxattr_finder_info: return res; } -ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, +ssize_t __hfsplus_getxattr(struct inode *inode, const char *name, void *value, size_t size) { - struct inode *inode = dentry->d_inode; struct hfs_find_data fd; hfsplus_attr_entry *entry; __be32 xattr_record_type; @@ -333,7 +383,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, } if (!strcmp_xattr_finder_info(name)) - return hfsplus_getxattr_finder_info(dentry, value, size); + return hfsplus_getxattr_finder_info(inode, value, size); if (!HFSPLUS_SB(inode->i_sb)->attr_tree) return -EOPNOTSUPP; diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h index 847b695b984d..841b5698c0fc 100644 --- a/fs/hfsplus/xattr.h +++ b/fs/hfsplus/xattr.h @@ -14,8 +14,8 @@ extern const struct xattr_handler hfsplus_xattr_osx_handler; extern const struct xattr_handler hfsplus_xattr_user_handler; extern const struct xattr_handler hfsplus_xattr_trusted_handler; -/*extern const struct xattr_handler hfsplus_xattr_acl_access_handler;*/ -/*extern const struct xattr_handler hfsplus_xattr_acl_default_handler;*/ +extern const struct xattr_handler hfsplus_xattr_acl_access_handler; +extern const struct xattr_handler hfsplus_xattr_acl_default_handler; extern const struct xattr_handler hfsplus_xattr_security_handler; extern const struct xattr_handler *hfsplus_xattr_handlers[]; @@ -29,9 +29,17 @@ static inline int hfsplus_setxattr(struct dentry *dentry, const char *name, return __hfsplus_setxattr(dentry->d_inode, name, value, size, flags); } -ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, +ssize_t __hfsplus_getxattr(struct inode *inode, const char *name, void *value, size_t size); +static inline ssize_t hfsplus_getxattr(struct dentry *dentry, + const char *name, + void *value, + size_t size) +{ + return __hfsplus_getxattr(dentry->d_inode, name, value, size); +} + ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size); int hfsplus_removexattr(struct dentry *dentry, const char *name); @@ -39,22 +47,7 @@ int hfsplus_removexattr(struct dentry *dentry, const char *name); int hfsplus_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr); -static inline int hfsplus_init_acl(struct inode *inode, struct inode *dir) -{ - /*TODO: implement*/ - return 0; -} - -static inline int hfsplus_init_inode_security(struct inode *inode, - struct inode *dir, - const struct qstr *qstr) -{ - int err; - - err = hfsplus_init_acl(inode, dir); - if (!err) - err = hfsplus_init_security(inode, dir, qstr); - return err; -} +int hfsplus_init_inode_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr); #endif diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c index 83b842f113c5..00722765ea79 100644 --- a/fs/hfsplus/xattr_security.c +++ b/fs/hfsplus/xattr_security.c @@ -9,6 +9,7 @@ #include <linux/security.h> #include "hfsplus_fs.h" #include "xattr.h" +#include "acl.h" static int hfsplus_security_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size, int type) @@ -96,6 +97,18 @@ int hfsplus_init_security(struct inode *inode, struct inode *dir, &hfsplus_initxattrs, NULL); } +int hfsplus_init_inode_security(struct inode *inode, + struct inode *dir, + const struct qstr *qstr) +{ + int err; + + err = hfsplus_init_posix_acl(inode, dir); + if (!err) + err = hfsplus_init_security(inode, dir, qstr); + return err; +} + const struct xattr_handler hfsplus_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = hfsplus_security_listxattr, diff --git a/fs/namespace.c b/fs/namespace.c index 25845d1b300b..da5c49483430 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -17,7 +17,7 @@ #include <linux/security.h> #include <linux/idr.h> #include <linux/acct.h> /* acct_auto_close_mnt */ -#include <linux/ramfs.h> /* init_rootfs */ +#include <linux/init.h> /* init_rootfs */ #include <linux/fs_struct.h> /* get_fs_root et.al. */ #include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ #include <linux/uaccess.h> diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 8a404576fb26..b4f788e0ca31 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -51,10 +51,6 @@ static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size) return ERR_PTR(-EINVAL); count = size / sizeof(struct posix_acl_entry); - if (count < 0) - return ERR_PTR(-EINVAL); - if (count == 0) - return NULL; acl = posix_acl_alloc(count, GFP_NOFS); if (!acl) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 94417a85ce6e..f37d3c0e2053 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2044,7 +2044,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping, out_write_size: pos += copied; - if (pos > inode->i_size) { + if (pos > i_size_read(inode)) { i_size_write(inode, pos); mark_inode_dirty(inode); } diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 5c1c864e81cc..363f0dcc924f 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -628,11 +628,9 @@ static void o2hb_fire_callbacks(struct o2hb_callback *hbcall, struct o2nm_node *node, int idx) { - struct list_head *iter; struct o2hb_callback_func *f; - list_for_each(iter, &hbcall->list) { - f = list_entry(iter, struct o2hb_callback_func, hc_item); + list_for_each_entry(f, &hbcall->list, hc_item) { mlog(ML_HEARTBEAT, "calling funcs %p\n", f); (f->hc_func)(node, idx, f->hc_data); } @@ -641,16 +639,9 @@ static void o2hb_fire_callbacks(struct o2hb_callback *hbcall, /* Will run the list in order until we process the passed event */ static void o2hb_run_event_list(struct o2hb_node_event *queued_event) { - int empty; struct o2hb_callback *hbcall; struct o2hb_node_event *event; - spin_lock(&o2hb_live_lock); - empty = list_empty(&queued_event->hn_item); - spin_unlock(&o2hb_live_lock); - if (empty) - return; - /* Holding callback sem assures we don't alter the callback * lists when doing this, and serializes ourselves with other * processes wanting callbacks. */ @@ -709,6 +700,7 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot) struct o2hb_node_event event = { .hn_item = LIST_HEAD_INIT(event.hn_item), }; struct o2nm_node *node; + int queued = 0; node = o2nm_get_node_by_num(slot->ds_node_num); if (!node) @@ -726,11 +718,13 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot) o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node, slot->ds_node_num); + queued = 1; } } spin_unlock(&o2hb_live_lock); - o2hb_run_event_list(&event); + if (queued) + o2hb_run_event_list(&event); o2nm_node_put(node); } @@ -790,6 +784,7 @@ static int o2hb_check_slot(struct o2hb_region *reg, unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS; unsigned int slot_dead_ms; int tmp; + int queued = 0; memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); @@ -883,6 +878,7 @@ fire_callbacks: slot->ds_node_num); changed = 1; + queued = 1; } list_add_tail(&slot->ds_live_item, @@ -934,6 +930,7 @@ fire_callbacks: node, slot->ds_node_num); changed = 1; + queued = 1; } /* We don't clear this because the node is still @@ -949,7 +946,8 @@ fire_callbacks: out: spin_unlock(&o2hb_live_lock); - o2hb_run_event_list(&event); + if (queued) + o2hb_run_event_list(&event); if (node) o2nm_node_put(node); @@ -2516,8 +2514,7 @@ unlock: int o2hb_register_callback(const char *region_uuid, struct o2hb_callback_func *hc) { - struct o2hb_callback_func *tmp; - struct list_head *iter; + struct o2hb_callback_func *f; struct o2hb_callback *hbcall; int ret; @@ -2540,10 +2537,9 @@ int o2hb_register_callback(const char *region_uuid, down_write(&o2hb_callback_sem); - list_for_each(iter, &hbcall->list) { - tmp = list_entry(iter, struct o2hb_callback_func, hc_item); - if (hc->hc_priority < tmp->hc_priority) { - list_add_tail(&hc->hc_item, iter); + list_for_each_entry(f, &hbcall->list, hc_item) { + if (hc->hc_priority < f->hc_priority) { + list_add_tail(&hc->hc_item, &f->hc_item); break; } } diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index d644dc611425..2cd2406b4140 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -543,8 +543,9 @@ static void o2net_set_nn_state(struct o2net_node *nn, } if (was_valid && !valid) { - printk(KERN_NOTICE "o2net: No longer connected to " - SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc)); + if (old_sc) + printk(KERN_NOTICE "o2net: No longer connected to " + SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc)); o2net_complete_nodes_nsw(nn); } @@ -765,32 +766,32 @@ static struct o2net_msg_handler * o2net_handler_tree_lookup(u32 msg_type, u32 key, struct rb_node ***ret_p, struct rb_node **ret_parent) { - struct rb_node **p = &o2net_handler_tree.rb_node; - struct rb_node *parent = NULL; + struct rb_node **p = &o2net_handler_tree.rb_node; + struct rb_node *parent = NULL; struct o2net_msg_handler *nmh, *ret = NULL; int cmp; - while (*p) { - parent = *p; - nmh = rb_entry(parent, struct o2net_msg_handler, nh_node); + while (*p) { + parent = *p; + nmh = rb_entry(parent, struct o2net_msg_handler, nh_node); cmp = o2net_handler_cmp(nmh, msg_type, key); - if (cmp < 0) - p = &(*p)->rb_left; - else if (cmp > 0) - p = &(*p)->rb_right; - else { + if (cmp < 0) + p = &(*p)->rb_left; + else if (cmp > 0) + p = &(*p)->rb_right; + else { ret = nmh; - break; + break; } - } + } - if (ret_p != NULL) - *ret_p = p; - if (ret_parent != NULL) - *ret_parent = parent; + if (ret_p != NULL) + *ret_p = p; + if (ret_parent != NULL) + *ret_parent = parent; - return ret; + return ret; } static void o2net_handler_kref_release(struct kref *kref) @@ -1695,13 +1696,12 @@ static void o2net_start_connect(struct work_struct *work) ret = 0; out: - if (ret) { + if (ret && sc) { printk(KERN_NOTICE "o2net: Connect attempt to " SC_NODEF_FMT " failed with errno %d\n", SC_NODEF_ARGS(sc), ret); /* 0 err so that another will be queued and attempted * from set_nn_state */ - if (sc) - o2net_ensure_shutdown(nn, sc, 0); + o2net_ensure_shutdown(nn, sc, 0); } if (sc) sc_put(sc); @@ -1873,12 +1873,16 @@ static int o2net_accept_one(struct socket *sock) if (o2nm_this_node() >= node->nd_num) { local_node = o2nm_get_node_by_num(o2nm_this_node()); - printk(KERN_NOTICE "o2net: Unexpected connect attempt seen " - "at node '%s' (%u, %pI4:%d) from node '%s' (%u, " - "%pI4:%d)\n", local_node->nd_name, local_node->nd_num, - &(local_node->nd_ipv4_address), - ntohs(local_node->nd_ipv4_port), node->nd_name, - node->nd_num, &sin.sin_addr.s_addr, ntohs(sin.sin_port)); + if (local_node) + printk(KERN_NOTICE "o2net: Unexpected connect attempt " + "seen at node '%s' (%u, %pI4:%d) from " + "node '%s' (%u, %pI4:%d)\n", + local_node->nd_name, local_node->nd_num, + &(local_node->nd_ipv4_address), + ntohs(local_node->nd_ipv4_port), + node->nd_name, + node->nd_num, &sin.sin_addr.s_addr, + ntohs(sin.sin_port)); ret = -EINVAL; goto out; } diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index fbec0be62326..b46278f9ae44 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -292,7 +292,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, struct dlm_lock *lock = NULL; struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf; char *name; - struct list_head *iter, *head=NULL; + struct list_head *head = NULL; __be64 cookie; u32 flags; u8 node; @@ -373,8 +373,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, /* try convert queue for both ast/bast */ head = &res->converting; lock = NULL; - list_for_each(iter, head) { - lock = list_entry (iter, struct dlm_lock, list); + list_for_each_entry(lock, head, list) { if (lock->ml.cookie == cookie) goto do_ast; } @@ -385,8 +384,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, else head = &res->granted; - list_for_each(iter, head) { - lock = list_entry (iter, struct dlm_lock, list); + list_for_each_entry(lock, head, list) { if (lock->ml.cookie == cookie) goto do_ast; } diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index de854cca12a2..e0517762fcc0 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -1079,11 +1079,9 @@ static inline int dlm_lock_compatible(int existing, int request) static inline int dlm_lock_on_list(struct list_head *head, struct dlm_lock *lock) { - struct list_head *iter; struct dlm_lock *tmplock; - list_for_each(iter, head) { - tmplock = list_entry(iter, struct dlm_lock, list); + list_for_each_entry(tmplock, head, list) { if (tmplock == lock) return 1; } diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index 29a886d1e82c..e36d63ff1783 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -123,7 +123,6 @@ static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm, int *kick_thread) { enum dlm_status status = DLM_NORMAL; - struct list_head *iter; struct dlm_lock *tmplock=NULL; assert_spin_locked(&res->spinlock); @@ -185,16 +184,14 @@ static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm, /* upconvert from here on */ status = DLM_NORMAL; - list_for_each(iter, &res->granted) { - tmplock = list_entry(iter, struct dlm_lock, list); + list_for_each_entry(tmplock, &res->granted, list) { if (tmplock == lock) continue; if (!dlm_lock_compatible(tmplock->ml.type, type)) goto switch_queues; } - list_for_each(iter, &res->converting) { - tmplock = list_entry(iter, struct dlm_lock, list); + list_for_each_entry(tmplock, &res->converting, list) { if (!dlm_lock_compatible(tmplock->ml.type, type)) goto switch_queues; /* existing conversion requests take precedence */ @@ -424,8 +421,8 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data, struct dlm_ctxt *dlm = data; struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf; struct dlm_lock_resource *res = NULL; - struct list_head *iter; struct dlm_lock *lock = NULL; + struct dlm_lock *tmp_lock; struct dlm_lockstatus *lksb; enum dlm_status status = DLM_NORMAL; u32 flags; @@ -471,14 +468,13 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data, dlm_error(status); goto leave; } - list_for_each(iter, &res->granted) { - lock = list_entry(iter, struct dlm_lock, list); - if (lock->ml.cookie == cnv->cookie && - lock->ml.node == cnv->node_idx) { + list_for_each_entry(tmp_lock, &res->granted, list) { + if (tmp_lock->ml.cookie == cnv->cookie && + tmp_lock->ml.node == cnv->node_idx) { + lock = tmp_lock; dlm_lock_get(lock); break; } - lock = NULL; } spin_unlock(&res->spinlock); if (!lock) { diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 0e28e242226d..e33cd7a3c582 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -96,7 +96,6 @@ static void __dlm_print_lock(struct dlm_lock *lock) void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) { - struct list_head *iter2; struct dlm_lock *lock; char buf[DLM_LOCKID_NAME_MAX]; @@ -118,18 +117,15 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) res->inflight_locks, atomic_read(&res->asts_reserved)); dlm_print_lockres_refmap(res); printk(" granted queue:\n"); - list_for_each(iter2, &res->granted) { - lock = list_entry(iter2, struct dlm_lock, list); + list_for_each_entry(lock, &res->granted, list) { __dlm_print_lock(lock); } printk(" converting queue:\n"); - list_for_each(iter2, &res->converting) { - lock = list_entry(iter2, struct dlm_lock, list); + list_for_each_entry(lock, &res->converting, list) { __dlm_print_lock(lock); } printk(" blocked queue:\n"); - list_for_each(iter2, &res->blocked) { - lock = list_entry(iter2, struct dlm_lock, list); + list_for_each_entry(lock, &res->blocked, list) { __dlm_print_lock(lock); } } @@ -446,7 +442,6 @@ static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len) { struct dlm_master_list_entry *mle; struct hlist_head *bucket; - struct hlist_node *list; int i, out = 0; unsigned long total = 0, longest = 0, bucket_count = 0; @@ -456,9 +451,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len) spin_lock(&dlm->master_lock); for (i = 0; i < DLM_HASH_BUCKETS; i++) { bucket = dlm_master_hash(dlm, i); - hlist_for_each(list, bucket) { - mle = hlist_entry(list, struct dlm_master_list_entry, - master_hash_node); + hlist_for_each_entry(mle, bucket, master_hash_node) { ++total; ++bucket_count; if (len - out < 200) diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index dbb17c07656a..8b3382abf840 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -193,7 +193,7 @@ struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, unsigned int hash) { struct hlist_head *bucket; - struct hlist_node *list; + struct dlm_lock_resource *res; mlog(0, "%.*s\n", len, name); @@ -201,9 +201,7 @@ struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, bucket = dlm_lockres_hash(dlm, hash); - hlist_for_each(list, bucket) { - struct dlm_lock_resource *res = hlist_entry(list, - struct dlm_lock_resource, hash_node); + hlist_for_each_entry(res, bucket, hash_node) { if (res->lockname.name[0] != name[0]) continue; if (unlikely(res->lockname.len != len)) @@ -262,22 +260,19 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, static struct dlm_ctxt * __dlm_lookup_domain_full(const char *domain, int len) { - struct dlm_ctxt *tmp = NULL; - struct list_head *iter; + struct dlm_ctxt *tmp; assert_spin_locked(&dlm_domain_lock); /* tmp->name here is always NULL terminated, * but domain may not be! */ - list_for_each(iter, &dlm_domains) { - tmp = list_entry (iter, struct dlm_ctxt, list); + list_for_each_entry(tmp, &dlm_domains, list) { if (strlen(tmp->name) == len && memcmp(tmp->name, domain, len)==0) - break; - tmp = NULL; + return tmp; } - return tmp; + return NULL; } /* For null terminated domain strings ONLY */ @@ -366,25 +361,22 @@ static void __dlm_get(struct dlm_ctxt *dlm) * you shouldn't trust your pointer. */ struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm) { - struct list_head *iter; - struct dlm_ctxt *target = NULL; + struct dlm_ctxt *target; + struct dlm_ctxt *ret = NULL; spin_lock(&dlm_domain_lock); - list_for_each(iter, &dlm_domains) { - target = list_entry (iter, struct dlm_ctxt, list); - + list_for_each_entry(target, &dlm_domains, list) { if (target == dlm) { __dlm_get(target); + ret = target; break; } - - target = NULL; } spin_unlock(&dlm_domain_lock); - return target; + return ret; } int dlm_domain_fully_joined(struct dlm_ctxt *dlm) @@ -2296,13 +2288,10 @@ static DECLARE_RWSEM(dlm_callback_sem); void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm, int node_num) { - struct list_head *iter; struct dlm_eviction_cb *cb; down_read(&dlm_callback_sem); - list_for_each(iter, &dlm->dlm_eviction_callbacks) { - cb = list_entry(iter, struct dlm_eviction_cb, ec_item); - + list_for_each_entry(cb, &dlm->dlm_eviction_callbacks, ec_item) { cb->ec_func(node_num, cb->ec_data); } up_read(&dlm_callback_sem); diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 47e67c2d228f..5d32f7511f74 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -91,19 +91,14 @@ void dlm_destroy_lock_cache(void) static int dlm_can_grant_new_lock(struct dlm_lock_resource *res, struct dlm_lock *lock) { - struct list_head *iter; struct dlm_lock *tmplock; - list_for_each(iter, &res->granted) { - tmplock = list_entry(iter, struct dlm_lock, list); - + list_for_each_entry(tmplock, &res->granted, list) { if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type)) return 0; } - list_for_each(iter, &res->converting) { - tmplock = list_entry(iter, struct dlm_lock, list); - + list_for_each_entry(tmplock, &res->converting, list) { if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type)) return 0; if (!dlm_lock_compatible(tmplock->ml.convert_type, diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 33ecbe0e6734..cf0f103963b1 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -342,16 +342,13 @@ static int dlm_find_mle(struct dlm_ctxt *dlm, { struct dlm_master_list_entry *tmpmle; struct hlist_head *bucket; - struct hlist_node *list; unsigned int hash; assert_spin_locked(&dlm->master_lock); hash = dlm_lockid_hash(name, namelen); bucket = dlm_master_hash(dlm, hash); - hlist_for_each(list, bucket) { - tmpmle = hlist_entry(list, struct dlm_master_list_entry, - master_hash_node); + hlist_for_each_entry(tmpmle, bucket, master_hash_node) { if (!dlm_mle_equal(dlm, tmpmle, name, namelen)) continue; dlm_get_mle(tmpmle); @@ -3183,7 +3180,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) struct dlm_master_list_entry *mle; struct dlm_lock_resource *res; struct hlist_head *bucket; - struct hlist_node *list; + struct hlist_node *tmp; unsigned int i; mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node); @@ -3194,10 +3191,7 @@ top: spin_lock(&dlm->master_lock); for (i = 0; i < DLM_HASH_BUCKETS; i++) { bucket = dlm_master_hash(dlm, i); - hlist_for_each(list, bucket) { - mle = hlist_entry(list, struct dlm_master_list_entry, - master_hash_node); - + hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) { BUG_ON(mle->type != DLM_MLE_BLOCK && mle->type != DLM_MLE_MASTER && mle->type != DLM_MLE_MIGRATION); @@ -3378,7 +3372,7 @@ void dlm_force_free_mles(struct dlm_ctxt *dlm) int i; struct hlist_head *bucket; struct dlm_master_list_entry *mle; - struct hlist_node *tmp, *list; + struct hlist_node *tmp; /* * We notified all other nodes that we are exiting the domain and @@ -3394,9 +3388,7 @@ void dlm_force_free_mles(struct dlm_ctxt *dlm) for (i = 0; i < DLM_HASH_BUCKETS; i++) { bucket = dlm_master_hash(dlm, i); - hlist_for_each_safe(list, tmp, bucket) { - mle = hlist_entry(list, struct dlm_master_list_entry, - master_hash_node); + hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) { if (mle->type != DLM_MLE_BLOCK) { mlog(ML_ERROR, "bad mle: %p\n", mle); dlm_print_one_mle(mle); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 773bd32bfd8c..0b5adca1b178 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -787,6 +787,7 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from, { struct dlm_lock_request lr; int ret; + int status; mlog(0, "\n"); @@ -800,13 +801,15 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from, // send message ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key, - &lr, sizeof(lr), request_from, NULL); + &lr, sizeof(lr), request_from, &status); /* negative status is handled by caller */ if (ret < 0) mlog(ML_ERROR, "%s: Error %d send LOCK_REQUEST to node %u " "to recover dead node %u\n", dlm->name, ret, request_from, dead_node); + else + ret = status; // return from here, then // sleep until all received or error return ret; @@ -2328,6 +2331,14 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) } else if (res->owner == dlm->node_num) { dlm_free_dead_locks(dlm, res, dead_node); __dlm_lockres_calc_usage(dlm, res); + } else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { + if (test_bit(dead_node, res->refmap)) { + mlog(0, "%s:%.*s: dead node %u had a ref, but had " + "no locks and had not purged before dying\n", + dlm->name, res->lockname.len, + res->lockname.name, dead_node); + dlm_lockres_clear_refmap_bit(dlm, res, dead_node); + } } spin_unlock(&res->spinlock); } diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index e73c833fc2a1..9db869de829d 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -286,8 +286,6 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { struct dlm_lock *lock, *target; - struct list_head *iter; - struct list_head *head; int can_grant = 1; /* @@ -314,9 +312,7 @@ converting: dlm->name, res->lockname.len, res->lockname.name); BUG(); } - head = &res->granted; - list_for_each(iter, head) { - lock = list_entry(iter, struct dlm_lock, list); + list_for_each_entry(lock, &res->granted, list) { if (lock==target) continue; if (!dlm_lock_compatible(lock->ml.type, @@ -333,9 +329,8 @@ converting: target->ml.convert_type; } } - head = &res->converting; - list_for_each(iter, head) { - lock = list_entry(iter, struct dlm_lock, list); + + list_for_each_entry(lock, &res->converting, list) { if (lock==target) continue; if (!dlm_lock_compatible(lock->ml.type, @@ -384,9 +379,7 @@ blocked: goto leave; target = list_entry(res->blocked.next, struct dlm_lock, list); - head = &res->granted; - list_for_each(iter, head) { - lock = list_entry(iter, struct dlm_lock, list); + list_for_each_entry(lock, &res->granted, list) { if (lock==target) continue; if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) { @@ -400,9 +393,7 @@ blocked: } } - head = &res->converting; - list_for_each(iter, head) { - lock = list_entry(iter, struct dlm_lock, list); + list_for_each_entry(lock, &res->converting, list) { if (lock==target) continue; if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) { diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 850aa7e87537..5698b52cf5c9 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -388,7 +388,6 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, struct dlm_ctxt *dlm = data; struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf; struct dlm_lock_resource *res = NULL; - struct list_head *iter; struct dlm_lock *lock = NULL; enum dlm_status status = DLM_NORMAL; int found = 0, i; @@ -458,8 +457,7 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, } for (i=0; i<3; i++) { - list_for_each(iter, queue) { - lock = list_entry(iter, struct dlm_lock, list); + list_for_each_entry(lock, queue, list) { if (lock->ml.cookie == unlock->cookie && lock->ml.node == unlock->node_idx) { dlm_lock_get(lock); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 12bafb7265ce..efa2b3d339e3 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -401,11 +401,8 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb) { struct inode *inode = new_inode(sb); umode_t mode = S_IFDIR | 0755; - struct dlmfs_inode_private *ip; if (inode) { - ip = DLMFS_I(inode); - inode->i_ino = get_next_ino(); inode_init_owner(inode, NULL, mode); inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 2487116d0d33..767370b656ca 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -781,7 +781,6 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, cpos = map_start >> osb->s_clustersize_bits; mapping_end = ocfs2_clusters_for_bytes(inode->i_sb, map_start + map_len); - mapping_end -= cpos; is_last = 0; while (cpos < mapping_end && !is_last) { u32 fe_flags; @@ -852,20 +851,20 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence) down_read(&OCFS2_I(inode)->ip_alloc_sem); - if (*offset >= inode->i_size) { + if (*offset >= i_size_read(inode)) { ret = -ENXIO; goto out_unlock; } if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { if (whence == SEEK_HOLE) - *offset = inode->i_size; + *offset = i_size_read(inode); goto out_unlock; } clen = 0; cpos = *offset >> cs_bits; - cend = ocfs2_clusters_for_bytes(inode->i_sb, inode->i_size); + cend = ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)); while (cpos < cend && !is_last) { ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size, @@ -904,8 +903,8 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence) extlen = clen; extlen <<= cs_bits; - if ((extoff + extlen) > inode->i_size) - extlen = inode->i_size - extoff; + if ((extoff + extlen) > i_size_read(inode)) + extlen = i_size_read(inode) - extoff; extoff += extlen; if (extoff > *offset) *offset = extoff; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 3261d71319ee..4f8197caa487 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -671,11 +671,7 @@ restarted_transaction: } else { BUG_ON(why != RESTART_TRANS); - /* TODO: This can be more intelligent. */ - credits = ocfs2_calc_extend_credits(osb->sb, - &fe->id2.i_list, - clusters_to_add); - status = ocfs2_extend_trans(handle, credits); + status = ocfs2_allocate_extend_trans(handle, 1); if (status < 0) { /* handle still has to be committed at * this point. */ @@ -1800,6 +1796,7 @@ static int ocfs2_remove_inode_range(struct inode *inode, ocfs2_truncate_cluster_pages(inode, byte_start, byte_len); out: + ocfs2_free_path(path); ocfs2_schedule_truncate_log_flush(osb, 1); ocfs2_run_deallocs(osb, &dealloc); diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 0c60ef2d8056..fa32ce9b455d 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -303,7 +303,7 @@ int ocfs2_info_handle_journal_size(struct inode *inode, if (o2info_from_user(oij, req)) goto bail; - oij.ij_journal_size = osb->journal->j_inode->i_size; + oij.ij_journal_size = i_size_read(osb->journal->j_inode); o2info_set_request_filled(&oij.ij_req); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 242170d83971..44fc3e530c3d 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -455,6 +455,41 @@ bail: return status; } +/* + * If we have fewer than thresh credits, extend by OCFS2_MAX_TRANS_DATA. + * If that fails, restart the transaction & regain write access for the + * buffer head which is used for metadata modifications. + * Taken from Ext4: extend_or_restart_transaction() + */ +int ocfs2_allocate_extend_trans(handle_t *handle, int thresh) +{ + int status, old_nblks; + + BUG_ON(!handle); + + old_nblks = handle->h_buffer_credits; + trace_ocfs2_allocate_extend_trans(old_nblks, thresh); + + if (old_nblks < thresh) + return 0; + + status = jbd2_journal_extend(handle, OCFS2_MAX_TRANS_DATA); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (status > 0) { + status = jbd2_journal_restart(handle, OCFS2_MAX_TRANS_DATA); + if (status < 0) + mlog_errno(status); + } + +bail: + return status; +} + + struct ocfs2_triggers { struct jbd2_buffer_trigger_type ot_triggers; int ot_offset; @@ -801,14 +836,14 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) inode_lock = 1; di = (struct ocfs2_dinode *)bh->b_data; - if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) { + if (i_size_read(inode) < OCFS2_MIN_JOURNAL_SIZE) { mlog(ML_ERROR, "Journal file size (%lld) is too small!\n", - inode->i_size); + i_size_read(inode)); status = -EINVAL; goto done; } - trace_ocfs2_journal_init(inode->i_size, + trace_ocfs2_journal_init(i_size_read(inode), (unsigned long long)inode->i_blocks, OCFS2_I(inode)->ip_clusters); @@ -1096,7 +1131,7 @@ static int ocfs2_force_read_journal(struct inode *inode) memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL); - num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size); + num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); v_blkno = 0; while (v_blkno < num_blocks) { status = ocfs2_extent_map_get_blocks(inode, v_blkno, diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 0a992737dcaf..0b479bab3671 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -258,6 +258,17 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int ocfs2_commit_trans(struct ocfs2_super *osb, handle_t *handle); int ocfs2_extend_trans(handle_t *handle, int nblocks); +int ocfs2_allocate_extend_trans(handle_t *handle, + int thresh); + +/* + * Define an arbitrary limit for the amount of data we will anticipate + * writing to any given transaction. For unbounded transactions such as + * fallocate(2) we can write more than this, but we always + * start off at the maximum transaction size and grow the transaction + * optimistically as we go. + */ +#define OCFS2_MAX_TRANS_DATA 64U /* * Create access is for when we get a newly created buffer and we're diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index aebeacd807c3..cd5496b7a0a3 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -1082,7 +1082,7 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb, } retry_enospc: - (*ac)->ac_bits_wanted = osb->local_alloc_default_bits; + (*ac)->ac_bits_wanted = osb->local_alloc_bits; status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); if (status == -ENOSPC) { if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) == @@ -1154,7 +1154,7 @@ retry_enospc: OCFS2_LA_DISABLED) goto bail; - ac->ac_bits_wanted = osb->local_alloc_default_bits; + ac->ac_bits_wanted = osb->local_alloc_bits; status = ocfs2_claim_clusters(handle, ac, osb->local_alloc_bits, &cluster_off, diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 452068b45749..3d3f3c83065c 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -152,6 +152,7 @@ static int __ocfs2_move_extent(handle_t *handle, } out: + ocfs2_free_path(path); return ret; } @@ -845,7 +846,7 @@ static int __ocfs2_move_extents_range(struct buffer_head *di_bh, struct ocfs2_move_extents *range = context->range; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - if ((inode->i_size == 0) || (range->me_len == 0)) + if ((i_size_read(inode) == 0) || (range->me_len == 0)) return 0; if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index 3b481f490633..1b60c62aa9d6 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -2579,6 +2579,8 @@ DEFINE_OCFS2_INT_INT_EVENT(ocfs2_extend_trans); DEFINE_OCFS2_INT_EVENT(ocfs2_extend_trans_restart); +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_allocate_extend_trans); + DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_journal_access); DEFINE_OCFS2_ULL_EVENT(ocfs2_journal_dirty); diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 332a281f217e..aaa50611ec66 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -234,7 +234,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type, len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset; } - if (gqinode->i_size < off + len) { + if (i_size_read(gqinode) < off + len) { loff_t rounded_end = ocfs2_align_bytes_to_blocks(sb, off + len); @@ -778,8 +778,8 @@ static int ocfs2_acquire_dquot(struct dquot *dquot) */ WARN_ON(journal_current_handle()); status = ocfs2_extend_no_holes(gqinode, NULL, - gqinode->i_size + (need_alloc << sb->s_blocksize_bits), - gqinode->i_size); + i_size_read(gqinode) + (need_alloc << sb->s_blocksize_bits), + i_size_read(gqinode)); if (status < 0) goto out_dq; } diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 27fe7ee4874c..2e4344be3b96 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -982,14 +982,14 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( /* We are protected by dqio_sem so no locking needed */ status = ocfs2_extend_no_holes(lqinode, NULL, - lqinode->i_size + 2 * sb->s_blocksize, - lqinode->i_size); + i_size_read(lqinode) + 2 * sb->s_blocksize, + i_size_read(lqinode)); if (status < 0) { mlog_errno(status); goto out; } status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh, - lqinode->i_size + 2 * sb->s_blocksize); + i_size_read(lqinode) + 2 * sb->s_blocksize); if (status < 0) { mlog_errno(status); goto out; @@ -1125,14 +1125,14 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( /* We are protected by dqio_sem so no locking needed */ status = ocfs2_extend_no_holes(lqinode, NULL, - lqinode->i_size + sb->s_blocksize, - lqinode->i_size); + i_size_read(lqinode) + sb->s_blocksize, + i_size_read(lqinode)); if (status < 0) { mlog_errno(status); goto out; } status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh, - lqinode->i_size + sb->s_blocksize); + i_size_read(lqinode) + sb->s_blocksize); if (status < 0) { mlog_errno(status); goto out; diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index a70d604593b6..bf4dfc14bb2c 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3854,7 +3854,10 @@ static int ocfs2_attach_refcount_tree(struct inode *inode, while (cpos < clusters) { ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters, &ext_flags); - + if (ret) { + mlog_errno(ret); + goto unlock; + } if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) { ret = ocfs2_add_refcount_flag(inode, &di_et, &ref_tree->rf_ci, @@ -4025,7 +4028,10 @@ static int ocfs2_duplicate_extent_list(struct inode *s_inode, while (cpos < clusters) { ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster, &num_clusters, &ext_flags); - + if (ret) { + mlog_errno(ret); + goto out; + } if (p_cluster) { ret = ocfs2_add_refcounted_extent(t_inode, &et, ref_ci, ref_root_bh, diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 317ef0abccbb..6ce0686eab72 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -3505,7 +3505,7 @@ int ocfs2_xattr_set(struct inode *inode, int ret, credits, ref_meta = 0, ref_credits = 0; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct inode *tl_inode = osb->osb_tl_inode; - struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, }; + struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, NULL, }; struct ocfs2_refcount_tree *ref_tree = NULL; struct ocfs2_xattr_info xi = { @@ -3609,13 +3609,14 @@ int ocfs2_xattr_set(struct inode *inode, if (IS_ERR(ctxt.handle)) { ret = PTR_ERR(ctxt.handle); mlog_errno(ret); - goto cleanup; + goto out_free_ac; } ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt); ocfs2_commit_trans(osb, ctxt.handle); +out_free_ac: if (ctxt.data_ac) ocfs2_free_alloc_context(ctxt.data_ac); if (ctxt.meta_ac) @@ -5881,6 +5882,10 @@ static int ocfs2_xattr_value_attach_refcount(struct inode *inode, while (cpos < clusters) { ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, &num_clusters, el, &ext_flags); + if (ret) { + mlog_errno(ret); + break; + } cpos += num_clusters; if ((ext_flags & OCFS2_EXT_REFCOUNTED)) @@ -6797,7 +6802,7 @@ out: if (ret) { if (*meta_ac) { ocfs2_free_alloc_context(*meta_ac); - meta_ac = NULL; + *meta_ac = NULL; } } diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 0ff80f9b930f..985ea881b5bc 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -286,7 +286,7 @@ int proc_fd_permission(struct inode *inode, int mask) int rv = generic_permission(inode, mask); if (rv == 0) return 0; - if (task_pid(current) == proc_pid(inode)) + if (task_tgid(current) == proc_pid(inode)) rv = 0; return rv; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 107d026f5d6e..7366e9d63cee 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -740,6 +740,9 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, ptent = pte_file_clear_soft_dirty(ptent); } + if (vma->vm_flags & VM_SOFTDIRTY) + vma->vm_flags &= ~VM_SOFTDIRTY; + set_pte_at(vma->vm_mm, addr, pte, ptent); #endif } @@ -949,13 +952,15 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, if (is_migration_entry(entry)) page = migration_entry_to_page(entry); } else { - *pme = make_pme(PM_NOT_PRESENT(pm->v2)); + if (vma->vm_flags & VM_SOFTDIRTY) + flags2 |= __PM_SOFT_DIRTY; + *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); return; } if (page && !PageAnon(page)) flags |= PM_FILE; - if (pte_soft_dirty(pte)) + if ((vma->vm_flags & VM_SOFTDIRTY) || pte_soft_dirty(pte)) flags2 |= __PM_SOFT_DIRTY; *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); @@ -974,7 +979,7 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *p *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); else - *pme = make_pme(PM_NOT_PRESENT(pm->v2)); + *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2)); } #else static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, @@ -997,7 +1002,11 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (vma && pmd_trans_huge_lock(pmd, vma) == 1) { int pmd_flags2; - pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0); + if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd)) + pmd_flags2 = __PM_SOFT_DIRTY; + else + pmd_flags2 = 0; + for (; addr != end; addr += PAGE_SIZE) { unsigned long offset; @@ -1015,12 +1024,17 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (pmd_trans_unstable(pmd)) return 0; for (; addr != end; addr += PAGE_SIZE) { + int flags2; /* check to see if we've left 'vma' behind * and need a new, higher one */ if (vma && (addr >= vma->vm_end)) { vma = find_vma(walk->mm, addr); - pme = make_pme(PM_NOT_PRESENT(pm->v2)); + if (vma && (vma->vm_flags & VM_SOFTDIRTY)) + flags2 = __PM_SOFT_DIRTY; + else + flags2 = 0; + pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); } /* check that 'vma' actually covers this address, @@ -1044,13 +1058,15 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, #ifdef CONFIG_HUGETLB_PAGE static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, - pte_t pte, int offset) + pte_t pte, int offset, int flags2) { if (pte_present(pte)) - *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) - | PM_STATUS2(pm->v2, 0) | PM_PRESENT); + *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) | + PM_STATUS2(pm->v2, flags2) | + PM_PRESENT); else - *pme = make_pme(PM_NOT_PRESENT(pm->v2)); + *pme = make_pme(PM_NOT_PRESENT(pm->v2) | + PM_STATUS2(pm->v2, flags2)); } /* This function walks within one hugetlb entry in the single call */ @@ -1059,12 +1075,22 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, struct mm_walk *walk) { struct pagemapread *pm = walk->private; + struct vm_area_struct *vma; int err = 0; + int flags2; pagemap_entry_t pme; + vma = find_vma(walk->mm, addr); + WARN_ON_ONCE(!vma); + + if (vma && (vma->vm_flags & VM_SOFTDIRTY)) + flags2 = __PM_SOFT_DIRTY; + else + flags2 = 0; + for (; addr != end; addr += PAGE_SIZE) { int offset = (addr & ~hmask) >> PAGE_SHIFT; - huge_pte_to_pagemap_entry(&pme, pm, *pte, offset); + huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2); err = add_to_pagemap(addr, &pme, pm); if (err) return err; @@ -1376,8 +1402,10 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) walk.mm = mm; pol = get_vma_policy(task, vma, vma->vm_start); - mpol_to_str(buffer, sizeof(buffer), pol); + n = mpol_to_str(buffer, sizeof(buffer), pol); mpol_cond_put(pol); + if (n < 0) + return n; seq_printf(m, "%08lx %s", vma->vm_start, buffer); diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index a1a16eb97c7b..9100d6959886 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -21,6 +21,7 @@ #include <linux/crash_dump.h> #include <linux/list.h> #include <linux/vmalloc.h> +#include <linux/pagemap.h> #include <asm/uaccess.h> #include <asm/io.h> #include "internal.h" @@ -123,11 +124,65 @@ static ssize_t read_from_oldmem(char *buf, size_t count, return read; } +/* + * Architectures may override this function to allocate ELF header in 2nd kernel + */ +int __weak elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size) +{ + return 0; +} + +/* + * Architectures may override this function to free header + */ +void __weak elfcorehdr_free(unsigned long long addr) +{} + +/* + * Architectures may override this function to read from ELF header + */ +ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) +{ + return read_from_oldmem(buf, count, ppos, 0); +} + +/* + * Architectures may override this function to read from notes sections + */ +ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos) +{ + return read_from_oldmem(buf, count, ppos, 0); +} + +/* + * Architectures may override this function to map oldmem + */ +int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma, + unsigned long from, unsigned long pfn, + unsigned long size, pgprot_t prot) +{ + return remap_pfn_range(vma, from, pfn, size, prot); +} + +/* + * Copy to either kernel or user space + */ +static int copy_to(void *target, void *src, size_t size, int userbuf) +{ + if (userbuf) { + if (copy_to_user((char __user *) target, src, size)) + return -EFAULT; + } else { + memcpy(target, src, size); + } + return 0; +} + /* Read from the ELF header and then the crash dump. On error, negative value is * returned otherwise number of bytes read are returned. */ -static ssize_t read_vmcore(struct file *file, char __user *buffer, - size_t buflen, loff_t *fpos) +static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, + int userbuf) { ssize_t acc = 0, tmp; size_t tsz; @@ -144,7 +199,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, /* Read ELF core header */ if (*fpos < elfcorebuf_sz) { tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen); - if (copy_to_user(buffer, elfcorebuf + *fpos, tsz)) + if (copy_to(buffer, elfcorebuf + *fpos, tsz, userbuf)) return -EFAULT; buflen -= tsz; *fpos += tsz; @@ -162,7 +217,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen); kaddr = elfnotes_buf + *fpos - elfcorebuf_sz; - if (copy_to_user(buffer, kaddr, tsz)) + if (copy_to(buffer, kaddr, tsz, userbuf)) return -EFAULT; buflen -= tsz; *fpos += tsz; @@ -178,7 +233,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, if (*fpos < m->offset + m->size) { tsz = min_t(size_t, m->offset + m->size - *fpos, buflen); start = m->paddr + *fpos - m->offset; - tmp = read_from_oldmem(buffer, tsz, &start, 1); + tmp = read_from_oldmem(buffer, tsz, &start, userbuf); if (tmp < 0) return tmp; buflen -= tsz; @@ -195,6 +250,55 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, return acc; } +static ssize_t read_vmcore(struct file *file, char __user *buffer, + size_t buflen, loff_t *fpos) +{ + return __read_vmcore((__force char *) buffer, buflen, fpos, 1); +} + +/* + * The vmcore fault handler uses the page cache and fills data using the + * standard __vmcore_read() function. + * + * On s390 the fault handler is used for memory regions that can't be mapped + * directly with remap_pfn_range(). + */ +static int mmap_vmcore_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ +#ifdef CONFIG_S390 + struct address_space *mapping = vma->vm_file->f_mapping; + pgoff_t index = vmf->pgoff; + struct page *page; + loff_t offset; + char *buf; + int rc; + + page = find_or_create_page(mapping, index, GFP_KERNEL); + if (!page) + return VM_FAULT_OOM; + if (!PageUptodate(page)) { + offset = (loff_t) index << PAGE_CACHE_SHIFT; + buf = __va((page_to_pfn(page) << PAGE_SHIFT)); + rc = __read_vmcore(buf, PAGE_SIZE, &offset, 0); + if (rc < 0) { + unlock_page(page); + page_cache_release(page); + return (rc == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS; + } + SetPageUptodate(page); + } + unlock_page(page); + vmf->page = page; + return 0; +#else + return VM_FAULT_SIGBUS; +#endif +} + +static const struct vm_operations_struct vmcore_mmap_ops = { + .fault = mmap_vmcore_fault, +}; + /** * alloc_elfnotes_buf - allocate buffer for ELF note segment in * vmalloc memory @@ -223,7 +327,7 @@ static inline char *alloc_elfnotes_buf(size_t notes_sz) * regions in the 1st kernel pointed to by PT_LOAD entries) into * virtually contiguous user-space in ELF layout. */ -#if defined(CONFIG_MMU) && !defined(CONFIG_S390) +#ifdef CONFIG_MMU static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) { size_t size = vma->vm_end - vma->vm_start; @@ -241,6 +345,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); vma->vm_flags |= VM_MIXEDMAP; + vma->vm_ops = &vmcore_mmap_ops; len = 0; @@ -282,9 +387,9 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) tsz = min_t(size_t, m->offset + m->size - start, size); paddr = m->paddr + start - m->offset; - if (remap_pfn_range(vma, vma->vm_start + len, - paddr >> PAGE_SHIFT, tsz, - vma->vm_page_prot)) + if (remap_oldmem_pfn_range(vma, vma->vm_start + len, + paddr >> PAGE_SHIFT, tsz, + vma->vm_page_prot)) goto fail; size -= tsz; start += tsz; @@ -357,7 +462,7 @@ static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr) notes_section = kmalloc(max_sz, GFP_KERNEL); if (!notes_section) return -ENOMEM; - rc = read_from_oldmem(notes_section, max_sz, &offset, 0); + rc = elfcorehdr_read_notes(notes_section, max_sz, &offset); if (rc < 0) { kfree(notes_section); return rc; @@ -444,7 +549,8 @@ static int __init copy_notes_elf64(const Elf64_Ehdr *ehdr_ptr, char *notes_buf) if (phdr_ptr->p_type != PT_NOTE) continue; offset = phdr_ptr->p_offset; - rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0); + rc = elfcorehdr_read_notes(notes_buf, phdr_ptr->p_memsz, + &offset); if (rc < 0) return rc; notes_buf += phdr_ptr->p_memsz; @@ -536,7 +642,7 @@ static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr) notes_section = kmalloc(max_sz, GFP_KERNEL); if (!notes_section) return -ENOMEM; - rc = read_from_oldmem(notes_section, max_sz, &offset, 0); + rc = elfcorehdr_read_notes(notes_section, max_sz, &offset); if (rc < 0) { kfree(notes_section); return rc; @@ -623,7 +729,8 @@ static int __init copy_notes_elf32(const Elf32_Ehdr *ehdr_ptr, char *notes_buf) if (phdr_ptr->p_type != PT_NOTE) continue; offset = phdr_ptr->p_offset; - rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0); + rc = elfcorehdr_read_notes(notes_buf, phdr_ptr->p_memsz, + &offset); if (rc < 0) return rc; notes_buf += phdr_ptr->p_memsz; @@ -810,7 +917,7 @@ static int __init parse_crash_elf64_headers(void) addr = elfcorehdr_addr; /* Read Elf header */ - rc = read_from_oldmem((char*)&ehdr, sizeof(Elf64_Ehdr), &addr, 0); + rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf64_Ehdr), &addr); if (rc < 0) return rc; @@ -837,7 +944,7 @@ static int __init parse_crash_elf64_headers(void) if (!elfcorebuf) return -ENOMEM; addr = elfcorehdr_addr; - rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0); + rc = elfcorehdr_read(elfcorebuf, elfcorebuf_sz_orig, &addr); if (rc < 0) goto fail; @@ -866,7 +973,7 @@ static int __init parse_crash_elf32_headers(void) addr = elfcorehdr_addr; /* Read Elf header */ - rc = read_from_oldmem((char*)&ehdr, sizeof(Elf32_Ehdr), &addr, 0); + rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf32_Ehdr), &addr); if (rc < 0) return rc; @@ -892,7 +999,7 @@ static int __init parse_crash_elf32_headers(void) if (!elfcorebuf) return -ENOMEM; addr = elfcorehdr_addr; - rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0); + rc = elfcorehdr_read(elfcorebuf, elfcorebuf_sz_orig, &addr); if (rc < 0) goto fail; @@ -919,7 +1026,7 @@ static int __init parse_crash_elf_headers(void) int rc=0; addr = elfcorehdr_addr; - rc = read_from_oldmem(e_ident, EI_NIDENT, &addr, 0); + rc = elfcorehdr_read(e_ident, EI_NIDENT, &addr); if (rc < 0) return rc; if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) { @@ -952,7 +1059,14 @@ static int __init vmcore_init(void) { int rc = 0; - /* If elfcorehdr= has been passed in cmdline, then capture the dump.*/ + /* Allow architectures to allocate ELF header in 2nd kernel */ + rc = elfcorehdr_alloc(&elfcorehdr_addr, &elfcorehdr_size); + if (rc) + return rc; + /* + * If elfcorehdr= has been passed in cmdline or created in 2nd kernel, + * then capture the dump. + */ if (!(is_vmcore_usable())) return rc; rc = parse_crash_elf_headers(); @@ -960,6 +1074,8 @@ static int __init vmcore_init(void) pr_warn("Kdump: vmcore not initialized\n"); return rc; } + elfcorehdr_free(elfcorehdr_addr); + elfcorehdr_addr = ELFCORE_ADDR_ERR; proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations); if (proc_vmcore) diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index c24f1e10b946..39d14659a8d3 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -244,12 +244,6 @@ struct dentry *ramfs_mount(struct file_system_type *fs_type, return mount_nodev(fs_type, flags, data, ramfs_fill_super); } -static struct dentry *rootfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return mount_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super); -} - static void ramfs_kill_sb(struct super_block *sb) { kfree(sb->s_fs_info); @@ -262,29 +256,23 @@ static struct file_system_type ramfs_fs_type = { .kill_sb = ramfs_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; -static struct file_system_type rootfs_fs_type = { - .name = "rootfs", - .mount = rootfs_mount, - .kill_sb = kill_litter_super, -}; -static int __init init_ramfs_fs(void) -{ - return register_filesystem(&ramfs_fs_type); -} -module_init(init_ramfs_fs) - -int __init init_rootfs(void) +int __init init_ramfs_fs(void) { + static unsigned long once; int err; + if (test_and_set_bit(0, &once)) + return 0; + err = bdi_init(&ramfs_backing_dev_info); if (err) return err; - err = register_filesystem(&rootfs_fs_type); + err = register_filesystem(&ramfs_fs_type); if (err) bdi_destroy(&ramfs_backing_dev_info); return err; } +module_init(init_ramfs_fs) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index c3881553f7d1..5f66d519a726 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -243,6 +243,8 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); * BDI_CAP_EXEC_MAP: Can be mapped for execution * * BDI_CAP_SWAP_BACKED: Count shmem/tmpfs objects as swap-backed. + * + * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold. */ #define BDI_CAP_NO_ACCT_DIRTY 0x00000001 #define BDI_CAP_NO_WRITEBACK 0x00000002 @@ -254,6 +256,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); #define BDI_CAP_NO_ACCT_WB 0x00000080 #define BDI_CAP_SWAP_BACKED 0x00000100 #define BDI_CAP_STABLE_WRITES 0x00000200 +#define BDI_CAP_STRICTLIMIT 0x00000400 #define BDI_CAP_VMFLAGS \ (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP) diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 70cf138690e9..e8112ae50531 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -31,7 +31,7 @@ struct linux_binprm { #ifdef __alpha__ unsigned int taso:1; #endif - unsigned int recursion_depth; + unsigned int recursion_depth; /* only for search_binary_handler() */ struct file * file; struct cred *cred; /* new credentials */ int unsafe; /* how unsafe this exec is (mask of LSM_UNSAFE_*) */ diff --git a/include/linux/cmdline-parser.h b/include/linux/cmdline-parser.h new file mode 100644 index 000000000000..98e892ef6d5a --- /dev/null +++ b/include/linux/cmdline-parser.h @@ -0,0 +1,43 @@ +/* + * Parsing command line, get the partitions information. + * + * Written by Cai Zhiyong <caizhiyong@huawei.com> + * + */ +#ifndef CMDLINEPARSEH +#define CMDLINEPARSEH + +#include <linux/blkdev.h> + +/* partition flags */ +#define PF_RDONLY 0x01 /* Device is read only */ +#define PF_POWERUP_LOCK 0x02 /* Always locked after reset */ + +struct cmdline_subpart { + char name[BDEVNAME_SIZE]; /* partition name, such as 'rootfs' */ + sector_t from; + sector_t size; + int flags; + struct cmdline_subpart *next_subpart; +}; + +struct cmdline_parts { + char name[BDEVNAME_SIZE]; /* block device, such as 'mmcblk0' */ + unsigned int nr_subparts; + struct cmdline_subpart *subpart; + struct cmdline_parts *next_parts; +}; + +void cmdline_parts_free(struct cmdline_parts **parts); + +int cmdline_parts_parse(struct cmdline_parts **parts, const char *cmdline); + +struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts, + const char *bdev); + +void cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size, + int slot, + int (*add_part)(int, struct cmdline_subpart *, void *), + void *param); + +#endif /* CMDLINEPARSEH */ diff --git a/include/linux/compat.h b/include/linux/compat.h index ec1aee4aec9c..345da00a86e0 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -43,6 +43,7 @@ #define COMPAT_SYSCALL_DEFINEx(x, name, ...) \ asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\ static inline long C_SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__));\ + asmlinkage long compat_SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));\ asmlinkage long compat_SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))\ { \ return C_SYSC##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__)); \ diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 37e4f8da7cdf..fe68a5a98583 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -12,6 +12,15 @@ extern unsigned long long elfcorehdr_addr; extern unsigned long long elfcorehdr_size; +extern int __weak elfcorehdr_alloc(unsigned long long *addr, + unsigned long long *size); +extern void __weak elfcorehdr_free(unsigned long long addr); +extern ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos); +extern ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos); +extern int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma, + unsigned long from, unsigned long pfn, + unsigned long size, pgprot_t prot); + extern ssize_t copy_oldmem_page(unsigned long, char *, size_t, unsigned long, int); diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h index 661d374aeb2d..f8d41cb1cbe0 100644 --- a/include/linux/genalloc.h +++ b/include/linux/genalloc.h @@ -66,8 +66,8 @@ struct gen_pool_chunk { struct list_head next_chunk; /* next chunk in pool */ atomic_t avail; phys_addr_t phys_addr; /* physical starting address of memory chunk */ - unsigned long start_addr; /* starting address of memory chunk */ - unsigned long end_addr; /* ending address of memory chunk */ + unsigned long start_addr; /* start address of memory chunk */ + unsigned long end_addr; /* end address of memory chunk (inclusive) */ unsigned long bits[0]; /* bitmap for allocating memory chunk */ }; diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index c2b1801a160b..0393270466c3 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -66,6 +66,9 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to, vm_flags_t vm_flags); void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); int dequeue_hwpoisoned_huge_page(struct page *page); +bool isolate_huge_page(struct page *page, struct list_head *list); +void putback_active_hugepage(struct page *page); +bool is_hugepage_active(struct page *page); void copy_huge_page(struct page *dst, struct page *src); #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE @@ -134,6 +137,9 @@ static inline int dequeue_hwpoisoned_huge_page(struct page *page) return 0; } +#define isolate_huge_page(p, l) false +#define putback_active_hugepage(p) do {} while (0) +#define is_hugepage_active(x) false static inline void copy_huge_page(struct page *dst, struct page *src) { } @@ -261,6 +267,8 @@ struct huge_bootmem_page { }; struct page *alloc_huge_page_node(struct hstate *h, int nid); +struct page *alloc_huge_page_noerr(struct vm_area_struct *vma, + unsigned long addr, int avoid_reserve); /* arch callback */ int __init alloc_bootmem_huge_page(struct hstate *h); @@ -371,9 +379,23 @@ static inline pgoff_t basepage_index(struct page *page) return __basepage_index(page); } +extern void dissolve_free_huge_pages(unsigned long start_pfn, + unsigned long end_pfn); +int pmd_huge_support(void); +/* + * Currently hugepage migration is enabled only for pmd-based hugepage. + * This function will be updated when hugepage migration is more widely + * supported. + */ +static inline int hugepage_migration_support(struct hstate *h) +{ + return pmd_huge_support() && (huge_page_shift(h) == PMD_SHIFT); +} + #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; #define alloc_huge_page_node(h, nid) NULL +#define alloc_huge_page_noerr(v, a, r) NULL #define alloc_bootmem_huge_page(h) NULL #define hstate_file(f) NULL #define hstate_sizelog(s) NULL @@ -396,6 +418,9 @@ static inline pgoff_t basepage_index(struct page *page) { return page->index; } +#define dissolve_free_huge_pages(s, e) do {} while (0) +#define pmd_huge_support() 0 +#define hugepage_migration_support(h) 0 #endif /* CONFIG_HUGETLB_PAGE */ #endif /* _LINUX_HUGETLB_H */ diff --git a/include/linux/init.h b/include/linux/init.h index e73f2b708525..f1c27a71d03c 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -153,6 +153,7 @@ extern unsigned int reset_devices; void setup_arch(char **); void prepare_namespace(void); void __init load_default_modules(void); +int __init init_rootfs(void); extern void (*late_time_init)(void); diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index c4d870b0d5e6..19c19a5eee29 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -22,7 +22,7 @@ struct ipc_ids { int in_use; unsigned short seq; unsigned short seq_max; - struct rw_semaphore rw_mutex; + struct rw_semaphore rwsem; struct idr ipcs_idr; int next_id; }; diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index ca1d27a0d6a6..925eaf28fca9 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -264,10 +264,36 @@ extern void arch_arm_kprobe(struct kprobe *p); extern void arch_disarm_kprobe(struct kprobe *p); extern int arch_init_kprobes(void); extern void show_registers(struct pt_regs *regs); -extern kprobe_opcode_t *get_insn_slot(void); -extern void free_insn_slot(kprobe_opcode_t *slot, int dirty); extern void kprobes_inc_nmissed_count(struct kprobe *p); +struct kprobe_insn_cache { + struct mutex mutex; + void *(*alloc)(void); /* allocate insn page */ + void (*free)(void *); /* free insn page */ + struct list_head pages; /* list of kprobe_insn_page */ + size_t insn_size; /* size of instruction slot */ + int nr_garbage; +}; + +extern kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c); +extern void __free_insn_slot(struct kprobe_insn_cache *c, + kprobe_opcode_t *slot, int dirty); + +#define DEFINE_INSN_CACHE_OPS(__name) \ +extern struct kprobe_insn_cache kprobe_##__name##_slots; \ + \ +static inline kprobe_opcode_t *get_##__name##_slot(void) \ +{ \ + return __get_insn_slot(&kprobe_##__name##_slots); \ +} \ + \ +static inline void free_##__name##_slot(kprobe_opcode_t *slot, int dirty)\ +{ \ + __free_insn_slot(&kprobe_##__name##_slots, slot, dirty); \ +} \ + +DEFINE_INSN_CACHE_OPS(insn); + #ifdef CONFIG_OPTPROBES /* * Internal structure for direct jump optimized probe @@ -287,13 +313,13 @@ extern void arch_optimize_kprobes(struct list_head *oplist); extern void arch_unoptimize_kprobes(struct list_head *oplist, struct list_head *done_list); extern void arch_unoptimize_kprobe(struct optimized_kprobe *op); -extern kprobe_opcode_t *get_optinsn_slot(void); -extern void free_optinsn_slot(kprobe_opcode_t *slot, int dirty); extern int arch_within_optimized_kprobe(struct optimized_kprobe *op, unsigned long addr); extern void opt_pre_handler(struct kprobe *p, struct pt_regs *regs); +DEFINE_INSN_CACHE_OPS(optinsn); + #ifdef CONFIG_SYSCTL extern int sysctl_kprobes_optimization; extern int proc_kprobes_optimization_handler(struct ctl_table *table, diff --git a/include/linux/lz4.h b/include/linux/lz4.h index d21c13f10a64..4356686b0a39 100644 --- a/include/linux/lz4.h +++ b/include/linux/lz4.h @@ -67,8 +67,8 @@ int lz4hc_compress(const unsigned char *src, size_t src_len, * note : Destination buffer must be already allocated. * slightly faster than lz4_decompress_unknownoutputsize() */ -int lz4_decompress(const char *src, size_t *src_len, char *dest, - size_t actual_dest_len); +int lz4_decompress(const unsigned char *src, size_t *src_len, + unsigned char *dest, size_t actual_dest_len); /* * lz4_decompress_unknownoutputsize() @@ -82,6 +82,6 @@ int lz4_decompress(const char *src, size_t *src_len, char *dest, * Error if return (< 0) * note : Destination buffer must be already allocated. */ -int lz4_decompress_unknownoutputsize(const char *src, size_t src_len, - char *dest, size_t *dest_len); +int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len, + unsigned char *dest, size_t *dest_len); #endif diff --git a/include/linux/memblock.h b/include/linux/memblock.h index f388203db7e8..31e95acddb4d 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -60,6 +60,8 @@ int memblock_reserve(phys_addr_t base, phys_addr_t size); void memblock_trim_memory(phys_addr_t align); #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, + unsigned long *end_pfn); void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, unsigned long *out_end_pfn, int *out_nid); diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 0d7df39a5885..da6716b9e3fe 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -91,7 +91,6 @@ static inline struct mempolicy *mpol_dup(struct mempolicy *pol) } #define vma_policy(vma) ((vma)->vm_policy) -#define vma_set_policy(vma, pol) ((vma)->vm_policy = (pol)) static inline void mpol_get(struct mempolicy *pol) { @@ -126,6 +125,7 @@ struct shared_policy { spinlock_t lock; }; +int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst); void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol); int mpol_set_shared_policy(struct shared_policy *info, struct vm_area_struct *vma, @@ -173,7 +173,7 @@ extern int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol); /* Check if a vma is migratable */ static inline int vma_migratable(struct vm_area_struct *vma) { - if (vma->vm_flags & (VM_IO | VM_HUGETLB | VM_PFNMAP)) + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) return 0; /* * Migration allocates pages in the highest zone. If we cannot @@ -240,7 +240,12 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) } #define vma_policy(vma) NULL -#define vma_set_policy(vma, pol) do {} while(0) + +static inline int +vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) +{ + return 0; +} static inline void numa_policy_init(void) { diff --git a/include/linux/migrate.h b/include/linux/migrate.h index a405d3dc0f61..6fe521420631 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -41,8 +41,6 @@ extern int migrate_page(struct address_space *, struct page *, struct page *, enum migrate_mode); extern int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, enum migrate_mode mode, int reason); -extern int migrate_huge_page(struct page *, new_page_t x, - unsigned long private, enum migrate_mode mode); extern int fail_migrate_page(struct address_space *, struct page *, struct page *); @@ -62,9 +60,6 @@ static inline void putback_movable_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, enum migrate_mode mode, int reason) { return -ENOSYS; } -static inline int migrate_huge_page(struct page *page, new_page_t x, - unsigned long private, enum migrate_mode mode) - { return -ENOSYS; } static inline int migrate_prep(void) { return -ENOSYS; } static inline int migrate_prep_local(void) { return -ENOSYS; } diff --git a/include/linux/mm.h b/include/linux/mm.h index d2d59b4149d0..caf543c7eaa7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -115,6 +115,12 @@ extern unsigned int kobjsize(const void *objp); #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ +#ifdef CONFIG_MEM_SOFT_DIRTY +# define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */ +#else +# define VM_SOFTDIRTY 0 +#endif + #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ #define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ @@ -489,20 +495,6 @@ static inline int compound_order(struct page *page) return (unsigned long)page[1].lru.prev; } -static inline int compound_trans_order(struct page *page) -{ - int order; - unsigned long flags; - - if (!PageHead(page)) - return 0; - - flags = compound_lock_irqsave(page); - order = compound_order(page); - compound_unlock_irqrestore(page, flags); - return order; -} - static inline void set_compound_order(struct page *page, unsigned long order) { page[1].lru.prev = (void *)order; @@ -637,12 +629,12 @@ static inline enum zone_type page_zonenum(const struct page *page) #endif /* - * The identification function is only used by the buddy allocator for - * determining if two pages could be buddies. We are not really - * identifying a zone since we could be using a the section number - * id if we have not node id available in page flags. - * We guarantee only that it will return the same value for two - * combinable pages in a zone. + * The identification function is mainly used by the buddy allocator for + * determining if two pages could be buddies. We are not really identifying + * the zone since we could be using the section number id if we do not have + * node id available in page flags. + * We only guarantee that it will return the same value for two combinable + * pages in a zone. */ static inline int page_zone_id(struct page *page) { diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 1397ccf81e91..cf55945c83fb 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -2,6 +2,7 @@ #define LINUX_MM_INLINE_H #include <linux/huge_mm.h> +#include <linux/swap.h> /** * page_is_file_cache - should the page be on a file LRU or anon LRU? diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index af4a3b77a8de..bd791e452ad7 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -105,6 +105,7 @@ struct zone_padding { enum zone_stat_item { /* First 128 byte cacheline (assuming 64 bit words) */ NR_FREE_PAGES, + NR_ALLOC_BATCH, NR_LRU_BASE, NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ NR_ACTIVE_ANON, /* " " " " " */ @@ -352,7 +353,6 @@ struct zone { * free areas of different sizes */ spinlock_t lock; - int all_unreclaimable; /* All pages pinned */ #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* Set to true when the PG_migrate_skip bits should be cleared */ bool compact_blockskip_flush; diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index ffc444c38b0a..403940787be1 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -231,6 +231,7 @@ unsigned long radix_tree_next_hole(struct radix_tree_root *root, unsigned long radix_tree_prev_hole(struct radix_tree_root *root, unsigned long index, unsigned long max_scan); int radix_tree_preload(gfp_t gfp_mask); +int radix_tree_maybe_preload(gfp_t gfp_mask); void radix_tree_init(void); void *radix_tree_tag_set(struct radix_tree_root *root, unsigned long index, unsigned int tag); diff --git a/include/linux/ramfs.h b/include/linux/ramfs.h index 69e37c2d1ea5..753207c8ce20 100644 --- a/include/linux/ramfs.h +++ b/include/linux/ramfs.h @@ -25,7 +25,7 @@ extern int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma); extern const struct file_operations ramfs_file_operations; extern const struct vm_operations_struct generic_file_vm_ops; -extern int __init init_rootfs(void); +extern int __init init_ramfs_fs(void); int ramfs_fill_super(struct super_block *sb, void *data, int silent); diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 0022c1bb1e26..aa870a4ddf54 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -68,6 +68,10 @@ extern struct rb_node *rb_prev(const struct rb_node *); extern struct rb_node *rb_first(const struct rb_root *); extern struct rb_node *rb_last(const struct rb_root *); +/* Postorder iteration - always visit the parent after its children */ +extern struct rb_node *rb_first_postorder(const struct rb_root *); +extern struct rb_node *rb_next_postorder(const struct rb_node *); + /* Fast replacement of a single node without remove/rebalance/add/rebalance */ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); @@ -81,4 +85,22 @@ static inline void rb_link_node(struct rb_node * node, struct rb_node * parent, *rb_link = node; } +/** + * rbtree_postorder_for_each_entry_safe - iterate over rb_root in post order of + * given type safe against removal of rb_node entry + * + * @pos: the 'type *' to use as a loop cursor. + * @n: another 'type *' to use as temporary storage + * @root: 'rb_root *' of the rbtree. + * @field: the name of the rb_node field within 'type'. + */ +#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \ + for (pos = rb_entry(rb_first_postorder(root), typeof(*pos), field),\ + n = rb_entry(rb_next_postorder(&pos->field), \ + typeof(*pos), field); \ + &pos->field; \ + pos = n, \ + n = rb_entry(rb_next_postorder(&pos->field), \ + typeof(*pos), field)) + #endif /* _LINUX_RBTREE_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index ce1e1c0aaa33..45f254dddafc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2169,15 +2169,15 @@ static inline bool thread_group_leader(struct task_struct *p) * all we care about is that we have a task with the appropriate * pid, we don't actually care if we have the right task. */ -static inline int has_group_leader_pid(struct task_struct *p) +static inline bool has_group_leader_pid(struct task_struct *p) { - return p->pid == p->tgid; + return task_pid(p) == p->signal->leader_pid; } static inline -int same_thread_group(struct task_struct *p1, struct task_struct *p2) +bool same_thread_group(struct task_struct *p1, struct task_struct *p2) { - return p1->tgid == p2->tgid; + return p1->signal == p2->signal; } static inline struct task_struct *next_thread(const struct task_struct *p) diff --git a/include/linux/smp.h b/include/linux/smp.h index c181399f2c20..cfb7ca094b38 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -28,6 +28,27 @@ extern unsigned int total_cpus; int smp_call_function_single(int cpuid, smp_call_func_t func, void *info, int wait); +/* + * Call a function on all processors + */ +int on_each_cpu(smp_call_func_t func, void *info, int wait); + +/* + * Call a function on processors specified by mask, which might include + * the local one. + */ +void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, + void *info, bool wait); + +/* + * Call a function on each processor for which the supplied function + * cond_func returns a positive value. This may include the local + * processor. + */ +void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags); + #ifdef CONFIG_SMP #include <linux/preempt.h> @@ -95,27 +116,6 @@ static inline void call_function_init(void) { } #endif /* - * Call a function on all processors - */ -int on_each_cpu(smp_call_func_t func, void *info, int wait); - -/* - * Call a function on processors specified by mask, which might include - * the local one. - */ -void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, - void *info, bool wait); - -/* - * Call a function on each processor for which the supplied function - * cond_func returns a positive value. This may include the local - * processor. - */ -void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), - smp_call_func_t func, void *info, bool wait, - gfp_t gfp_flags); - -/* * Mark the boot cpu "online" so that it can call console drivers in * printk() and can access its per-cpu storage. */ @@ -139,43 +139,6 @@ static inline int up_smp_call_function(smp_call_func_t func, void *info) } #define smp_call_function(func, info, wait) \ (up_smp_call_function(func, info)) -#define on_each_cpu(func, info, wait) \ - ({ \ - unsigned long __flags; \ - local_irq_save(__flags); \ - func(info); \ - local_irq_restore(__flags); \ - 0; \ - }) -/* - * Note we still need to test the mask even for UP - * because we actually can get an empty mask from - * code that on SMP might call us without the local - * CPU in the mask. - */ -#define on_each_cpu_mask(mask, func, info, wait) \ - do { \ - if (cpumask_test_cpu(0, (mask))) { \ - local_irq_disable(); \ - (func)(info); \ - local_irq_enable(); \ - } \ - } while (0) -/* - * Preemption is disabled here to make sure the cond_func is called under the - * same condtions in UP and SMP. - */ -#define on_each_cpu_cond(cond_func, func, info, wait, gfp_flags)\ - do { \ - void *__info = (info); \ - preempt_disable(); \ - if ((cond_func)(0, __info)) { \ - local_irq_disable(); \ - (func)(__info); \ - local_irq_enable(); \ - } \ - preempt_enable(); \ - } while (0) static inline void smp_send_reschedule(int cpu) { } #define smp_prepare_boot_cpu() do {} while (0) diff --git a/include/linux/swap.h b/include/linux/swap.h index d95cde5e257d..c03c139219c9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -182,6 +182,33 @@ enum { #define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs, in first swap_map */ /* + * We use this to track usage of a cluster. A cluster is a block of swap disk + * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All + * free clusters are organized into a list. We fetch an entry from the list to + * get a free cluster. + * + * The data field stores next cluster if the cluster is free or cluster usage + * counter otherwise. The flags field determines if a cluster is free. This is + * protected by swap_info_struct.lock. + */ +struct swap_cluster_info { + unsigned int data:24; + unsigned int flags:8; +}; +#define CLUSTER_FLAG_FREE 1 /* This cluster is free */ +#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */ + +/* + * We assign a cluster to each CPU, so each CPU can allocate swap entry from + * its own cluster and swapout sequentially. The purpose is to optimize swapout + * throughput. + */ +struct percpu_cluster { + struct swap_cluster_info index; /* Current cluster index */ + unsigned int next; /* Likely next allocation offset */ +}; + +/* * The in-memory structure used to track swap areas. */ struct swap_info_struct { @@ -191,14 +218,16 @@ struct swap_info_struct { signed char next; /* next type on the swap list */ unsigned int max; /* extent of the swap_map */ unsigned char *swap_map; /* vmalloc'ed array of usage counts */ + struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ + struct swap_cluster_info free_cluster_head; /* free cluster list head */ + struct swap_cluster_info free_cluster_tail; /* free cluster list tail */ unsigned int lowest_bit; /* index of first free in swap_map */ unsigned int highest_bit; /* index of last free in swap_map */ unsigned int pages; /* total of usable pages of swap */ unsigned int inuse_pages; /* number of those currently in use */ unsigned int cluster_next; /* likely index for next allocation */ unsigned int cluster_nr; /* countdown to next cluster search */ - unsigned int lowest_alloc; /* while preparing discard cluster */ - unsigned int highest_alloc; /* while preparing discard cluster */ + struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ struct swap_extent *curr_swap_extent; struct swap_extent first_swap_extent; struct block_device *bdev; /* swap device or bdev of swap file */ @@ -212,14 +241,18 @@ struct swap_info_struct { * protect map scan related fields like * swap_map, lowest_bit, highest_bit, * inuse_pages, cluster_next, - * cluster_nr, lowest_alloc and - * highest_alloc. other fields are only - * changed at swapon/swapoff, so are - * protected by swap_lock. changing - * flags need hold this lock and - * swap_lock. If both locks need hold, - * hold swap_lock first. + * cluster_nr, lowest_alloc, + * highest_alloc, free/discard cluster + * list. other fields are only changed + * at swapon/swapoff, so are protected + * by swap_lock. changing flags need + * hold this lock and swap_lock. If + * both locks need hold, hold swap_lock + * first. */ + struct work_struct discard_work; /* discard worker */ + struct swap_cluster_info discard_cluster_head; /* list head of discard clusters */ + struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */ }; struct swap_list_t { @@ -414,6 +447,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) #else /* CONFIG_SWAP */ +#define swap_address_space(entry) (NULL) #define get_nr_swap_pages() 0L #define total_swap_pages 0L #define total_swapcache_pages() 0UL diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 84662ecc7b51..7fac04e7ff6e 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -186,6 +186,7 @@ extern struct trace_event_functions exit_syscall_print_funcs; #define __SYSCALL_DEFINEx(x, name, ...) \ asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ + asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ { \ long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \ diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index bd6cf61142be..1855f0a22add 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -70,6 +70,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_ZERO_PAGE_ALLOC, THP_ZERO_PAGE_ALLOC_FAILED, #endif +#ifdef CONFIG_SMP + NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */ + NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */ +#endif + NR_TLB_LOCAL_FLUSH_ALL, + NR_TLB_LOCAL_FLUSH_ONE, NR_VM_EVENT_ITEMS }; diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index c586679b6fef..e4b948080d20 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -143,7 +143,6 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone, } extern unsigned long global_reclaimable_pages(void); -extern unsigned long zone_reclaimable_pages(struct zone *zone); #ifdef CONFIG_NUMA /* @@ -198,7 +197,7 @@ extern void __inc_zone_state(struct zone *, enum zone_stat_item); extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); -void refresh_cpu_vm_stats(int); +void cpu_vm_stats_fold(int cpu); void refresh_zone_stat_thresholds(void); void drain_zonestat(struct zone *zone, struct per_cpu_pageset *); @@ -255,6 +254,7 @@ static inline void __dec_zone_page_state(struct page *page, static inline void refresh_cpu_vm_stats(int cpu) { } static inline void refresh_zone_stat_thresholds(void) { } +static inline void cpu_vm_stats_fold(int cpu) { } static inline void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) { } diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 4e198ca1f685..021b8a319b9e 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -98,8 +98,6 @@ int try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason); int try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr, enum wb_reason reason); void sync_inodes_sb(struct super_block *); -long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, - enum wb_reason reason); void wakeup_flusher_threads(long nr_pages, enum wb_reason reason); void inode_wait_for_writeback(struct inode *inode); diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 6bc943ecb841..d0c613476620 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -268,11 +268,13 @@ TRACE_EVENT(mm_page_alloc_extfrag, TP_PROTO(struct page *page, int alloc_order, int fallback_order, - int alloc_migratetype, int fallback_migratetype), + int alloc_migratetype, int fallback_migratetype, + int change_ownership), TP_ARGS(page, alloc_order, fallback_order, - alloc_migratetype, fallback_migratetype), + alloc_migratetype, fallback_migratetype, + change_ownership), TP_STRUCT__entry( __field( struct page *, page ) @@ -280,6 +282,7 @@ TRACE_EVENT(mm_page_alloc_extfrag, __field( int, fallback_order ) __field( int, alloc_migratetype ) __field( int, fallback_migratetype ) + __field( int, change_ownership ) ), TP_fast_assign( @@ -288,6 +291,7 @@ TRACE_EVENT(mm_page_alloc_extfrag, __entry->fallback_order = fallback_order; __entry->alloc_migratetype = alloc_migratetype; __entry->fallback_migratetype = fallback_migratetype; + __entry->change_ownership = change_ownership; ), TP_printk("page=%p pfn=%lu alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d", @@ -299,7 +303,7 @@ TRACE_EVENT(mm_page_alloc_extfrag, __entry->alloc_migratetype, __entry->fallback_migratetype, __entry->fallback_order < pageblock_order, - __entry->alloc_migratetype == __entry->fallback_migratetype) + __entry->change_ownership) ); #endif /* _TRACE_KMEM_H */ diff --git a/init/do_mounts.c b/init/do_mounts.c index 816014c4627e..a51cddc2ff8c 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -26,6 +26,8 @@ #include <linux/async.h> #include <linux/fs_struct.h> #include <linux/slab.h> +#include <linux/ramfs.h> +#include <linux/shmem_fs.h> #include <linux/nfs_fs.h> #include <linux/nfs_fs_sb.h> @@ -588,3 +590,46 @@ out: sys_mount(".", "/", NULL, MS_MOVE, NULL); sys_chroot("."); } + +static bool is_tmpfs; +static struct dentry *rootfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + static unsigned long once; + void *fill = ramfs_fill_super; + + if (test_and_set_bit(0, &once)) + return ERR_PTR(-ENODEV); + + if (IS_ENABLED(CONFIG_TMPFS) && is_tmpfs) + fill = shmem_fill_super; + + return mount_nodev(fs_type, flags, data, fill); +} + +static struct file_system_type rootfs_fs_type = { + .name = "rootfs", + .mount = rootfs_mount, + .kill_sb = kill_litter_super, +}; + +int __init init_rootfs(void) +{ + int err = register_filesystem(&rootfs_fs_type); + + if (err) + return err; + + if (IS_ENABLED(CONFIG_TMPFS) && !saved_root_name[0] && + (!root_fs_names || strstr(root_fs_names, "tmpfs"))) { + err = shmem_init(); + is_tmpfs = true; + } else { + err = init_ramfs_fs(); + } + + if (err) + unregister_filesystem(&rootfs_fs_type); + + return err; +} diff --git a/ipc/msg.c b/ipc/msg.c index b65fdf1a09dd..b0d541d42677 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -70,8 +70,6 @@ struct msg_sender { #define msg_ids(ns) ((ns)->ids[IPC_MSG_IDS]) -#define msg_unlock(msq) ipc_unlock(&(msq)->q_perm) - static void freeque(struct ipc_namespace *, struct kern_ipc_perm *); static int newque(struct ipc_namespace *, struct ipc_params *); #ifdef CONFIG_PROC_FS @@ -172,7 +170,7 @@ static inline void msg_rmid(struct ipc_namespace *ns, struct msg_queue *s) * @ns: namespace * @params: ptr to the structure that contains the key and msgflg * - * Called with msg_ids.rw_mutex held (writer) + * Called with msg_ids.rwsem held (writer) */ static int newque(struct ipc_namespace *ns, struct ipc_params *params) { @@ -259,8 +257,8 @@ static void expunge_all(struct msg_queue *msq, int res) * removes the message queue from message queue ID IDR, and cleans up all the * messages associated with this queue. * - * msg_ids.rw_mutex (writer) and the spinlock for this message queue are held - * before freeque() is called. msg_ids.rw_mutex remains locked on exit. + * msg_ids.rwsem (writer) and the spinlock for this message queue are held + * before freeque() is called. msg_ids.rwsem remains locked on exit. */ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) { @@ -270,7 +268,8 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) expunge_all(msq, -EIDRM); ss_wakeup(&msq->q_senders, 1); msg_rmid(ns, msq); - msg_unlock(msq); + ipc_unlock_object(&msq->q_perm); + rcu_read_unlock(); list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) { atomic_dec(&ns->msg_hdrs); @@ -282,7 +281,7 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) } /* - * Called with msg_ids.rw_mutex and ipcp locked. + * Called with msg_ids.rwsem and ipcp locked. */ static inline int msg_security(struct kern_ipc_perm *ipcp, int msgflg) { @@ -386,9 +385,9 @@ copy_msqid_from_user(struct msqid64_ds *out, void __user *buf, int version) } /* - * This function handles some msgctl commands which require the rw_mutex + * This function handles some msgctl commands which require the rwsem * to be held in write mode. - * NOTE: no locks must be held, the rw_mutex is taken inside this function. + * NOTE: no locks must be held, the rwsem is taken inside this function. */ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, struct msqid_ds __user *buf, int version) @@ -403,7 +402,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, return -EFAULT; } - down_write(&msg_ids(ns).rw_mutex); + down_write(&msg_ids(ns).rwsem); rcu_read_lock(); ipcp = ipcctl_pre_down_nolock(ns, &msg_ids(ns), msqid, cmd, @@ -459,7 +458,7 @@ out_unlock0: out_unlock1: rcu_read_unlock(); out_up: - up_write(&msg_ids(ns).rw_mutex); + up_write(&msg_ids(ns).rwsem); return err; } @@ -494,7 +493,7 @@ static int msgctl_nolock(struct ipc_namespace *ns, int msqid, msginfo.msgmnb = ns->msg_ctlmnb; msginfo.msgssz = MSGSSZ; msginfo.msgseg = MSGSEG; - down_read(&msg_ids(ns).rw_mutex); + down_read(&msg_ids(ns).rwsem); if (cmd == MSG_INFO) { msginfo.msgpool = msg_ids(ns).in_use; msginfo.msgmap = atomic_read(&ns->msg_hdrs); @@ -505,7 +504,7 @@ static int msgctl_nolock(struct ipc_namespace *ns, int msqid, msginfo.msgtql = MSGTQL; } max_id = ipc_get_maxid(&msg_ids(ns)); - up_read(&msg_ids(ns).rw_mutex); + up_read(&msg_ids(ns).rwsem); if (copy_to_user(buf, &msginfo, sizeof(struct msginfo))) return -EFAULT; return (max_id < 0) ? 0 : max_id; diff --git a/ipc/namespace.c b/ipc/namespace.c index 4be6581d3b7f..59451c1e214d 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -81,7 +81,7 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, int next_id; int total, in_use; - down_write(&ids->rw_mutex); + down_write(&ids->rwsem); in_use = ids->in_use; @@ -89,11 +89,12 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, perm = idr_find(&ids->ipcs_idr, next_id); if (perm == NULL) continue; - ipc_lock_by_ptr(perm); + rcu_read_lock(); + ipc_lock_object(perm); free(ns, perm); total++; } - up_write(&ids->rw_mutex); + up_write(&ids->rwsem); } static void free_ipc_ns(struct ipc_namespace *ns) diff --git a/ipc/sem.c b/ipc/sem.c index 41088899783d..69b6a21f3844 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -322,7 +322,7 @@ static inline void sem_unlock(struct sem_array *sma, int locknum) } /* - * sem_lock_(check_) routines are called in the paths where the rw_mutex + * sem_lock_(check_) routines are called in the paths where the rwsem * is not held. * * The caller holds the RCU read lock. @@ -426,7 +426,7 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) * @ns: namespace * @params: ptr to the structure that contains key, semflg and nsems * - * Called with sem_ids.rw_mutex held (as a writer) + * Called with sem_ids.rwsem held (as a writer) */ static int newary(struct ipc_namespace *ns, struct ipc_params *params) @@ -492,7 +492,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params) /* - * Called with sem_ids.rw_mutex and ipcp locked. + * Called with sem_ids.rwsem and ipcp locked. */ static inline int sem_security(struct kern_ipc_perm *ipcp, int semflg) { @@ -503,7 +503,7 @@ static inline int sem_security(struct kern_ipc_perm *ipcp, int semflg) } /* - * Called with sem_ids.rw_mutex and ipcp locked. + * Called with sem_ids.rwsem and ipcp locked. */ static inline int sem_more_checks(struct kern_ipc_perm *ipcp, struct ipc_params *params) @@ -994,8 +994,8 @@ static int count_semzcnt (struct sem_array * sma, ushort semnum) return semzcnt; } -/* Free a semaphore set. freeary() is called with sem_ids.rw_mutex locked - * as a writer and the spinlock for this semaphore set hold. sem_ids.rw_mutex +/* Free a semaphore set. freeary() is called with sem_ids.rwsem locked + * as a writer and the spinlock for this semaphore set hold. sem_ids.rwsem * remains locked on exit. */ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) @@ -1116,7 +1116,7 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid, seminfo.semmnu = SEMMNU; seminfo.semmap = SEMMAP; seminfo.semume = SEMUME; - down_read(&sem_ids(ns).rw_mutex); + down_read(&sem_ids(ns).rwsem); if (cmd == SEM_INFO) { seminfo.semusz = sem_ids(ns).in_use; seminfo.semaem = ns->used_sems; @@ -1125,7 +1125,7 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid, seminfo.semaem = SEMAEM; } max_id = ipc_get_maxid(&sem_ids(ns)); - up_read(&sem_ids(ns).rw_mutex); + up_read(&sem_ids(ns).rwsem); if (copy_to_user(p, &seminfo, sizeof(struct seminfo))) return -EFAULT; return (max_id < 0) ? 0: max_id; @@ -1431,9 +1431,9 @@ copy_semid_from_user(struct semid64_ds *out, void __user *buf, int version) } /* - * This function handles some semctl commands which require the rw_mutex + * This function handles some semctl commands which require the rwsem * to be held in write mode. - * NOTE: no locks must be held, the rw_mutex is taken inside this function. + * NOTE: no locks must be held, the rwsem is taken inside this function. */ static int semctl_down(struct ipc_namespace *ns, int semid, int cmd, int version, void __user *p) @@ -1448,7 +1448,7 @@ static int semctl_down(struct ipc_namespace *ns, int semid, return -EFAULT; } - down_write(&sem_ids(ns).rw_mutex); + down_write(&sem_ids(ns).rwsem); rcu_read_lock(); ipcp = ipcctl_pre_down_nolock(ns, &sem_ids(ns), semid, cmd, @@ -1487,7 +1487,7 @@ out_unlock0: out_unlock1: rcu_read_unlock(); out_up: - up_write(&sem_ids(ns).rw_mutex); + up_write(&sem_ids(ns).rwsem); return err; } diff --git a/ipc/shm.c b/ipc/shm.c index c6b4ad5ce3b7..2821cdf93adb 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -19,6 +19,9 @@ * namespaces support * OpenVZ, SWsoft Inc. * Pavel Emelianov <xemul@openvz.org> + * + * Better ipc lock (kern_ipc_perm.lock) handling + * Davidlohr Bueso <davidlohr.bueso@hp.com>, June 2013. */ #include <linux/slab.h> @@ -80,8 +83,8 @@ void shm_init_ns(struct ipc_namespace *ns) } /* - * Called with shm_ids.rw_mutex (writer) and the shp structure locked. - * Only shm_ids.rw_mutex remains locked on exit. + * Called with shm_ids.rwsem (writer) and the shp structure locked. + * Only shm_ids.rwsem remains locked on exit. */ static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) { @@ -124,8 +127,28 @@ void __init shm_init (void) IPC_SHM_IDS, sysvipc_shm_proc_show); } +static inline struct shmid_kernel *shm_obtain_object(struct ipc_namespace *ns, int id) +{ + struct kern_ipc_perm *ipcp = ipc_obtain_object(&shm_ids(ns), id); + + if (IS_ERR(ipcp)) + return ERR_CAST(ipcp); + + return container_of(ipcp, struct shmid_kernel, shm_perm); +} + +static inline struct shmid_kernel *shm_obtain_object_check(struct ipc_namespace *ns, int id) +{ + struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&shm_ids(ns), id); + + if (IS_ERR(ipcp)) + return ERR_CAST(ipcp); + + return container_of(ipcp, struct shmid_kernel, shm_perm); +} + /* - * shm_lock_(check_) routines are called in the paths where the rw_mutex + * shm_lock_(check_) routines are called in the paths where the rwsem * is not necessarily held. */ static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id) @@ -144,17 +167,6 @@ static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp) ipc_lock_object(&ipcp->shm_perm); } -static inline struct shmid_kernel *shm_lock_check(struct ipc_namespace *ns, - int id) -{ - struct kern_ipc_perm *ipcp = ipc_lock_check(&shm_ids(ns), id); - - if (IS_ERR(ipcp)) - return (struct shmid_kernel *)ipcp; - - return container_of(ipcp, struct shmid_kernel, shm_perm); -} - static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) { ipc_rmid(&shm_ids(ns), &s->shm_perm); @@ -182,7 +194,7 @@ static void shm_open(struct vm_area_struct *vma) * @ns: namespace * @shp: struct to free * - * It has to be called with shp and shm_ids.rw_mutex (writer) locked, + * It has to be called with shp and shm_ids.rwsem (writer) locked, * but returns with shp unlocked and freed. */ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) @@ -230,7 +242,7 @@ static void shm_close(struct vm_area_struct *vma) struct shmid_kernel *shp; struct ipc_namespace *ns = sfd->ns; - down_write(&shm_ids(ns).rw_mutex); + down_write(&shm_ids(ns).rwsem); /* remove from the list of attaches of the shm segment */ shp = shm_lock(ns, sfd->id); BUG_ON(IS_ERR(shp)); @@ -241,10 +253,10 @@ static void shm_close(struct vm_area_struct *vma) shm_destroy(ns, shp); else shm_unlock(shp); - up_write(&shm_ids(ns).rw_mutex); + up_write(&shm_ids(ns).rwsem); } -/* Called with ns->shm_ids(ns).rw_mutex locked */ +/* Called with ns->shm_ids(ns).rwsem locked */ static int shm_try_destroy_current(int id, void *p, void *data) { struct ipc_namespace *ns = data; @@ -275,7 +287,7 @@ static int shm_try_destroy_current(int id, void *p, void *data) return 0; } -/* Called with ns->shm_ids(ns).rw_mutex locked */ +/* Called with ns->shm_ids(ns).rwsem locked */ static int shm_try_destroy_orphaned(int id, void *p, void *data) { struct ipc_namespace *ns = data; @@ -286,7 +298,7 @@ static int shm_try_destroy_orphaned(int id, void *p, void *data) * We want to destroy segments without users and with already * exit'ed originating process. * - * As shp->* are changed under rw_mutex, it's safe to skip shp locking. + * As shp->* are changed under rwsem, it's safe to skip shp locking. */ if (shp->shm_creator != NULL) return 0; @@ -300,10 +312,10 @@ static int shm_try_destroy_orphaned(int id, void *p, void *data) void shm_destroy_orphaned(struct ipc_namespace *ns) { - down_write(&shm_ids(ns).rw_mutex); + down_write(&shm_ids(ns).rwsem); if (shm_ids(ns).in_use) idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns); - up_write(&shm_ids(ns).rw_mutex); + up_write(&shm_ids(ns).rwsem); } @@ -315,10 +327,10 @@ void exit_shm(struct task_struct *task) return; /* Destroy all already created segments, but not mapped yet */ - down_write(&shm_ids(ns).rw_mutex); + down_write(&shm_ids(ns).rwsem); if (shm_ids(ns).in_use) idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_current, ns); - up_write(&shm_ids(ns).rw_mutex); + up_write(&shm_ids(ns).rwsem); } static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) @@ -452,7 +464,7 @@ static const struct vm_operations_struct shm_vm_ops = { * @ns: namespace * @params: ptr to the structure that contains key, size and shmflg * - * Called with shm_ids.rw_mutex held as a writer. + * Called with shm_ids.rwsem held as a writer. */ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) @@ -560,7 +572,7 @@ no_file: } /* - * Called with shm_ids.rw_mutex and ipcp locked. + * Called with shm_ids.rwsem and ipcp locked. */ static inline int shm_security(struct kern_ipc_perm *ipcp, int shmflg) { @@ -571,7 +583,7 @@ static inline int shm_security(struct kern_ipc_perm *ipcp, int shmflg) } /* - * Called with shm_ids.rw_mutex and ipcp locked. + * Called with shm_ids.rwsem and ipcp locked. */ static inline int shm_more_checks(struct kern_ipc_perm *ipcp, struct ipc_params *params) @@ -684,7 +696,7 @@ static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminf /* * Calculate and add used RSS and swap pages of a shm. - * Called with shm_ids.rw_mutex held as a reader + * Called with shm_ids.rwsem held as a reader */ static void shm_add_rss_swap(struct shmid_kernel *shp, unsigned long *rss_add, unsigned long *swp_add) @@ -711,7 +723,7 @@ static void shm_add_rss_swap(struct shmid_kernel *shp, } /* - * Called with shm_ids.rw_mutex held as a reader + * Called with shm_ids.rwsem held as a reader */ static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss, unsigned long *swp) @@ -740,9 +752,9 @@ static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss, } /* - * This function handles some shmctl commands which require the rw_mutex + * This function handles some shmctl commands which require the rwsem * to be held in write mode. - * NOTE: no locks must be held, the rw_mutex is taken inside this function. + * NOTE: no locks must be held, the rwsem is taken inside this function. */ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, struct shmid_ds __user *buf, int version) @@ -757,14 +769,13 @@ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, return -EFAULT; } - down_write(&shm_ids(ns).rw_mutex); + down_write(&shm_ids(ns).rwsem); rcu_read_lock(); - ipcp = ipcctl_pre_down(ns, &shm_ids(ns), shmid, cmd, - &shmid64.shm_perm, 0); + ipcp = ipcctl_pre_down_nolock(ns, &shm_ids(ns), shmid, cmd, + &shmid64.shm_perm, 0); if (IS_ERR(ipcp)) { err = PTR_ERR(ipcp); - /* the ipc lock is not held upon failure */ goto out_unlock1; } @@ -772,14 +783,16 @@ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, err = security_shm_shmctl(shp, cmd); if (err) - goto out_unlock0; + goto out_unlock1; switch (cmd) { case IPC_RMID: + ipc_lock_object(&shp->shm_perm); /* do_shm_rmid unlocks the ipc object and rcu */ do_shm_rmid(ns, ipcp); goto out_up; case IPC_SET: + ipc_lock_object(&shp->shm_perm); err = ipc_update_perm(&shmid64.shm_perm, ipcp); if (err) goto out_unlock0; @@ -787,6 +800,7 @@ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, break; default: err = -EINVAL; + goto out_unlock1; } out_unlock0: @@ -794,33 +808,28 @@ out_unlock0: out_unlock1: rcu_read_unlock(); out_up: - up_write(&shm_ids(ns).rw_mutex); + up_write(&shm_ids(ns).rwsem); return err; } -SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) +static int shmctl_nolock(struct ipc_namespace *ns, int shmid, + int cmd, int version, void __user *buf) { + int err; struct shmid_kernel *shp; - int err, version; - struct ipc_namespace *ns; - if (cmd < 0 || shmid < 0) { - err = -EINVAL; - goto out; + /* preliminary security checks for *_INFO */ + if (cmd == IPC_INFO || cmd == SHM_INFO) { + err = security_shm_shmctl(NULL, cmd); + if (err) + return err; } - version = ipc_parse_version(&cmd); - ns = current->nsproxy->ipc_ns; - - switch (cmd) { /* replace with proc interface ? */ + switch (cmd) { case IPC_INFO: { struct shminfo64 shminfo; - err = security_shm_shmctl(NULL, cmd); - if (err) - return err; - memset(&shminfo, 0, sizeof(shminfo)); shminfo.shmmni = shminfo.shmseg = ns->shm_ctlmni; shminfo.shmmax = ns->shm_ctlmax; @@ -830,9 +839,9 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) if(copy_shminfo_to_user (buf, &shminfo, version)) return -EFAULT; - down_read(&shm_ids(ns).rw_mutex); + down_read(&shm_ids(ns).rwsem); err = ipc_get_maxid(&shm_ids(ns)); - up_read(&shm_ids(ns).rw_mutex); + up_read(&shm_ids(ns).rwsem); if(err<0) err = 0; @@ -842,19 +851,15 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) { struct shm_info shm_info; - err = security_shm_shmctl(NULL, cmd); - if (err) - return err; - memset(&shm_info, 0, sizeof(shm_info)); - down_read(&shm_ids(ns).rw_mutex); + down_read(&shm_ids(ns).rwsem); shm_info.used_ids = shm_ids(ns).in_use; shm_get_stat (ns, &shm_info.shm_rss, &shm_info.shm_swp); shm_info.shm_tot = ns->shm_tot; shm_info.swap_attempts = 0; shm_info.swap_successes = 0; err = ipc_get_maxid(&shm_ids(ns)); - up_read(&shm_ids(ns).rw_mutex); + up_read(&shm_ids(ns).rwsem); if (copy_to_user(buf, &shm_info, sizeof(shm_info))) { err = -EFAULT; goto out; @@ -869,27 +874,31 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) struct shmid64_ds tbuf; int result; + rcu_read_lock(); if (cmd == SHM_STAT) { - shp = shm_lock(ns, shmid); + shp = shm_obtain_object(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); - goto out; + goto out_unlock; } result = shp->shm_perm.id; } else { - shp = shm_lock_check(ns, shmid); + shp = shm_obtain_object_check(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); - goto out; + goto out_unlock; } result = 0; } + err = -EACCES; if (ipcperms(ns, &shp->shm_perm, S_IRUGO)) goto out_unlock; + err = security_shm_shmctl(shp, cmd); if (err) goto out_unlock; + memset(&tbuf, 0, sizeof(tbuf)); kernel_to_ipc64_perm(&shp->shm_perm, &tbuf.shm_perm); tbuf.shm_segsz = shp->shm_segsz; @@ -899,43 +908,76 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) tbuf.shm_cpid = shp->shm_cprid; tbuf.shm_lpid = shp->shm_lprid; tbuf.shm_nattch = shp->shm_nattch; - shm_unlock(shp); - if(copy_shmid_to_user (buf, &tbuf, version)) + rcu_read_unlock(); + + if (copy_shmid_to_user(buf, &tbuf, version)) err = -EFAULT; else err = result; goto out; } + default: + return -EINVAL; + } + +out_unlock: + rcu_read_unlock(); +out: + return err; +} + +SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) +{ + struct shmid_kernel *shp; + int err, version; + struct ipc_namespace *ns; + + if (cmd < 0 || shmid < 0) + return -EINVAL; + + version = ipc_parse_version(&cmd); + ns = current->nsproxy->ipc_ns; + + switch (cmd) { + case IPC_INFO: + case SHM_INFO: + case SHM_STAT: + case IPC_STAT: + return shmctl_nolock(ns, shmid, cmd, version, buf); + case IPC_RMID: + case IPC_SET: + return shmctl_down(ns, shmid, cmd, buf, version); case SHM_LOCK: case SHM_UNLOCK: { struct file *shm_file; - shp = shm_lock_check(ns, shmid); + rcu_read_lock(); + shp = shm_obtain_object_check(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); - goto out; + goto out_unlock1; } audit_ipc_obj(&(shp->shm_perm)); + err = security_shm_shmctl(shp, cmd); + if (err) + goto out_unlock1; + ipc_lock_object(&shp->shm_perm); if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) { kuid_t euid = current_euid(); err = -EPERM; if (!uid_eq(euid, shp->shm_perm.uid) && !uid_eq(euid, shp->shm_perm.cuid)) - goto out_unlock; + goto out_unlock0; if (cmd == SHM_LOCK && !rlimit(RLIMIT_MEMLOCK)) - goto out_unlock; + goto out_unlock0; } - err = security_shm_shmctl(shp, cmd); - if (err) - goto out_unlock; - shm_file = shp->shm_file; if (is_file_hugepages(shm_file)) - goto out_unlock; + goto out_unlock0; if (cmd == SHM_LOCK) { struct user_struct *user = current_user(); @@ -944,32 +986,31 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) shp->shm_perm.mode |= SHM_LOCKED; shp->mlock_user = user; } - goto out_unlock; + goto out_unlock0; } /* SHM_UNLOCK */ if (!(shp->shm_perm.mode & SHM_LOCKED)) - goto out_unlock; + goto out_unlock0; shmem_lock(shm_file, 0, shp->mlock_user); shp->shm_perm.mode &= ~SHM_LOCKED; shp->mlock_user = NULL; get_file(shm_file); - shm_unlock(shp); + ipc_unlock_object(&shp->shm_perm); + rcu_read_unlock(); shmem_unlock_mapping(shm_file->f_mapping); + fput(shm_file); - goto out; - } - case IPC_RMID: - case IPC_SET: - err = shmctl_down(ns, shmid, cmd, buf, version); return err; + } default: return -EINVAL; } -out_unlock: - shm_unlock(shp); -out: +out_unlock0: + ipc_unlock_object(&shp->shm_perm); +out_unlock1: + rcu_read_unlock(); return err; } @@ -1037,10 +1078,11 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, * additional creator id... */ ns = current->nsproxy->ipc_ns; - shp = shm_lock_check(ns, shmid); + rcu_read_lock(); + shp = shm_obtain_object_check(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); - goto out; + goto out_unlock; } err = -EACCES; @@ -1051,24 +1093,31 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, if (err) goto out_unlock; + ipc_lock_object(&shp->shm_perm); path = shp->shm_file->f_path; path_get(&path); shp->shm_nattch++; size = i_size_read(path.dentry->d_inode); - shm_unlock(shp); + ipc_unlock_object(&shp->shm_perm); + rcu_read_unlock(); err = -ENOMEM; sfd = kzalloc(sizeof(*sfd), GFP_KERNEL); - if (!sfd) - goto out_put_dentry; + if (!sfd) { + path_put(&path); + goto out_nattch; + } file = alloc_file(&path, f_mode, is_file_hugepages(shp->shm_file) ? &shm_file_operations_huge : &shm_file_operations); err = PTR_ERR(file); - if (IS_ERR(file)) - goto out_free; + if (IS_ERR(file)) { + kfree(sfd); + path_put(&path); + goto out_nattch; + } file->private_data = sfd; file->f_mapping = shp->shm_file->f_mapping; @@ -1094,7 +1143,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, addr > current->mm->start_stack - size - PAGE_SIZE * 5) goto invalid; } - + addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate); *raddr = addr; err = 0; @@ -1109,7 +1158,7 @@ out_fput: fput(file); out_nattch: - down_write(&shm_ids(ns).rw_mutex); + down_write(&shm_ids(ns).rwsem); shp = shm_lock(ns, shmid); BUG_ON(IS_ERR(shp)); shp->shm_nattch--; @@ -1117,20 +1166,13 @@ out_nattch: shm_destroy(ns, shp); else shm_unlock(shp); - up_write(&shm_ids(ns).rw_mutex); - -out: + up_write(&shm_ids(ns).rwsem); return err; out_unlock: - shm_unlock(shp); - goto out; - -out_free: - kfree(sfd); -out_put_dentry: - path_put(&path); - goto out_nattch; + rcu_read_unlock(); +out: + return err; } SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg) @@ -1235,8 +1277,7 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) #else /* CONFIG_MMU */ /* under NOMMU conditions, the exact address to be destroyed must be * given */ - retval = -EINVAL; - if (vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) { + if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) { do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); retval = 0; } diff --git a/ipc/util.c b/ipc/util.c index 4704223bfad4..e829da9ed01f 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -15,6 +15,14 @@ * Jun 2006 - namespaces ssupport * OpenVZ, SWsoft Inc. * Pavel Emelianov <xemul@openvz.org> + * + * General sysv ipc locking scheme: + * when doing ipc id lookups, take the ids->rwsem + * rcu_read_lock() + * obtain the ipc object (kern_ipc_perm) + * perform security, capabilities, auditing and permission checks, etc. + * acquire the ipc lock (kern_ipc_perm.lock) throught ipc_lock_object() + * perform data updates (ie: SET, RMID, LOCK/UNLOCK commands) */ #include <linux/mm.h> @@ -119,7 +127,7 @@ __initcall(ipc_init); void ipc_init_ids(struct ipc_ids *ids) { - init_rwsem(&ids->rw_mutex); + init_rwsem(&ids->rwsem); ids->in_use = 0; ids->seq = 0; @@ -174,7 +182,7 @@ void __init ipc_init_proc_interface(const char *path, const char *header, * @ids: Identifier set * @key: The key to find * - * Requires ipc_ids.rw_mutex locked. + * Requires ipc_ids.rwsem locked. * Returns the LOCKED pointer to the ipc structure if found or NULL * if not. * If key is found ipc points to the owning ipc structure @@ -197,7 +205,8 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key) continue; } - ipc_lock_by_ptr(ipc); + rcu_read_lock(); + ipc_lock_object(ipc); return ipc; } @@ -208,7 +217,7 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key) * ipc_get_maxid - get the last assigned id * @ids: IPC identifier set * - * Called with ipc_ids.rw_mutex held. + * Called with ipc_ids.rwsem held. */ int ipc_get_maxid(struct ipc_ids *ids) @@ -246,7 +255,7 @@ int ipc_get_maxid(struct ipc_ids *ids) * is returned. The 'new' entry is returned in a locked state on success. * On failure the entry is not locked and a negative err-code is returned. * - * Called with writer ipc_ids.rw_mutex held. + * Called with writer ipc_ids.rwsem held. */ int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size) { @@ -312,9 +321,9 @@ static int ipcget_new(struct ipc_namespace *ns, struct ipc_ids *ids, { int err; - down_write(&ids->rw_mutex); + down_write(&ids->rwsem); err = ops->getnew(ns, params); - up_write(&ids->rw_mutex); + up_write(&ids->rwsem); return err; } @@ -331,7 +340,7 @@ static int ipcget_new(struct ipc_namespace *ns, struct ipc_ids *ids, * * On success, the IPC id is returned. * - * It is called with ipc_ids.rw_mutex and ipcp->lock held. + * It is called with ipc_ids.rwsem and ipcp->lock held. */ static int ipc_check_perms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, @@ -376,7 +385,7 @@ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids, * Take the lock as a writer since we are potentially going to add * a new entry + read locks are not "upgradable" */ - down_write(&ids->rw_mutex); + down_write(&ids->rwsem); ipcp = ipc_findkey(ids, params->key); if (ipcp == NULL) { /* key not used */ @@ -402,7 +411,7 @@ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids, } ipc_unlock(ipcp); } - up_write(&ids->rw_mutex); + up_write(&ids->rwsem); return err; } @@ -413,7 +422,7 @@ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids, * @ids: IPC identifier set * @ipcp: ipc perm structure containing the identifier to remove * - * ipc_ids.rw_mutex (as a writer) and the spinlock for this ID are held + * ipc_ids.rwsem (as a writer) and the spinlock for this ID are held * before this function is called, and remain locked on the exit. */ @@ -621,7 +630,7 @@ struct kern_ipc_perm *ipc_obtain_object(struct ipc_ids *ids, int id) } /** - * ipc_lock - Lock an ipc structure without rw_mutex held + * ipc_lock - Lock an ipc structure without rwsem held * @ids: IPC identifier set * @id: ipc id to look for * @@ -677,22 +686,6 @@ out: return out; } -struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id) -{ - struct kern_ipc_perm *out; - - out = ipc_lock(ids, id); - if (IS_ERR(out)) - return out; - - if (ipc_checkid(out, id)) { - ipc_unlock(out); - return ERR_PTR(-EIDRM); - } - - return out; -} - /** * ipcget - Common sys_*get() code * @ns : namsepace @@ -733,7 +726,7 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out) } /** - * ipcctl_pre_down - retrieve an ipc and check permissions for some IPC_XXX cmd + * ipcctl_pre_down_nolock - retrieve an ipc and check permissions for some IPC_XXX cmd * @ns: the ipc namespace * @ids: the table of ids where to look for the ipc * @id: the id of the ipc to retrieve @@ -746,29 +739,13 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out) * It must be called without any lock held and * - retrieves the ipc with the given id in the given table. * - performs some audit and permission check, depending on the given cmd - * - returns the ipc with the ipc lock held in case of success - * or an err-code without any lock held otherwise. + * - returns a pointer to the ipc object or otherwise, the corresponding error. * - * Call holding the both the rw_mutex and the rcu read lock. + * Call holding the both the rwsem and the rcu read lock. */ -struct kern_ipc_perm *ipcctl_pre_down(struct ipc_namespace *ns, - struct ipc_ids *ids, int id, int cmd, - struct ipc64_perm *perm, int extra_perm) -{ - struct kern_ipc_perm *ipcp; - - ipcp = ipcctl_pre_down_nolock(ns, ids, id, cmd, perm, extra_perm); - if (IS_ERR(ipcp)) - goto out; - - spin_lock(&ipcp->lock); -out: - return ipcp; -} - struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns, - struct ipc_ids *ids, int id, int cmd, - struct ipc64_perm *perm, int extra_perm) + struct ipc_ids *ids, int id, int cmd, + struct ipc64_perm *perm, int extra_perm) { kuid_t euid; int err = -EPERM; @@ -846,7 +823,8 @@ static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos, ipc = idr_find(&ids->ipcs_idr, pos); if (ipc != NULL) { *new_pos = pos + 1; - ipc_lock_by_ptr(ipc); + rcu_read_lock(); + ipc_lock_object(ipc); return ipc; } } @@ -884,7 +862,7 @@ static void *sysvipc_proc_start(struct seq_file *s, loff_t *pos) * Take the lock - this will be released by the corresponding * call to stop(). */ - down_read(&ids->rw_mutex); + down_read(&ids->rwsem); /* pos < 0 is invalid */ if (*pos < 0) @@ -911,7 +889,7 @@ static void sysvipc_proc_stop(struct seq_file *s, void *it) ids = &iter->ns->ids[iface->ids]; /* Release the lock we took in start() */ - up_read(&ids->rw_mutex); + up_read(&ids->rwsem); } static int sysvipc_proc_show(struct seq_file *s, void *it) diff --git a/ipc/util.h b/ipc/util.h index b6a6a88f3002..c5f3338ba1fa 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -94,10 +94,10 @@ void __init ipc_init_proc_interface(const char *path, const char *header, #define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER) #define ipcid_to_seqx(id) ((id) / SEQ_MULTIPLIER) -/* must be called with ids->rw_mutex acquired for writing */ +/* must be called with ids->rwsem acquired for writing */ int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int); -/* must be called with ids->rw_mutex acquired for reading */ +/* must be called with ids->rwsem acquired for reading */ int ipc_get_maxid(struct ipc_ids *); /* must be called with both locks acquired. */ @@ -131,9 +131,6 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out); struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns, struct ipc_ids *ids, int id, int cmd, struct ipc64_perm *perm, int extra_perm); -struct kern_ipc_perm *ipcctl_pre_down(struct ipc_namespace *ns, - struct ipc_ids *ids, int id, int cmd, - struct ipc64_perm *perm, int extra_perm); #ifndef CONFIG_ARCH_WANT_IPC_PARSE_VERSION /* On IA-64, we always use the "64-bit version" of the IPC structures. */ @@ -174,19 +171,12 @@ static inline void ipc_assert_locked_object(struct kern_ipc_perm *perm) assert_spin_locked(&perm->lock); } -static inline void ipc_lock_by_ptr(struct kern_ipc_perm *perm) -{ - rcu_read_lock(); - ipc_lock_object(perm); -} - static inline void ipc_unlock(struct kern_ipc_perm *perm) { ipc_unlock_object(perm); rcu_read_unlock(); } -struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id); struct kern_ipc_perm *ipc_obtain_object_check(struct ipc_ids *ids, int id); int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids, struct ipc_ops *ops, struct ipc_params *params); diff --git a/kernel/extable.c b/kernel/extable.c index 67460b93b1a1..832cb28105bb 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -41,7 +41,7 @@ u32 __initdata main_extable_sort_needed = 1; /* Sort the kernel's built-in exception table */ void __init sort_main_extable(void) { - if (main_extable_sort_needed) { + if (main_extable_sort_needed && __stop___ex_table > __start___ex_table) { pr_notice("Sorting __ex_table...\n"); sort_extable(__start___ex_table, __stop___ex_table); } diff --git a/kernel/fork.c b/kernel/fork.c index c9eaf2013002..81ccb4f010c2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -351,7 +351,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) struct rb_node **rb_link, *rb_parent; int retval; unsigned long charge; - struct mempolicy *pol; uprobe_start_dup_mmap(); down_write(&oldmm->mmap_sem); @@ -400,11 +399,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) goto fail_nomem; *tmp = *mpnt; INIT_LIST_HEAD(&tmp->anon_vma_chain); - pol = mpol_dup(vma_policy(mpnt)); - retval = PTR_ERR(pol); - if (IS_ERR(pol)) + retval = vma_dup_policy(mpnt, tmp); + if (retval) goto fail_nomem_policy; - vma_set_policy(tmp, pol); tmp->vm_mm = mm; if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; @@ -472,7 +469,7 @@ out: uprobe_end_dup_mmap(); return retval; fail_nomem_anon_vma_fork: - mpol_put(pol); + mpol_put(vma_policy(tmp)); fail_nomem_policy: kmem_cache_free(vm_area_cachep, tmp); fail_nomem: @@ -1173,13 +1170,16 @@ static struct task_struct *copy_process(unsigned long clone_flags, return ERR_PTR(-EINVAL); /* - * If the new process will be in a different pid namespace - * don't allow the creation of threads. + * If the new process will be in a different pid or user namespace + * do not allow it to share a thread group or signal handlers or + * parent with the forking task. */ - if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) && - (task_active_pid_ns(current) != - current->nsproxy->pid_ns_for_children)) - return ERR_PTR(-EINVAL); + if (clone_flags & (CLONE_SIGHAND | CLONE_PARENT)) { + if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || + (task_active_pid_ns(current) != + current->nsproxy->pid_ns_for_children)) + return ERR_PTR(-EINVAL); + } retval = security_task_create(clone_flags); if (retval) @@ -1576,15 +1576,6 @@ long do_fork(unsigned long clone_flags, long nr; /* - * Do some preliminary argument and permissions checking before we - * actually start allocating stuff - */ - if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) { - if (clone_flags & (CLONE_THREAD|CLONE_PARENT)) - return -EINVAL; - } - - /* * Determine whether and which event to report to ptracer. When * called from kernel_thread or CLONE_UNTRACED is explicitly * requested, no event is reported; otherwise, report if the event diff --git a/kernel/kexec.c b/kernel/kexec.c index 59f7b55ba745..2a74f307c5ec 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1474,11 +1474,8 @@ static int __init __parse_crashkernel(char *cmdline, if (first_colon && (!first_space || first_colon < first_space)) return parse_crashkernel_mem(ck_cmdline, system_ram, crash_size, crash_base); - else - return parse_crashkernel_simple(ck_cmdline, crash_size, - crash_base); - return 0; + return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); } /* diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 6e33498d665c..a0d367a49122 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -112,6 +112,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = { struct kprobe_insn_page { struct list_head list; kprobe_opcode_t *insns; /* Page of instruction slots */ + struct kprobe_insn_cache *cache; int nused; int ngarbage; char slot_used[]; @@ -121,12 +122,6 @@ struct kprobe_insn_page { (offsetof(struct kprobe_insn_page, slot_used) + \ (sizeof(char) * (slots))) -struct kprobe_insn_cache { - struct list_head pages; /* list of kprobe_insn_page */ - size_t insn_size; /* size of instruction slot */ - int nr_garbage; -}; - static int slots_per_page(struct kprobe_insn_cache *c) { return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t)); @@ -138,8 +133,20 @@ enum kprobe_slot_state { SLOT_USED = 2, }; -static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */ -static struct kprobe_insn_cache kprobe_insn_slots = { +static void *alloc_insn_page(void) +{ + return module_alloc(PAGE_SIZE); +} + +static void free_insn_page(void *page) +{ + module_free(NULL, page); +} + +struct kprobe_insn_cache kprobe_insn_slots = { + .mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex), + .alloc = alloc_insn_page, + .free = free_insn_page, .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages), .insn_size = MAX_INSN_SIZE, .nr_garbage = 0, @@ -150,10 +157,12 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c); * __get_insn_slot() - Find a slot on an executable page for an instruction. * We allocate an executable page if there's no room on existing ones. */ -static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) +kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) { struct kprobe_insn_page *kip; + kprobe_opcode_t *slot = NULL; + mutex_lock(&c->mutex); retry: list_for_each_entry(kip, &c->pages, list) { if (kip->nused < slots_per_page(c)) { @@ -162,7 +171,8 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) if (kip->slot_used[i] == SLOT_CLEAN) { kip->slot_used[i] = SLOT_USED; kip->nused++; - return kip->insns + (i * c->insn_size); + slot = kip->insns + (i * c->insn_size); + goto out; } } /* kip->nused is broken. Fix it. */ @@ -178,37 +188,29 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) /* All out of space. Need to allocate a new page. */ kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL); if (!kip) - return NULL; + goto out; /* * Use module_alloc so this page is within +/- 2GB of where the * kernel image and loaded module images reside. This is required * so x86_64 can correctly handle the %rip-relative fixups. */ - kip->insns = module_alloc(PAGE_SIZE); + kip->insns = c->alloc(); if (!kip->insns) { kfree(kip); - return NULL; + goto out; } INIT_LIST_HEAD(&kip->list); memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c)); kip->slot_used[0] = SLOT_USED; kip->nused = 1; kip->ngarbage = 0; + kip->cache = c; list_add(&kip->list, &c->pages); - return kip->insns; -} - - -kprobe_opcode_t __kprobes *get_insn_slot(void) -{ - kprobe_opcode_t *ret = NULL; - - mutex_lock(&kprobe_insn_mutex); - ret = __get_insn_slot(&kprobe_insn_slots); - mutex_unlock(&kprobe_insn_mutex); - - return ret; + slot = kip->insns; +out: + mutex_unlock(&c->mutex); + return slot; } /* Return 1 if all garbages are collected, otherwise 0. */ @@ -225,7 +227,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) */ if (!list_is_singular(&kip->list)) { list_del(&kip->list); - module_free(NULL, kip->insns); + kip->cache->free(kip->insns); kfree(kip); } return 1; @@ -255,11 +257,12 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c) return 0; } -static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, - kprobe_opcode_t *slot, int dirty) +void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, + kprobe_opcode_t *slot, int dirty) { struct kprobe_insn_page *kip; + mutex_lock(&c->mutex); list_for_each_entry(kip, &c->pages, list) { long idx = ((long)slot - (long)kip->insns) / (c->insn_size * sizeof(kprobe_opcode_t)); @@ -272,45 +275,25 @@ static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, collect_garbage_slots(c); } else collect_one_slot(kip, idx); - return; + goto out; } } /* Could not free this slot. */ WARN_ON(1); +out: + mutex_unlock(&c->mutex); } -void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) -{ - mutex_lock(&kprobe_insn_mutex); - __free_insn_slot(&kprobe_insn_slots, slot, dirty); - mutex_unlock(&kprobe_insn_mutex); -} #ifdef CONFIG_OPTPROBES /* For optimized_kprobe buffer */ -static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */ -static struct kprobe_insn_cache kprobe_optinsn_slots = { +struct kprobe_insn_cache kprobe_optinsn_slots = { + .mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex), + .alloc = alloc_insn_page, + .free = free_insn_page, .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages), /* .insn_size is initialized later */ .nr_garbage = 0, }; -/* Get a slot for optimized_kprobe buffer */ -kprobe_opcode_t __kprobes *get_optinsn_slot(void) -{ - kprobe_opcode_t *ret = NULL; - - mutex_lock(&kprobe_optinsn_mutex); - ret = __get_insn_slot(&kprobe_optinsn_slots); - mutex_unlock(&kprobe_optinsn_mutex); - - return ret; -} - -void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty) -{ - mutex_lock(&kprobe_optinsn_mutex); - __free_insn_slot(&kprobe_optinsn_slots, slot, dirty); - mutex_unlock(&kprobe_optinsn_mutex); -} #endif #endif diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c index 2b6e69909c39..7cbd4507a7e6 100644 --- a/kernel/modsign_pubkey.c +++ b/kernel/modsign_pubkey.c @@ -18,14 +18,14 @@ struct key *modsign_keyring; -extern __initdata const u8 modsign_certificate_list[]; -extern __initdata const u8 modsign_certificate_list_end[]; +extern __initconst const u8 modsign_certificate_list[]; +extern __initconst const u8 modsign_certificate_list_end[]; /* * We need to make sure ccache doesn't cache the .o file as it doesn't notice * if modsign.pub changes. */ -static __initdata const char annoy_ccache[] = __TIME__ "foo"; +static __initconst const char annoy_ccache[] = __TIME__ "foo"; /* * Load the compiled-in keys diff --git a/kernel/panic.c b/kernel/panic.c index 801864600514..b6c482ccc5db 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -123,10 +123,14 @@ void panic(const char *fmt, ...) */ smp_send_stop(); - kmsg_dump(KMSG_DUMP_PANIC); - + /* + * Run any panic handlers, including those that might need to + * add information to the kmsg dump output. + */ atomic_notifier_call_chain(&panic_notifier_list, 0, buf); + kmsg_dump(KMSG_DUMP_PANIC); + bust_spinlocks(0); if (!panic_blink) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 349587bb03e1..358a146fd4da 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -352,7 +352,7 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) struct mem_extent *ext, *cur, *aux; zone_start = zone->zone_start_pfn; - zone_end = zone->zone_start_pfn + zone->spanned_pages; + zone_end = zone_end_pfn(zone); list_for_each_entry(ext, list, hook) if (zone_start <= ext->end) @@ -884,7 +884,7 @@ static unsigned int count_highmem_pages(void) continue; mark_free_pages(zone); - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (saveable_highmem_page(zone, pfn)) n++; @@ -948,7 +948,7 @@ static unsigned int count_data_pages(void) continue; mark_free_pages(zone); - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (saveable_page(zone, pfn)) n++; @@ -1041,7 +1041,7 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) unsigned long max_zone_pfn; mark_free_pages(zone); - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (page_is_saveable(zone, pfn)) memory_bm_set_bit(orig_bm, pfn); @@ -1093,7 +1093,7 @@ void swsusp_free(void) unsigned long pfn, max_zone_pfn; for_each_populated_zone(zone) { - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) { struct page *page = pfn_to_page(pfn); @@ -1755,7 +1755,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm) /* Clear page flags */ for_each_populated_zone(zone) { - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) swsusp_unset_page_free(pfn_to_page(pfn)); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index a146ee327f6a..dd562e9aa2c8 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -236,7 +236,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) */ int dumpable = 0; /* Don't let security modules deny introspection */ - if (task == current) + if (same_thread_group(task, current)) return 0; rcu_read_lock(); tcred = __task_cred(task); diff --git a/kernel/signal.c b/kernel/signal.c index 50e41075ac77..ded28b91fa53 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3394,7 +3394,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, new_ka.sa.sa_restorer = compat_ptr(restorer); #endif ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask)); - ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags); + ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags); if (ret) return -EFAULT; sigset_from_compat(&new_ka.sa.sa_mask, &mask); @@ -3406,7 +3406,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler); ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask)); - ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags); + ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags); #ifdef __ARCH_HAS_SA_RESTORER ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer); diff --git a/kernel/smp.c b/kernel/smp.c index 449b707fc20d..0564571dcdf7 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -48,10 +48,13 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) cpu_to_node(cpu))) return notifier_from_errno(-ENOMEM); if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, - cpu_to_node(cpu))) + cpu_to_node(cpu))) { + free_cpumask_var(cfd->cpumask); return notifier_from_errno(-ENOMEM); + } cfd->csd = alloc_percpu(struct call_single_data); if (!cfd->csd) { + free_cpumask_var(cfd->cpumask_ipi); free_cpumask_var(cfd->cpumask); return notifier_from_errno(-ENOMEM); } @@ -572,8 +575,10 @@ EXPORT_SYMBOL(on_each_cpu); * * If @wait is true, then returns once @func has returned. * - * You must not call this function with disabled interrupts or - * from a hardware interrupt handler or from a bottom half handler. + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. The + * exception is that it may be used during early boot while + * early_boot_irqs_disabled is set. */ void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait) @@ -582,9 +587,10 @@ void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, smp_call_function_many(mask, func, info, wait); if (cpumask_test_cpu(cpu, mask)) { - local_irq_disable(); + unsigned long flags; + local_irq_save(flags); func(info); - local_irq_enable(); + local_irq_restore(flags); } put_cpu(); } diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 5cdd8065a3ce..4b082b5cac9e 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -34,6 +34,20 @@ #else #define raw_read_can_lock(l) read_can_lock(l) #define raw_write_can_lock(l) write_can_lock(l) + +/* + * Some architectures can relax in favour of the CPU owning the lock. + */ +#ifndef arch_read_relax +# define arch_read_relax(l) cpu_relax() +#endif +#ifndef arch_write_relax +# define arch_write_relax(l) cpu_relax() +#endif +#ifndef arch_spin_relax +# define arch_spin_relax(l) cpu_relax() +#endif + /* * We build the __lock_function inlines here. They are too large for * inlining all over the place, but here is only one user per function diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 07f6fc468e17..dc69093a8ec4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1225,7 +1225,7 @@ static struct ctl_table vm_table[] = { .data = &hugepages_treat_as_movable, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = hugetlb_treat_movable_handler, + .proc_handler = proc_dointvec, }, { .procname = "nr_overcommit_hugepages", diff --git a/kernel/task_work.c b/kernel/task_work.c index 65bd3c92d6f3..8727032e3a6f 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -4,6 +4,23 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ +/** + * task_work_add - ask the @task to execute @work->func() + * @task: the task which should run the callback + * @work: the callback to run + * @notify: send the notification if true + * + * Queue @work for task_work_run() below and notify the @task if @notify. + * Fails if the @task is exiting/exited and thus it can't process this @work. + * Otherwise @work->func() will be called when the @task returns from kernel + * mode or exits. + * + * This is like the signal handler which runs in kernel mode, but it doesn't + * try to wake up the @task. + * + * RETURNS: + * 0 if succeeds or -ESRCH. + */ int task_work_add(struct task_struct *task, struct callback_head *work, bool notify) { @@ -21,11 +38,22 @@ task_work_add(struct task_struct *task, struct callback_head *work, bool notify) return 0; } +/** + * task_work_cancel - cancel a pending work added by task_work_add() + * @task: the task which should execute the work + * @func: identifies the work to remove + * + * Find the last queued pending work with ->func == @func and remove + * it from queue. + * + * RETURNS: + * The found work or NULL if not found. + */ struct callback_head * task_work_cancel(struct task_struct *task, task_work_func_t func) { struct callback_head **pprev = &task->task_works; - struct callback_head *work = NULL; + struct callback_head *work; unsigned long flags; /* * If cmpxchg() fails we continue without updating pprev. @@ -35,7 +63,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) */ raw_spin_lock_irqsave(&task->pi_lock, flags); while ((work = ACCESS_ONCE(*pprev))) { - read_barrier_depends(); + smp_read_barrier_depends(); if (work->func != func) pprev = &work->next; else if (cmpxchg(pprev, work, work->next) == work) @@ -46,6 +74,14 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) return work; } +/** + * task_work_run - execute the works added by task_work_add() + * + * Flush the pending works. Should be used by the core kernel code. + * Called before the task returns to the user-mode or stops, or when + * it exits. In the latter case task_work_add() can no longer add the + * new work after task_work_run() returns. + */ void task_work_run(void) { struct task_struct *task = current; diff --git a/kernel/up.c b/kernel/up.c index c54c75e9faf7..630d72bf7e41 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -10,12 +10,64 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, int wait) { + unsigned long flags; + WARN_ON(cpu != 0); - local_irq_disable(); - (func)(info); - local_irq_enable(); + local_irq_save(flags); + func(info); + local_irq_restore(flags); return 0; } EXPORT_SYMBOL(smp_call_function_single); + +int on_each_cpu(smp_call_func_t func, void *info, int wait) +{ + unsigned long flags; + + local_irq_save(flags); + func(info); + local_irq_restore(flags); + return 0; +} +EXPORT_SYMBOL(on_each_cpu); + +/* + * Note we still need to test the mask even for UP + * because we actually can get an empty mask from + * code that on SMP might call us without the local + * CPU in the mask. + */ +void on_each_cpu_mask(const struct cpumask *mask, + smp_call_func_t func, void *info, bool wait) +{ + unsigned long flags; + + if (cpumask_test_cpu(0, mask)) { + local_irq_save(flags); + func(info); + local_irq_restore(flags); + } +} +EXPORT_SYMBOL(on_each_cpu_mask); + +/* + * Preemption is disabled here to make sure the cond_func is called under the + * same condtions in UP and SMP. + */ +void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags) +{ + unsigned long flags; + + preempt_disable(); + if (cond_func(0, info)) { + local_irq_save(flags); + func(info); + local_irq_restore(flags); + } + preempt_enable(); +} +EXPORT_SYMBOL(on_each_cpu_cond); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 652bea9054f0..c9eef36739a9 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1461,7 +1461,7 @@ config BACKTRACE_SELF_TEST config RBTREE_TEST tristate "Red-Black tree test" - depends on m && DEBUG_KERNEL + depends on DEBUG_KERNEL help A benchmark measuring the performance of the rbtree library. Also includes rbtree invariant checks. diff --git a/lib/crc32.c b/lib/crc32.c index 072fbd8234d5..410093dbe51c 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -131,11 +131,14 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) #endif /** - * crc32_le() - Calculate bitwise little-endian Ethernet AUTODIN II CRC32 - * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for - * other uses, or the previous crc32 value if computing incrementally. - * @p: pointer to buffer over which CRC is run + * crc32_le_generic() - Calculate bitwise little-endian Ethernet AUTODIN II + * CRC32/CRC32C + * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for other + * uses, or the previous crc32/crc32c value if computing incrementally. + * @p: pointer to buffer over which CRC32/CRC32C is run * @len: length of buffer @p + * @tab: little-endian Ethernet table + * @polynomial: CRC32/CRC32c LE polynomial */ static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p, size_t len, const u32 (*tab)[256], @@ -201,11 +204,13 @@ EXPORT_SYMBOL(crc32_le); EXPORT_SYMBOL(__crc32c_le); /** - * crc32_be() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32 + * crc32_be_generic() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32 * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for * other uses, or the previous crc32 value if computing incrementally. - * @p: pointer to buffer over which CRC is run + * @p: pointer to buffer over which CRC32 is run * @len: length of buffer @p + * @tab: big-endian Ethernet table + * @polynomial: CRC32 BE polynomial */ static inline u32 __pure crc32_be_generic(u32 crc, unsigned char const *p, size_t len, const u32 (*tab)[256], diff --git a/lib/decompress_inflate.c b/lib/decompress_inflate.c index 19ff89e34eec..d619b28c456f 100644 --- a/lib/decompress_inflate.c +++ b/lib/decompress_inflate.c @@ -48,7 +48,7 @@ STATIC int INIT gunzip(unsigned char *buf, int len, out_len = 0x8000; /* 32 K */ out_buf = malloc(out_len); } else { - out_len = 0x7fffffff; /* no limit */ + out_len = ((size_t)~0) - (size_t)out_buf; /* no limit */ } if (!out_buf) { error("Out of memory while allocating output buffer"); diff --git a/lib/genalloc.c b/lib/genalloc.c index b35cfa9bc3d4..26cf20be72b7 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -37,6 +37,11 @@ #include <linux/of_address.h> #include <linux/of_device.h> +static inline size_t chunk_size(const struct gen_pool_chunk *chunk) +{ + return chunk->end_addr - chunk->start_addr + 1; +} + static int set_bits_ll(unsigned long *addr, unsigned long mask_to_set) { unsigned long val, nval; @@ -182,13 +187,13 @@ int gen_pool_add_virt(struct gen_pool *pool, unsigned long virt, phys_addr_t phy int nbytes = sizeof(struct gen_pool_chunk) + BITS_TO_LONGS(nbits) * sizeof(long); - chunk = kmalloc_node(nbytes, GFP_KERNEL | __GFP_ZERO, nid); + chunk = kzalloc_node(nbytes, GFP_KERNEL, nid); if (unlikely(chunk == NULL)) return -ENOMEM; chunk->phys_addr = phys; chunk->start_addr = virt; - chunk->end_addr = virt + size; + chunk->end_addr = virt + size - 1; atomic_set(&chunk->avail, size); spin_lock(&pool->lock); @@ -213,7 +218,7 @@ phys_addr_t gen_pool_virt_to_phys(struct gen_pool *pool, unsigned long addr) rcu_read_lock(); list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) { - if (addr >= chunk->start_addr && addr < chunk->end_addr) { + if (addr >= chunk->start_addr && addr <= chunk->end_addr) { paddr = chunk->phys_addr + (addr - chunk->start_addr); break; } @@ -242,7 +247,7 @@ void gen_pool_destroy(struct gen_pool *pool) chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk); list_del(&chunk->next_chunk); - end_bit = (chunk->end_addr - chunk->start_addr) >> order; + end_bit = chunk_size(chunk) >> order; bit = find_next_bit(chunk->bits, end_bit, 0); BUG_ON(bit < end_bit); @@ -283,7 +288,7 @@ unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size) if (size > atomic_read(&chunk->avail)) continue; - end_bit = (chunk->end_addr - chunk->start_addr) >> order; + end_bit = chunk_size(chunk) >> order; retry: start_bit = pool->algo(chunk->bits, end_bit, start_bit, nbits, pool->data); @@ -330,8 +335,8 @@ void gen_pool_free(struct gen_pool *pool, unsigned long addr, size_t size) nbits = (size + (1UL << order) - 1) >> order; rcu_read_lock(); list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) { - if (addr >= chunk->start_addr && addr < chunk->end_addr) { - BUG_ON(addr + size > chunk->end_addr); + if (addr >= chunk->start_addr && addr <= chunk->end_addr) { + BUG_ON(addr + size - 1 > chunk->end_addr); start_bit = (addr - chunk->start_addr) >> order; remain = bitmap_clear_ll(chunk->bits, start_bit, nbits); BUG_ON(remain); @@ -400,7 +405,7 @@ size_t gen_pool_size(struct gen_pool *pool) rcu_read_lock(); list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) - size += chunk->end_addr - chunk->start_addr; + size += chunk_size(chunk); rcu_read_unlock(); return size; } @@ -519,7 +524,6 @@ struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order, /** * dev_get_gen_pool - Obtain the gen_pool (if any) for a device * @dev: device to retrieve the gen_pool from - * @name: Optional name for the gen_pool, usually NULL * * Returns the gen_pool for the device if one is present, or NULL. */ diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c index 411be80ddb46..df6839e3ce08 100644 --- a/lib/lz4/lz4_decompress.c +++ b/lib/lz4/lz4_decompress.c @@ -283,8 +283,8 @@ _output_error: return (int) (-(((char *) ip) - source)); } -int lz4_decompress(const char *src, size_t *src_len, char *dest, - size_t actual_dest_len) +int lz4_decompress(const unsigned char *src, size_t *src_len, + unsigned char *dest, size_t actual_dest_len) { int ret = -1; int input_len = 0; @@ -302,8 +302,8 @@ exit_0: EXPORT_SYMBOL(lz4_decompress); #endif -int lz4_decompress_unknownoutputsize(const char *src, size_t src_len, - char *dest, size_t *dest_len) +int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len, + unsigned char *dest, size_t *dest_len) { int ret = -1; int out_len = 0; diff --git a/lib/radix-tree.c b/lib/radix-tree.c index e7964296fd50..7811ed3b4e70 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -32,6 +32,7 @@ #include <linux/string.h> #include <linux/bitops.h> #include <linux/rcupdate.h> +#include <linux/hardirq.h> /* in_interrupt() */ #ifdef __KERNEL__ @@ -207,7 +208,12 @@ radix_tree_node_alloc(struct radix_tree_root *root) struct radix_tree_node *ret = NULL; gfp_t gfp_mask = root_gfp_mask(root); - if (!(gfp_mask & __GFP_WAIT)) { + /* + * Preload code isn't irq safe and it doesn't make sence to use + * preloading in the interrupt anyway as all the allocations have to + * be atomic. So just do normal allocation when in interrupt. + */ + if (!(gfp_mask & __GFP_WAIT) && !in_interrupt()) { struct radix_tree_preload *rtp; /* @@ -264,7 +270,7 @@ radix_tree_node_free(struct radix_tree_node *node) * To make use of this facility, the radix tree must be initialised without * __GFP_WAIT being passed to INIT_RADIX_TREE(). */ -int radix_tree_preload(gfp_t gfp_mask) +static int __radix_tree_preload(gfp_t gfp_mask) { struct radix_tree_preload *rtp; struct radix_tree_node *node; @@ -288,9 +294,40 @@ int radix_tree_preload(gfp_t gfp_mask) out: return ret; } + +/* + * Load up this CPU's radix_tree_node buffer with sufficient objects to + * ensure that the addition of a single element in the tree cannot fail. On + * success, return zero, with preemption disabled. On error, return -ENOMEM + * with preemption not disabled. + * + * To make use of this facility, the radix tree must be initialised without + * __GFP_WAIT being passed to INIT_RADIX_TREE(). + */ +int radix_tree_preload(gfp_t gfp_mask) +{ + /* Warn on non-sensical use... */ + WARN_ON_ONCE(!(gfp_mask & __GFP_WAIT)); + return __radix_tree_preload(gfp_mask); +} EXPORT_SYMBOL(radix_tree_preload); /* + * The same as above function, except we don't guarantee preloading happens. + * We do it, if we decide it helps. On success, return zero with preemption + * disabled. On error, return -ENOMEM with preemption not disabled. + */ +int radix_tree_maybe_preload(gfp_t gfp_mask) +{ + if (gfp_mask & __GFP_WAIT) + return __radix_tree_preload(gfp_mask); + /* Preloading doesn't help anything with this gfp mask, skip it */ + preempt_disable(); + return 0; +} +EXPORT_SYMBOL(radix_tree_maybe_preload); + +/* * Return the maximum key which can be store into a * radix tree with height HEIGHT. */ diff --git a/lib/rbtree.c b/lib/rbtree.c index c0e31fe2fabf..65f4effd117f 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -518,3 +518,43 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new, *new = *victim; } EXPORT_SYMBOL(rb_replace_node); + +static struct rb_node *rb_left_deepest_node(const struct rb_node *node) +{ + for (;;) { + if (node->rb_left) + node = node->rb_left; + else if (node->rb_right) + node = node->rb_right; + else + return (struct rb_node *)node; + } +} + +struct rb_node *rb_next_postorder(const struct rb_node *node) +{ + const struct rb_node *parent; + if (!node) + return NULL; + parent = rb_parent(node); + + /* If we're sitting on node, we've already seen our children */ + if (parent && node == parent->rb_left && parent->rb_right) { + /* If we are the parent's left node, go to the parent's right + * node then all the way down to the left */ + return rb_left_deepest_node(parent->rb_right); + } else + /* Otherwise we are the parent's right node, and the parent + * should be next */ + return (struct rb_node *)parent; +} +EXPORT_SYMBOL(rb_next_postorder); + +struct rb_node *rb_first_postorder(const struct rb_root *root) +{ + if (!root->rb_node) + return NULL; + + return rb_left_deepest_node(root->rb_node); +} +EXPORT_SYMBOL(rb_first_postorder); diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c index 122f02f9941b..31dd4ccd3baa 100644 --- a/lib/rbtree_test.c +++ b/lib/rbtree_test.c @@ -114,6 +114,16 @@ static int black_path_count(struct rb_node *rb) return count; } +static void check_postorder(int nr_nodes) +{ + struct rb_node *rb; + int count = 0; + for (rb = rb_first_postorder(&root); rb; rb = rb_next_postorder(rb)) + count++; + + WARN_ON_ONCE(count != nr_nodes); +} + static void check(int nr_nodes) { struct rb_node *rb; @@ -136,6 +146,8 @@ static void check(int nr_nodes) WARN_ON_ONCE(count != nr_nodes); WARN_ON_ONCE(count < (1 << black_path_count(rb_last(&root))) - 1); + + check_postorder(nr_nodes); } static void check_augmented(int nr_nodes) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 37d9edcd14cf..ce682f7a4f29 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -652,7 +652,7 @@ int pdflush_proc_obsolete(struct ctl_table *table, int write, { char kbuf[] = "0\n"; - if (*ppos) { + if (*ppos || *lenp < sizeof(kbuf)) { *lenp = 0; return 0; } diff --git a/mm/compaction.c b/mm/compaction.c index 05ccb4cc0bdb..c43789388cd8 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1131,6 +1131,9 @@ void compact_pgdat(pg_data_t *pgdat, int order) .sync = false, }; + if (!order) + return; + __compact_pgdat(pgdat, &cc); } diff --git a/mm/filemap.c b/mm/filemap.c index 731a2c24532d..e607728db4a8 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -469,7 +469,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, if (error) goto out; - error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); + error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); if (error == 0) { page_cache_get(page); page->mapping = mapping; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a92012a71702..963e14c0486f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -417,7 +417,7 @@ static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, unsigned long msecs; int err; - err = strict_strtoul(buf, 10, &msecs); + err = kstrtoul(buf, 10, &msecs); if (err || msecs > UINT_MAX) return -EINVAL; @@ -444,7 +444,7 @@ static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, unsigned long msecs; int err; - err = strict_strtoul(buf, 10, &msecs); + err = kstrtoul(buf, 10, &msecs); if (err || msecs > UINT_MAX) return -EINVAL; @@ -470,7 +470,7 @@ static ssize_t pages_to_scan_store(struct kobject *kobj, int err; unsigned long pages; - err = strict_strtoul(buf, 10, &pages); + err = kstrtoul(buf, 10, &pages); if (err || !pages || pages > UINT_MAX) return -EINVAL; @@ -538,7 +538,7 @@ static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, int err; unsigned long max_ptes_none; - err = strict_strtoul(buf, 10, &max_ptes_none); + err = kstrtoul(buf, 10, &max_ptes_none); if (err || max_ptes_none > HPAGE_PMD_NR-1) return -EINVAL; @@ -2296,6 +2296,8 @@ static void collapse_huge_page(struct mm_struct *mm, goto out; vma = find_vma(mm, address); + if (!vma) + goto out; hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (address < hstart || address + HPAGE_PMD_SIZE > hend) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b60f33080a28..b49579c7f2a5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -21,6 +21,7 @@ #include <linux/rmap.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/page-isolation.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -33,7 +34,6 @@ #include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; -static gfp_t htlb_alloc_mask = GFP_HIGHUSER; unsigned long hugepages_treat_as_movable; int hugetlb_max_hstate __read_mostly; @@ -48,7 +48,8 @@ static unsigned long __initdata default_hstate_max_huge_pages; static unsigned long __initdata default_hstate_size; /* - * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages + * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, + * free_huge_pages, and surplus_huge_pages. */ DEFINE_SPINLOCK(hugetlb_lock); @@ -135,9 +136,9 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) * across the pages in a mapping. * * The region data structures are protected by a combination of the mmap_sem - * and the hugetlb_instantion_mutex. To access or modify a region the caller + * and the hugetlb_instantiation_mutex. To access or modify a region the caller * must either hold the mmap_sem for write, or the mmap_sem for read and - * the hugetlb_instantiation mutex: + * the hugetlb_instantiation_mutex: * * down_write(&mm->mmap_sem); * or @@ -434,25 +435,6 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) return (get_vma_private_data(vma) & flag) != 0; } -/* Decrement the reserved pages in the hugepage pool by one */ -static void decrement_hugepage_resv_vma(struct hstate *h, - struct vm_area_struct *vma) -{ - if (vma->vm_flags & VM_NORESERVE) - return; - - if (vma->vm_flags & VM_MAYSHARE) { - /* Shared mappings always use reserves */ - h->resv_huge_pages--; - } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { - /* - * Only the process that called mmap() has reserves for - * private mappings. - */ - h->resv_huge_pages--; - } -} - /* Reset counters to 0 and clear all HPAGE_RESV_* flags */ void reset_vma_resv_huge_pages(struct vm_area_struct *vma) { @@ -462,12 +444,35 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma) } /* Returns true if the VMA has associated reserve pages */ -static int vma_has_reserves(struct vm_area_struct *vma) +static int vma_has_reserves(struct vm_area_struct *vma, long chg) { + if (vma->vm_flags & VM_NORESERVE) { + /* + * This address is already reserved by other process(chg == 0), + * so, we should decrement reserved count. Without decrementing, + * reserve count remains after releasing inode, because this + * allocated page will go into page cache and is regarded as + * coming from reserved pool in releasing step. Currently, we + * don't have any other solution to deal with this situation + * properly, so add work-around here. + */ + if (vma->vm_flags & VM_MAYSHARE && chg == 0) + return 1; + else + return 0; + } + + /* Shared mappings always use reserves */ if (vma->vm_flags & VM_MAYSHARE) return 1; + + /* + * Only the process that called mmap() has reserves for + * private mappings. + */ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) return 1; + return 0; } @@ -517,9 +522,15 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) { struct page *page; - if (list_empty(&h->hugepage_freelists[nid])) + list_for_each_entry(page, &h->hugepage_freelists[nid], lru) + if (!is_migrate_isolate_page(page)) + break; + /* + * if 'non-isolated free hugepage' not found on the list, + * the allocation fails. + */ + if (&h->hugepage_freelists[nid] == &page->lru) return NULL; - page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); list_move(&page->lru, &h->hugepage_activelist); set_page_refcounted(page); h->free_huge_pages--; @@ -527,9 +538,19 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) return page; } +/* Movability of hugepages depends on migration support. */ +static inline gfp_t htlb_alloc_mask(struct hstate *h) +{ + if (hugepages_treat_as_movable || hugepage_migration_support(h)) + return GFP_HIGHUSER_MOVABLE; + else + return GFP_HIGHUSER; +} + static struct page *dequeue_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, - unsigned long address, int avoid_reserve) + unsigned long address, int avoid_reserve, + long chg) { struct page *page = NULL; struct mempolicy *mpol; @@ -539,16 +560,12 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, struct zoneref *z; unsigned int cpuset_mems_cookie; -retry_cpuset: - cpuset_mems_cookie = get_mems_allowed(); - zonelist = huge_zonelist(vma, address, - htlb_alloc_mask, &mpol, &nodemask); /* * A child process with MAP_PRIVATE mappings created by their parent * have no page reserves. This check ensures that reservations are * not "stolen". The child may still get SIGKILLed */ - if (!vma_has_reserves(vma) && + if (!vma_has_reserves(vma, chg) && h->free_huge_pages - h->resv_huge_pages == 0) goto err; @@ -556,13 +573,23 @@ retry_cpuset: if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) goto err; +retry_cpuset: + cpuset_mems_cookie = get_mems_allowed(); + zonelist = huge_zonelist(vma, address, + htlb_alloc_mask(h), &mpol, &nodemask); + for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { - if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) { + if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask(h))) { page = dequeue_huge_page_node(h, zone_to_nid(zone)); if (page) { - if (!avoid_reserve) - decrement_hugepage_resv_vma(h, vma); + if (avoid_reserve) + break; + if (!vma_has_reserves(vma, chg)) + break; + + SetPagePrivate(page); + h->resv_huge_pages--; break; } } @@ -574,7 +601,6 @@ retry_cpuset: return page; err: - mpol_cond_put(mpol); return NULL; } @@ -620,15 +646,20 @@ static void free_huge_page(struct page *page) int nid = page_to_nid(page); struct hugepage_subpool *spool = (struct hugepage_subpool *)page_private(page); + bool restore_reserve; set_page_private(page, 0); page->mapping = NULL; BUG_ON(page_count(page)); BUG_ON(page_mapcount(page)); + restore_reserve = PagePrivate(page); spin_lock(&hugetlb_lock); hugetlb_cgroup_uncharge_page(hstate_index(h), pages_per_huge_page(h), page); + if (restore_reserve) + h->resv_huge_pages++; + if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { /* remove the page from active list */ list_del(&page->lru); @@ -715,7 +746,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) return NULL; page = alloc_pages_exact_node(nid, - htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| + htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); if (page) { @@ -772,33 +803,6 @@ static int hstate_next_node_to_alloc(struct hstate *h, return nid; } -static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) -{ - struct page *page; - int start_nid; - int next_nid; - int ret = 0; - - start_nid = hstate_next_node_to_alloc(h, nodes_allowed); - next_nid = start_nid; - - do { - page = alloc_fresh_huge_page_node(h, next_nid); - if (page) { - ret = 1; - break; - } - next_nid = hstate_next_node_to_alloc(h, nodes_allowed); - } while (next_nid != start_nid); - - if (ret) - count_vm_event(HTLB_BUDDY_PGALLOC); - else - count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); - - return ret; -} - /* * helper for free_pool_huge_page() - return the previously saved * node ["this node"] from which to free a huge page. Advance the @@ -817,6 +821,40 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) return nid; } +#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ + for (nr_nodes = nodes_weight(*mask); \ + nr_nodes > 0 && \ + ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ + nr_nodes--) + +#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ + for (nr_nodes = nodes_weight(*mask); \ + nr_nodes > 0 && \ + ((node = hstate_next_node_to_free(hs, mask)) || 1); \ + nr_nodes--) + +static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) +{ + struct page *page; + int nr_nodes, node; + int ret = 0; + + for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { + page = alloc_fresh_huge_page_node(h, node); + if (page) { + ret = 1; + break; + } + } + + if (ret) + count_vm_event(HTLB_BUDDY_PGALLOC); + else + count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); + + return ret; +} + /* * Free huge page from pool from next node to free. * Attempt to keep persistent huge pages more or less @@ -826,40 +864,73 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, bool acct_surplus) { - int start_nid; - int next_nid; + int nr_nodes, node; int ret = 0; - start_nid = hstate_next_node_to_free(h, nodes_allowed); - next_nid = start_nid; - - do { + for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { /* * If we're returning unused surplus pages, only examine * nodes with surplus pages. */ - if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) && - !list_empty(&h->hugepage_freelists[next_nid])) { + if ((!acct_surplus || h->surplus_huge_pages_node[node]) && + !list_empty(&h->hugepage_freelists[node])) { struct page *page = - list_entry(h->hugepage_freelists[next_nid].next, + list_entry(h->hugepage_freelists[node].next, struct page, lru); list_del(&page->lru); h->free_huge_pages--; - h->free_huge_pages_node[next_nid]--; + h->free_huge_pages_node[node]--; if (acct_surplus) { h->surplus_huge_pages--; - h->surplus_huge_pages_node[next_nid]--; + h->surplus_huge_pages_node[node]--; } update_and_free_page(h, page); ret = 1; break; } - next_nid = hstate_next_node_to_free(h, nodes_allowed); - } while (next_nid != start_nid); + } return ret; } +/* + * Dissolve a given free hugepage into free buddy pages. This function does + * nothing for in-use (including surplus) hugepages. + */ +static void dissolve_free_huge_page(struct page *page) +{ + spin_lock(&hugetlb_lock); + if (PageHuge(page) && !page_count(page)) { + struct hstate *h = page_hstate(page); + int nid = page_to_nid(page); + list_del(&page->lru); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + update_and_free_page(h, page); + } + spin_unlock(&hugetlb_lock); +} + +/* + * Dissolve free hugepages in a given pfn range. Used by memory hotplug to + * make specified memory blocks removable from the system. + * Note that start_pfn should aligned with (minimum) hugepage size. + */ +void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned int order = 8 * sizeof(void *); + unsigned long pfn; + struct hstate *h; + + /* Set scan step to minimum hugepage size */ + for_each_hstate(h) + if (order > huge_page_order(h)) + order = huge_page_order(h); + VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order)); + for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) + dissolve_free_huge_page(pfn_to_page(pfn)); +} + static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) { struct page *page; @@ -902,12 +973,12 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) spin_unlock(&hugetlb_lock); if (nid == NUMA_NO_NODE) - page = alloc_pages(htlb_alloc_mask|__GFP_COMP| + page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); else page = alloc_pages_exact_node(nid, - htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| + htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); if (page && arch_prepare_hugepage(page)) { @@ -944,10 +1015,11 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) */ struct page *alloc_huge_page_node(struct hstate *h, int nid) { - struct page *page; + struct page *page = NULL; spin_lock(&hugetlb_lock); - page = dequeue_huge_page_node(h, nid); + if (h->free_huge_pages - h->resv_huge_pages > 0) + page = dequeue_huge_page_node(h, nid); spin_unlock(&hugetlb_lock); if (!page) @@ -1035,11 +1107,8 @@ free: spin_unlock(&hugetlb_lock); /* Free unnecessary surplus pages to the buddy allocator */ - if (!list_empty(&surplus_list)) { - list_for_each_entry_safe(page, tmp, &surplus_list, lru) { - put_page(page); - } - } + list_for_each_entry_safe(page, tmp, &surplus_list, lru) + put_page(page); spin_lock(&hugetlb_lock); return ret; @@ -1106,9 +1175,9 @@ static long vma_needs_reservation(struct hstate *h, } else { long err; pgoff_t idx = vma_hugecache_offset(h, vma, addr); - struct resv_map *reservations = vma_resv_map(vma); + struct resv_map *resv = vma_resv_map(vma); - err = region_chg(&reservations->regions, idx, idx + 1); + err = region_chg(&resv->regions, idx, idx + 1); if (err < 0) return err; return 0; @@ -1126,10 +1195,10 @@ static void vma_commit_reservation(struct hstate *h, } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { pgoff_t idx = vma_hugecache_offset(h, vma, addr); - struct resv_map *reservations = vma_resv_map(vma); + struct resv_map *resv = vma_resv_map(vma); /* Mark this page used in the map. */ - region_add(&reservations->regions, idx, idx + 1); + region_add(&resv->regions, idx, idx + 1); } } @@ -1155,38 +1224,35 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, chg = vma_needs_reservation(h, vma, addr); if (chg < 0) return ERR_PTR(-ENOMEM); - if (chg) - if (hugepage_subpool_get_pages(spool, chg)) + if (chg || avoid_reserve) + if (hugepage_subpool_get_pages(spool, 1)) return ERR_PTR(-ENOSPC); ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); if (ret) { - hugepage_subpool_put_pages(spool, chg); + if (chg || avoid_reserve) + hugepage_subpool_put_pages(spool, 1); return ERR_PTR(-ENOSPC); } spin_lock(&hugetlb_lock); - page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); - if (page) { - /* update page cgroup details */ - hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), - h_cg, page); - spin_unlock(&hugetlb_lock); - } else { + page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg); + if (!page) { spin_unlock(&hugetlb_lock); page = alloc_buddy_huge_page(h, NUMA_NO_NODE); if (!page) { hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); - hugepage_subpool_put_pages(spool, chg); + if (chg || avoid_reserve) + hugepage_subpool_put_pages(spool, 1); return ERR_PTR(-ENOSPC); } spin_lock(&hugetlb_lock); - hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), - h_cg, page); list_move(&page->lru, &h->hugepage_activelist); - spin_unlock(&hugetlb_lock); + /* Fall through */ } + hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); + spin_unlock(&hugetlb_lock); set_page_private(page, (unsigned long)spool); @@ -1194,17 +1260,29 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, return page; } +/* + * alloc_huge_page()'s wrapper which simply returns the page if allocation + * succeeds, otherwise NULL. This function is called from new_vma_page(), + * where no ERR_VALUE is expected to be returned. + */ +struct page *alloc_huge_page_noerr(struct vm_area_struct *vma, + unsigned long addr, int avoid_reserve) +{ + struct page *page = alloc_huge_page(vma, addr, avoid_reserve); + if (IS_ERR(page)) + page = NULL; + return page; +} + int __weak alloc_bootmem_huge_page(struct hstate *h) { struct huge_bootmem_page *m; - int nr_nodes = nodes_weight(node_states[N_MEMORY]); + int nr_nodes, node; - while (nr_nodes) { + for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { void *addr; - addr = __alloc_bootmem_node_nopanic( - NODE_DATA(hstate_next_node_to_alloc(h, - &node_states[N_MEMORY])), + addr = __alloc_bootmem_node_nopanic(NODE_DATA(node), huge_page_size(h), huge_page_size(h), 0); if (addr) { @@ -1216,7 +1294,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) m = addr; goto found; } - nr_nodes--; } return 0; @@ -1355,48 +1432,28 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count, static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, int delta) { - int start_nid, next_nid; - int ret = 0; + int nr_nodes, node; VM_BUG_ON(delta != -1 && delta != 1); - if (delta < 0) - start_nid = hstate_next_node_to_alloc(h, nodes_allowed); - else - start_nid = hstate_next_node_to_free(h, nodes_allowed); - next_nid = start_nid; - - do { - int nid = next_nid; - if (delta < 0) { - /* - * To shrink on this node, there must be a surplus page - */ - if (!h->surplus_huge_pages_node[nid]) { - next_nid = hstate_next_node_to_alloc(h, - nodes_allowed); - continue; - } + if (delta < 0) { + for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { + if (h->surplus_huge_pages_node[node]) + goto found; } - if (delta > 0) { - /* - * Surplus cannot exceed the total number of pages - */ - if (h->surplus_huge_pages_node[nid] >= - h->nr_huge_pages_node[nid]) { - next_nid = hstate_next_node_to_free(h, - nodes_allowed); - continue; - } + } else { + for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { + if (h->surplus_huge_pages_node[node] < + h->nr_huge_pages_node[node]) + goto found; } + } + return 0; - h->surplus_huge_pages += delta; - h->surplus_huge_pages_node[nid] += delta; - ret = 1; - break; - } while (next_nid != start_nid); - - return ret; +found: + h->surplus_huge_pages += delta; + h->surplus_huge_pages_node[node] += delta; + return 1; } #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) @@ -1526,7 +1583,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, struct hstate *h; NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); - err = strict_strtoul(buf, 10, &count); + err = kstrtoul(buf, 10, &count); if (err) goto out; @@ -1617,7 +1674,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, if (h->order >= MAX_ORDER) return -EINVAL; - err = strict_strtoul(buf, 10, &input); + err = kstrtoul(buf, 10, &input); if (err) return err; @@ -2068,18 +2125,6 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, } #endif /* CONFIG_NUMA */ -int hugetlb_treat_movable_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *length, loff_t *ppos) -{ - proc_dointvec(table, write, buffer, length, ppos); - if (hugepages_treat_as_movable) - htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; - else - htlb_alloc_mask = GFP_HIGHUSER; - return 0; -} - int hugetlb_overcommit_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) @@ -2207,7 +2252,7 @@ out: static void hugetlb_vm_op_open(struct vm_area_struct *vma) { - struct resv_map *reservations = vma_resv_map(vma); + struct resv_map *resv = vma_resv_map(vma); /* * This new VMA should share its siblings reservation map if present. @@ -2217,34 +2262,34 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) * after this open call completes. It is therefore safe to take a * new reference here without additional locking. */ - if (reservations) - kref_get(&reservations->refs); + if (resv) + kref_get(&resv->refs); } static void resv_map_put(struct vm_area_struct *vma) { - struct resv_map *reservations = vma_resv_map(vma); + struct resv_map *resv = vma_resv_map(vma); - if (!reservations) + if (!resv) return; - kref_put(&reservations->refs, resv_map_release); + kref_put(&resv->refs, resv_map_release); } static void hugetlb_vm_op_close(struct vm_area_struct *vma) { struct hstate *h = hstate_vma(vma); - struct resv_map *reservations = vma_resv_map(vma); + struct resv_map *resv = vma_resv_map(vma); struct hugepage_subpool *spool = subpool_vma(vma); unsigned long reserve; unsigned long start; unsigned long end; - if (reservations) { + if (resv) { start = vma_hugecache_offset(h, vma, vma->vm_start); end = vma_hugecache_offset(h, vma, vma->vm_end); reserve = (end - start) - - region_count(&reservations->regions, start, end); + region_count(&resv->regions, start, end); resv_map_put(vma); @@ -2557,7 +2602,6 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, { struct hstate *h = hstate_vma(vma); struct page *old_page, *new_page; - int avoidcopy; int outside_reserve = 0; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ @@ -2567,10 +2611,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, retry_avoidcopy: /* If no-one else is actually using this page, avoid the copy * and just make the page writable */ - avoidcopy = (page_mapcount(old_page) == 1); - if (avoidcopy) { - if (PageAnon(old_page)) - page_move_anon_rmap(old_page, vma, address); + if (page_mapcount(old_page) == 1 && PageAnon(old_page)) { + page_move_anon_rmap(old_page, vma, address); set_huge_ptep_writable(vma, address, ptep); return 0; } @@ -2584,8 +2626,7 @@ retry_avoidcopy: * at the time of fork() could consume its reserves on COW instead * of the full address range. */ - if (!(vma->vm_flags & VM_MAYSHARE) && - is_vma_resv_set(vma, HPAGE_RESV_OWNER) && + if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && old_page != pagecache_page) outside_reserve = 1; @@ -2657,6 +2698,8 @@ retry_avoidcopy: spin_lock(&mm->page_table_lock); ptep = huge_pte_offset(mm, address & huge_page_mask(h)); if (likely(pte_same(huge_ptep_get(ptep), pte))) { + ClearPagePrivate(new_page); + /* Break COW */ huge_ptep_clear_flush(vma, address, ptep); set_huge_pte_at(mm, address, ptep, @@ -2668,10 +2711,11 @@ retry_avoidcopy: } spin_unlock(&mm->page_table_lock); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - /* Caller expects lock to be held */ - spin_lock(&mm->page_table_lock); page_cache_release(new_page); page_cache_release(old_page); + + /* Caller expects lock to be held */ + spin_lock(&mm->page_table_lock); return 0; } @@ -2767,6 +2811,7 @@ retry: goto retry; goto out; } + ClearPagePrivate(page); spin_lock(&inode->i_lock); inode->i_blocks += blocks_per_huge_page(h); @@ -2813,8 +2858,10 @@ retry: if (!huge_pte_none(huge_ptep_get(ptep))) goto backout; - if (anon_rmap) + if (anon_rmap) { + ClearPagePrivate(page); hugepage_add_new_anon_rmap(page, vma, address); + } else page_dup_rmap(page); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) @@ -3431,3 +3478,45 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage) return ret; } #endif + +bool isolate_huge_page(struct page *page, struct list_head *list) +{ + VM_BUG_ON(!PageHead(page)); + if (!get_page_unless_zero(page)) + return false; + spin_lock(&hugetlb_lock); + list_move_tail(&page->lru, list); + spin_unlock(&hugetlb_lock); + return true; +} + +void putback_active_hugepage(struct page *page) +{ + VM_BUG_ON(!PageHead(page)); + spin_lock(&hugetlb_lock); + list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); + spin_unlock(&hugetlb_lock); + put_page(page); +} + +bool is_hugepage_active(struct page *page) +{ + VM_BUG_ON(!PageHuge(page)); + /* + * This function can be called for a tail page because the caller, + * scan_movable_pages, scans through a given pfn-range which typically + * covers one memory block. In systems using gigantic hugepage (1GB + * for x86_64,) a hugepage is larger than a memory block, and we don't + * support migrating such large hugepages for now, so return false + * when called for tail pages. + */ + if (PageTail(page)) + return false; + /* + * Refcount of a hwpoisoned hugepages is 1, but they are not active, + * so we should return false for them. + */ + if (unlikely(PageHWPoison(page))) + return false; + return page_count(page) > 0; +} diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 3a61efc518d5..afc2daa91c60 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -88,12 +88,12 @@ static int pfn_inject_init(void) * hardware status change, hence do not require hardware support. * They are mainly for testing hwpoison in software level. */ - dentry = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, + dentry = debugfs_create_file("corrupt-pfn", 0200, hwpoison_dir, NULL, &hwpoison_fops); if (!dentry) goto fail; - dentry = debugfs_create_file("unpoison-pfn", 0600, hwpoison_dir, + dentry = debugfs_create_file("unpoison-pfn", 0200, hwpoison_dir, NULL, &unpoison_fops); if (!dentry) goto fail; diff --git a/mm/internal.h b/mm/internal.h index 4390ac6c106e..684f7aa9692a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -85,6 +85,8 @@ extern unsigned long highest_memmap_pfn; */ extern int isolate_lru_page(struct page *page); extern void putback_lru_page(struct page *page); +extern unsigned long zone_reclaimable_pages(struct zone *zone); +extern bool zone_reclaimable(struct zone *zone); /* * in mm/rmap.c: diff --git a/mm/kmemleak.c b/mm/kmemleak.c index c8d7f3110fd0..e126b0ef9ad2 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1639,7 +1639,7 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, else if (strncmp(buf, "scan=", 5) == 0) { unsigned long secs; - ret = strict_strtoul(buf + 5, 0, &secs); + ret = kstrtoul(buf + 5, 0, &secs); if (ret < 0) goto out; stop_scan_thread(); @@ -2194,7 +2194,7 @@ static ssize_t sleep_millisecs_store(struct kobject *kobj, unsigned long msecs; int err; - err = strict_strtoul(buf, 10, &msecs); + err = kstrtoul(buf, 10, &msecs); if (err || msecs > UINT_MAX) return -EINVAL; @@ -2217,7 +2217,7 @@ static ssize_t pages_to_scan_store(struct kobject *kobj, int err; unsigned long nr_pages; - err = strict_strtoul(buf, 10, &nr_pages); + err = kstrtoul(buf, 10, &nr_pages); if (err || nr_pages > UINT_MAX) return -EINVAL; @@ -2239,7 +2239,7 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, int err; unsigned long flags; - err = strict_strtoul(buf, 10, &flags); + err = kstrtoul(buf, 10, &flags); if (err || flags > UINT_MAX) return -EINVAL; if (flags > KSM_RUN_UNMERGE) diff --git a/mm/madvise.c b/mm/madvise.c index 7055883e6e25..6975bc812542 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -42,11 +42,11 @@ static int madvise_need_mmap_write(int behavior) * We can potentially split a vm area into separate * areas, each area with its own behavior. */ -static long madvise_behavior(struct vm_area_struct * vma, +static long madvise_behavior(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, int behavior) { - struct mm_struct * mm = vma->vm_mm; + struct mm_struct *mm = vma->vm_mm; int error = 0; pgoff_t pgoff; unsigned long new_flags = vma->vm_flags; @@ -215,8 +215,8 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, /* * Schedule all required I/O operations. Do not wait for completion. */ -static long madvise_willneed(struct vm_area_struct * vma, - struct vm_area_struct ** prev, +static long madvise_willneed(struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, unsigned long end) { struct file *file = vma->vm_file; @@ -270,8 +270,8 @@ static long madvise_willneed(struct vm_area_struct * vma, * An interface that causes the system to free clean pages and flush * dirty pages is already available as msync(MS_INVALIDATE). */ -static long madvise_dontneed(struct vm_area_struct * vma, - struct vm_area_struct ** prev, +static long madvise_dontneed(struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, unsigned long end) { *prev = vma; @@ -343,29 +343,34 @@ static long madvise_remove(struct vm_area_struct *vma, */ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) { - int ret = 0; - if (!capable(CAP_SYS_ADMIN)) return -EPERM; for (; start < end; start += PAGE_SIZE) { struct page *p; - int ret = get_user_pages_fast(start, 1, 0, &p); + int ret; + + ret = get_user_pages_fast(start, 1, 0, &p); if (ret != 1) return ret; + + if (PageHWPoison(p)) { + put_page(p); + continue; + } if (bhv == MADV_SOFT_OFFLINE) { - printk(KERN_INFO "Soft offlining page %lx at %lx\n", + pr_info("Soft offlining page %#lx at %#lx\n", page_to_pfn(p), start); ret = soft_offline_page(p, MF_COUNT_INCREASED); if (ret) - break; + return ret; continue; } - printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", + pr_info("Injecting memory failure for page %#lx at %#lx\n", page_to_pfn(p), start); /* Ignore return value for now */ memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); } - return ret; + return 0; } #endif @@ -459,7 +464,7 @@ madvise_behavior_valid(int behavior) SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) { unsigned long end, tmp; - struct vm_area_struct * vma, *prev; + struct vm_area_struct *vma, *prev; int unmapped_error = 0; int error = -EINVAL; int write; diff --git a/mm/memblock.c b/mm/memblock.c index a847bfe6f3ba..0ac412a0a7ee 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -914,6 +914,24 @@ int __init_memblock memblock_is_memory(phys_addr_t addr) return memblock_search(&memblock.memory, addr) != -1; } +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +int __init_memblock memblock_search_pfn_nid(unsigned long pfn, + unsigned long *start_pfn, unsigned long *end_pfn) +{ + struct memblock_type *type = &memblock.memory; + int mid = memblock_search(type, (phys_addr_t)pfn << PAGE_SHIFT); + + if (mid == -1) + return -1; + + *start_pfn = type->regions[mid].base >> PAGE_SHIFT; + *end_pfn = (type->regions[mid].base + type->regions[mid].size) + >> PAGE_SHIFT; + + return type->regions[mid].nid; +} +#endif + /** * memblock_is_region_memory - check if a region is a subset of memory * @base: base of region to check diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3b83957b6439..c6bd28edd533 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3121,7 +3121,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) ssize_t size = memcg_caches_array_size(num_groups); size *= sizeof(void *); - size += sizeof(struct memcg_cache_params); + size += offsetof(struct memcg_cache_params, memcg_caches); s->memcg_params = kzalloc(size, GFP_KERNEL); if (!s->memcg_params) { @@ -3164,13 +3164,16 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, struct kmem_cache *root_cache) { - size_t size = sizeof(struct memcg_cache_params); + size_t size; if (!memcg_kmem_enabled()) return 0; - if (!memcg) + if (!memcg) { + size = offsetof(struct memcg_cache_params, memcg_caches); size += memcg_limited_groups_array_size * sizeof(void *); + } else + size = sizeof(struct memcg_cache_params); s->memcg_params = kzalloc(size, GFP_KERNEL); if (!s->memcg_params) @@ -5588,7 +5591,13 @@ static int compare_thresholds(const void *a, const void *b) const struct mem_cgroup_threshold *_a = a; const struct mem_cgroup_threshold *_b = b; - return _a->threshold - _b->threshold; + if (_a->threshold > _b->threshold) + return 1; + + if (_a->threshold < _b->threshold) + return -1; + + return 0; } static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index d84c5e5331bb..d472e14c6808 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -206,7 +206,7 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, #ifdef __ARCH_SI_TRAPNO si.si_trapno = trapno; #endif - si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT; + si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; if ((flags & MF_ACTION_REQUIRED) && t == current) { si.si_code = BUS_MCEERR_AR; @@ -983,7 +983,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, static void set_page_hwpoison_huge_page(struct page *hpage) { int i; - int nr_pages = 1 << compound_trans_order(hpage); + int nr_pages = 1 << compound_order(hpage); for (i = 0; i < nr_pages; i++) SetPageHWPoison(hpage + i); } @@ -991,7 +991,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage) static void clear_page_hwpoison_huge_page(struct page *hpage) { int i; - int nr_pages = 1 << compound_trans_order(hpage); + int nr_pages = 1 << compound_order(hpage); for (i = 0; i < nr_pages; i++) ClearPageHWPoison(hpage + i); } @@ -1204,6 +1204,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags) for (ps = error_states;; ps++) if ((p->flags & ps->mask) == ps->res) break; + + page_flags |= (p->flags & (1UL << PG_dirty)); + if (!ps->mask) for (ps = error_states;; ps++) if ((page_flags & ps->mask) == ps->res) @@ -1339,7 +1342,17 @@ int unpoison_memory(unsigned long pfn) return 0; } - nr_pages = 1 << compound_trans_order(page); + /* + * unpoison_memory() can encounter thp only when the thp is being + * worked by memory_failure() and the page lock is not held yet. + * In such case, we yield to memory_failure() and make unpoison fail. + */ + if (PageTransHuge(page)) { + pr_info("MCE: Memory failure is now running on %#lx\n", pfn); + return 0; + } + + nr_pages = 1 << compound_order(page); if (!get_page_unless_zero(page)) { /* @@ -1353,7 +1366,7 @@ int unpoison_memory(unsigned long pfn) return 0; } if (TestClearPageHWPoison(p)) - atomic_long_sub(nr_pages, &num_poisoned_pages); + atomic_long_dec(&num_poisoned_pages); pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); return 0; } @@ -1375,7 +1388,7 @@ int unpoison_memory(unsigned long pfn) unlock_page(page); put_page(page); - if (freeit) + if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) put_page(page); return 0; @@ -1416,7 +1429,8 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags) * was free. This flag should be kept set until the source page * is freed and PG_hwpoison on it is set. */ - set_migratetype_isolate(p, true); + if (get_pageblock_migratetype(p) != MIGRATE_ISOLATE) + set_migratetype_isolate(p, true); /* * When the target page is a free hugepage, just remove it * from free hugepage list. @@ -1470,6 +1484,7 @@ static int soft_offline_huge_page(struct page *page, int flags) int ret; unsigned long pfn = page_to_pfn(page); struct page *hpage = compound_head(page); + LIST_HEAD(pagelist); /* * This double-check of PageHWPoison is to avoid the race with @@ -1485,86 +1500,29 @@ static int soft_offline_huge_page(struct page *page, int flags) unlock_page(hpage); /* Keep page count to indicate a given hugepage is isolated. */ - ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, - MIGRATE_SYNC); - put_page(hpage); + list_move(&hpage->lru, &pagelist); + ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, + MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); + /* + * We know that soft_offline_huge_page() tries to migrate + * only one hugepage pointed to by hpage, so we need not + * run through the pagelist here. + */ + putback_active_hugepage(hpage); + if (ret > 0) + ret = -EIO; } else { set_page_hwpoison_huge_page(hpage); dequeue_hwpoisoned_huge_page(hpage); - atomic_long_add(1 << compound_trans_order(hpage), + atomic_long_add(1 << compound_order(hpage), &num_poisoned_pages); } return ret; } -static int __soft_offline_page(struct page *page, int flags); - -/** - * soft_offline_page - Soft offline a page. - * @page: page to offline - * @flags: flags. Same as memory_failure(). - * - * Returns 0 on success, otherwise negated errno. - * - * Soft offline a page, by migration or invalidation, - * without killing anything. This is for the case when - * a page is not corrupted yet (so it's still valid to access), - * but has had a number of corrected errors and is better taken - * out. - * - * The actual policy on when to do that is maintained by - * user space. - * - * This should never impact any application or cause data loss, - * however it might take some time. - * - * This is not a 100% solution for all memory, but tries to be - * ``good enough'' for the majority of memory. - */ -int soft_offline_page(struct page *page, int flags) -{ - int ret; - unsigned long pfn = page_to_pfn(page); - struct page *hpage = compound_trans_head(page); - - if (PageHWPoison(page)) { - pr_info("soft offline: %#lx page already poisoned\n", pfn); - return -EBUSY; - } - if (!PageHuge(page) && PageTransHuge(hpage)) { - if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { - pr_info("soft offline: %#lx: failed to split THP\n", - pfn); - return -EBUSY; - } - } - - ret = get_any_page(page, pfn, flags); - if (ret < 0) - return ret; - if (ret) { /* for in-use pages */ - if (PageHuge(page)) - ret = soft_offline_huge_page(page, flags); - else - ret = __soft_offline_page(page, flags); - } else { /* for free pages */ - if (PageHuge(page)) { - set_page_hwpoison_huge_page(hpage); - dequeue_hwpoisoned_huge_page(hpage); - atomic_long_add(1 << compound_trans_order(hpage), - &num_poisoned_pages); - } else { - SetPageHWPoison(page); - atomic_long_inc(&num_poisoned_pages); - } - } - unset_migratetype_isolate(page, MIGRATE_MOVABLE); - return ret; -} - static int __soft_offline_page(struct page *page, int flags) { int ret; @@ -1651,3 +1609,67 @@ static int __soft_offline_page(struct page *page, int flags) } return ret; } + +/** + * soft_offline_page - Soft offline a page. + * @page: page to offline + * @flags: flags. Same as memory_failure(). + * + * Returns 0 on success, otherwise negated errno. + * + * Soft offline a page, by migration or invalidation, + * without killing anything. This is for the case when + * a page is not corrupted yet (so it's still valid to access), + * but has had a number of corrected errors and is better taken + * out. + * + * The actual policy on when to do that is maintained by + * user space. + * + * This should never impact any application or cause data loss, + * however it might take some time. + * + * This is not a 100% solution for all memory, but tries to be + * ``good enough'' for the majority of memory. + */ +int soft_offline_page(struct page *page, int flags) +{ + int ret; + unsigned long pfn = page_to_pfn(page); + struct page *hpage = compound_trans_head(page); + + if (PageHWPoison(page)) { + pr_info("soft offline: %#lx page already poisoned\n", pfn); + return -EBUSY; + } + if (!PageHuge(page) && PageTransHuge(hpage)) { + if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { + pr_info("soft offline: %#lx: failed to split THP\n", + pfn); + return -EBUSY; + } + } + + ret = get_any_page(page, pfn, flags); + if (ret < 0) + goto unset; + if (ret) { /* for in-use pages */ + if (PageHuge(page)) + ret = soft_offline_huge_page(page, flags); + else + ret = __soft_offline_page(page, flags); + } else { /* for free pages */ + if (PageHuge(page)) { + set_page_hwpoison_huge_page(hpage); + dequeue_hwpoisoned_huge_page(hpage); + atomic_long_add(1 << compound_order(hpage), + &num_poisoned_pages); + } else { + SetPageHWPoison(page); + atomic_long_inc(&num_poisoned_pages); + } + } +unset: + unset_migratetype_isolate(page, MIGRATE_MOVABLE); + return ret; +} diff --git a/mm/memory.c b/mm/memory.c index b3c6bf9a398e..2b73dbde2274 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -373,30 +373,6 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ /* - * If a p?d_bad entry is found while walking page tables, report - * the error, before resetting entry to p?d_none. Usually (but - * very seldom) called out from the p?d_none_or_clear_bad macros. - */ - -void pgd_clear_bad(pgd_t *pgd) -{ - pgd_ERROR(*pgd); - pgd_clear(pgd); -} - -void pud_clear_bad(pud_t *pud) -{ - pud_ERROR(*pud); - pud_clear(pud); -} - -void pmd_clear_bad(pmd_t *pmd) -{ - pmd_ERROR(*pmd); - pmd_clear(pmd); -} - -/* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. */ @@ -1505,7 +1481,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma, if (pud_none(*pud)) goto no_page_table; if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { - BUG_ON(flags & FOLL_GET); + if (flags & FOLL_GET) + goto out; page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); goto out; } @@ -1516,8 +1493,20 @@ struct page *follow_page_mask(struct vm_area_struct *vma, if (pmd_none(*pmd)) goto no_page_table; if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { - BUG_ON(flags & FOLL_GET); page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); + if (flags & FOLL_GET) { + /* + * Refcount on tail pages are not well-defined and + * shouldn't be taken. The caller should handle a NULL + * return when trying to follow tail pages. + */ + if (PageHead(page)) + get_page(page); + else { + page = NULL; + goto out; + } + } goto out; } if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ca1dd3aa5eee..0eb1a1df649d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -30,6 +30,7 @@ #include <linux/mm_inline.h> #include <linux/firmware-map.h> #include <linux/stop_machine.h> +#include <linux/hugetlb.h> #include <asm/tlbflush.h> @@ -194,7 +195,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) zone = &pgdat->node_zones[0]; for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { - if (zone->wait_table) { + if (zone_is_initialized(zone)) { nr_pages = zone->wait_table_hash_nr_entries * sizeof(wait_queue_head_t); nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; @@ -229,8 +230,8 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn, zone_span_writelock(zone); - old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; - if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn) + old_zone_end_pfn = zone_end_pfn(zone); + if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) zone->zone_start_pfn = start_pfn; zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - @@ -305,7 +306,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, goto out_fail; /* use start_pfn for z1's start_pfn if z1 is empty */ - if (z1->spanned_pages) + if (!zone_is_empty(z1)) z1_start_pfn = z1->zone_start_pfn; else z1_start_pfn = start_pfn; @@ -347,7 +348,7 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, goto out_fail; /* use end_pfn for z2's end_pfn if z2 is empty */ - if (z2->spanned_pages) + if (!zone_is_empty(z2)) z2_end_pfn = zone_end_pfn(z2); else z2_end_pfn = end_pfn; @@ -514,8 +515,9 @@ static int find_biggest_section_pfn(int nid, struct zone *zone, static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) { - unsigned long zone_start_pfn = zone->zone_start_pfn; - unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; + unsigned long zone_start_pfn = zone->zone_start_pfn; + unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */ + unsigned long zone_end_pfn = z; unsigned long pfn; struct mem_section *ms; int nid = zone_to_nid(zone); @@ -1069,6 +1071,23 @@ out: return ret; } +static int check_hotplug_memory_range(u64 start, u64 size) +{ + u64 start_pfn = start >> PAGE_SHIFT; + u64 nr_pages = size >> PAGE_SHIFT; + + /* Memory range must be aligned with section */ + if ((start_pfn & ~PAGE_SECTION_MASK) || + (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) { + pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n", + (unsigned long long)start, + (unsigned long long)size); + return -EINVAL; + } + + return 0; +} + /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ int __ref add_memory(int nid, u64 start, u64 size) { @@ -1078,6 +1097,10 @@ int __ref add_memory(int nid, u64 start, u64 size) struct resource *res; int ret; + ret = check_hotplug_memory_range(start, size); + if (ret) + return ret; + lock_memory_hotplug(); res = register_memory_resource(start, size); @@ -1208,10 +1231,12 @@ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) } /* - * Scanning pfn is much easier than scanning lru list. - * Scan pfn from start to end and Find LRU page. + * Scan pfn range [start,end) to find movable/migratable pages (LRU pages + * and hugepages). We scan pfn because it's much easier than scanning over + * linked list. This function returns the pfn of the first found movable + * page if it's found, otherwise 0. */ -static unsigned long scan_lru_pages(unsigned long start, unsigned long end) +static unsigned long scan_movable_pages(unsigned long start, unsigned long end) { unsigned long pfn; struct page *page; @@ -1220,6 +1245,13 @@ static unsigned long scan_lru_pages(unsigned long start, unsigned long end) page = pfn_to_page(pfn); if (PageLRU(page)) return pfn; + if (PageHuge(page)) { + if (is_hugepage_active(page)) + return pfn; + else + pfn = round_up(pfn + 1, + 1 << compound_order(page)) - 1; + } } } return 0; @@ -1240,6 +1272,19 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (!pfn_valid(pfn)) continue; page = pfn_to_page(pfn); + + if (PageHuge(page)) { + struct page *head = compound_head(page); + pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; + if (compound_order(head) > PFN_SECTION_SHIFT) { + ret = -EBUSY; + break; + } + if (isolate_huge_page(page, &source)) + move_pages -= 1 << compound_order(head); + continue; + } + if (!get_page_unless_zero(page)) continue; /* @@ -1272,7 +1317,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) } if (!list_empty(&source)) { if (not_managed) { - putback_lru_pages(&source); + putback_movable_pages(&source); goto out; } @@ -1283,7 +1328,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) ret = migrate_pages(&source, alloc_migrate_target, 0, MIGRATE_SYNC, MR_MEMORY_HOTPLUG); if (ret) - putback_lru_pages(&source); + putback_movable_pages(&source); } out: return ret; @@ -1472,7 +1517,6 @@ static int __ref __offline_pages(unsigned long start_pfn, struct zone *zone; struct memory_notify arg; - BUG_ON(start_pfn >= end_pfn); /* at least, alignment against pageblock is necessary */ if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) return -EINVAL; @@ -1527,8 +1571,8 @@ repeat: drain_all_pages(); } - pfn = scan_lru_pages(start_pfn, end_pfn); - if (pfn) { /* We have page on LRU */ + pfn = scan_movable_pages(start_pfn, end_pfn); + if (pfn) { /* We have movable pages */ ret = do_migrate_range(pfn, end_pfn); if (!ret) { drain = 1; @@ -1547,6 +1591,11 @@ repeat: yield(); /* drain pcp pages, this is synchronous. */ drain_all_pages(); + /* + * dissolve free hugepages in the memory block before doing offlining + * actually in order to make hugetlbfs's object counting consistent. + */ + dissolve_free_huge_pages(start_pfn, end_pfn); /* check again */ offlined_pages = check_pages_isolated(start_pfn, end_pfn); if (offlined_pages < 0) { @@ -1674,9 +1723,8 @@ static int is_memblock_offlined_cb(struct memory_block *mem, void *arg) return ret; } -static int check_cpu_on_node(void *data) +static int check_cpu_on_node(pg_data_t *pgdat) { - struct pglist_data *pgdat = data; int cpu; for_each_present_cpu(cpu) { @@ -1691,10 +1739,9 @@ static int check_cpu_on_node(void *data) return 0; } -static void unmap_cpu_on_node(void *data) +static void unmap_cpu_on_node(pg_data_t *pgdat) { #ifdef CONFIG_ACPI_NUMA - struct pglist_data *pgdat = data; int cpu; for_each_possible_cpu(cpu) @@ -1703,10 +1750,11 @@ static void unmap_cpu_on_node(void *data) #endif } -static int check_and_unmap_cpu_on_node(void *data) +static int check_and_unmap_cpu_on_node(pg_data_t *pgdat) { - int ret = check_cpu_on_node(data); + int ret; + ret = check_cpu_on_node(pgdat); if (ret) return ret; @@ -1715,11 +1763,18 @@ static int check_and_unmap_cpu_on_node(void *data) * the cpu_to_node() now. */ - unmap_cpu_on_node(data); + unmap_cpu_on_node(pgdat); return 0; } -/* offline the node if all memory sections of this node are removed */ +/** + * try_offline_node + * + * Offline a node if all memory sections and cpus of the node are removed. + * + * NOTE: The caller must call lock_device_hotplug() to serialize hotplug + * and online/offline operations before this call. + */ void try_offline_node(int nid) { pg_data_t *pgdat = NODE_DATA(nid); @@ -1745,7 +1800,7 @@ void try_offline_node(int nid) return; } - if (stop_machine(check_and_unmap_cpu_on_node, pgdat, NULL)) + if (check_and_unmap_cpu_on_node(pgdat)) return; /* @@ -1782,10 +1837,19 @@ void try_offline_node(int nid) } EXPORT_SYMBOL(try_offline_node); +/** + * remove_memory + * + * NOTE: The caller must call lock_device_hotplug() to serialize hotplug + * and online/offline operations before this call, as required by + * try_offline_node(). + */ void __ref remove_memory(int nid, u64 start, u64 size) { int ret; + BUG_ON(check_hotplug_memory_range(start, size)); + lock_memory_hotplug(); /* diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4baf12e534d1..04729647f359 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -123,16 +123,19 @@ static struct mempolicy preferred_node_policy[MAX_NUMNODES]; static struct mempolicy *get_task_policy(struct task_struct *p) { struct mempolicy *pol = p->mempolicy; - int node; if (!pol) { - node = numa_node_id(); - if (node != NUMA_NO_NODE) - pol = &preferred_node_policy[node]; + int node = numa_node_id(); - /* preferred_node_policy is not initialised early in boot */ - if (!pol->mode) - pol = NULL; + if (node != NUMA_NO_NODE) { + pol = &preferred_node_policy[node]; + /* + * preferred_node_policy is not initialised early in + * boot + */ + if (!pol->mode) + pol = NULL; + } } return pol; @@ -473,8 +476,11 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { static void migrate_page_add(struct page *page, struct list_head *pagelist, unsigned long flags); -/* Scan through pages checking if pages follow certain conditions. */ -static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, +/* + * Scan through pages checking if pages follow certain conditions, + * and move them to the pagelist if they do. + */ +static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, const nodemask_t *nodes, unsigned long flags, void *private) @@ -512,7 +518,31 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, return addr != end; } -static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, +static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, + pmd_t *pmd, const nodemask_t *nodes, unsigned long flags, + void *private) +{ +#ifdef CONFIG_HUGETLB_PAGE + int nid; + struct page *page; + + spin_lock(&vma->vm_mm->page_table_lock); + page = pte_page(huge_ptep_get((pte_t *)pmd)); + nid = page_to_nid(page); + if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) + goto unlock; + /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ + if (flags & (MPOL_MF_MOVE_ALL) || + (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) + isolate_huge_page(page, private); +unlock: + spin_unlock(&vma->vm_mm->page_table_lock); +#else + BUG(); +#endif +} + +static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, const nodemask_t *nodes, unsigned long flags, void *private) @@ -523,17 +553,24 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + if (!pmd_present(*pmd)) + continue; + if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) { + queue_pages_hugetlb_pmd_range(vma, pmd, nodes, + flags, private); + continue; + } split_huge_page_pmd(vma, addr, pmd); if (pmd_none_or_trans_huge_or_clear_bad(pmd)) continue; - if (check_pte_range(vma, pmd, addr, next, nodes, + if (queue_pages_pte_range(vma, pmd, addr, next, nodes, flags, private)) return -EIO; } while (pmd++, addr = next, addr != end); return 0; } -static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, +static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, const nodemask_t *nodes, unsigned long flags, void *private) @@ -544,16 +581,18 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); + if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) + continue; if (pud_none_or_clear_bad(pud)) continue; - if (check_pmd_range(vma, pud, addr, next, nodes, + if (queue_pages_pmd_range(vma, pud, addr, next, nodes, flags, private)) return -EIO; } while (pud++, addr = next, addr != end); return 0; } -static inline int check_pgd_range(struct vm_area_struct *vma, +static inline int queue_pages_pgd_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, const nodemask_t *nodes, unsigned long flags, void *private) @@ -566,7 +605,7 @@ static inline int check_pgd_range(struct vm_area_struct *vma, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - if (check_pud_range(vma, pgd, addr, next, nodes, + if (queue_pages_pud_range(vma, pgd, addr, next, nodes, flags, private)) return -EIO; } while (pgd++, addr = next, addr != end); @@ -604,12 +643,14 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma, #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ /* - * Check if all pages in a range are on a set of nodes. - * If pagelist != NULL then isolate pages from the LRU and - * put them on the pagelist. + * Walk through page tables and collect pages to be migrated. + * + * If pages found in a given range are on a set of nodes (determined by + * @nodes and @flags,) it's isolated and queued to the pagelist which is + * passed via @private.) */ static struct vm_area_struct * -check_range(struct mm_struct *mm, unsigned long start, unsigned long end, +queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, const nodemask_t *nodes, unsigned long flags, void *private) { int err; @@ -635,9 +676,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, return ERR_PTR(-EFAULT); } - if (is_vm_hugetlb_page(vma)) - goto next; - if (flags & MPOL_MF_LAZY) { change_prot_numa(vma, start, endvma); goto next; @@ -647,7 +685,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && vma_migratable(vma))) { - err = check_pgd_range(vma, start, endvma, nodes, + err = queue_pages_pgd_range(vma, start, endvma, nodes, flags, private); if (err) { first = ERR_PTR(err); @@ -990,7 +1028,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, static struct page *new_node_page(struct page *page, unsigned long node, int **x) { - return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); + if (PageHuge(page)) + return alloc_huge_page_node(page_hstate(compound_head(page)), + node); + else + return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); } /* @@ -1013,14 +1055,14 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. */ VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); - check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, + queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_node_page, dest, MIGRATE_SYNC, MR_SYSCALL); if (err) - putback_lru_pages(&pagelist); + putback_movable_pages(&pagelist); } return err; @@ -1154,10 +1196,14 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int * break; vma = vma->vm_next; } - /* - * if !vma, alloc_page_vma() will use task or system default policy + * queue_pages_range() confirms that @page belongs to some vma, + * so vma shouldn't be NULL. */ + BUG_ON(!vma); + + if (PageHuge(page)) + return alloc_huge_page_noerr(vma, address, 1); return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); } #else @@ -1249,7 +1295,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (err) goto mpol_out; - vma = check_range(mm, start, end, nmask, + vma = queue_pages_range(mm, start, end, nmask, flags | MPOL_MF_INVERT, &pagelist); err = PTR_ERR(vma); /* maybe ... */ @@ -1265,7 +1311,7 @@ static long do_mbind(unsigned long start, unsigned long len, (unsigned long)vma, MIGRATE_SYNC, MR_MEMPOLICY_MBIND); if (nr_failed) - putback_lru_pages(&pagelist); + putback_movable_pages(&pagelist); } if (nr_failed && (flags & MPOL_MF_STRICT)) @@ -2065,6 +2111,16 @@ retry_cpuset: } EXPORT_SYMBOL(alloc_pages_current); +int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) +{ + struct mempolicy *pol = mpol_dup(vma_policy(src)); + + if (IS_ERR(pol)) + return PTR_ERR(pol); + dst->vm_policy = pol; + return 0; +} + /* * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it * rebinds the mempolicy its copying by calling mpol_rebind_policy() diff --git a/mm/mempool.c b/mm/mempool.c index 54990476c049..659aa42bad16 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -73,7 +73,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, gfp_t gfp_mask, int node_id) { mempool_t *pool; - pool = kmalloc_node(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id); + pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id); if (!pool) return NULL; pool->elements = kmalloc_node(min_nr * sizeof(void *), diff --git a/mm/migrate.c b/mm/migrate.c index 6f0c24438bba..b7ded7eafe3a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -100,6 +100,10 @@ void putback_movable_pages(struct list_head *l) struct page *page2; list_for_each_entry_safe(page, page2, l, lru) { + if (unlikely(PageHuge(page))) { + putback_active_hugepage(page); + continue; + } list_del(&page->lru); dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); @@ -945,6 +949,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, struct page *new_hpage = get_new_page(hpage, private, &result); struct anon_vma *anon_vma = NULL; + /* + * Movability of hugepages depends on architectures and hugepage size. + * This check is necessary because some callers of hugepage migration + * like soft offline and memory hotremove don't walk through page + * tables or check whether the hugepage is pmd-based or not before + * kicking migration. + */ + if (!hugepage_migration_support(page_hstate(hpage))) + return -ENOSYS; + if (!new_hpage) return -ENOMEM; @@ -975,6 +989,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, unlock_page(hpage); out: + if (rc != -EAGAIN) + putback_active_hugepage(hpage); put_page(new_hpage); if (result) { if (rc) @@ -1025,7 +1041,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, list_for_each_entry_safe(page, page2, from, lru) { cond_resched(); - rc = unmap_and_move(get_new_page, private, + if (PageHuge(page)) + rc = unmap_and_move_huge_page(get_new_page, + private, page, pass > 2, mode); + else + rc = unmap_and_move(get_new_page, private, page, pass > 2, mode); switch(rc) { @@ -1058,32 +1078,6 @@ out: return rc; } -int migrate_huge_page(struct page *hpage, new_page_t get_new_page, - unsigned long private, enum migrate_mode mode) -{ - int pass, rc; - - for (pass = 0; pass < 10; pass++) { - rc = unmap_and_move_huge_page(get_new_page, private, - hpage, pass > 2, mode); - switch (rc) { - case -ENOMEM: - goto out; - case -EAGAIN: - /* try again */ - cond_resched(); - break; - case MIGRATEPAGE_SUCCESS: - goto out; - default: - rc = -EIO; - goto out; - } - } -out: - return rc; -} - #ifdef CONFIG_NUMA /* * Move a list of individual pages @@ -1108,7 +1102,11 @@ static struct page *new_page_node(struct page *p, unsigned long private, *result = &pm->status; - return alloc_pages_exact_node(pm->node, + if (PageHuge(p)) + return alloc_huge_page_node(page_hstate(compound_head(p)), + pm->node); + else + return alloc_pages_exact_node(pm->node, GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); } @@ -1168,6 +1166,11 @@ static int do_move_page_to_node_array(struct mm_struct *mm, !migrate_all) goto put_and_set; + if (PageHuge(page)) { + isolate_huge_page(page, &pagelist); + goto put_and_set; + } + err = isolate_lru_page(page); if (!err) { list_add_tail(&page->lru, &pagelist); @@ -1190,7 +1193,7 @@ set_status: err = migrate_pages(&pagelist, new_page_node, (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); if (err) - putback_lru_pages(&pagelist); + putback_movable_pages(&pagelist); } up_read(&mm->mmap_sem); @@ -1468,7 +1471,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, if (!populated_zone(zone)) continue; - if (zone->all_unreclaimable) + if (!zone_reclaimable(zone)) continue; /* Avoid waking kswapd by allocating pages_to_migrate pages. */ diff --git a/mm/mlock.c b/mm/mlock.c index 79b7cf7d1bca..d63802663242 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -11,6 +11,7 @@ #include <linux/swap.h> #include <linux/swapops.h> #include <linux/pagemap.h> +#include <linux/pagevec.h> #include <linux/mempolicy.h> #include <linux/syscalls.h> #include <linux/sched.h> @@ -18,6 +19,8 @@ #include <linux/rmap.h> #include <linux/mmzone.h> #include <linux/hugetlb.h> +#include <linux/memcontrol.h> +#include <linux/mm_inline.h> #include "internal.h" @@ -87,6 +90,47 @@ void mlock_vma_page(struct page *page) } } +/* + * Finish munlock after successful page isolation + * + * Page must be locked. This is a wrapper for try_to_munlock() + * and putback_lru_page() with munlock accounting. + */ +static void __munlock_isolated_page(struct page *page) +{ + int ret = SWAP_AGAIN; + + /* + * Optimization: if the page was mapped just once, that's our mapping + * and we don't need to check all the other vmas. + */ + if (page_mapcount(page) > 1) + ret = try_to_munlock(page); + + /* Did try_to_unlock() succeed or punt? */ + if (ret != SWAP_MLOCK) + count_vm_event(UNEVICTABLE_PGMUNLOCKED); + + putback_lru_page(page); +} + +/* + * Accounting for page isolation fail during munlock + * + * Performs accounting when page isolation fails in munlock. There is nothing + * else to do because it means some other task has already removed the page + * from the LRU. putback_lru_page() will take care of removing the page from + * the unevictable list, if necessary. vmscan [page_referenced()] will move + * the page back to the unevictable list if some other vma has it mlocked. + */ +static void __munlock_isolation_failed(struct page *page) +{ + if (PageUnevictable(page)) + count_vm_event(UNEVICTABLE_PGSTRANDED); + else + count_vm_event(UNEVICTABLE_PGMUNLOCKED); +} + /** * munlock_vma_page - munlock a vma page * @page - page to be unlocked @@ -112,37 +156,10 @@ unsigned int munlock_vma_page(struct page *page) unsigned int nr_pages = hpage_nr_pages(page); mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); page_mask = nr_pages - 1; - if (!isolate_lru_page(page)) { - int ret = SWAP_AGAIN; - - /* - * Optimization: if the page was mapped just once, - * that's our mapping and we don't need to check all the - * other vmas. - */ - if (page_mapcount(page) > 1) - ret = try_to_munlock(page); - /* - * did try_to_unlock() succeed or punt? - */ - if (ret != SWAP_MLOCK) - count_vm_event(UNEVICTABLE_PGMUNLOCKED); - - putback_lru_page(page); - } else { - /* - * Some other task has removed the page from the LRU. - * putback_lru_page() will take care of removing the - * page from the unevictable list, if necessary. - * vmscan [page_referenced()] will move the page back - * to the unevictable list if some other vma has it - * mlocked. - */ - if (PageUnevictable(page)) - count_vm_event(UNEVICTABLE_PGSTRANDED); - else - count_vm_event(UNEVICTABLE_PGMUNLOCKED); - } + if (!isolate_lru_page(page)) + __munlock_isolated_page(page); + else + __munlock_isolation_failed(page); } return page_mask; @@ -210,6 +227,191 @@ static int __mlock_posix_error_return(long retval) } /* + * Prepare page for fast batched LRU putback via putback_lru_evictable_pagevec() + * + * The fast path is available only for evictable pages with single mapping. + * Then we can bypass the per-cpu pvec and get better performance. + * when mapcount > 1 we need try_to_munlock() which can fail. + * when !page_evictable(), we need the full redo logic of putback_lru_page to + * avoid leaving evictable page in unevictable list. + * + * In case of success, @page is added to @pvec and @pgrescued is incremented + * in case that the page was previously unevictable. @page is also unlocked. + */ +static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec, + int *pgrescued) +{ + VM_BUG_ON(PageLRU(page)); + VM_BUG_ON(!PageLocked(page)); + + if (page_mapcount(page) <= 1 && page_evictable(page)) { + pagevec_add(pvec, page); + if (TestClearPageUnevictable(page)) + (*pgrescued)++; + unlock_page(page); + return true; + } + + return false; +} + +/* + * Putback multiple evictable pages to the LRU + * + * Batched putback of evictable pages that bypasses the per-cpu pvec. Some of + * the pages might have meanwhile become unevictable but that is OK. + */ +static void __putback_lru_fast(struct pagevec *pvec, int pgrescued) +{ + count_vm_events(UNEVICTABLE_PGMUNLOCKED, pagevec_count(pvec)); + /* + *__pagevec_lru_add() calls release_pages() so we don't call + * put_page() explicitly + */ + __pagevec_lru_add(pvec); + count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); +} + +/* + * Munlock a batch of pages from the same zone + * + * The work is split to two main phases. First phase clears the Mlocked flag + * and attempts to isolate the pages, all under a single zone lru lock. + * The second phase finishes the munlock only for pages where isolation + * succeeded. + * + * Note that the pagevec may be modified during the process. + */ +static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) +{ + int i; + int nr = pagevec_count(pvec); + int delta_munlocked = -nr; + struct pagevec pvec_putback; + int pgrescued = 0; + + /* Phase 1: page isolation */ + spin_lock_irq(&zone->lru_lock); + for (i = 0; i < nr; i++) { + struct page *page = pvec->pages[i]; + + if (TestClearPageMlocked(page)) { + struct lruvec *lruvec; + int lru; + + if (PageLRU(page)) { + lruvec = mem_cgroup_page_lruvec(page, zone); + lru = page_lru(page); + /* + * We already have pin from follow_page_mask() + * so we can spare the get_page() here. + */ + ClearPageLRU(page); + del_page_from_lru_list(page, lruvec, lru); + } else { + __munlock_isolation_failed(page); + goto skip_munlock; + } + + } else { +skip_munlock: + /* + * We won't be munlocking this page in the next phase + * but we still need to release the follow_page_mask() + * pin. + */ + pvec->pages[i] = NULL; + put_page(page); + delta_munlocked++; + } + } + __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); + spin_unlock_irq(&zone->lru_lock); + + /* Phase 2: page munlock */ + pagevec_init(&pvec_putback, 0); + for (i = 0; i < nr; i++) { + struct page *page = pvec->pages[i]; + + if (page) { + lock_page(page); + if (!__putback_lru_fast_prepare(page, &pvec_putback, + &pgrescued)) { + /* + * Slow path. We don't want to lose the last + * pin before unlock_page() + */ + get_page(page); /* for putback_lru_page() */ + __munlock_isolated_page(page); + unlock_page(page); + put_page(page); /* from follow_page_mask() */ + } + } + } + + /* + * Phase 3: page putback for pages that qualified for the fast path + * This will also call put_page() to return pin from follow_page_mask() + */ + if (pagevec_count(&pvec_putback)) + __putback_lru_fast(&pvec_putback, pgrescued); +} + +/* + * Fill up pagevec for __munlock_pagevec using pte walk + * + * The function expects that the struct page corresponding to @start address is + * a non-TPH page already pinned and in the @pvec, and that it belongs to @zone. + * + * The rest of @pvec is filled by subsequent pages within the same pmd and same + * zone, as long as the pte's are present and vm_normal_page() succeeds. These + * pages also get pinned. + * + * Returns the address of the next page that should be scanned. This equals + * @start + PAGE_SIZE when no page could be added by the pte walk. + */ +static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, + struct vm_area_struct *vma, int zoneid, unsigned long start, + unsigned long end) +{ + pte_t *pte; + spinlock_t *ptl; + + /* + * Initialize pte walk starting at the already pinned page where we + * are sure that there is a pte. + */ + pte = get_locked_pte(vma->vm_mm, start, &ptl); + end = min(end, pmd_addr_end(start, end)); + + /* The page next to the pinned page is the first we will try to get */ + start += PAGE_SIZE; + while (start < end) { + struct page *page = NULL; + pte++; + if (pte_present(*pte)) + page = vm_normal_page(vma, start, *pte); + /* + * Break if page could not be obtained or the page's node+zone does not + * match + */ + if (!page || page_zone_id(page) != zoneid) + break; + + get_page(page); + /* + * Increase the address that will be returned *before* the + * eventual break due to pvec becoming full by adding the page + */ + start += PAGE_SIZE; + if (pagevec_add(pvec, page) == 0) + break; + } + pte_unmap_unlock(pte, ptl); + return start; +} + +/* * munlock_vma_pages_range() - munlock all pages in the vma range.' * @vma - vma containing range to be munlock()ed. * @start - start address in @vma of the range @@ -233,9 +435,13 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, vma->vm_flags &= ~VM_LOCKED; while (start < end) { - struct page *page; + struct page *page = NULL; unsigned int page_mask, page_increm; + struct pagevec pvec; + struct zone *zone; + int zoneid; + pagevec_init(&pvec, 0); /* * Although FOLL_DUMP is intended for get_dump_page(), * it just so happens that its special treatment of the @@ -244,21 +450,45 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, * has sneaked into the range, we won't oops here: great). */ page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP, - &page_mask); + &page_mask); + if (page && !IS_ERR(page)) { - lock_page(page); - lru_add_drain(); - /* - * Any THP page found by follow_page_mask() may have - * gotten split before reaching munlock_vma_page(), - * so we need to recompute the page_mask here. - */ - page_mask = munlock_vma_page(page); - unlock_page(page); - put_page(page); + if (PageTransHuge(page)) { + lock_page(page); + /* + * Any THP page found by follow_page_mask() may + * have gotten split before reaching + * munlock_vma_page(), so we need to recompute + * the page_mask here. + */ + page_mask = munlock_vma_page(page); + unlock_page(page); + put_page(page); /* follow_page_mask() */ + } else { + /* + * Non-huge pages are handled in batches via + * pagevec. The pin from follow_page_mask() + * prevents them from collapsing by THP. + */ + pagevec_add(&pvec, page); + zone = page_zone(page); + zoneid = page_zone_id(page); + + /* + * Try to fill the rest of pagevec using fast + * pte walk. This will also update start to + * the next page to process. Then munlock the + * pagevec. + */ + start = __munlock_pagevec_fill(&pvec, vma, + zoneid, start, end); + __munlock_pagevec(&pvec, zone); + goto next; + } } page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); start += page_increm * PAGE_SIZE; +next: cond_resched(); } } diff --git a/mm/mmap.c b/mm/mmap.c index f9c97d10b873..9d548512ff8a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1202,7 +1202,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long *populate) { struct mm_struct * mm = current->mm; - struct inode *inode; vm_flags_t vm_flags; *populate = 0; @@ -1265,9 +1264,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, return -EAGAIN; } - inode = file ? file_inode(file) : NULL; - if (file) { + struct inode *inode = file_inode(file); + switch (flags & MAP_TYPE) { case MAP_SHARED: if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) @@ -1302,6 +1301,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, if (!file->f_op || !file->f_op->mmap) return -ENODEV; + if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) + return -EINVAL; break; default: @@ -1310,6 +1311,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, } else { switch (flags & MAP_TYPE) { case MAP_SHARED: + if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) + return -EINVAL; /* * Ignore pgoff. */ @@ -1476,11 +1479,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; - int correct_wcount = 0; int error; struct rb_node **rb_link, *rb_parent; unsigned long charged = 0; - struct inode *inode = file ? file_inode(file) : NULL; /* Check against address space limit. */ if (!may_expand_vm(mm, len >> PAGE_SHIFT)) { @@ -1544,16 +1545,11 @@ munmap_back: vma->vm_pgoff = pgoff; INIT_LIST_HEAD(&vma->anon_vma_chain); - error = -EINVAL; /* when rejecting VM_GROWSDOWN|VM_GROWSUP */ - if (file) { - if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) - goto free_vma; if (vm_flags & VM_DENYWRITE) { error = deny_write_access(file); if (error) goto free_vma; - correct_wcount = 1; } vma->vm_file = get_file(file); error = file->f_op->mmap(file, vma); @@ -1570,11 +1566,8 @@ munmap_back: WARN_ON_ONCE(addr != vma->vm_start); addr = vma->vm_start; - pgoff = vma->vm_pgoff; vm_flags = vma->vm_flags; } else if (vm_flags & VM_SHARED) { - if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP))) - goto free_vma; error = shmem_zero_setup(vma); if (error) goto free_vma; @@ -1596,11 +1589,10 @@ munmap_back: } vma_link(mm, vma, prev, rb_link, rb_parent); - file = vma->vm_file; - /* Once vma denies write, undo our temporary denial count */ - if (correct_wcount) - atomic_inc(&inode->i_writecount); + if (vm_flags & VM_DENYWRITE) + allow_write_access(file); + file = vma->vm_file; out: perf_event_mmap(vma); @@ -1616,11 +1608,20 @@ out: if (file) uprobe_mmap(vma); + /* + * New (or expanded) vma always get soft dirty status. + * Otherwise user-space soft-dirty page tracker won't + * be able to distinguish situation when vma area unmapped, + * then new mapped in-place (which must be aimed as + * a completely new data area). + */ + vma->vm_flags |= VM_SOFTDIRTY; + return addr; unmap_and_free_vma: - if (correct_wcount) - atomic_inc(&inode->i_writecount); + if (vm_flags & VM_DENYWRITE) + allow_write_access(file); vma->vm_file = NULL; fput(file); @@ -2380,7 +2381,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long addr, int new_below) { - struct mempolicy *pol; struct vm_area_struct *new; int err = -ENOMEM; @@ -2404,12 +2404,9 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); } - pol = mpol_dup(vma_policy(vma)); - if (IS_ERR(pol)) { - err = PTR_ERR(pol); + err = vma_dup_policy(vma, new); + if (err) goto out_free_vma; - } - vma_set_policy(new, pol); if (anon_vma_clone(new, vma)) goto out_free_mpol; @@ -2437,7 +2434,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, fput(new->vm_file); unlink_anon_vmas(new); out_free_mpol: - mpol_put(pol); + mpol_put(vma_policy(new)); out_free_vma: kmem_cache_free(vm_area_cachep, new); out_err: @@ -2663,6 +2660,7 @@ out: mm->total_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); + vma->vm_flags |= VM_SOFTDIRTY; return addr; } @@ -2780,7 +2778,6 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma, *prev; struct rb_node **rb_link, *rb_parent; - struct mempolicy *pol; bool faulted_in_anon_vma = true; /* @@ -2825,10 +2822,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, new_vma->vm_start = addr; new_vma->vm_end = addr + len; new_vma->vm_pgoff = pgoff; - pol = mpol_dup(vma_policy(vma)); - if (IS_ERR(pol)) + if (vma_dup_policy(vma, new_vma)) goto out_free_vma; - vma_set_policy(new_vma, pol); INIT_LIST_HEAD(&new_vma->anon_vma_chain); if (anon_vma_clone(new_vma, vma)) goto out_free_mempol; @@ -2843,7 +2838,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, return new_vma; out_free_mempol: - mpol_put(pol); + mpol_put(vma_policy(new_vma)); out_free_vma: kmem_cache_free(vm_area_cachep, new_vma); return NULL; @@ -2930,7 +2925,7 @@ int install_special_mapping(struct mm_struct *mm, vma->vm_start = addr; vma->vm_end = addr + len; - vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND; + vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); vma->vm_ops = &special_mapping_vmops; diff --git a/mm/mremap.c b/mm/mremap.c index 0843feb66f3d..91b13d6a16d4 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -25,6 +25,7 @@ #include <asm/uaccess.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> +#include <asm/pgalloc.h> #include "internal.h" @@ -62,8 +63,10 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, return NULL; pmd = pmd_alloc(mm, pud, addr); - if (!pmd) + if (!pmd) { + pud_free(mm, pud); return NULL; + } VM_BUG_ON(pmd_trans_huge(*pmd)); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3f0c895c71fe..6c7b0187be8e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -36,8 +36,11 @@ #include <linux/pagevec.h> #include <linux/timer.h> #include <linux/sched/rt.h> +#include <linux/mm_inline.h> #include <trace/events/writeback.h> +#include "internal.h" + /* * Sleep at most 200ms at a time in balance_dirty_pages(). */ @@ -241,9 +244,6 @@ static unsigned long global_dirtyable_memory(void) if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); - /* Subtract min_free_kbytes */ - x -= min_t(unsigned long, x, min_free_kbytes >> (PAGE_SHIFT - 10)); - return x + 1; /* Ensure that we never return 0 */ } @@ -585,6 +585,37 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) } /* + * setpoint - dirty 3 + * f(dirty) := 1.0 + (----------------) + * limit - setpoint + * + * it's a 3rd order polynomial that subjects to + * + * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast + * (2) f(setpoint) = 1.0 => the balance point + * (3) f(limit) = 0 => the hard limit + * (4) df/dx <= 0 => negative feedback control + * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) + * => fast response on large errors; small oscillation near setpoint + */ +static inline long long pos_ratio_polynom(unsigned long setpoint, + unsigned long dirty, + unsigned long limit) +{ + long long pos_ratio; + long x; + + x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, + limit - setpoint + 1); + pos_ratio = x; + pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; + pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; + pos_ratio += 1 << RATELIMIT_CALC_SHIFT; + + return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT); +} + +/* * Dirty position control. * * (o) global/bdi setpoints @@ -682,26 +713,80 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, /* * global setpoint * - * setpoint - dirty 3 - * f(dirty) := 1.0 + (----------------) - * limit - setpoint + * See comment for pos_ratio_polynom(). + */ + setpoint = (freerun + limit) / 2; + pos_ratio = pos_ratio_polynom(setpoint, dirty, limit); + + /* + * The strictlimit feature is a tool preventing mistrusted filesystems + * from growing a large number of dirty pages before throttling. For + * such filesystems balance_dirty_pages always checks bdi counters + * against bdi limits. Even if global "nr_dirty" is under "freerun". + * This is especially important for fuse which sets bdi->max_ratio to + * 1% by default. Without strictlimit feature, fuse writeback may + * consume arbitrary amount of RAM because it is accounted in + * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty". * - * it's a 3rd order polynomial that subjects to + * Here, in bdi_position_ratio(), we calculate pos_ratio based on + * two values: bdi_dirty and bdi_thresh. Let's consider an example: + * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global + * limits are set by default to 10% and 20% (background and throttle). + * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. + * bdi_dirty_limit(bdi, bg_thresh) is about ~4K pages. bdi_setpoint is + * about ~6K pages (as the average of background and throttle bdi + * limits). The 3rd order polynomial will provide positive feedback if + * bdi_dirty is under bdi_setpoint and vice versa. * - * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast - * (2) f(setpoint) = 1.0 => the balance point - * (3) f(limit) = 0 => the hard limit - * (4) df/dx <= 0 => negative feedback control - * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) - * => fast response on large errors; small oscillation near setpoint + * Note, that we cannot use global counters in these calculations + * because we want to throttle process writing to a strictlimit BDI + * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB + * in the example above). */ - setpoint = (freerun + limit) / 2; - x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, - limit - setpoint + 1); - pos_ratio = x; - pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; - pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; - pos_ratio += 1 << RATELIMIT_CALC_SHIFT; + if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) { + long long bdi_pos_ratio; + unsigned long bdi_bg_thresh; + + if (bdi_dirty < 8) + return min_t(long long, pos_ratio * 2, + 2 << RATELIMIT_CALC_SHIFT); + + if (bdi_dirty >= bdi_thresh) + return 0; + + bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh, thresh); + bdi_setpoint = dirty_freerun_ceiling(bdi_thresh, + bdi_bg_thresh); + + if (bdi_setpoint == 0 || bdi_setpoint == bdi_thresh) + return 0; + + bdi_pos_ratio = pos_ratio_polynom(bdi_setpoint, bdi_dirty, + bdi_thresh); + + /* + * Typically, for strictlimit case, bdi_setpoint << setpoint + * and pos_ratio >> bdi_pos_ratio. In the other words global + * state ("dirty") is not limiting factor and we have to + * make decision based on bdi counters. But there is an + * important case when global pos_ratio should get precedence: + * global limits are exceeded (e.g. due to activities on other + * BDIs) while given strictlimit BDI is below limit. + * + * "pos_ratio * bdi_pos_ratio" would work for the case above, + * but it would look too non-natural for the case of all + * activity in the system coming from a single strictlimit BDI + * with bdi->max_ratio == 100%. + * + * Note that min() below somewhat changes the dynamics of the + * control system. Normally, pos_ratio value can be well over 3 + * (when globally we are at freerun and bdi is well below bdi + * setpoint). Now the maximum pos_ratio in the same situation + * is 2. We might want to tweak this if we observe the control + * system is too slow to adapt. + */ + return min(pos_ratio, bdi_pos_ratio); + } /* * We have computed basic pos_ratio above based on global situation. If @@ -994,6 +1079,27 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, * keep that period small to reduce time lags). */ step = 0; + + /* + * For strictlimit case, calculations above were based on bdi counters + * and limits (starting from pos_ratio = bdi_position_ratio() and up to + * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). + * Hence, to calculate "step" properly, we have to use bdi_dirty as + * "dirty" and bdi_setpoint as "setpoint". + * + * We rampup dirty_ratelimit forcibly if bdi_dirty is low because + * it's possible that bdi_thresh is close to zero due to inactivity + * of backing device (see the implementation of bdi_dirty_limit()). + */ + if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) { + dirty = bdi_dirty; + if (bdi_dirty < 8) + setpoint = bdi_dirty + 1; + else + setpoint = (bdi_thresh + + bdi_dirty_limit(bdi, bg_thresh)) / 2; + } + if (dirty < setpoint) { x = min(bdi->balanced_dirty_ratelimit, min(balanced_dirty_ratelimit, task_ratelimit)); @@ -1198,6 +1304,56 @@ static long bdi_min_pause(struct backing_dev_info *bdi, return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; } +static inline void bdi_dirty_limits(struct backing_dev_info *bdi, + unsigned long dirty_thresh, + unsigned long background_thresh, + unsigned long *bdi_dirty, + unsigned long *bdi_thresh, + unsigned long *bdi_bg_thresh) +{ + unsigned long bdi_reclaimable; + + /* + * bdi_thresh is not treated as some limiting factor as + * dirty_thresh, due to reasons + * - in JBOD setup, bdi_thresh can fluctuate a lot + * - in a system with HDD and USB key, the USB key may somehow + * go into state (bdi_dirty >> bdi_thresh) either because + * bdi_dirty starts high, or because bdi_thresh drops low. + * In this case we don't want to hard throttle the USB key + * dirtiers for 100 seconds until bdi_dirty drops under + * bdi_thresh. Instead the auxiliary bdi control line in + * bdi_position_ratio() will let the dirtier task progress + * at some rate <= (write_bw / 2) for bringing down bdi_dirty. + */ + *bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); + + if (bdi_bg_thresh) + *bdi_bg_thresh = div_u64((u64)*bdi_thresh * + background_thresh, + dirty_thresh); + + /* + * In order to avoid the stacked BDI deadlock we need + * to ensure we accurately count the 'dirty' pages when + * the threshold is low. + * + * Otherwise it would be possible to get thresh+n pages + * reported dirty, even though there are thresh-m pages + * actually dirty; with m+n sitting in the percpu + * deltas. + */ + if (*bdi_thresh < 2 * bdi_stat_error(bdi)) { + bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); + *bdi_dirty = bdi_reclaimable + + bdi_stat_sum(bdi, BDI_WRITEBACK); + } else { + bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); + *bdi_dirty = bdi_reclaimable + + bdi_stat(bdi, BDI_WRITEBACK); + } +} + /* * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force @@ -1209,13 +1365,9 @@ static void balance_dirty_pages(struct address_space *mapping, unsigned long pages_dirtied) { unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ - unsigned long bdi_reclaimable; unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ - unsigned long bdi_dirty; - unsigned long freerun; unsigned long background_thresh; unsigned long dirty_thresh; - unsigned long bdi_thresh; long period; long pause; long max_pause; @@ -1226,10 +1378,16 @@ static void balance_dirty_pages(struct address_space *mapping, unsigned long dirty_ratelimit; unsigned long pos_ratio; struct backing_dev_info *bdi = mapping->backing_dev_info; + bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; unsigned long start_time = jiffies; for (;;) { unsigned long now = jiffies; + unsigned long uninitialized_var(bdi_thresh); + unsigned long thresh; + unsigned long uninitialized_var(bdi_dirty); + unsigned long dirty; + unsigned long bg_thresh; /* * Unstable writes are a feature of certain networked @@ -1243,61 +1401,44 @@ static void balance_dirty_pages(struct address_space *mapping, global_dirty_limits(&background_thresh, &dirty_thresh); + if (unlikely(strictlimit)) { + bdi_dirty_limits(bdi, dirty_thresh, background_thresh, + &bdi_dirty, &bdi_thresh, &bg_thresh); + + dirty = bdi_dirty; + thresh = bdi_thresh; + } else { + dirty = nr_dirty; + thresh = dirty_thresh; + bg_thresh = background_thresh; + } + /* * Throttle it only when the background writeback cannot * catch-up. This avoids (excessively) small writeouts - * when the bdi limits are ramping up. + * when the bdi limits are ramping up in case of !strictlimit. + * + * In strictlimit case make decision based on the bdi counters + * and limits. Small writeouts when the bdi limits are ramping + * up are the price we consciously pay for strictlimit-ing. */ - freerun = dirty_freerun_ceiling(dirty_thresh, - background_thresh); - if (nr_dirty <= freerun) { + if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) { current->dirty_paused_when = now; current->nr_dirtied = 0; current->nr_dirtied_pause = - dirty_poll_interval(nr_dirty, dirty_thresh); + dirty_poll_interval(dirty, thresh); break; } if (unlikely(!writeback_in_progress(bdi))) bdi_start_background_writeback(bdi); - /* - * bdi_thresh is not treated as some limiting factor as - * dirty_thresh, due to reasons - * - in JBOD setup, bdi_thresh can fluctuate a lot - * - in a system with HDD and USB key, the USB key may somehow - * go into state (bdi_dirty >> bdi_thresh) either because - * bdi_dirty starts high, or because bdi_thresh drops low. - * In this case we don't want to hard throttle the USB key - * dirtiers for 100 seconds until bdi_dirty drops under - * bdi_thresh. Instead the auxiliary bdi control line in - * bdi_position_ratio() will let the dirtier task progress - * at some rate <= (write_bw / 2) for bringing down bdi_dirty. - */ - bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); - - /* - * In order to avoid the stacked BDI deadlock we need - * to ensure we accurately count the 'dirty' pages when - * the threshold is low. - * - * Otherwise it would be possible to get thresh+n pages - * reported dirty, even though there are thresh-m pages - * actually dirty; with m+n sitting in the percpu - * deltas. - */ - if (bdi_thresh < 2 * bdi_stat_error(bdi)) { - bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); - bdi_dirty = bdi_reclaimable + - bdi_stat_sum(bdi, BDI_WRITEBACK); - } else { - bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); - bdi_dirty = bdi_reclaimable + - bdi_stat(bdi, BDI_WRITEBACK); - } + if (!strictlimit) + bdi_dirty_limits(bdi, dirty_thresh, background_thresh, + &bdi_dirty, &bdi_thresh, NULL); dirty_exceeded = (bdi_dirty > bdi_thresh) && - (nr_dirty > dirty_thresh); + ((nr_dirty > dirty_thresh) || strictlimit); if (dirty_exceeded && !bdi->dirty_exceeded) bdi->dirty_exceeded = 1; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c2b59dbda196..0ee638f76ebe 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -56,6 +56,7 @@ #include <linux/ftrace_event.h> #include <linux/memcontrol.h> #include <linux/prefetch.h> +#include <linux/mm_inline.h> #include <linux/migrate.h> #include <linux/page-debug-flags.h> #include <linux/hugetlb.h> @@ -488,8 +489,10 @@ __find_buddy_index(unsigned long page_idx, unsigned int order) * (c) a page and its buddy have the same order && * (d) a page and its buddy are in the same zone. * - * For recording whether a page is in the buddy system, we set ->_mapcount -2. - * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock. + * For recording whether a page is in the buddy system, we set ->_mapcount + * PAGE_BUDDY_MAPCOUNT_VALUE. + * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is + * serialized by zone->lock. * * For recording page's order, we use page_private(page). */ @@ -527,8 +530,9 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, * as necessary, plus some accounting needed to play nicely with other * parts of the VM system. * At each level, we keep a list of pages, which are heads of continuous - * free pages of length of (1 << order) and marked with _mapcount -2. Page's - * order is recorded in page_private(page) field. + * free pages of length of (1 << order) and marked with _mapcount + * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page) + * field. * So when we are allocating or freeing one, we can derive the state of the * other. That is, if we allocate a small block, and both were * free, the remainder of the region must be split into blocks. @@ -647,7 +651,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, int to_free = count; spin_lock(&zone->lock); - zone->all_unreclaimable = 0; zone->pages_scanned = 0; while (to_free) { @@ -696,7 +699,6 @@ static void free_one_page(struct zone *zone, struct page *page, int order, int migratetype) { spin_lock(&zone->lock); - zone->all_unreclaimable = 0; zone->pages_scanned = 0; __free_one_page(page, zone, order, migratetype); @@ -721,7 +723,8 @@ static bool free_pages_prepare(struct page *page, unsigned int order) return false; if (!PageHighMem(page)) { - debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); + debug_check_no_locks_freed(page_address(page), + PAGE_SIZE << order); debug_check_no_obj_freed(page_address(page), PAGE_SIZE << order); } @@ -750,19 +753,19 @@ static void __free_pages_ok(struct page *page, unsigned int order) void __init __free_pages_bootmem(struct page *page, unsigned int order) { unsigned int nr_pages = 1 << order; + struct page *p = page; unsigned int loop; - prefetchw(page); - for (loop = 0; loop < nr_pages; loop++) { - struct page *p = &page[loop]; - - if (loop + 1 < nr_pages) - prefetchw(p + 1); + prefetchw(p); + for (loop = 0; loop < (nr_pages - 1); loop++, p++) { + prefetchw(p + 1); __ClearPageReserved(p); set_page_count(p, 0); } + __ClearPageReserved(p); + set_page_count(p, 0); - page_zone(page)->managed_pages += 1 << order; + page_zone(page)->managed_pages += nr_pages; set_page_refcounted(page); __free_pages(page, order); } @@ -885,7 +888,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, int migratetype) { unsigned int current_order; - struct free_area * area; + struct free_area *area; struct page *page; /* Find a page of the appropriate size in the preferred list */ @@ -1007,14 +1010,60 @@ static void change_pageblock_range(struct page *pageblock_page, } } +/* + * If breaking a large block of pages, move all free pages to the preferred + * allocation list. If falling back for a reclaimable kernel allocation, be + * more aggressive about taking ownership of free pages. + * + * On the other hand, never change migration type of MIGRATE_CMA pageblocks + * nor move CMA pages to different free lists. We don't want unmovable pages + * to be allocated from MIGRATE_CMA areas. + * + * Returns the new migratetype of the pageblock (or the same old migratetype + * if it was unchanged). + */ +static int try_to_steal_freepages(struct zone *zone, struct page *page, + int start_type, int fallback_type) +{ + int current_order = page_order(page); + + if (is_migrate_cma(fallback_type)) + return fallback_type; + + /* Take ownership for orders >= pageblock_order */ + if (current_order >= pageblock_order) { + change_pageblock_range(page, current_order, start_type); + return start_type; + } + + if (current_order >= pageblock_order / 2 || + start_type == MIGRATE_RECLAIMABLE || + page_group_by_mobility_disabled) { + int pages; + + pages = move_freepages_block(zone, page, start_type); + + /* Claim the whole block if over half of it is free */ + if (pages >= (1 << (pageblock_order-1)) || + page_group_by_mobility_disabled) { + + set_pageblock_migratetype(page, start_type); + return start_type; + } + + } + + return fallback_type; +} + /* Remove an element from the buddy allocator from the fallback list */ static inline struct page * __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) { - struct free_area * area; + struct free_area *area; int current_order; struct page *page; - int migratetype, i; + int migratetype, new_type, i; /* Find the largest possible block of pages in the other list */ for (current_order = MAX_ORDER-1; current_order >= order; @@ -1034,51 +1083,29 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) struct page, lru); area->nr_free--; - /* - * If breaking a large block of pages, move all free - * pages to the preferred allocation list. If falling - * back for a reclaimable kernel allocation, be more - * aggressive about taking ownership of free pages - * - * On the other hand, never change migration - * type of MIGRATE_CMA pageblocks nor move CMA - * pages on different free lists. We don't - * want unmovable pages to be allocated from - * MIGRATE_CMA areas. - */ - if (!is_migrate_cma(migratetype) && - (current_order >= pageblock_order / 2 || - start_migratetype == MIGRATE_RECLAIMABLE || - page_group_by_mobility_disabled)) { - int pages; - pages = move_freepages_block(zone, page, - start_migratetype); - - /* Claim the whole block if over half of it is free */ - if (pages >= (1 << (pageblock_order-1)) || - page_group_by_mobility_disabled) - set_pageblock_migratetype(page, - start_migratetype); - - migratetype = start_migratetype; - } + new_type = try_to_steal_freepages(zone, page, + start_migratetype, + migratetype); /* Remove the page from the freelists */ list_del(&page->lru); rmv_page_order(page); - /* Take ownership for orders >= pageblock_order */ - if (current_order >= pageblock_order && - !is_migrate_cma(migratetype)) - change_pageblock_range(page, current_order, - start_migratetype); - + /* + * Borrow the excess buddy pages as well, irrespective + * of whether we stole freepages, or took ownership of + * the pageblock or not. + * + * Exception: When borrowing from MIGRATE_CMA, release + * the excess buddy pages to CMA itself. + */ expand(zone, page, order, current_order, area, is_migrate_cma(migratetype) ? migratetype : start_migratetype); - trace_mm_page_alloc_extfrag(page, order, current_order, - start_migratetype, migratetype); + trace_mm_page_alloc_extfrag(page, order, + current_order, start_migratetype, migratetype, + new_type == start_migratetype); return page; } @@ -1281,7 +1308,7 @@ void mark_free_pages(struct zone *zone) int order, t; struct list_head *curr; - if (!zone->spanned_pages) + if (zone_is_empty(zone)) return; spin_lock_irqsave(&zone->lock, flags); @@ -1526,6 +1553,7 @@ again: get_pageblock_migratetype(page)); } + __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); local_irq_restore(flags); @@ -1792,6 +1820,11 @@ static void zlc_clear_zones_full(struct zonelist *zonelist) bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); } +static bool zone_local(struct zone *local_zone, struct zone *zone) +{ + return node_distance(local_zone->node, zone->node) == LOCAL_DISTANCE; +} + static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); @@ -1829,6 +1862,11 @@ static void zlc_clear_zones_full(struct zonelist *zonelist) { } +static bool zone_local(struct zone *local_zone, struct zone *zone) +{ + return true; +} + static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { return true; @@ -1860,16 +1898,41 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, zonelist_scan: /* * Scan zonelist, looking for a zone with enough free. - * See also cpuset_zone_allowed() comment in kernel/cpuset.c. + * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { + unsigned long mark; + if (IS_ENABLED(CONFIG_NUMA) && zlc_active && !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; if ((alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed_softwall(zone, gfp_mask)) continue; + BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); + if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS)) + goto try_this_zone; + /* + * Distribute pages in proportion to the individual + * zone size to ensure fair page aging. The zone a + * page was allocated in should have no effect on the + * time the page has in memory before being reclaimed. + * + * When zone_reclaim_mode is enabled, try to stay in + * local zones in the fastpath. If that fails, the + * slowpath is entered, which will do another pass + * starting with the local zones, but ultimately fall + * back to remote zones that do not partake in the + * fairness round-robin cycle of this zonelist. + */ + if (alloc_flags & ALLOC_WMARK_LOW) { + if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) + continue; + if (zone_reclaim_mode && + !zone_local(preferred_zone, zone)) + continue; + } /* * When allocating a page cache page for writing, we * want to get it from a zone that is within its dirty @@ -1900,16 +1963,11 @@ zonelist_scan: (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) goto this_zone_full; - BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); - if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { - unsigned long mark; + mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; + if (!zone_watermark_ok(zone, order, mark, + classzone_idx, alloc_flags)) { int ret; - mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; - if (zone_watermark_ok(zone, order, mark, - classzone_idx, alloc_flags)) - goto try_this_zone; - if (IS_ENABLED(CONFIG_NUMA) && !did_zlc_setup && nr_online_nodes > 1) { /* @@ -2321,16 +2379,30 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, return page; } -static inline -void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, - enum zone_type high_zoneidx, - enum zone_type classzone_idx) +static void prepare_slowpath(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, + enum zone_type high_zoneidx, + struct zone *preferred_zone) { struct zoneref *z; struct zone *zone; - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) - wakeup_kswapd(zone, order, classzone_idx); + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + if (!(gfp_mask & __GFP_NO_KSWAPD)) + wakeup_kswapd(zone, order, zone_idx(preferred_zone)); + /* + * Only reset the batches of zones that were actually + * considered in the fast path, we don't want to + * thrash fairness information for zones that are not + * actually part of this zonelist's round-robin cycle. + */ + if (zone_reclaim_mode && !zone_local(preferred_zone, zone)) + continue; + mod_zone_page_state(zone, NR_ALLOC_BATCH, + high_wmark_pages(zone) - + low_wmark_pages(zone) - + zone_page_state(zone, NR_ALLOC_BATCH)); + } } static inline int @@ -2426,9 +2498,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; restart: - if (!(gfp_mask & __GFP_NO_KSWAPD)) - wake_all_kswapd(order, zonelist, high_zoneidx, - zone_idx(preferred_zone)); + prepare_slowpath(gfp_mask, order, zonelist, + high_zoneidx, preferred_zone); /* * OK, we're below the kswapd watermark and have kicked background @@ -3095,7 +3166,7 @@ void show_free_areas(unsigned int filter) K(zone_page_state(zone, NR_FREE_CMA_PAGES)), K(zone_page_state(zone, NR_WRITEBACK_TEMP)), zone->pages_scanned, - (zone->all_unreclaimable ? "yes" : "no") + (!zone_reclaimable(zone) ? "yes" : "no") ); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) @@ -3104,7 +3175,7 @@ void show_free_areas(unsigned int filter) } for_each_populated_zone(zone) { - unsigned long nr[MAX_ORDER], flags, order, total = 0; + unsigned long nr[MAX_ORDER], flags, order, total = 0; unsigned char types[MAX_ORDER]; if (skip_free_areas_node(filter, zone_to_nid(zone))) @@ -3416,11 +3487,11 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) static int default_zonelist_order(void) { int nid, zone_type; - unsigned long low_kmem_size,total_size; + unsigned long low_kmem_size, total_size; struct zone *z; int average_size; /* - * ZONE_DMA and ZONE_DMA32 can be very small area in the system. + * ZONE_DMA and ZONE_DMA32 can be very small area in the system. * If they are really small and used heavily, the system can fall * into OOM very easily. * This function detect ZONE_DMA/DMA32 size and configures zone order. @@ -3452,9 +3523,9 @@ static int default_zonelist_order(void) return ZONELIST_ORDER_NODE; /* * look into each node's config. - * If there is a node whose DMA/DMA32 memory is very big area on - * local memory, NODE_ORDER may be suitable. - */ + * If there is a node whose DMA/DMA32 memory is very big area on + * local memory, NODE_ORDER may be suitable. + */ average_size = total_size / (nodes_weight(node_states[N_MEMORY]) + 1); for_each_online_node(nid) { @@ -4180,7 +4251,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) if (!zone->wait_table) return -ENOMEM; - for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) + for (i = 0; i < zone->wait_table_hash_nr_entries; ++i) init_waitqueue_head(zone->wait_table + i); return 0; @@ -4237,7 +4308,7 @@ int __meminit init_currently_empty_zone(struct zone *zone, int __meminit __early_pfn_to_nid(unsigned long pfn) { unsigned long start_pfn, end_pfn; - int i, nid; + int nid; /* * NOTE: The following SMP-unsafe globals are only used early in boot * when the kernel is running single-threaded. @@ -4248,15 +4319,14 @@ int __meminit __early_pfn_to_nid(unsigned long pfn) if (last_start_pfn <= pfn && pfn < last_end_pfn) return last_nid; - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) - if (start_pfn <= pfn && pfn < end_pfn) { - last_start_pfn = start_pfn; - last_end_pfn = end_pfn; - last_nid = nid; - return nid; - } - /* This is a memory hole */ - return -1; + nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); + if (nid != -1) { + last_start_pfn = start_pfn; + last_end_pfn = end_pfn; + last_nid = nid; + } + + return nid; } #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ @@ -4586,7 +4656,7 @@ static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ -void __init set_pageblock_order(void) +void __paginginit set_pageblock_order(void) { unsigned int order; @@ -4614,7 +4684,7 @@ void __init set_pageblock_order(void) * include/linux/pageblock-flags.h for the values of pageblock_order based on * the kernel config */ -void __init set_pageblock_order(void) +void __paginginit set_pageblock_order(void) { } @@ -4728,8 +4798,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, spin_lock_init(&zone->lru_lock); zone_seqlock_init(zone); zone->zone_pgdat = pgdat; - zone_pcp_init(zone); + + /* For bootup, initialized properly in watermark setup */ + mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages); + lruvec_init(&zone->lruvec); if (!size) continue; @@ -4930,7 +5003,7 @@ static unsigned long __init early_calculate_totalpages(void) if (pages) node_set_state(nid, N_MEMORY); } - return totalpages; + return totalpages; } /* @@ -5047,7 +5120,7 @@ restart: /* * Some kernelcore has been met, update counts and * break if the kernelcore for this node has been - * satisified + * satisfied */ required_kernelcore -= min(required_kernelcore, size_pages); @@ -5061,7 +5134,7 @@ restart: * If there is still required_kernelcore, we do another pass with one * less node in the count. This will push zone_movable_pfn[nid] further * along on the nodes that still have memory until kernelcore is - * satisified + * satisfied */ usable_nodes--; if (usable_nodes && required_kernelcore > usable_nodes) @@ -5286,8 +5359,10 @@ void __init mem_init_print_info(const char *str) * 3) .rodata.* may be embedded into .text or .data sections. */ #define adj_init_size(start, end, size, pos, adj) \ - if (start <= pos && pos < end && size > adj) \ - size -= adj; + do { \ + if (start <= pos && pos < end && size > adj) \ + size -= adj; \ + } while (0) adj_init_size(__init_begin, __init_end, init_data_size, _sinittext, init_code_size); @@ -5361,7 +5436,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self, * This is only okay since the processor is dead and cannot * race with what we are doing. */ - refresh_cpu_vm_stats(cpu); + cpu_vm_stats_fold(cpu); } return NOTIFY_OK; } @@ -5498,6 +5573,11 @@ static void __setup_per_zone_wmarks(void) zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); + __mod_zone_page_state(zone, NR_ALLOC_BATCH, + high_wmark_pages(zone) - + low_wmark_pages(zone) - + zone_page_state(zone, NR_ALLOC_BATCH)); + setup_zone_migrate_reserve(zone); spin_unlock_irqrestore(&zone->lock, flags); } @@ -5570,7 +5650,7 @@ static void __meminit setup_per_zone_inactive_ratio(void) * we want it large (64MB max). But it is not linear, because network * bandwidth does not increase linearly with machine size. We use * - * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: + * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: * min_free_kbytes = sqrt(lowmem_kbytes * 16) * * which yields @@ -5614,11 +5694,11 @@ int __meminit init_per_zone_wmark_min(void) module_init(init_per_zone_wmark_min) /* - * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so + * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so * that we can call two helper functions whenever min_free_kbytes * changes. */ -int min_free_kbytes_sysctl_handler(ctl_table *table, int write, +int min_free_kbytes_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec(table, write, buffer, length, ppos); @@ -5682,8 +5762,8 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, /* * percpu_pagelist_fraction - changes the pcp->high for each zone on each - * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist - * can have before it gets flushed back to buddy allocator. + * cpu. It is the fraction of total pages in each zone that a hot per cpu + * pagelist can have before it gets flushed back to buddy allocator. */ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) @@ -5745,9 +5825,10 @@ void *__init alloc_large_system_hash(const char *tablename, if (!numentries) { /* round applicable memory size up to nearest megabyte */ numentries = nr_kernel_pages; - numentries += (1UL << (20 - PAGE_SHIFT)) - 1; - numentries >>= 20 - PAGE_SHIFT; - numentries <<= 20 - PAGE_SHIFT; + + /* It isn't necessary when PAGE_SIZE >= 1MB */ + if (PAGE_SHIFT < 20) + numentries = round_up(numentries, (1<<20)/PAGE_SIZE); /* limit to 1 bucket per 2^scale bytes of low memory */ if (scale > PAGE_SHIFT) @@ -5900,7 +5981,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, * This function checks whether pageblock includes unmovable pages or not. * If @count is not zero, it is okay to include less @count unmovable pages * - * PageLRU check wihtout isolation or lru_lock could race so that + * PageLRU check without isolation or lru_lock could race so that * MIGRATE_MOVABLE block might include unmovable pages. It means you can't * expect this function should be exact. */ @@ -5928,6 +6009,17 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, continue; page = pfn_to_page(check); + + /* + * Hugepages are not in LRU lists, but they're movable. + * We need not scan over tail pages bacause we don't + * handle each tail page individually in migration. + */ + if (PageHuge(page)) { + iter = round_up(iter + 1, 1<<compound_order(page)) - 1; + continue; + } + /* * We can't use page_count without pin a page * because another CPU can free compound page. diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 0cee10ffb98d..d1473b2e9481 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -6,6 +6,7 @@ #include <linux/page-isolation.h> #include <linux/pageblock-flags.h> #include <linux/memory.h> +#include <linux/hugetlb.h> #include "internal.h" int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages) @@ -252,6 +253,19 @@ struct page *alloc_migrate_target(struct page *page, unsigned long private, { gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; + /* + * TODO: allocate a destination hugepage from a nearest neighbor node, + * accordance with memory policy of the user process if possible. For + * now as a simple work-around, we use the next node for destination. + */ + if (PageHuge(page)) { + nodemask_t src = nodemask_of_node(page_to_nid(page)); + nodemask_t dst; + nodes_complement(dst, src); + return alloc_huge_page_node(page_hstate(compound_head(page)), + next_node(page_to_nid(page), dst)); + } + if (PageHighMem(page)) gfp_mask |= __GFP_HIGHMEM; diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index e1a6e4fab016..3929a40bd6c0 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -10,6 +10,30 @@ #include <asm/tlb.h> #include <asm-generic/pgtable.h> +/* + * If a p?d_bad entry is found while walking page tables, report + * the error, before resetting entry to p?d_none. Usually (but + * very seldom) called out from the p?d_none_or_clear_bad macros. + */ + +void pgd_clear_bad(pgd_t *pgd) +{ + pgd_ERROR(*pgd); + pgd_clear(pgd); +} + +void pud_clear_bad(pud_t *pud) +{ + pud_ERROR(*pud); + pud_clear(pud); +} + +void pmd_clear_bad(pmd_t *pmd) +{ + pmd_ERROR(*pmd); + pmd_clear(pmd); +} + #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS /* * Only sets the access flags (dirty, accessed), as well as write diff --git a/mm/readahead.c b/mm/readahead.c index 829a77c62834..e4ed04149785 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -371,10 +371,10 @@ static int try_context_readahead(struct address_space *mapping, size = count_history_pages(mapping, ra, offset, max); /* - * no history pages: + * not enough history pages: * it could be a random read */ - if (!size) + if (size <= req_size) return 0; /* @@ -385,8 +385,8 @@ static int try_context_readahead(struct address_space *mapping, size *= 2; ra->start = offset; - ra->size = get_init_ra_size(size + req_size, max); - ra->async_size = ra->size; + ra->size = min(size + req_size, max); + ra->async_size = 1; return 1; } diff --git a/mm/shmem.c b/mm/shmem.c index 526149846d0a..8297623fcaed 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1205,7 +1205,7 @@ repeat: gfp & GFP_RECLAIM_MASK); if (error) goto decused; - error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); + error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, gfp, NULL); @@ -2819,6 +2819,10 @@ int __init shmem_init(void) { int error; + /* If rootfs called this, don't re-init */ + if (shmem_inode_cachep) + return 0; + error = bdi_init(&shmem_backing_dev_info); if (error) goto out4; diff --git a/mm/slub.c b/mm/slub.c index e3ba1f2cf60c..51df8272cfaf 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4420,7 +4420,7 @@ static ssize_t order_store(struct kmem_cache *s, unsigned long order; int err; - err = strict_strtoul(buf, 10, &order); + err = kstrtoul(buf, 10, &order); if (err) return err; @@ -4448,7 +4448,7 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, unsigned long min; int err; - err = strict_strtoul(buf, 10, &min); + err = kstrtoul(buf, 10, &min); if (err) return err; @@ -4468,7 +4468,7 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, unsigned long objects; int err; - err = strict_strtoul(buf, 10, &objects); + err = kstrtoul(buf, 10, &objects); if (err) return err; if (objects && !kmem_cache_has_cpu_partial(s)) @@ -4784,7 +4784,7 @@ static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, unsigned long ratio; int err; - err = strict_strtoul(buf, 10, &ratio); + err = kstrtoul(buf, 10, &ratio); if (err) return err; diff --git a/mm/sparse.c b/mm/sparse.c index 308d50331bc3..4ac1d7ef548f 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -339,13 +339,14 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) } #endif /* CONFIG_MEMORY_HOTREMOVE */ -static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map, +static void __init sparse_early_usemaps_alloc_node(void *data, unsigned long pnum_begin, unsigned long pnum_end, unsigned long usemap_count, int nodeid) { void *usemap; unsigned long pnum; + unsigned long **usemap_map = (unsigned long **)data; int size = usemap_size(); usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), @@ -430,11 +431,12 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER -static void __init sparse_early_mem_maps_alloc_node(struct page **map_map, +static void __init sparse_early_mem_maps_alloc_node(void *data, unsigned long pnum_begin, unsigned long pnum_end, unsigned long map_count, int nodeid) { + struct page **map_map = (struct page **)data; sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end, map_count, nodeid); } @@ -460,6 +462,55 @@ void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) { } +/** + * alloc_usemap_and_memmap - memory alloction for pageblock flags and vmemmap + * @map: usemap_map for pageblock flags or mmap_map for vmemmap + */ +static void __init alloc_usemap_and_memmap(void (*alloc_func) + (void *, unsigned long, unsigned long, + unsigned long, int), void *data) +{ + unsigned long pnum; + unsigned long map_count; + int nodeid_begin = 0; + unsigned long pnum_begin = 0; + + for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { + struct mem_section *ms; + + if (!present_section_nr(pnum)) + continue; + ms = __nr_to_section(pnum); + nodeid_begin = sparse_early_nid(ms); + pnum_begin = pnum; + break; + } + map_count = 1; + for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) { + struct mem_section *ms; + int nodeid; + + if (!present_section_nr(pnum)) + continue; + ms = __nr_to_section(pnum); + nodeid = sparse_early_nid(ms); + if (nodeid == nodeid_begin) { + map_count++; + continue; + } + /* ok, we need to take cake of from pnum_begin to pnum - 1*/ + alloc_func(data, pnum_begin, pnum, + map_count, nodeid_begin); + /* new start, update count etc*/ + nodeid_begin = nodeid; + pnum_begin = pnum; + map_count = 1; + } + /* ok, last chunk */ + alloc_func(data, pnum_begin, NR_MEM_SECTIONS, + map_count, nodeid_begin); +} + /* * Allocate the accumulated non-linear sections, allocate a mem_map * for each and record the physical to section mapping. @@ -471,11 +522,7 @@ void __init sparse_init(void) unsigned long *usemap; unsigned long **usemap_map; int size; - int nodeid_begin = 0; - unsigned long pnum_begin = 0; - unsigned long usemap_count; #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - unsigned long map_count; int size2; struct page **map_map; #endif @@ -501,82 +548,16 @@ void __init sparse_init(void) usemap_map = alloc_bootmem(size); if (!usemap_map) panic("can not allocate usemap_map\n"); - - for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { - struct mem_section *ms; - - if (!present_section_nr(pnum)) - continue; - ms = __nr_to_section(pnum); - nodeid_begin = sparse_early_nid(ms); - pnum_begin = pnum; - break; - } - usemap_count = 1; - for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) { - struct mem_section *ms; - int nodeid; - - if (!present_section_nr(pnum)) - continue; - ms = __nr_to_section(pnum); - nodeid = sparse_early_nid(ms); - if (nodeid == nodeid_begin) { - usemap_count++; - continue; - } - /* ok, we need to take cake of from pnum_begin to pnum - 1*/ - sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, pnum, - usemap_count, nodeid_begin); - /* new start, update count etc*/ - nodeid_begin = nodeid; - pnum_begin = pnum; - usemap_count = 1; - } - /* ok, last chunk */ - sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, NR_MEM_SECTIONS, - usemap_count, nodeid_begin); + alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node, + (void *)usemap_map); #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER size2 = sizeof(struct page *) * NR_MEM_SECTIONS; map_map = alloc_bootmem(size2); if (!map_map) panic("can not allocate map_map\n"); - - for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { - struct mem_section *ms; - - if (!present_section_nr(pnum)) - continue; - ms = __nr_to_section(pnum); - nodeid_begin = sparse_early_nid(ms); - pnum_begin = pnum; - break; - } - map_count = 1; - for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) { - struct mem_section *ms; - int nodeid; - - if (!present_section_nr(pnum)) - continue; - ms = __nr_to_section(pnum); - nodeid = sparse_early_nid(ms); - if (nodeid == nodeid_begin) { - map_count++; - continue; - } - /* ok, we need to take cake of from pnum_begin to pnum - 1*/ - sparse_early_mem_maps_alloc_node(map_map, pnum_begin, pnum, - map_count, nodeid_begin); - /* new start, update count etc*/ - nodeid_begin = nodeid; - pnum_begin = pnum; - map_count = 1; - } - /* ok, last chunk */ - sparse_early_mem_maps_alloc_node(map_map, pnum_begin, NR_MEM_SECTIONS, - map_count, nodeid_begin); + alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node, + (void *)map_map); #endif for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { diff --git a/mm/swap.c b/mm/swap.c index 62b78a6e224f..c899502d3e36 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -31,6 +31,7 @@ #include <linux/memcontrol.h> #include <linux/gfp.h> #include <linux/uio.h> +#include <linux/hugetlb.h> #include "internal.h" @@ -81,6 +82,19 @@ static void __put_compound_page(struct page *page) static void put_compound_page(struct page *page) { + /* + * hugetlbfs pages cannot be split from under us. If this is a + * hugetlbfs page, check refcount on head page and release the page if + * the refcount becomes zero. + */ + if (PageHuge(page)) { + page = compound_head(page); + if (put_page_testzero(page)) + __put_compound_page(page); + + return; + } + if (unlikely(PageTail(page))) { /* __split_huge_page_refcount can run under us */ struct page *page_head = compound_trans_head(page); @@ -184,38 +198,51 @@ bool __get_page_tail(struct page *page) * proper PT lock that already serializes against * split_huge_page(). */ - unsigned long flags; bool got = false; - struct page *page_head = compound_trans_head(page); + struct page *page_head; - if (likely(page != page_head && get_page_unless_zero(page_head))) { + /* + * If this is a hugetlbfs page it cannot be split under us. Simply + * increment refcount for the head page. + */ + if (PageHuge(page)) { + page_head = compound_head(page); + atomic_inc(&page_head->_count); + got = true; + } else { + unsigned long flags; + + page_head = compound_trans_head(page); + if (likely(page != page_head && + get_page_unless_zero(page_head))) { + + /* Ref to put_compound_page() comment. */ + if (PageSlab(page_head)) { + if (likely(PageTail(page))) { + __get_page_tail_foll(page, false); + return true; + } else { + put_page(page_head); + return false; + } + } - /* Ref to put_compound_page() comment. */ - if (PageSlab(page_head)) { + /* + * page_head wasn't a dangling pointer but it + * may not be a head page anymore by the time + * we obtain the lock. That is ok as long as it + * can't be freed from under us. + */ + flags = compound_lock_irqsave(page_head); + /* here __split_huge_page_refcount won't run anymore */ if (likely(PageTail(page))) { __get_page_tail_foll(page, false); - return true; - } else { - put_page(page_head); - return false; + got = true; } + compound_unlock_irqrestore(page_head, flags); + if (unlikely(!got)) + put_page(page_head); } - - /* - * page_head wasn't a dangling pointer but it - * may not be a head page anymore by the time - * we obtain the lock. That is ok as long as it - * can't be freed from under us. - */ - flags = compound_lock_irqsave(page_head); - /* here __split_huge_page_refcount won't run anymore */ - if (likely(PageTail(page))) { - __get_page_tail_foll(page, false); - got = true; - } - compound_unlock_irqrestore(page_head, flags); - if (unlikely(!got)) - put_page(page_head); } return got; } diff --git a/mm/swap_state.c b/mm/swap_state.c index f24ab0dff554..e6f15f8ca2af 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -122,7 +122,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) { int error; - error = radix_tree_preload(gfp_mask); + error = radix_tree_maybe_preload(gfp_mask); if (!error) { error = __add_to_swap_cache(page, entry); radix_tree_preload_end(); @@ -328,7 +328,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* * call radix_tree_preload() while we can wait. */ - err = radix_tree_preload(gfp_mask & GFP_KERNEL); + err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL); if (err) break; diff --git a/mm/swapfile.c b/mm/swapfile.c index 6cf2e60983b7..3963fc24fcc1 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -175,14 +175,296 @@ static void discard_swap_cluster(struct swap_info_struct *si, } } -static int wait_for_discard(void *word) +#define SWAPFILE_CLUSTER 256 +#define LATENCY_LIMIT 256 + +static inline void cluster_set_flag(struct swap_cluster_info *info, + unsigned int flag) { - schedule(); - return 0; + info->flags = flag; } -#define SWAPFILE_CLUSTER 256 -#define LATENCY_LIMIT 256 +static inline unsigned int cluster_count(struct swap_cluster_info *info) +{ + return info->data; +} + +static inline void cluster_set_count(struct swap_cluster_info *info, + unsigned int c) +{ + info->data = c; +} + +static inline void cluster_set_count_flag(struct swap_cluster_info *info, + unsigned int c, unsigned int f) +{ + info->flags = f; + info->data = c; +} + +static inline unsigned int cluster_next(struct swap_cluster_info *info) +{ + return info->data; +} + +static inline void cluster_set_next(struct swap_cluster_info *info, + unsigned int n) +{ + info->data = n; +} + +static inline void cluster_set_next_flag(struct swap_cluster_info *info, + unsigned int n, unsigned int f) +{ + info->flags = f; + info->data = n; +} + +static inline bool cluster_is_free(struct swap_cluster_info *info) +{ + return info->flags & CLUSTER_FLAG_FREE; +} + +static inline bool cluster_is_null(struct swap_cluster_info *info) +{ + return info->flags & CLUSTER_FLAG_NEXT_NULL; +} + +static inline void cluster_set_null(struct swap_cluster_info *info) +{ + info->flags = CLUSTER_FLAG_NEXT_NULL; + info->data = 0; +} + +/* Add a cluster to discard list and schedule it to do discard */ +static void swap_cluster_schedule_discard(struct swap_info_struct *si, + unsigned int idx) +{ + /* + * If scan_swap_map() can't find a free cluster, it will check + * si->swap_map directly. To make sure the discarding cluster isn't + * taken by scan_swap_map(), mark the swap entries bad (occupied). It + * will be cleared after discard + */ + memset(si->swap_map + idx * SWAPFILE_CLUSTER, + SWAP_MAP_BAD, SWAPFILE_CLUSTER); + + if (cluster_is_null(&si->discard_cluster_head)) { + cluster_set_next_flag(&si->discard_cluster_head, + idx, 0); + cluster_set_next_flag(&si->discard_cluster_tail, + idx, 0); + } else { + unsigned int tail = cluster_next(&si->discard_cluster_tail); + cluster_set_next(&si->cluster_info[tail], idx); + cluster_set_next_flag(&si->discard_cluster_tail, + idx, 0); + } + + schedule_work(&si->discard_work); +} + +/* + * Doing discard actually. After a cluster discard is finished, the cluster + * will be added to free cluster list. caller should hold si->lock. +*/ +static void swap_do_scheduled_discard(struct swap_info_struct *si) +{ + struct swap_cluster_info *info; + unsigned int idx; + + info = si->cluster_info; + + while (!cluster_is_null(&si->discard_cluster_head)) { + idx = cluster_next(&si->discard_cluster_head); + + cluster_set_next_flag(&si->discard_cluster_head, + cluster_next(&info[idx]), 0); + if (cluster_next(&si->discard_cluster_tail) == idx) { + cluster_set_null(&si->discard_cluster_head); + cluster_set_null(&si->discard_cluster_tail); + } + spin_unlock(&si->lock); + + discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, + SWAPFILE_CLUSTER); + + spin_lock(&si->lock); + cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE); + if (cluster_is_null(&si->free_cluster_head)) { + cluster_set_next_flag(&si->free_cluster_head, + idx, 0); + cluster_set_next_flag(&si->free_cluster_tail, + idx, 0); + } else { + unsigned int tail; + + tail = cluster_next(&si->free_cluster_tail); + cluster_set_next(&info[tail], idx); + cluster_set_next_flag(&si->free_cluster_tail, + idx, 0); + } + memset(si->swap_map + idx * SWAPFILE_CLUSTER, + 0, SWAPFILE_CLUSTER); + } +} + +static void swap_discard_work(struct work_struct *work) +{ + struct swap_info_struct *si; + + si = container_of(work, struct swap_info_struct, discard_work); + + spin_lock(&si->lock); + swap_do_scheduled_discard(si); + spin_unlock(&si->lock); +} + +/* + * The cluster corresponding to page_nr will be used. The cluster will be + * removed from free cluster list and its usage counter will be increased. + */ +static void inc_cluster_info_page(struct swap_info_struct *p, + struct swap_cluster_info *cluster_info, unsigned long page_nr) +{ + unsigned long idx = page_nr / SWAPFILE_CLUSTER; + + if (!cluster_info) + return; + if (cluster_is_free(&cluster_info[idx])) { + VM_BUG_ON(cluster_next(&p->free_cluster_head) != idx); + cluster_set_next_flag(&p->free_cluster_head, + cluster_next(&cluster_info[idx]), 0); + if (cluster_next(&p->free_cluster_tail) == idx) { + cluster_set_null(&p->free_cluster_tail); + cluster_set_null(&p->free_cluster_head); + } + cluster_set_count_flag(&cluster_info[idx], 0, 0); + } + + VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER); + cluster_set_count(&cluster_info[idx], + cluster_count(&cluster_info[idx]) + 1); +} + +/* + * The cluster corresponding to page_nr decreases one usage. If the usage + * counter becomes 0, which means no page in the cluster is in using, we can + * optionally discard the cluster and add it to free cluster list. + */ +static void dec_cluster_info_page(struct swap_info_struct *p, + struct swap_cluster_info *cluster_info, unsigned long page_nr) +{ + unsigned long idx = page_nr / SWAPFILE_CLUSTER; + + if (!cluster_info) + return; + + VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0); + cluster_set_count(&cluster_info[idx], + cluster_count(&cluster_info[idx]) - 1); + + if (cluster_count(&cluster_info[idx]) == 0) { + /* + * If the swap is discardable, prepare discard the cluster + * instead of free it immediately. The cluster will be freed + * after discard. + */ + if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == + (SWP_WRITEOK | SWP_PAGE_DISCARD)) { + swap_cluster_schedule_discard(p, idx); + return; + } + + cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); + if (cluster_is_null(&p->free_cluster_head)) { + cluster_set_next_flag(&p->free_cluster_head, idx, 0); + cluster_set_next_flag(&p->free_cluster_tail, idx, 0); + } else { + unsigned int tail = cluster_next(&p->free_cluster_tail); + cluster_set_next(&cluster_info[tail], idx); + cluster_set_next_flag(&p->free_cluster_tail, idx, 0); + } + } +} + +/* + * It's possible scan_swap_map() uses a free cluster in the middle of free + * cluster list. Avoiding such abuse to avoid list corruption. + */ +static bool +scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, + unsigned long offset) +{ + struct percpu_cluster *percpu_cluster; + bool conflict; + + offset /= SWAPFILE_CLUSTER; + conflict = !cluster_is_null(&si->free_cluster_head) && + offset != cluster_next(&si->free_cluster_head) && + cluster_is_free(&si->cluster_info[offset]); + + if (!conflict) + return false; + + percpu_cluster = this_cpu_ptr(si->percpu_cluster); + cluster_set_null(&percpu_cluster->index); + return true; +} + +/* + * Try to get a swap entry from current cpu's swap entry pool (a cluster). This + * might involve allocating a new cluster for current CPU too. + */ +static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, + unsigned long *offset, unsigned long *scan_base) +{ + struct percpu_cluster *cluster; + bool found_free; + unsigned long tmp; + +new_cluster: + cluster = this_cpu_ptr(si->percpu_cluster); + if (cluster_is_null(&cluster->index)) { + if (!cluster_is_null(&si->free_cluster_head)) { + cluster->index = si->free_cluster_head; + cluster->next = cluster_next(&cluster->index) * + SWAPFILE_CLUSTER; + } else if (!cluster_is_null(&si->discard_cluster_head)) { + /* + * we don't have free cluster but have some clusters in + * discarding, do discard now and reclaim them + */ + swap_do_scheduled_discard(si); + *scan_base = *offset = si->cluster_next; + goto new_cluster; + } else + return; + } + + found_free = false; + + /* + * Other CPUs can use our cluster if they can't find a free cluster, + * check if there is still free entry in the cluster + */ + tmp = cluster->next; + while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) * + SWAPFILE_CLUSTER) { + if (!si->swap_map[tmp]) { + found_free = true; + break; + } + tmp++; + } + if (!found_free) { + cluster_set_null(&cluster->index); + goto new_cluster; + } + cluster->next = tmp + 1; + *offset = tmp; + *scan_base = tmp; +} static unsigned long scan_swap_map(struct swap_info_struct *si, unsigned char usage) @@ -191,7 +473,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, unsigned long scan_base; unsigned long last_in_cluster = 0; int latency_ration = LATENCY_LIMIT; - int found_free_cluster = 0; /* * We try to cluster swap pages by allocating them sequentially @@ -207,24 +488,18 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, si->flags += SWP_SCANNING; scan_base = offset = si->cluster_next; + /* SSD algorithm */ + if (si->cluster_info) { + scan_swap_map_try_ssd_cluster(si, &offset, &scan_base); + goto checks; + } + if (unlikely(!si->cluster_nr--)) { if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; } - if (si->flags & SWP_PAGE_DISCARD) { - /* - * Start range check on racing allocations, in case - * they overlap the cluster we eventually decide on - * (we scan without swap_lock to allow preemption). - * It's hardly conceivable that cluster_nr could be - * wrapped during our scan, but don't depend on it. - */ - if (si->lowest_alloc) - goto checks; - si->lowest_alloc = si->max; - si->highest_alloc = 0; - } + spin_unlock(&si->lock); /* @@ -248,7 +523,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, offset -= SWAPFILE_CLUSTER - 1; si->cluster_next = offset; si->cluster_nr = SWAPFILE_CLUSTER - 1; - found_free_cluster = 1; goto checks; } if (unlikely(--latency_ration < 0)) { @@ -269,7 +543,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, offset -= SWAPFILE_CLUSTER - 1; si->cluster_next = offset; si->cluster_nr = SWAPFILE_CLUSTER - 1; - found_free_cluster = 1; goto checks; } if (unlikely(--latency_ration < 0)) { @@ -281,10 +554,13 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, offset = scan_base; spin_lock(&si->lock); si->cluster_nr = SWAPFILE_CLUSTER - 1; - si->lowest_alloc = 0; } checks: + if (si->cluster_info) { + while (scan_swap_map_ssd_cluster_conflict(si, offset)) + scan_swap_map_try_ssd_cluster(si, &offset, &scan_base); + } if (!(si->flags & SWP_WRITEOK)) goto no_page; if (!si->highest_bit) @@ -317,62 +593,10 @@ checks: si->highest_bit = 0; } si->swap_map[offset] = usage; + inc_cluster_info_page(si, si->cluster_info, offset); si->cluster_next = offset + 1; si->flags -= SWP_SCANNING; - if (si->lowest_alloc) { - /* - * Only set when SWP_PAGE_DISCARD, and there's a scan - * for a free cluster in progress or just completed. - */ - if (found_free_cluster) { - /* - * To optimize wear-levelling, discard the - * old data of the cluster, taking care not to - * discard any of its pages that have already - * been allocated by racing tasks (offset has - * already stepped over any at the beginning). - */ - if (offset < si->highest_alloc && - si->lowest_alloc <= last_in_cluster) - last_in_cluster = si->lowest_alloc - 1; - si->flags |= SWP_DISCARDING; - spin_unlock(&si->lock); - - if (offset < last_in_cluster) - discard_swap_cluster(si, offset, - last_in_cluster - offset + 1); - - spin_lock(&si->lock); - si->lowest_alloc = 0; - si->flags &= ~SWP_DISCARDING; - - smp_mb(); /* wake_up_bit advises this */ - wake_up_bit(&si->flags, ilog2(SWP_DISCARDING)); - - } else if (si->flags & SWP_DISCARDING) { - /* - * Delay using pages allocated by racing tasks - * until the whole discard has been issued. We - * could defer that delay until swap_writepage, - * but it's easier to keep this self-contained. - */ - spin_unlock(&si->lock); - wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), - wait_for_discard, TASK_UNINTERRUPTIBLE); - spin_lock(&si->lock); - } else { - /* - * Note pages allocated by racing tasks while - * scan for a free cluster is in progress, so - * that its final discard can exclude them. - */ - if (offset < si->lowest_alloc) - si->lowest_alloc = offset; - if (offset > si->highest_alloc) - si->highest_alloc = offset; - } - } return offset; scan: @@ -527,16 +751,16 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry) return p; bad_free: - printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); + pr_err("swap_free: %s%08lx\n", Unused_offset, entry.val); goto out; bad_offset: - printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); + pr_err("swap_free: %s%08lx\n", Bad_offset, entry.val); goto out; bad_device: - printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); + pr_err("swap_free: %s%08lx\n", Unused_file, entry.val); goto out; bad_nofile: - printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); + pr_err("swap_free: %s%08lx\n", Bad_file, entry.val); out: return NULL; } @@ -600,6 +824,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, /* free if no reference */ if (!usage) { + dec_cluster_info_page(p, p->cluster_info, offset); if (offset < p->lowest_bit) p->lowest_bit = offset; if (offset > p->highest_bit) @@ -1107,7 +1332,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, else continue; } - count = si->swap_map[i]; + count = ACCESS_ONCE(si->swap_map[i]); if (count && swap_count(count) != SWAP_MAP_BAD) break; } @@ -1127,7 +1352,11 @@ int try_to_unuse(unsigned int type, bool frontswap, { struct swap_info_struct *si = swap_info[type]; struct mm_struct *start_mm; - unsigned char *swap_map; + volatile unsigned char *swap_map; /* swap_map is accessed without + * locking. Mark it as volatile + * to prevent compiler doing + * something odd. + */ unsigned char swcount; struct page *page; swp_entry_t entry; @@ -1178,7 +1407,15 @@ int try_to_unuse(unsigned int type, bool frontswap, * reused since sys_swapoff() already disabled * allocation from here, or alloc_page() failed. */ - if (!*swap_map) + swcount = *swap_map; + /* + * We don't hold lock here, so the swap entry could be + * SWAP_MAP_BAD (when the cluster is discarding). + * Instead of fail out, We can just skip the swap + * entry because swapoff will wait for discarding + * finish anyway. + */ + if (!swcount || swcount == SWAP_MAP_BAD) continue; retval = -ENOMEM; break; @@ -1524,7 +1761,8 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) } static void _enable_swap_info(struct swap_info_struct *p, int prio, - unsigned char *swap_map) + unsigned char *swap_map, + struct swap_cluster_info *cluster_info) { int i, prev; @@ -1533,6 +1771,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, else p->prio = --least_priority; p->swap_map = swap_map; + p->cluster_info = cluster_info; p->flags |= SWP_WRITEOK; atomic_long_add(p->pages, &nr_swap_pages); total_swap_pages += p->pages; @@ -1553,12 +1792,13 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, static void enable_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, + struct swap_cluster_info *cluster_info, unsigned long *frontswap_map) { frontswap_init(p->type, frontswap_map); spin_lock(&swap_lock); spin_lock(&p->lock); - _enable_swap_info(p, prio, swap_map); + _enable_swap_info(p, prio, swap_map, cluster_info); spin_unlock(&p->lock); spin_unlock(&swap_lock); } @@ -1567,7 +1807,7 @@ static void reinsert_swap_info(struct swap_info_struct *p) { spin_lock(&swap_lock); spin_lock(&p->lock); - _enable_swap_info(p, p->prio, p->swap_map); + _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info); spin_unlock(&p->lock); spin_unlock(&swap_lock); } @@ -1576,6 +1816,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; unsigned char *swap_map; + struct swap_cluster_info *cluster_info; unsigned long *frontswap_map; struct file *swap_file, *victim; struct address_space *mapping; @@ -1651,6 +1892,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) goto out_dput; } + flush_work(&p->discard_work); + destroy_swap_extents(p); if (p->flags & SWP_CONTINUED) free_swap_count_continuations(p); @@ -1675,6 +1918,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->max = 0; swap_map = p->swap_map; p->swap_map = NULL; + cluster_info = p->cluster_info; + p->cluster_info = NULL; p->flags = 0; frontswap_map = frontswap_map_get(p); frontswap_map_set(p, NULL); @@ -1682,7 +1927,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); frontswap_invalidate_area(type); mutex_unlock(&swapon_mutex); + free_percpu(p->percpu_cluster); + p->percpu_cluster = NULL; vfree(swap_map); + vfree(cluster_info); vfree(frontswap_map); /* Destroy swap account informatin */ swap_cgroup_swapoff(type); @@ -1926,9 +2174,10 @@ static unsigned long read_swap_header(struct swap_info_struct *p, int i; unsigned long maxpages; unsigned long swapfilepages; + unsigned long last_page; if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { - printk(KERN_ERR "Unable to find swap-space signature\n"); + pr_err("Unable to find swap-space signature\n"); return 0; } @@ -1942,9 +2191,8 @@ static unsigned long read_swap_header(struct swap_info_struct *p, } /* Check the swap header's sub-version */ if (swap_header->info.version != 1) { - printk(KERN_WARNING - "Unable to handle swap header version %d\n", - swap_header->info.version); + pr_warn("Unable to handle swap header version %d\n", + swap_header->info.version); return 0; } @@ -1968,8 +2216,14 @@ static unsigned long read_swap_header(struct swap_info_struct *p, */ maxpages = swp_offset(pte_to_swp_entry( swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; - if (maxpages > swap_header->info.last_page) { - maxpages = swap_header->info.last_page + 1; + last_page = swap_header->info.last_page; + if (last_page > maxpages) { + pr_warn("Truncating oversized swap area, only using %luk out of %luk\n", + maxpages << (PAGE_SHIFT - 10), + last_page << (PAGE_SHIFT - 10)); + } + if (maxpages > last_page) { + maxpages = last_page + 1; /* p->max is an unsigned int: don't overflow it */ if ((unsigned int)maxpages == 0) maxpages = UINT_MAX; @@ -1980,8 +2234,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p, return 0; swapfilepages = i_size_read(inode) >> PAGE_SHIFT; if (swapfilepages && maxpages > swapfilepages) { - printk(KERN_WARNING - "Swap area shorter than signature indicates\n"); + pr_warn("Swap area shorter than signature indicates\n"); return 0; } if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) @@ -1995,15 +2248,23 @@ static unsigned long read_swap_header(struct swap_info_struct *p, static int setup_swap_map_and_extents(struct swap_info_struct *p, union swap_header *swap_header, unsigned char *swap_map, + struct swap_cluster_info *cluster_info, unsigned long maxpages, sector_t *span) { int i; unsigned int nr_good_pages; int nr_extents; + unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); + unsigned long idx = p->cluster_next / SWAPFILE_CLUSTER; nr_good_pages = maxpages - 1; /* omit header page */ + cluster_set_null(&p->free_cluster_head); + cluster_set_null(&p->free_cluster_tail); + cluster_set_null(&p->discard_cluster_head); + cluster_set_null(&p->discard_cluster_tail); + for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; if (page_nr == 0 || page_nr > swap_header->info.last_page) @@ -2011,11 +2272,25 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, if (page_nr < maxpages) { swap_map[page_nr] = SWAP_MAP_BAD; nr_good_pages--; + /* + * Haven't marked the cluster free yet, no list + * operation involved + */ + inc_cluster_info_page(p, cluster_info, page_nr); } } + /* Haven't marked the cluster free yet, no list operation involved */ + for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) + inc_cluster_info_page(p, cluster_info, i); + if (nr_good_pages) { swap_map[0] = SWAP_MAP_BAD; + /* + * Not mark the cluster free yet, no list + * operation involved + */ + inc_cluster_info_page(p, cluster_info, 0); p->max = maxpages; p->pages = nr_good_pages; nr_extents = setup_swap_extents(p, span); @@ -2024,10 +2299,34 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, nr_good_pages = p->pages; } if (!nr_good_pages) { - printk(KERN_WARNING "Empty swap-file\n"); + pr_warn("Empty swap-file\n"); return -EINVAL; } + if (!cluster_info) + return nr_extents; + + for (i = 0; i < nr_clusters; i++) { + if (!cluster_count(&cluster_info[idx])) { + cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); + if (cluster_is_null(&p->free_cluster_head)) { + cluster_set_next_flag(&p->free_cluster_head, + idx, 0); + cluster_set_next_flag(&p->free_cluster_tail, + idx, 0); + } else { + unsigned int tail; + + tail = cluster_next(&p->free_cluster_tail); + cluster_set_next(&cluster_info[tail], idx); + cluster_set_next_flag(&p->free_cluster_tail, + idx, 0); + } + } + idx++; + if (idx == nr_clusters) + idx = 0; + } return nr_extents; } @@ -2059,6 +2358,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) sector_t span; unsigned long maxpages; unsigned char *swap_map = NULL; + struct swap_cluster_info *cluster_info = NULL; unsigned long *frontswap_map = NULL; struct page *page = NULL; struct inode *inode = NULL; @@ -2073,6 +2373,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (IS_ERR(p)) return PTR_ERR(p); + INIT_WORK(&p->discard_work, swap_discard_work); + name = getname(specialfile); if (IS_ERR(name)) { error = PTR_ERR(name); @@ -2132,13 +2434,38 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -ENOMEM; goto bad_swap; } + if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { + p->flags |= SWP_SOLIDSTATE; + /* + * select a random position to start with to help wear leveling + * SSD + */ + p->cluster_next = 1 + (prandom_u32() % p->highest_bit); + + cluster_info = vzalloc(DIV_ROUND_UP(maxpages, + SWAPFILE_CLUSTER) * sizeof(*cluster_info)); + if (!cluster_info) { + error = -ENOMEM; + goto bad_swap; + } + p->percpu_cluster = alloc_percpu(struct percpu_cluster); + if (!p->percpu_cluster) { + error = -ENOMEM; + goto bad_swap; + } + for_each_possible_cpu(i) { + struct percpu_cluster *cluster; + cluster = per_cpu_ptr(p->percpu_cluster, i); + cluster_set_null(&cluster->index); + } + } error = swap_cgroup_swapon(p->type, maxpages); if (error) goto bad_swap; nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map, - maxpages, &span); + cluster_info, maxpages, &span); if (unlikely(nr_extents < 0)) { error = nr_extents; goto bad_swap; @@ -2147,41 +2474,33 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (frontswap_enabled) frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long)); - if (p->bdev) { - if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { - p->flags |= SWP_SOLIDSTATE; - p->cluster_next = 1 + (prandom_u32() % p->highest_bit); - } - - if ((swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { - /* - * When discard is enabled for swap with no particular - * policy flagged, we set all swap discard flags here in - * order to sustain backward compatibility with older - * swapon(8) releases. - */ - p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | - SWP_PAGE_DISCARD); + if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { + /* + * When discard is enabled for swap with no particular + * policy flagged, we set all swap discard flags here in + * order to sustain backward compatibility with older + * swapon(8) releases. + */ + p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | + SWP_PAGE_DISCARD); - /* - * By flagging sys_swapon, a sysadmin can tell us to - * either do single-time area discards only, or to just - * perform discards for released swap page-clusters. - * Now it's time to adjust the p->flags accordingly. - */ - if (swap_flags & SWAP_FLAG_DISCARD_ONCE) - p->flags &= ~SWP_PAGE_DISCARD; - else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) - p->flags &= ~SWP_AREA_DISCARD; - - /* issue a swapon-time discard if it's still required */ - if (p->flags & SWP_AREA_DISCARD) { - int err = discard_swap(p); - if (unlikely(err)) - printk(KERN_ERR - "swapon: discard_swap(%p): %d\n", - p, err); - } + /* + * By flagging sys_swapon, a sysadmin can tell us to + * either do single-time area discards only, or to just + * perform discards for released swap page-clusters. + * Now it's time to adjust the p->flags accordingly. + */ + if (swap_flags & SWAP_FLAG_DISCARD_ONCE) + p->flags &= ~SWP_PAGE_DISCARD; + else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) + p->flags &= ~SWP_AREA_DISCARD; + + /* issue a swapon-time discard if it's still required */ + if (p->flags & SWP_AREA_DISCARD) { + int err = discard_swap(p); + if (unlikely(err)) + pr_err("swapon: discard_swap(%p): %d\n", + p, err); } } @@ -2190,9 +2509,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (swap_flags & SWAP_FLAG_PREFER) prio = (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; - enable_swap_info(p, prio, swap_map, frontswap_map); + enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map); - printk(KERN_INFO "Adding %uk swap on %s. " + pr_info("Adding %uk swap on %s. " "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", p->pages<<(PAGE_SHIFT-10), name->name, p->prio, nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), @@ -2211,6 +2530,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = 0; goto out; bad_swap: + free_percpu(p->percpu_cluster); + p->percpu_cluster = NULL; if (inode && S_ISBLK(inode->i_mode) && p->bdev) { set_blocksize(p->bdev, p->old_block_size); blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); @@ -2222,6 +2543,7 @@ bad_swap: p->flags = 0; spin_unlock(&swap_lock); vfree(swap_map); + vfree(cluster_info); if (swap_file) { if (inode && S_ISREG(inode->i_mode)) { mutex_unlock(&inode->i_mutex); @@ -2291,6 +2613,16 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) goto unlock_out; count = p->swap_map[offset]; + + /* + * swapin_readahead() doesn't check if a swap entry is valid, so the + * swap entry could be SWAP_MAP_BAD. Check here with lock held. + */ + if (unlikely(swap_count(count) == SWAP_MAP_BAD)) { + err = -ENOENT; + goto unlock_out; + } + has_cache = count & SWAP_HAS_CACHE; count &= ~SWAP_HAS_CACHE; err = 0; @@ -2326,7 +2658,7 @@ out: return err; bad_file: - printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); + pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val); goto out; } diff --git a/mm/util.c b/mm/util.c index 7441c41d00f6..eaf63fc2c92f 100644 --- a/mm/util.c +++ b/mm/util.c @@ -388,15 +388,12 @@ struct address_space *page_mapping(struct page *page) struct address_space *mapping = page->mapping; VM_BUG_ON(PageSlab(page)); -#ifdef CONFIG_SWAP if (unlikely(PageSwapCache(page))) { swp_entry_t entry; entry.val = page_private(page); mapping = swap_address_space(entry); - } else -#endif - if ((unsigned long)mapping & PAGE_MAPPING_ANON) + } else if ((unsigned long)mapping & PAGE_MAPPING_ANON) mapping = NULL; return mapping; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 13a54953a273..107454312d5e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -752,7 +752,6 @@ struct vmap_block_queue { struct vmap_block { spinlock_t lock; struct vmap_area *va; - struct vmap_block_queue *vbq; unsigned long free, dirty; DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); struct list_head free_list; @@ -830,7 +829,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) radix_tree_preload_end(); vbq = &get_cpu_var(vmap_block_queue); - vb->vbq = vbq; spin_lock(&vbq->lock); list_add_rcu(&vb->free_list, &vbq->free); spin_unlock(&vbq->lock); @@ -1018,15 +1016,16 @@ void vm_unmap_aliases(void) rcu_read_lock(); list_for_each_entry_rcu(vb, &vbq->free, free_list) { - int i; + int i, j; spin_lock(&vb->lock); i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS); - while (i < VMAP_BBMAP_BITS) { + if (i < VMAP_BBMAP_BITS) { unsigned long s, e; - int j; - j = find_next_zero_bit(vb->dirty_map, - VMAP_BBMAP_BITS, i); + + j = find_last_bit(vb->dirty_map, + VMAP_BBMAP_BITS); + j = j + 1; /* need exclusive index */ s = vb->va->va_start + (i << PAGE_SHIFT); e = vb->va->va_start + (j << PAGE_SHIFT); @@ -1036,10 +1035,6 @@ void vm_unmap_aliases(void) start = s; if (e > end) end = e; - - i = j; - i = find_next_bit(vb->dirty_map, - VMAP_BBMAP_BITS, i); } spin_unlock(&vb->lock); } @@ -1263,7 +1258,7 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) { unsigned long addr = (unsigned long)area->addr; - unsigned long end = addr + area->size - PAGE_SIZE; + unsigned long end = addr + get_vm_area_size(area); int err; err = vmap_page_range(addr, end, prot, *pages); @@ -1558,7 +1553,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, unsigned int nr_pages, array_size, i; gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; - nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT; + nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; array_size = (nr_pages * sizeof(struct page *)); area->nr_pages = nr_pages; @@ -1990,7 +1985,7 @@ long vread(char *buf, char *addr, unsigned long count) vm = va->vm; vaddr = (char *) vm->addr; - if (addr >= vaddr + vm->size - PAGE_SIZE) + if (addr >= vaddr + get_vm_area_size(vm)) continue; while (addr < vaddr) { if (count == 0) @@ -2000,7 +1995,7 @@ long vread(char *buf, char *addr, unsigned long count) addr++; count--; } - n = vaddr + vm->size - PAGE_SIZE - addr; + n = vaddr + get_vm_area_size(vm) - addr; if (n > count) n = count; if (!(vm->flags & VM_IOREMAP)) @@ -2072,7 +2067,7 @@ long vwrite(char *buf, char *addr, unsigned long count) vm = va->vm; vaddr = (char *) vm->addr; - if (addr >= vaddr + vm->size - PAGE_SIZE) + if (addr >= vaddr + get_vm_area_size(vm)) continue; while (addr < vaddr) { if (count == 0) @@ -2081,7 +2076,7 @@ long vwrite(char *buf, char *addr, unsigned long count) addr++; count--; } - n = vaddr + vm->size - PAGE_SIZE - addr; + n = vaddr + get_vm_area_size(vm) - addr; if (n > count) n = count; if (!(vm->flags & VM_IOREMAP)) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 2cff0d491c6d..fe715daeb8bc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -146,6 +146,25 @@ static bool global_reclaim(struct scan_control *sc) } #endif +unsigned long zone_reclaimable_pages(struct zone *zone) +{ + int nr; + + nr = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); + + if (get_nr_swap_pages() > 0) + nr += zone_page_state(zone, NR_ACTIVE_ANON) + + zone_page_state(zone, NR_INACTIVE_ANON); + + return nr; +} + +bool zone_reclaimable(struct zone *zone) +{ + return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; +} + static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) { if (!mem_cgroup_disabled()) @@ -545,7 +564,7 @@ int remove_mapping(struct address_space *mapping, struct page *page) */ void putback_lru_page(struct page *page) { - int lru; + bool is_unevictable; int was_unevictable = PageUnevictable(page); VM_BUG_ON(PageLRU(page)); @@ -560,14 +579,14 @@ redo: * unevictable page on [in]active list. * We know how to handle that. */ - lru = page_lru_base_type(page); + is_unevictable = false; lru_cache_add(page); } else { /* * Put unevictable pages directly on zone's unevictable * list. */ - lru = LRU_UNEVICTABLE; + is_unevictable = true; add_page_to_unevictable_list(page); /* * When racing with an mlock or AS_UNEVICTABLE clearing @@ -587,7 +606,7 @@ redo: * page is on unevictable list, it never be freed. To avoid that, * check after we added it to the list, again. */ - if (lru == LRU_UNEVICTABLE && page_evictable(page)) { + if (is_unevictable && page_evictable(page)) { if (!isolate_lru_page(page)) { put_page(page); goto redo; @@ -598,9 +617,9 @@ redo: */ } - if (was_unevictable && lru != LRU_UNEVICTABLE) + if (was_unevictable && !is_unevictable) count_vm_event(UNEVICTABLE_PGRESCUED); - else if (!was_unevictable && lru == LRU_UNEVICTABLE) + else if (!was_unevictable && is_unevictable) count_vm_event(UNEVICTABLE_PGCULLED); put_page(page); /* drop ref from isolate */ @@ -1789,7 +1808,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * latencies, so it's better to scan a minimum amount there as * well. */ - if (current_is_kswapd() && zone->all_unreclaimable) + if (current_is_kswapd() && !zone_reclaimable(zone)) force_scan = true; if (!global_reclaim(sc)) force_scan = true; @@ -2244,8 +2263,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) if (global_reclaim(sc)) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - if (zone->all_unreclaimable && - sc->priority != DEF_PRIORITY) + if (sc->priority != DEF_PRIORITY && + !zone_reclaimable(zone)) continue; /* Let kswapd poll it */ if (IS_ENABLED(CONFIG_COMPACTION)) { /* @@ -2283,11 +2302,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) return aborted_reclaim; } -static bool zone_reclaimable(struct zone *zone) -{ - return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; -} - /* All zones in zonelist are unreclaimable? */ static bool all_unreclaimable(struct zonelist *zonelist, struct scan_control *sc) @@ -2301,7 +2315,7 @@ static bool all_unreclaimable(struct zonelist *zonelist, continue; if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - if (!zone->all_unreclaimable) + if (zone_reclaimable(zone)) return false; } @@ -2712,7 +2726,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) * DEF_PRIORITY. Effectively, it considers them balanced so * they must be considered balanced here as well! */ - if (zone->all_unreclaimable) { + if (!zone_reclaimable(zone)) { balanced_pages += zone->managed_pages; continue; } @@ -2773,7 +2787,6 @@ static bool kswapd_shrink_zone(struct zone *zone, unsigned long lru_pages, unsigned long *nr_attempted) { - unsigned long nr_slab; int testorder = sc->order; unsigned long balance_gap; struct reclaim_state *reclaim_state = current->reclaim_state; @@ -2818,15 +2831,12 @@ static bool kswapd_shrink_zone(struct zone *zone, shrink_zone(zone, sc); reclaim_state->reclaimed_slab = 0; - nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages); + shrink_slab(&shrink, sc->nr_scanned, lru_pages); sc->nr_reclaimed += reclaim_state->reclaimed_slab; /* Account for the number of pages attempted to reclaim */ *nr_attempted += sc->nr_to_reclaim; - if (nr_slab == 0 && !zone_reclaimable(zone)) - zone->all_unreclaimable = 1; - zone_clear_flag(zone, ZONE_WRITEBACK); /* @@ -2835,7 +2845,7 @@ static bool kswapd_shrink_zone(struct zone *zone, * BDIs but as pressure is relieved, speculatively avoid congestion * waits. */ - if (!zone->all_unreclaimable && + if (zone_reclaimable(zone) && zone_balanced(zone, testorder, 0, classzone_idx)) { zone_clear_flag(zone, ZONE_CONGESTED); zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); @@ -2901,8 +2911,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, if (!populated_zone(zone)) continue; - if (zone->all_unreclaimable && - sc.priority != DEF_PRIORITY) + if (sc.priority != DEF_PRIORITY && + !zone_reclaimable(zone)) continue; /* @@ -2980,8 +2990,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, if (!populated_zone(zone)) continue; - if (zone->all_unreclaimable && - sc.priority != DEF_PRIORITY) + if (sc.priority != DEF_PRIORITY && + !zone_reclaimable(zone)) continue; sc.nr_scanned = 0; @@ -3237,7 +3247,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) } if (!waitqueue_active(&pgdat->kswapd_wait)) return; - if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) + if (zone_balanced(zone, order, 0, 0)) return; trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); @@ -3265,20 +3275,6 @@ unsigned long global_reclaimable_pages(void) return nr; } -unsigned long zone_reclaimable_pages(struct zone *zone) -{ - int nr; - - nr = zone_page_state(zone, NR_ACTIVE_FILE) + - zone_page_state(zone, NR_INACTIVE_FILE); - - if (get_nr_swap_pages() > 0) - nr += zone_page_state(zone, NR_ACTIVE_ANON) + - zone_page_state(zone, NR_INACTIVE_ANON); - - return nr; -} - #ifdef CONFIG_HIBERNATION /* * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of @@ -3576,7 +3572,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) return ZONE_RECLAIM_FULL; - if (zone->all_unreclaimable) + if (!zone_reclaimable(zone)) return ZONE_RECLAIM_FULL; /* diff --git a/mm/vmstat.c b/mm/vmstat.c index 20c2ef4458fa..9bb314577911 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -19,6 +19,9 @@ #include <linux/math64.h> #include <linux/writeback.h> #include <linux/compaction.h> +#include <linux/mm_inline.h> + +#include "internal.h" #ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; @@ -414,12 +417,17 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item) EXPORT_SYMBOL(dec_zone_page_state); #endif +static inline void fold_diff(int *diff) +{ + int i; + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + if (diff[i]) + atomic_long_add(diff[i], &vm_stat[i]); +} + /* - * Update the zone counters for one cpu. - * - * The cpu specified must be either the current cpu or a processor that - * is not online. If it is the current cpu then the execution thread must - * be pinned to the current cpu. + * Update the zone counters for the current cpu. * * Note that refresh_cpu_vm_stats strives to only access * node local memory. The per cpu pagesets on remote zones are placed @@ -432,33 +440,29 @@ EXPORT_SYMBOL(dec_zone_page_state); * with the global counters. These could cause remote node cache line * bouncing and will have to be only done when necessary. */ -void refresh_cpu_vm_stats(int cpu) +static void refresh_cpu_vm_stats(void) { struct zone *zone; int i; int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; for_each_populated_zone(zone) { - struct per_cpu_pageset *p; + struct per_cpu_pageset __percpu *p = zone->pageset; - p = per_cpu_ptr(zone->pageset, cpu); + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { + int v; - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - if (p->vm_stat_diff[i]) { - unsigned long flags; - int v; + v = this_cpu_xchg(p->vm_stat_diff[i], 0); + if (v) { - local_irq_save(flags); - v = p->vm_stat_diff[i]; - p->vm_stat_diff[i] = 0; - local_irq_restore(flags); atomic_long_add(v, &zone->vm_stat[i]); global_diff[i] += v; #ifdef CONFIG_NUMA /* 3 seconds idle till flush */ - p->expire = 3; + __this_cpu_write(p->expire, 3); #endif } + } cond_resched(); #ifdef CONFIG_NUMA /* @@ -468,29 +472,57 @@ void refresh_cpu_vm_stats(int cpu) * Check if there are pages remaining in this pageset * if not then there is nothing to expire. */ - if (!p->expire || !p->pcp.count) + if (!__this_cpu_read(p->expire) || + !__this_cpu_read(p->pcp.count)) continue; /* * We never drain zones local to this processor. */ if (zone_to_nid(zone) == numa_node_id()) { - p->expire = 0; + __this_cpu_write(p->expire, 0); continue; } - p->expire--; - if (p->expire) + + if (__this_cpu_dec_return(p->expire)) continue; - if (p->pcp.count) - drain_zone_pages(zone, &p->pcp); + if (__this_cpu_read(p->pcp.count)) + drain_zone_pages(zone, __this_cpu_ptr(&p->pcp)); #endif } + fold_diff(global_diff); +} - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - if (global_diff[i]) - atomic_long_add(global_diff[i], &vm_stat[i]); +/* + * Fold the data for an offline cpu into the global array. + * There cannot be any access by the offline cpu and therefore + * synchronization is simplified. + */ +void cpu_vm_stats_fold(int cpu) +{ + struct zone *zone; + int i; + int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; + + for_each_populated_zone(zone) { + struct per_cpu_pageset *p; + + p = per_cpu_ptr(zone->pageset, cpu); + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + if (p->vm_stat_diff[i]) { + int v; + + v = p->vm_stat_diff[i]; + p->vm_stat_diff[i] = 0; + atomic_long_add(v, &zone->vm_stat[i]); + global_diff[i] += v; + } + } + + fold_diff(global_diff); } /* @@ -703,6 +735,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, const char * const vmstat_text[] = { /* Zoned VM counters */ "nr_free_pages", + "nr_alloc_batch", "nr_inactive_anon", "nr_active_anon", "nr_inactive_file", @@ -817,6 +850,12 @@ const char * const vmstat_text[] = { "thp_zero_page_alloc", "thp_zero_page_alloc_failed", #endif +#ifdef CONFIG_SMP + "nr_tlb_remote_flush", + "nr_tlb_remote_flush_received", +#endif + "nr_tlb_local_flush_all", + "nr_tlb_local_flush_one", #endif /* CONFIG_VM_EVENTS_COUNTERS */ }; @@ -1052,7 +1091,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n all_unreclaimable: %u" "\n start_pfn: %lu" "\n inactive_ratio: %u", - zone->all_unreclaimable, + !zone_reclaimable(zone), zone->zone_start_pfn, zone->inactive_ratio); seq_putc(m, '\n'); @@ -1177,7 +1216,7 @@ int sysctl_stat_interval __read_mostly = HZ; static void vmstat_update(struct work_struct *w) { - refresh_cpu_vm_stats(smp_processor_id()); + refresh_cpu_vm_stats(); schedule_delayed_work(&__get_cpu_var(vmstat_work), round_jiffies_relative(sysctl_stat_interval)); } diff --git a/mm/zbud.c b/mm/zbud.c index ad1e781284fd..9451361e6aa7 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -16,7 +16,7 @@ * * zbud works by storing compressed pages, or "zpages", together in pairs in a * single memory page called a "zbud page". The first buddy is "left - * justifed" at the beginning of the zbud page, and the last buddy is "right + * justified" at the beginning of the zbud page, and the last buddy is "right * justified" at the end of the zbud page. The benefit is that if either * buddy is freed, the freed buddy space, coalesced with whatever slack space * that existed between the buddies, results in the largest possible free region @@ -243,7 +243,7 @@ void zbud_destroy_pool(struct zbud_pool *pool) * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used * as zbud pool pages. * - * Return: 0 if success and handle is set, otherwise -EINVAL is the size or + * Return: 0 if success and handle is set, otherwise -EINVAL if the size or * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate * a new page. */ diff --git a/mm/zswap.c b/mm/zswap.c index deda2b671e12..841e35f1db22 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -409,7 +409,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry, struct page **retpage) { struct page *found_page, *new_page = NULL; - struct address_space *swapper_space = &swapper_spaces[swp_type(entry)]; + struct address_space *swapper_space = swap_address_space(entry); int err; *retpage = NULL; @@ -790,26 +790,14 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) static void zswap_frontswap_invalidate_area(unsigned type) { struct zswap_tree *tree = zswap_trees[type]; - struct rb_node *node; - struct zswap_entry *entry; + struct zswap_entry *entry, *n; if (!tree) return; /* walk the tree and free everything */ spin_lock(&tree->lock); - /* - * TODO: Even though this code should not be executed because - * the try_to_unuse() in swapoff should have emptied the tree, - * it is very wasteful to rebalance the tree after every - * removal when we are freeing the whole tree. - * - * If post-order traversal code is ever added to the rbtree - * implementation, it should be used here. - */ - while ((node = rb_first(&tree->rbroot))) { - entry = rb_entry(node, struct zswap_entry, rbnode); - rb_erase(&entry->rbnode, &tree->rbroot); + rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) { zbud_free(tree->pool, entry->handle); zswap_entry_cache_free(entry); atomic_dec(&zswap_stored_pages); diff --git a/net/socket.c b/net/socket.c index b2d7c629eeb9..0ceaa5cb9ead 100644 --- a/net/socket.c +++ b/net/socket.c @@ -3072,12 +3072,12 @@ static int compat_sioc_ifmap(struct net *net, unsigned int cmd, uifmap32 = &uifr32->ifr_ifru.ifru_map; err = copy_from_user(&ifr, uifr32, sizeof(ifr.ifr_name)); - err |= __get_user(ifr.ifr_map.mem_start, &uifmap32->mem_start); - err |= __get_user(ifr.ifr_map.mem_end, &uifmap32->mem_end); - err |= __get_user(ifr.ifr_map.base_addr, &uifmap32->base_addr); - err |= __get_user(ifr.ifr_map.irq, &uifmap32->irq); - err |= __get_user(ifr.ifr_map.dma, &uifmap32->dma); - err |= __get_user(ifr.ifr_map.port, &uifmap32->port); + err |= get_user(ifr.ifr_map.mem_start, &uifmap32->mem_start); + err |= get_user(ifr.ifr_map.mem_end, &uifmap32->mem_end); + err |= get_user(ifr.ifr_map.base_addr, &uifmap32->base_addr); + err |= get_user(ifr.ifr_map.irq, &uifmap32->irq); + err |= get_user(ifr.ifr_map.dma, &uifmap32->dma); + err |= get_user(ifr.ifr_map.port, &uifmap32->port); if (err) return -EFAULT; @@ -3088,12 +3088,12 @@ static int compat_sioc_ifmap(struct net *net, unsigned int cmd, if (cmd == SIOCGIFMAP && !err) { err = copy_to_user(uifr32, &ifr, sizeof(ifr.ifr_name)); - err |= __put_user(ifr.ifr_map.mem_start, &uifmap32->mem_start); - err |= __put_user(ifr.ifr_map.mem_end, &uifmap32->mem_end); - err |= __put_user(ifr.ifr_map.base_addr, &uifmap32->base_addr); - err |= __put_user(ifr.ifr_map.irq, &uifmap32->irq); - err |= __put_user(ifr.ifr_map.dma, &uifmap32->dma); - err |= __put_user(ifr.ifr_map.port, &uifmap32->port); + err |= put_user(ifr.ifr_map.mem_start, &uifmap32->mem_start); + err |= put_user(ifr.ifr_map.mem_end, &uifmap32->mem_end); + err |= put_user(ifr.ifr_map.base_addr, &uifmap32->base_addr); + err |= put_user(ifr.ifr_map.irq, &uifmap32->irq); + err |= put_user(ifr.ifr_map.dma, &uifmap32->dma); + err |= put_user(ifr.ifr_map.port, &uifmap32->port); if (err) err = -EFAULT; } @@ -3167,25 +3167,25 @@ static int routing_ioctl(struct net *net, struct socket *sock, struct in6_rtmsg32 __user *ur6 = argp; ret = copy_from_user(&r6.rtmsg_dst, &(ur6->rtmsg_dst), 3 * sizeof(struct in6_addr)); - ret |= __get_user(r6.rtmsg_type, &(ur6->rtmsg_type)); - ret |= __get_user(r6.rtmsg_dst_len, &(ur6->rtmsg_dst_len)); - ret |= __get_user(r6.rtmsg_src_len, &(ur6->rtmsg_src_len)); - ret |= __get_user(r6.rtmsg_metric, &(ur6->rtmsg_metric)); - ret |= __get_user(r6.rtmsg_info, &(ur6->rtmsg_info)); - ret |= __get_user(r6.rtmsg_flags, &(ur6->rtmsg_flags)); - ret |= __get_user(r6.rtmsg_ifindex, &(ur6->rtmsg_ifindex)); + ret |= get_user(r6.rtmsg_type, &(ur6->rtmsg_type)); + ret |= get_user(r6.rtmsg_dst_len, &(ur6->rtmsg_dst_len)); + ret |= get_user(r6.rtmsg_src_len, &(ur6->rtmsg_src_len)); + ret |= get_user(r6.rtmsg_metric, &(ur6->rtmsg_metric)); + ret |= get_user(r6.rtmsg_info, &(ur6->rtmsg_info)); + ret |= get_user(r6.rtmsg_flags, &(ur6->rtmsg_flags)); + ret |= get_user(r6.rtmsg_ifindex, &(ur6->rtmsg_ifindex)); r = (void *) &r6; } else { /* ipv4 */ struct rtentry32 __user *ur4 = argp; ret = copy_from_user(&r4.rt_dst, &(ur4->rt_dst), 3 * sizeof(struct sockaddr)); - ret |= __get_user(r4.rt_flags, &(ur4->rt_flags)); - ret |= __get_user(r4.rt_metric, &(ur4->rt_metric)); - ret |= __get_user(r4.rt_mtu, &(ur4->rt_mtu)); - ret |= __get_user(r4.rt_window, &(ur4->rt_window)); - ret |= __get_user(r4.rt_irtt, &(ur4->rt_irtt)); - ret |= __get_user(rtdev, &(ur4->rt_dev)); + ret |= get_user(r4.rt_flags, &(ur4->rt_flags)); + ret |= get_user(r4.rt_metric, &(ur4->rt_metric)); + ret |= get_user(r4.rt_mtu, &(ur4->rt_mtu)); + ret |= get_user(r4.rt_window, &(ur4->rt_window)); + ret |= get_user(r4.rt_irtt, &(ur4->rt_irtt)); + ret |= get_user(rtdev, &(ur4->rt_dev)); if (rtdev) { ret |= copy_from_user(devname, compat_ptr(rtdev), 15); r4.rt_dev = (char __user __force *)devname; diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 2ee9eb750560..47016c304c84 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -31,12 +31,16 @@ my $show_types = 0; my $fix = 0; my $root; my %debug; -my %ignore_type = (); my %camelcase = (); +my %use_type = (); +my @use = (); +my %ignore_type = (); my @ignore = (); my $help = 0; my $configuration_file = ".checkpatch.conf"; my $max_line_length = 80; +my $ignore_perl_version = 0; +my $minimum_perl_version = 5.10.0; sub help { my ($exitcode) = @_; @@ -54,6 +58,7 @@ Options: --terse one line per report -f, --file treat FILE as regular source file --subjective, --strict enable more subjective tests + --types TYPE(,TYPE2...) show only these comma separated message types --ignore TYPE(,TYPE2...) ignore various comma separated message types --max-line-length=n set the maximum line length, if exceeded, warn --show-types show the message "types" in the output @@ -71,6 +76,8 @@ Options: "<inputfile>.EXPERIMENTAL-checkpatch-fixes" with potential errors corrected to the preferred checkpatch style + --ignore-perl-version override checking of perl version. expect + runtime errors. -h, --help, --version display this help and exit When FILE is - read standard input. @@ -116,6 +123,7 @@ GetOptions( 'subjective!' => \$check, 'strict!' => \$check, 'ignore=s' => \@ignore, + 'types=s' => \@use, 'show-types!' => \$show_types, 'max-line-length=i' => \$max_line_length, 'root=s' => \$root, @@ -123,6 +131,7 @@ GetOptions( 'mailback!' => \$mailback, 'summary-file!' => \$summary_file, 'fix!' => \$fix, + 'ignore-perl-version!' => \$ignore_perl_version, 'debug=s' => \%debug, 'test-only=s' => \$tst_only, 'h|help' => \$help, @@ -133,24 +142,50 @@ help(0) if ($help); my $exit = 0; +if ($^V && $^V lt $minimum_perl_version) { + printf "$P: requires at least perl version %vd\n", $minimum_perl_version; + if (!$ignore_perl_version) { + exit(1); + } +} + if ($#ARGV < 0) { print "$P: no input files\n"; exit(1); } -@ignore = split(/,/, join(',',@ignore)); -foreach my $word (@ignore) { - $word =~ s/\s*\n?$//g; - $word =~ s/^\s*//g; - $word =~ s/\s+/ /g; - $word =~ tr/[a-z]/[A-Z]/; +sub hash_save_array_words { + my ($hashRef, $arrayRef) = @_; + + my @array = split(/,/, join(',', @$arrayRef)); + foreach my $word (@array) { + $word =~ s/\s*\n?$//g; + $word =~ s/^\s*//g; + $word =~ s/\s+/ /g; + $word =~ tr/[a-z]/[A-Z]/; + + next if ($word =~ m/^\s*#/); + next if ($word =~ m/^\s*$/); - next if ($word =~ m/^\s*#/); - next if ($word =~ m/^\s*$/); + $hashRef->{$word}++; + } +} - $ignore_type{$word}++; +sub hash_show_words { + my ($hashRef, $prefix) = @_; + + if ($quiet == 0 && keys %$hashRef) { + print "NOTE: $prefix message types:"; + foreach my $word (sort keys %$hashRef) { + print " $word"; + } + print "\n\n"; + } } +hash_save_array_words(\%ignore_type, \@ignore); +hash_save_array_words(\%use_type, \@use); + my $dbg_values = 0; my $dbg_possible = 0; my $dbg_type = 0; @@ -207,6 +242,8 @@ our $Sparse = qr{ __rcu }x; +our $InitAttribute = qr{__(?:mem|cpu|dev|net_|)(?:initdata|initconst|init\b)}; + # Notes to $Attribute: # We need \b after 'init' otherwise 'initconst' will cause a false positive in a check our $Attribute = qr{ @@ -227,7 +264,7 @@ our $Attribute = qr{ __deprecated| __read_mostly| __kprobes| - __(?:mem|cpu|dev|)(?:initdata|initconst|init\b)| + $InitAttribute| ____cacheline_aligned| ____cacheline_aligned_in_smp| ____cacheline_internodealigned_in_smp| @@ -257,6 +294,7 @@ our $Operators = qr{ }x; our $NonptrType; +our $NonptrTypeWithAttr; our $Type; our $Declare; @@ -319,6 +357,12 @@ our @typeList = ( qr{${Ident}_handler}, qr{${Ident}_handler_fn}, ); +our @typeListWithAttr = ( + @typeList, + qr{struct\s+$InitAttribute\s+$Ident}, + qr{union\s+$InitAttribute\s+$Ident}, +); + our @modifierList = ( qr{fastcall}, ); @@ -332,6 +376,7 @@ our $allowed_asm_includes = qr{(?x: sub build_types { my $mods = "(?x: \n" . join("|\n ", @modifierList) . "\n)"; my $all = "(?x: \n" . join("|\n ", @typeList) . "\n)"; + my $allWithAttr = "(?x: \n" . join("|\n ", @typeListWithAttr) . "\n)"; $Modifier = qr{(?:$Attribute|$Sparse|$mods)}; $NonptrType = qr{ (?:$Modifier\s+|const\s+)* @@ -342,6 +387,15 @@ sub build_types { ) (?:\s+$Modifier|\s+const)* }x; + $NonptrTypeWithAttr = qr{ + (?:$Modifier\s+|const\s+)* + (?: + (?:typeof|__typeof__)\s*\([^\)]*\)| + (?:$typeTypedefs\b)| + (?:${allWithAttr}\b) + ) + (?:\s+$Modifier|\s+const)* + }x; $Type = qr{ $NonptrType (?:(?:\s|\*|\[\])+\s*const|(?:\s|\*|\[\])+|(?:\s*\[\s*\])+)? @@ -1355,7 +1409,9 @@ sub possible { my $prefix = ''; sub show_type { - return !defined $ignore_type{$_[0]}; + return defined $use_type{$_[0]} if (scalar keys %use_type > 0); + + return !defined $ignore_type{$_[0]}; } sub report { @@ -1435,7 +1491,23 @@ sub check_absolute_file { sub trim { my ($string) = @_; - $string =~ s/(^\s+|\s+$)//g; + $string =~ s/^\s+|\s+$//g; + + return $string; +} + +sub ltrim { + my ($string) = @_; + + $string =~ s/^\s+//; + + return $string; +} + +sub rtrim { + my ($string) = @_; + + $string =~ s/\s+$//; return $string; } @@ -1532,6 +1604,7 @@ sub process { my %suppress_export; my $suppress_statement = 0; + my %signatures = (); # Pre-scan the patch sanitizing the lines. # Pre-scan the patch looking for any __setup documentation. @@ -1624,6 +1697,8 @@ sub process { $linenr = 0; foreach my $line (@lines) { $linenr++; + my $sline = $line; #copy of $line + $sline =~ s/$;/ /g; #with comments as spaces my $rawline = $rawlines[$linenr - 1]; @@ -1781,6 +1856,17 @@ sub process { "email address '$email' might be better as '$suggested_email$comment'\n" . $herecurr); } } + +# Check for duplicate signatures + my $sig_nospace = $line; + $sig_nospace =~ s/\s//g; + $sig_nospace = lc($sig_nospace); + if (defined $signatures{$sig_nospace}) { + WARN("BAD_SIGN_OFF", + "Duplicate signature\n" . $herecurr); + } else { + $signatures{$sig_nospace} = 1; + } } # Check for wrappage within a valid hunk of the file @@ -1845,15 +1931,17 @@ sub process { #trailing whitespace if ($line =~ /^\+.*\015/) { my $herevet = "$here\n" . cat_vet($rawline) . "\n"; - ERROR("DOS_LINE_ENDINGS", - "DOS line endings\n" . $herevet); - + if (ERROR("DOS_LINE_ENDINGS", + "DOS line endings\n" . $herevet) && + $fix) { + $fixed[$linenr - 1] =~ s/[\s\015]+$//; + } } elsif ($rawline =~ /^\+.*\S\s+$/ || $rawline =~ /^\+\s+$/) { my $herevet = "$here\n" . cat_vet($rawline) . "\n"; if (ERROR("TRAILING_WHITESPACE", "trailing whitespace\n" . $herevet) && $fix) { - $fixed[$linenr - 1] =~ s/^(\+.*?)\s+$/$1/; + $fixed[$linenr - 1] =~ s/\s+$//; } $rpt_cleaners = 1; @@ -2060,6 +2148,7 @@ sub process { if ($realfile =~ m@^(drivers/net/|net/)@ && $prevrawline =~ /^\+[ \t]*\/\*/ && #starting /* $prevrawline !~ /\*\/[ \t]*$/ && #no trailing */ + $rawline =~ /^\+/ && #line is new $rawline !~ /^\+[ \t]*\*/) { #no leading * WARN("NETWORKING_BLOCK_COMMENT_STYLE", "networking block comments start with * on subsequent lines\n" . $hereprev); @@ -2126,7 +2215,7 @@ sub process { $realline_next); #print "LINE<$line>\n"; if ($linenr >= $suppress_statement && - $realcnt && $line =~ /.\s*\S/) { + $realcnt && $sline =~ /.\s*\S/) { ($stat, $cond, $line_nr_next, $remain_next, $off_next) = ctx_statement_block($linenr, $realcnt, 0); $stat =~ s/\n./\n /g; @@ -2486,16 +2575,22 @@ sub process { } # check for global initialisers. - if ($line =~ /^.$Type\s*$Ident\s*(?:\s+$Modifier)*\s*=\s*(0|NULL|false)\s*;/) { - ERROR("GLOBAL_INITIALISERS", - "do not initialise globals to 0 or NULL\n" . - $herecurr); + if ($line =~ /^\+(\s*$Type\s*$Ident\s*(?:\s+$Modifier))*\s*=\s*(0|NULL|false)\s*;/) { + if (ERROR("GLOBAL_INITIALISERS", + "do not initialise globals to 0 or NULL\n" . + $herecurr) && + $fix) { + $fixed[$linenr - 1] =~ s/($Type\s*$Ident\s*(?:\s+$Modifier))*\s*=\s*(0|NULL|false)\s*;/$1;/; + } } # check for static initialisers. - if ($line =~ /\bstatic\s.*=\s*(0|NULL|false)\s*;/) { - ERROR("INITIALISED_STATIC", - "do not initialise statics to 0 or NULL\n" . - $herecurr); + if ($line =~ /^\+.*\bstatic\s.*=\s*(0|NULL|false)\s*;/) { + if (ERROR("INITIALISED_STATIC", + "do not initialise statics to 0 or NULL\n" . + $herecurr) && + $fix) { + $fixed[$linenr - 1] =~ s/(\bstatic\s.*?)\s*=\s*(0|NULL|false)\s*;/$1;/; + } } # check for static const char * arrays. @@ -2638,8 +2733,12 @@ sub process { } if ($line =~ /\bpr_warning\s*\(/) { - WARN("PREFER_PR_LEVEL", - "Prefer pr_warn(... to pr_warning(...\n" . $herecurr); + if (WARN("PREFER_PR_LEVEL", + "Prefer pr_warn(... to pr_warning(...\n" . $herecurr) && + $fix) { + $fixed[$linenr - 1] =~ + s/\bpr_warning\b/pr_warn/; + } } if ($line =~ /\bdev_printk\s*\(\s*KERN_([A-Z]+)/) { @@ -2759,6 +2858,7 @@ sub process { $off = 0; my $blank = copy_spacing($opline); + my $last_after = -1; for (my $n = 0; $n < $#elements; $n += 2) { @@ -2824,7 +2924,7 @@ sub process { $cc !~ /^\\/ && $cc !~ /^;/) { if (ERROR("SPACING", "space required after that '$op' $at\n" . $hereptr)) { - $good = trim($fix_elements[$n]) . " " . trim($fix_elements[$n + 1]) . " "; + $good = $fix_elements[$n] . trim($fix_elements[$n + 1]) . " "; $line_fixed = 1; } } @@ -2839,11 +2939,11 @@ sub process { if ($ctx =~ /Wx.|.xW/) { if (ERROR("SPACING", "spaces prohibited around that '$op' $at\n" . $hereptr)) { - $good = trim($fix_elements[$n]) . trim($fix_elements[$n + 1]); - $line_fixed = 1; + $good = rtrim($fix_elements[$n]) . trim($fix_elements[$n + 1]); if (defined $fix_elements[$n + 2]) { $fix_elements[$n + 2] =~ s/^\s+//; } + $line_fixed = 1; } } @@ -2852,8 +2952,9 @@ sub process { if ($ctx !~ /.x[WEC]/ && $cc !~ /^}/) { if (ERROR("SPACING", "space required after that '$op' $at\n" . $hereptr)) { - $good = trim($fix_elements[$n]) . trim($fix_elements[$n + 1]) . " "; + $good = $fix_elements[$n] . trim($fix_elements[$n + 1]) . " "; $line_fixed = 1; + $last_after = $n; } } @@ -2870,8 +2971,10 @@ sub process { if ($ctx !~ /[WEBC]x./ && $ca !~ /(?:\)|!|~|\*|-|\&|\||\+\+|\-\-|\{)$/) { if (ERROR("SPACING", "space required before that '$op' $at\n" . $hereptr)) { - $good = trim($fix_elements[$n]) . " " . trim($fix_elements[$n + 1]); - $line_fixed = 1; + if ($n != $last_after + 2) { + $good = $fix_elements[$n] . " " . ltrim($fix_elements[$n + 1]); + $line_fixed = 1; + } } } if ($op eq '*' && $cc =~/\s*$Modifier\b/) { @@ -2880,12 +2983,11 @@ sub process { } elsif ($ctx =~ /.xW/) { if (ERROR("SPACING", "space prohibited after that '$op' $at\n" . $hereptr)) { - $fixed_line =~ s/\s+$//; - $good = trim($fix_elements[$n]) . trim($fix_elements[$n + 1]); - $line_fixed = 1; + $good = $fix_elements[$n] . rtrim($fix_elements[$n + 1]); if (defined $fix_elements[$n + 2]) { $fix_elements[$n + 2] =~ s/^\s+//; } + $line_fixed = 1; } } @@ -2894,8 +2996,7 @@ sub process { if ($ctx !~ /[WEOBC]x[^W]/ && $ctx !~ /[^W]x[WOBEC]/) { if (ERROR("SPACING", "space required one side of that '$op' $at\n" . $hereptr)) { - $fixed_line =~ s/\s+$//; - $good = trim($fix_elements[$n]) . trim($fix_elements[$n + 1]) . " "; + $good = $fix_elements[$n] . trim($fix_elements[$n + 1]) . " "; $line_fixed = 1; } } @@ -2903,20 +3004,18 @@ sub process { ($ctx =~ /Wx./ && $cc =~ /^;/)) { if (ERROR("SPACING", "space prohibited before that '$op' $at\n" . $hereptr)) { - $fixed_line =~ s/\s+$//; - $good = trim($fix_elements[$n]) . trim($fix_elements[$n + 1]); + $good = rtrim($fix_elements[$n]) . trim($fix_elements[$n + 1]); $line_fixed = 1; } } if ($ctx =~ /ExW/) { if (ERROR("SPACING", "space prohibited after that '$op' $at\n" . $hereptr)) { - $fixed_line =~ s/\s+$//; - $good = trim($fix_elements[$n]) . trim($fix_elements[$n + 1]); - $line_fixed = 1; + $good = $fix_elements[$n] . trim($fix_elements[$n + 1]); if (defined $fix_elements[$n + 2]) { $fix_elements[$n + 2] =~ s/^\s+//; } + $line_fixed = 1; } } @@ -2930,8 +3029,10 @@ sub process { if ($ctx =~ /Wx[^WCE]|[^WCE]xW/) { if (ERROR("SPACING", "need consistent spacing around '$op' $at\n" . $hereptr)) { - $fixed_line =~ s/\s+$//; - $good = trim($fix_elements[$n]) . " " . trim($fix_elements[$n + 1]) . " "; + $good = rtrim($fix_elements[$n]) . " " . trim($fix_elements[$n + 1]) . " "; + if (defined $fix_elements[$n + 2]) { + $fix_elements[$n + 2] =~ s/^\s+//; + } $line_fixed = 1; } } @@ -2942,7 +3043,7 @@ sub process { if ($ctx =~ /Wx./) { if (ERROR("SPACING", "space prohibited before that '$op' $at\n" . $hereptr)) { - $good = trim($fix_elements[$n]) . trim($fix_elements[$n + 1]); + $good = rtrim($fix_elements[$n]) . trim($fix_elements[$n + 1]); $line_fixed = 1; } } @@ -2969,8 +3070,10 @@ sub process { if ($ok == 0) { if (ERROR("SPACING", "spaces required around that '$op' $at\n" . $hereptr)) { - $good = trim($fix_elements[$n]) . " " . trim($fix_elements[$n + 1]) . " "; - $good = $fix_elements[$n] . " " . trim($fix_elements[$n + 1]) . " "; + $good = rtrim($fix_elements[$n]) . " " . trim($fix_elements[$n + 1]) . " "; + if (defined $fix_elements[$n + 2]) { + $fix_elements[$n + 2] =~ s/^\s+//; + } $line_fixed = 1; } } @@ -3031,8 +3134,7 @@ sub process { if (ERROR("SPACING", "space required before the open brace '{'\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ - s/^(\+.*(?:do|\))){/$1 {/; + $fixed[$linenr - 1] =~ s/^(\+.*(?:do|\))){/$1 {/; } } @@ -3047,8 +3149,12 @@ sub process { # closing brace should have a space following it when it has anything # on the line if ($line =~ /}(?!(?:,|;|\)))\S/) { - ERROR("SPACING", - "space required after that close brace '}'\n" . $herecurr); + if (ERROR("SPACING", + "space required after that close brace '}'\n" . $herecurr) && + $fix) { + $fixed[$linenr - 1] =~ + s/}((?!(?:,|;|\)))\S)/} $1/; + } } # check spacing on square brackets @@ -3271,8 +3377,13 @@ sub process { #gcc binary extension if ($var =~ /^$Binary$/) { - WARN("GCC_BINARY_CONSTANT", - "Avoid gcc v4.3+ binary constant extension: <$var>\n" . $herecurr); + if (WARN("GCC_BINARY_CONSTANT", + "Avoid gcc v4.3+ binary constant extension: <$var>\n" . $herecurr) && + $fix) { + my $hexval = sprintf("0x%x", oct($var)); + $fixed[$linenr - 1] =~ + s/\b$var\b/$hexval/; + } } #CamelCase @@ -3282,19 +3393,26 @@ sub process { $var !~ /^(?:Clear|Set|TestClear|TestSet|)Page[A-Z]/ && #Ignore SI style variants like nS, mV and dB (ie: max_uV, regulator_min_uA_show) $var !~ /^(?:[a-z_]*?)_?[a-z][A-Z](?:_[a-z_]+)?$/) { - seed_camelcase_includes() if ($check); - if (!defined $camelcase{$var}) { - $camelcase{$var} = 1; - CHK("CAMELCASE", - "Avoid CamelCase: <$var>\n" . $herecurr); + while ($var =~ m{($Ident)}g) { + my $word = $1; + next if ($word !~ /[A-Z][a-z]|[a-z][A-Z]/); + seed_camelcase_includes() if ($check); + if (!defined $camelcase{$word}) { + $camelcase{$word} = 1; + CHK("CAMELCASE", + "Avoid CamelCase: <$word>\n" . $herecurr); + } } } } #no spaces allowed after \ in define - if ($line=~/\#\s*define.*\\\s$/) { - WARN("WHITESPACE_AFTER_LINE_CONTINUATION", - "Whitepspace after \\ makes next lines useless\n" . $herecurr); + if ($line =~ /\#\s*define.*\\\s+$/) { + if (WARN("WHITESPACE_AFTER_LINE_CONTINUATION", + "Whitespace after \\ makes next lines useless\n" . $herecurr) && + $fix) { + $fixed[$linenr - 1] =~ s/\s+$//; + } } #warn if <asm/foo.h> is #included and <linux/foo.h> is available (uses RAW line) @@ -3374,7 +3492,8 @@ sub process { $dstat !~ /^for\s*$Constant$/ && # for (...) $dstat !~ /^for\s*$Constant\s+(?:$Ident|-?$Constant)$/ && # for (...) bar() $dstat !~ /^do\s*{/ && # do {... - $dstat !~ /^\({/) # ({... + $dstat !~ /^\({/ && # ({... + $ctx !~ /^.\s*#\s*define\s+TRACE_(?:SYSTEM|INCLUDE_FILE|INCLUDE_PATH)\b/) { $ctx =~ s/\n*$//; my $herectx = $here . "\n"; @@ -3606,6 +3725,32 @@ sub process { } } +sub string_find_replace { + my ($string, $find, $replace) = @_; + + $string =~ s/$find/$replace/g; + + return $string; +} + +# check for bad placement of section $InitAttribute (e.g.: __initdata) + if ($line =~ /(\b$InitAttribute\b)/) { + my $attr = $1; + if ($line =~ /^\+\s*static\s+(?:const\s+)?(?:$attr\s+)?($NonptrTypeWithAttr)\s+(?:$attr\s+)?($Ident(?:\[[^]]*\])?)\s*[=;]/) { + my $ptr = $1; + my $var = $2; + if ((($ptr =~ /\b(union|struct)\s+$attr\b/ && + ERROR("MISPLACED_INIT", + "$attr should be placed after $var\n" . $herecurr)) || + ($ptr !~ /\b(union|struct)\s+$attr\b/ && + WARN("MISPLACED_INIT", + "$attr should be placed after $var\n" . $herecurr))) && + $fix) { + $fixed[$linenr - 1] =~ s/(\bstatic\s+(?:const\s+)?)(?:$attr\s+)?($NonptrTypeWithAttr)\s+(?:$attr\s+)?($Ident(?:\[[^]]*\])?)\s*([=;])\s*/"$1" . trim(string_find_replace($2, "\\s*$attr\\s*", " ")) . " " . trim(string_find_replace($3, "\\s*$attr\\s*", "")) . " $attr" . ("$4" eq ";" ? ";" : " = ")/e; + } + } + } + # prefer usleep_range over udelay if ($line =~ /\budelay\s*\(\s*(\d+)\s*\)/) { # ignore udelay's < 10, however @@ -3691,8 +3836,12 @@ sub process { # Check for __inline__ and __inline, prefer inline if ($line =~ /\b(__inline__|__inline)\b/) { - WARN("INLINE", - "plain inline is preferred over $1\n" . $herecurr); + if (WARN("INLINE", + "plain inline is preferred over $1\n" . $herecurr) && + $fix) { + $fixed[$linenr - 1] =~ s/\b(__inline__|__inline)\b/inline/; + + } } # Check for __attribute__ packed, prefer __packed @@ -3709,14 +3858,21 @@ sub process { # Check for __attribute__ format(printf, prefer __printf if ($line =~ /\b__attribute__\s*\(\s*\(\s*format\s*\(\s*printf/) { - WARN("PREFER_PRINTF", - "__printf(string-index, first-to-check) is preferred over __attribute__((format(printf, string-index, first-to-check)))\n" . $herecurr); + if (WARN("PREFER_PRINTF", + "__printf(string-index, first-to-check) is preferred over __attribute__((format(printf, string-index, first-to-check)))\n" . $herecurr) && + $fix) { + $fixed[$linenr - 1] =~ s/\b__attribute__\s*\(\s*\(\s*format\s*\(\s*printf\s*,\s*(.*)\)\s*\)\s*\)/"__printf(" . trim($1) . ")"/ex; + + } } # Check for __attribute__ format(scanf, prefer __scanf if ($line =~ /\b__attribute__\s*\(\s*\(\s*format\s*\(\s*scanf\b/) { - WARN("PREFER_SCANF", - "__scanf(string-index, first-to-check) is preferred over __attribute__((format(scanf, string-index, first-to-check)))\n" . $herecurr); + if (WARN("PREFER_SCANF", + "__scanf(string-index, first-to-check) is preferred over __attribute__((format(scanf, string-index, first-to-check)))\n" . $herecurr) && + $fix) { + $fixed[$linenr - 1] =~ s/\b__attribute__\s*\(\s*\(\s*format\s*\(\s*scanf\s*,\s*(.*)\)\s*\)\s*\)/"__scanf(" . trim($1) . ")"/ex; + } } # check for sizeof(&) @@ -3727,8 +3883,11 @@ sub process { # check for sizeof without parenthesis if ($line =~ /\bsizeof\s+((?:\*\s*|)$Lval|$Type(?:\s+$Lval|))/) { - WARN("SIZEOF_PARENTHESIS", - "sizeof $1 should be sizeof($1)\n" . $herecurr); + if (WARN("SIZEOF_PARENTHESIS", + "sizeof $1 should be sizeof($1)\n" . $herecurr) && + $fix) { + $fixed[$linenr - 1] =~ s/\bsizeof\s+((?:\*\s*|)$Lval|$Type(?:\s+$Lval|))/"sizeof(" . trim($1) . ")"/ex; + } } # check for line continuations in quoted strings with odd counts of " @@ -3747,8 +3906,11 @@ sub process { if ($line =~ /\bseq_printf\s*\(/) { my $fmt = get_quoted_string($line, $rawline); if ($fmt !~ /[^\\]\%/) { - WARN("PREFER_SEQ_PUTS", - "Prefer seq_puts to seq_printf\n" . $herecurr); + if (WARN("PREFER_SEQ_PUTS", + "Prefer seq_puts to seq_printf\n" . $herecurr) && + $fix) { + $fixed[$linenr - 1] =~ s/\bseq_printf\b/seq_puts/; + } } } @@ -3810,6 +3972,16 @@ sub process { } } +# check for new externs in .h files. + if ($realfile =~ /\.h$/ && + $line =~ /^\+\s*(extern\s+)$Type\s*$Ident\s*\(/s) { + if (WARN("AVOID_EXTERNS", + "extern prototypes should be avoided in .h files\n" . $herecurr) && + $fix) { + $fixed[$linenr - 1] =~ s/(.*)\bextern\b\s*(.*)/$1$2/; + } + } + # check for new externs in .c files. if ($realfile =~ /\.c$/ && defined $stat && $stat =~ /^.\s*(?:extern\s+)?$Type\s+($Ident)(\s*)\(/s) @@ -3879,8 +4051,11 @@ sub process { # check for multiple semicolons if ($line =~ /;\s*;\s*$/) { - WARN("ONE_SEMICOLON", - "Statements terminations use 1 semicolon\n" . $herecurr); + if (WARN("ONE_SEMICOLON", + "Statements terminations use 1 semicolon\n" . $herecurr) && + $fix) { + $fixed[$linenr - 1] =~ s/(\s*;\s*){2,}$/;/g; + } } # check for switch/default statements without a break; @@ -3898,9 +4073,12 @@ sub process { } # check for gcc specific __FUNCTION__ - if ($line =~ /__FUNCTION__/) { - WARN("USE_FUNC", - "__func__ should be used instead of gcc specific __FUNCTION__\n" . $herecurr); + if ($line =~ /\b__FUNCTION__\b/) { + if (WARN("USE_FUNC", + "__func__ should be used instead of gcc specific __FUNCTION__\n" . $herecurr) && + $fix) { + $fixed[$linenr - 1] =~ s/\b__FUNCTION__\b/__func__/g; + } } # check for use of yield() @@ -4105,13 +4283,8 @@ sub process { } } - if ($quiet == 0 && keys %ignore_type) { - print "NOTE: Ignored message types:"; - foreach my $ignore (sort keys %ignore_type) { - print " $ignore"; - } - print "\n\n"; - } + hash_show_words(\%use_type, "Used"); + hash_show_words(\%ignore_type, "Ignored"); if ($clean == 0 && $fix && "@rawlines" ne "@fixed") { my $newfile = $filename . ".EXPERIMENTAL-checkpatch-fixes"; |