diff options
Diffstat (limited to 'arch')
137 files changed, 3040 insertions, 1424 deletions
diff --git a/arch/arm/mach-davinci/time.c b/arch/arm/mach-davinci/time.c index 4d8425de6922..e96a3dcdc1a7 100644 --- a/arch/arm/mach-davinci/time.c +++ b/arch/arm/mach-davinci/time.c @@ -285,6 +285,8 @@ static void davinci_set_mode(enum clock_event_mode mode, case CLOCK_EVT_MODE_SHUTDOWN: t->opts = TIMER_OPTS_DISABLED; break; + case CLOCK_EVT_MODE_RESUME: + break; } } diff --git a/arch/arm/mach-imx/time.c b/arch/arm/mach-imx/time.c index 010f6fa984a6..d86d124aea22 100644 --- a/arch/arm/mach-imx/time.c +++ b/arch/arm/mach-imx/time.c @@ -159,6 +159,7 @@ static void imx_set_mode(enum clock_event_mode mode, struct clock_event_device * break; case CLOCK_EVT_MODE_SHUTDOWN: case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_RESUME: /* Left event sources disabled, no more interrupts appears */ break; } diff --git a/arch/arm/mach-ixp4xx/common.c b/arch/arm/mach-ixp4xx/common.c index 8112f726ffa0..23e7fba6d3e1 100644 --- a/arch/arm/mach-ixp4xx/common.c +++ b/arch/arm/mach-ixp4xx/common.c @@ -459,6 +459,8 @@ static void ixp4xx_set_mode(enum clock_event_mode mode, default: osrt = opts = 0; break; + case CLOCK_EVT_MODE_RESUME: + break; } *IXP4XX_OSRT1 = osrt | opts; diff --git a/arch/arm/mach-omap1/time.c b/arch/arm/mach-omap1/time.c index 3705d20c4e5c..237651ebae5d 100644 --- a/arch/arm/mach-omap1/time.c +++ b/arch/arm/mach-omap1/time.c @@ -156,6 +156,7 @@ static void omap_mpu_set_mode(enum clock_event_mode mode, break; case CLOCK_EVT_MODE_UNUSED: case CLOCK_EVT_MODE_SHUTDOWN: + case CLOCK_EVT_MODE_RESUME: break; } } diff --git a/arch/arm/plat-omap/timer32k.c b/arch/arm/plat-omap/timer32k.c index 2feceec8eccd..b0af014b0e2c 100644 --- a/arch/arm/plat-omap/timer32k.c +++ b/arch/arm/plat-omap/timer32k.c @@ -156,6 +156,8 @@ static void omap_32k_timer_set_mode(enum clock_event_mode mode, case CLOCK_EVT_MODE_SHUTDOWN: omap_32k_timer_stop(); break; + case CLOCK_EVT_MODE_RESUME: + break; } } diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 7a11b905ef49..abb582bc218f 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -18,6 +18,10 @@ config GENERIC_TIME bool default y +config GENERIC_CMOS_UPDATE + bool + default y + config CLOCKSOURCE_WATCHDOG bool default y @@ -544,6 +548,7 @@ config HIGHMEM4G config HIGHMEM64G bool "64GB" depends on !M386 && !M486 + select X86_PAE help Select this if you have a 32-bit processor and more than 4 gigabytes of physical RAM. @@ -573,12 +578,12 @@ choice config VMSPLIT_3G bool "3G/1G user/kernel split" config VMSPLIT_3G_OPT - depends on !HIGHMEM + depends on !X86_PAE bool "3G/1G user/kernel split (for full 1G low memory)" config VMSPLIT_2G bool "2G/2G user/kernel split" config VMSPLIT_2G_OPT - depends on !HIGHMEM + depends on !X86_PAE bool "2G/2G user/kernel split (for full 2G low memory)" config VMSPLIT_1G bool "1G/3G user/kernel split" @@ -598,10 +603,15 @@ config HIGHMEM default y config X86_PAE - bool - depends on HIGHMEM64G - default y + bool "PAE (Physical Address Extension) Support" + default n + depends on !HIGHMEM4G select RESOURCES_64BIT + help + PAE is required for NX support, and furthermore enables + larger swapspace support for non-overcommit purposes. It + has the cost of more pagetable lookup overhead, and also + consumes more pagetable space per process. # Common NUMA Features config NUMA @@ -817,6 +827,7 @@ config CRASH_DUMP config PHYSICAL_START hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) + default "0x1000000" if X86_NUMAQ default "0x100000" help This gives the physical address where the kernel is loaded. diff --git a/arch/i386/defconfig b/arch/i386/defconfig index 0ac62cdcd3b7..54ee1764fdae 100644 --- a/arch/i386/defconfig +++ b/arch/i386/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.22-rc2 -# Mon May 21 13:23:44 2007 +# Linux kernel version: 2.6.22-git14 +# Fri Jul 20 09:53:15 2007 # CONFIG_X86_32=y CONFIG_GENERIC_TIME=y @@ -37,19 +37,18 @@ CONFIG_LOCALVERSION="" CONFIG_LOCALVERSION_AUTO=y CONFIG_SWAP=y CONFIG_SYSVIPC=y -# CONFIG_IPC_NS is not set CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y # CONFIG_BSD_PROCESS_ACCT is not set # CONFIG_TASKSTATS is not set -# CONFIG_UTS_NS is not set +# CONFIG_USER_NS is not set # CONFIG_AUDIT is not set CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=18 # CONFIG_CPUSETS is not set CONFIG_SYSFS_DEPRECATED=y -# CONFIG_RELAY is not set +CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y CONFIG_INITRAMFS_SOURCE="" CONFIG_CC_OPTIMIZE_FOR_SIZE=y @@ -73,16 +72,13 @@ CONFIG_TIMERFD=y CONFIG_EVENTFD=y CONFIG_SHMEM=y CONFIG_VM_EVENT_COUNTERS=y -CONFIG_SLAB=y -# CONFIG_SLUB is not set +CONFIG_SLUB_DEBUG=y +# CONFIG_SLAB is not set +CONFIG_SLUB=y # CONFIG_SLOB is not set CONFIG_RT_MUTEXES=y # CONFIG_TINY_SHMEM is not set CONFIG_BASE_SMALL=0 - -# -# Loadable module support -# CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y @@ -90,14 +86,11 @@ CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_MODULE_SRCVERSION_ALL is not set # CONFIG_KMOD is not set CONFIG_STOP_MACHINE=y - -# -# Block layer -# CONFIG_BLOCK=y CONFIG_LBD=y # CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_LSF is not set +# CONFIG_BLK_DEV_BSG is not set # # IO Schedulers @@ -201,6 +194,7 @@ CONFIG_X86_CPUID=y # CONFIG_EDD is not set # CONFIG_DELL_RBU is not set # CONFIG_DCDBAS is not set +CONFIG_DMIID=y # CONFIG_NOHIGHMEM is not set CONFIG_HIGHMEM4G=y # CONFIG_HIGHMEM64G is not set @@ -217,7 +211,9 @@ CONFIG_FLAT_NODE_MEM_MAP=y CONFIG_SPLIT_PTLOCK_CPUS=4 CONFIG_RESOURCES_64BIT=y CONFIG_ZONE_DMA_FLAG=1 +CONFIG_BOUNCE=y CONFIG_NR_QUICK=1 +CONFIG_VIRT_TO_BUS=y # CONFIG_HIGHPTE is not set # CONFIG_MATH_EMULATION is not set CONFIG_MTRR=y @@ -244,7 +240,6 @@ CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y CONFIG_PM=y CONFIG_PM_LEGACY=y # CONFIG_PM_DEBUG is not set -# CONFIG_PM_SYSFS_DEPRECATED is not set # # ACPI (Advanced Configuration and Power Interface) Support @@ -284,7 +279,7 @@ CONFIG_CPU_FREQ_GOV_PERFORMANCE=y # CONFIG_CPU_FREQ_GOV_POWERSAVE is not set CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_GOV_ONDEMAND=y -# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set +CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y # # CPUFreq processor drivers @@ -325,7 +320,7 @@ CONFIG_PCI_MMCONFIG=y CONFIG_ARCH_SUPPORTS_MSI=y CONFIG_PCI_MSI=y # CONFIG_PCI_DEBUG is not set -CONFIG_HT_IRQ=y +# CONFIG_HT_IRQ is not set CONFIG_ISA_DMA_API=y # CONFIG_ISA is not set # CONFIG_MCA is not set @@ -381,7 +376,7 @@ CONFIG_IP_PNP_DHCP=y CONFIG_INET_TUNNEL=y CONFIG_INET_XFRM_MODE_TRANSPORT=y CONFIG_INET_XFRM_MODE_TUNNEL=y -CONFIG_INET_XFRM_MODE_BEET=y +# CONFIG_INET_XFRM_MODE_BEET is not set CONFIG_INET_DIAG=y CONFIG_INET_TCP_DIAG=y # CONFIG_TCP_CONG_ADVANCED is not set @@ -400,27 +395,15 @@ CONFIG_IPV6=y # CONFIG_INET6_TUNNEL is not set CONFIG_INET6_XFRM_MODE_TRANSPORT=y CONFIG_INET6_XFRM_MODE_TUNNEL=y -CONFIG_INET6_XFRM_MODE_BEET=y +# CONFIG_INET6_XFRM_MODE_BEET is not set # CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set CONFIG_IPV6_SIT=y # CONFIG_IPV6_TUNNEL is not set # CONFIG_IPV6_MULTIPLE_TABLES is not set # CONFIG_NETWORK_SECMARK is not set # CONFIG_NETFILTER is not set - -# -# DCCP Configuration (EXPERIMENTAL) -# # CONFIG_IP_DCCP is not set - -# -# SCTP Configuration (EXPERIMENTAL) -# # CONFIG_IP_SCTP is not set - -# -# TIPC Configuration (EXPERIMENTAL) -# # CONFIG_TIPC is not set # CONFIG_ATM is not set # CONFIG_BRIDGE is not set @@ -457,6 +440,7 @@ CONFIG_IPV6_SIT=y # CONFIG_MAC80211 is not set # CONFIG_IEEE80211 is not set # CONFIG_RFKILL is not set +# CONFIG_NET_9P is not set # # Device Drivers @@ -471,21 +455,9 @@ CONFIG_FW_LOADER=y # CONFIG_DEBUG_DRIVER is not set # CONFIG_DEBUG_DEVRES is not set # CONFIG_SYS_HYPERVISOR is not set - -# -# Connector - unified userspace <-> kernelspace linker -# # CONFIG_CONNECTOR is not set # CONFIG_MTD is not set - -# -# Parallel port support -# # CONFIG_PARPORT is not set - -# -# Plug and Play support -# CONFIG_PNP=y # CONFIG_PNP_DEBUG is not set @@ -493,10 +465,7 @@ CONFIG_PNP=y # Protocols # CONFIG_PNPACPI=y - -# -# Block devices -# +CONFIG_BLK_DEV=y CONFIG_BLK_DEV_FD=y # CONFIG_BLK_CPQ_DA is not set # CONFIG_BLK_CPQ_CISS_DA is not set @@ -514,17 +483,14 @@ CONFIG_BLK_DEV_RAM_SIZE=4096 CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024 # CONFIG_CDROM_PKTCDVD is not set # CONFIG_ATA_OVER_ETH is not set - -# -# Misc devices -# +CONFIG_MISC_DEVICES=y # CONFIG_IBM_ASM is not set # CONFIG_PHANTOM is not set +# CONFIG_EEPROM_93CX6 is not set # CONFIG_SGI_IOC4 is not set # CONFIG_TIFM_CORE is not set # CONFIG_SONY_LAPTOP is not set # CONFIG_THINKPAD_ACPI is not set -# CONFIG_BLINK is not set CONFIG_IDE=y CONFIG_BLK_DEV_IDE=y @@ -596,6 +562,7 @@ CONFIG_BLK_DEV_IDEDMA=y # # CONFIG_RAID_ATTRS is not set CONFIG_SCSI=y +CONFIG_SCSI_DMA=y # CONFIG_SCSI_TGT is not set CONFIG_SCSI_NETLINK=y # CONFIG_SCSI_PROC_FS is not set @@ -606,8 +573,9 @@ CONFIG_SCSI_NETLINK=y CONFIG_BLK_DEV_SD=y # CONFIG_CHR_DEV_ST is not set # CONFIG_CHR_DEV_OSST is not set -# CONFIG_BLK_DEV_SR is not set -# CONFIG_CHR_DEV_SG is not set +CONFIG_BLK_DEV_SR=y +# CONFIG_BLK_DEV_SR_VENDOR is not set +CONFIG_CHR_DEV_SG=y # CONFIG_CHR_DEV_SCH is not set # @@ -667,6 +635,7 @@ CONFIG_AIC79XX_DEBUG_MASK=0 # CONFIG_SCSI_INIA100 is not set # CONFIG_SCSI_STEX is not set # CONFIG_SCSI_SYM53C8XX_2 is not set +# CONFIG_SCSI_IPR is not set # CONFIG_SCSI_QLOGIC_1280 is not set # CONFIG_SCSI_QLA_FC is not set # CONFIG_SCSI_QLA_ISCSI is not set @@ -675,14 +644,73 @@ CONFIG_AIC79XX_DEBUG_MASK=0 # CONFIG_SCSI_DC390T is not set # CONFIG_SCSI_NSP32 is not set # CONFIG_SCSI_DEBUG is not set -# CONFIG_SCSI_ESP_CORE is not set # CONFIG_SCSI_SRP is not set -# CONFIG_ATA is not set - -# -# Multi-device support (RAID and LVM) -# -# CONFIG_MD is not set +CONFIG_ATA=y +# CONFIG_ATA_NONSTANDARD is not set +CONFIG_ATA_ACPI=y +CONFIG_SATA_AHCI=y +CONFIG_SATA_SVW=y +CONFIG_ATA_PIIX=y +# CONFIG_SATA_MV is not set +CONFIG_SATA_NV=y +# CONFIG_PDC_ADMA is not set +# CONFIG_SATA_QSTOR is not set +# CONFIG_SATA_PROMISE is not set +# CONFIG_SATA_SX4 is not set +CONFIG_SATA_SIL=y +# CONFIG_SATA_SIL24 is not set +# CONFIG_SATA_SIS is not set +# CONFIG_SATA_ULI is not set +CONFIG_SATA_VIA=y +# CONFIG_SATA_VITESSE is not set +# CONFIG_SATA_INIC162X is not set +# CONFIG_PATA_ALI is not set +# CONFIG_PATA_AMD is not set +# CONFIG_PATA_ARTOP is not set +# CONFIG_PATA_ATIIXP is not set +# CONFIG_PATA_CMD640_PCI is not set +# CONFIG_PATA_CMD64X is not set +# CONFIG_PATA_CS5520 is not set +# CONFIG_PATA_CS5530 is not set +# CONFIG_PATA_CS5535 is not set +# CONFIG_PATA_CYPRESS is not set +# CONFIG_PATA_EFAR is not set +# CONFIG_ATA_GENERIC is not set +# CONFIG_PATA_HPT366 is not set +# CONFIG_PATA_HPT37X is not set +# CONFIG_PATA_HPT3X2N is not set +# CONFIG_PATA_HPT3X3 is not set +# CONFIG_PATA_IT821X is not set +# CONFIG_PATA_IT8213 is not set +# CONFIG_PATA_JMICRON is not set +# CONFIG_PATA_TRIFLEX is not set +# CONFIG_PATA_MARVELL is not set +# CONFIG_PATA_MPIIX is not set +# CONFIG_PATA_OLDPIIX is not set +# CONFIG_PATA_NETCELL is not set +# CONFIG_PATA_NS87410 is not set +# CONFIG_PATA_OPTI is not set +# CONFIG_PATA_OPTIDMA is not set +# CONFIG_PATA_PDC_OLD is not set +# CONFIG_PATA_RADISYS is not set +# CONFIG_PATA_RZ1000 is not set +# CONFIG_PATA_SC1200 is not set +# CONFIG_PATA_SERVERWORKS is not set +# CONFIG_PATA_PDC2027X is not set +# CONFIG_PATA_SIL680 is not set +# CONFIG_PATA_SIS is not set +# CONFIG_PATA_VIA is not set +# CONFIG_PATA_WINBOND is not set +CONFIG_MD=y +# CONFIG_BLK_DEV_MD is not set +CONFIG_BLK_DEV_DM=y +# CONFIG_DM_DEBUG is not set +# CONFIG_DM_CRYPT is not set +# CONFIG_DM_SNAPSHOT is not set +# CONFIG_DM_MIRROR is not set +# CONFIG_DM_ZERO is not set +# CONFIG_DM_MULTIPATH is not set +# CONFIG_DM_DELAY is not set # # Fusion MPT device support @@ -723,42 +751,27 @@ CONFIG_IEEE1394_OHCI1394=y # CONFIG_IEEE1394_ETH1394 is not set # CONFIG_IEEE1394_DV1394 is not set CONFIG_IEEE1394_RAWIO=y - -# -# I2O device support -# # CONFIG_I2O is not set -# CONFIG_MACINTOSH_DRIVERS is not set - -# -# Network device support -# +CONFIG_MACINTOSH_DRIVERS=y +# CONFIG_MAC_EMUMOUSEBTN is not set CONFIG_NETDEVICES=y +CONFIG_NETDEVICES_MULTIQUEUE=y # CONFIG_DUMMY is not set # CONFIG_BONDING is not set +# CONFIG_MACVLAN is not set # CONFIG_EQUALIZER is not set # CONFIG_TUN is not set # CONFIG_NET_SB1000 is not set - -# -# ARCnet devices -# # CONFIG_ARCNET is not set # CONFIG_PHYLIB is not set - -# -# Ethernet (10 or 100Mbit) -# CONFIG_NET_ETHERNET=y CONFIG_MII=y # CONFIG_HAPPYMEAL is not set # CONFIG_SUNGEM is not set # CONFIG_CASSINI is not set -# CONFIG_NET_VENDOR_3COM is not set - -# -# Tulip family network device support -# +CONFIG_NET_VENDOR_3COM=y +CONFIG_VORTEX=y +# CONFIG_TYPHOON is not set CONFIG_NET_TULIP=y # CONFIG_DE2104X is not set CONFIG_TULIP=y @@ -809,7 +822,6 @@ CONFIG_R8169=y # CONFIG_SIS190 is not set # CONFIG_SKGE is not set CONFIG_SKY2=y -# CONFIG_SK98LIN is not set # CONFIG_VIA_VELOCITY is not set CONFIG_TIGON3=y CONFIG_BNX2=y @@ -823,10 +835,6 @@ CONFIG_NETDEV_10000=y # CONFIG_MYRI10GE is not set # CONFIG_NETXEN_NIC is not set # CONFIG_MLX4_CORE is not set - -# -# Token Ring devices -# # CONFIG_TR is not set # @@ -855,15 +863,7 @@ CONFIG_NETCONSOLE=y CONFIG_NETPOLL=y # CONFIG_NETPOLL_TRAP is not set CONFIG_NET_POLL_CONTROLLER=y - -# -# ISDN subsystem -# # CONFIG_ISDN is not set - -# -# Telephony Support -# # CONFIG_PHONE is not set # @@ -871,6 +871,7 @@ CONFIG_NET_POLL_CONTROLLER=y # CONFIG_INPUT=y # CONFIG_INPUT_FF_MEMLESS is not set +# CONFIG_INPUT_POLLDEV is not set # # Userland interfaces @@ -936,6 +937,7 @@ CONFIG_HW_CONSOLE=y # CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_FIX_EARLYCON_MEM=y CONFIG_SERIAL_8250_PCI=y CONFIG_SERIAL_8250_PNP=y CONFIG_SERIAL_8250_NR_UARTS=4 @@ -951,10 +953,6 @@ CONFIG_SERIAL_CORE_CONSOLE=y CONFIG_UNIX98_PTYS=y CONFIG_LEGACY_PTYS=y CONFIG_LEGACY_PTY_COUNT=256 - -# -# IPMI -# # CONFIG_IPMI_HANDLER is not set # CONFIG_WATCHDOG is not set CONFIG_HW_RANDOM=y @@ -988,11 +986,7 @@ CONFIG_MAX_RAW_DEVS=256 CONFIG_HPET=y # CONFIG_HPET_RTC_IRQ is not set CONFIG_HPET_MMAP=y -CONFIG_HANGCHECK_TIMER=y - -# -# TPM devices -# +# CONFIG_HANGCHECK_TIMER is not set # CONFIG_TCG_TPM is not set # CONFIG_TELCLOCK is not set CONFIG_DEVPORT=y @@ -1003,11 +997,8 @@ CONFIG_DEVPORT=y # # CONFIG_SPI is not set # CONFIG_SPI_MASTER is not set - -# -# Dallas's 1-wire bus -# # CONFIG_W1 is not set +# CONFIG_POWER_SUPPLY is not set # CONFIG_HWMON is not set # @@ -1041,7 +1032,7 @@ CONFIG_DAB=y CONFIG_VGA_CONSOLE=y CONFIG_VGACON_SOFT_SCROLLBACK=y CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=128 -# CONFIG_VIDEO_SELECT is not set +CONFIG_VIDEO_SELECT=y CONFIG_DUMMY_CONSOLE=y # @@ -1058,15 +1049,11 @@ CONFIG_SOUND=y # Open Sound System # CONFIG_SOUND_PRIME=y -# CONFIG_OSS_OBSOLETE is not set # CONFIG_SOUND_TRIDENT is not set # CONFIG_SOUND_MSNDCLAS is not set # CONFIG_SOUND_MSNDPIN is not set # CONFIG_SOUND_OSS is not set - -# -# HID Devices -# +CONFIG_HID_SUPPORT=y CONFIG_HID=y # CONFIG_HID_DEBUG is not set @@ -1077,10 +1064,7 @@ CONFIG_USB_HID=y # CONFIG_USB_HIDINPUT_POWERBOOK is not set # CONFIG_HID_FF is not set # CONFIG_USB_HIDDEV is not set - -# -# USB support -# +CONFIG_USB_SUPPORT=y CONFIG_USB_ARCH_HAS_HCD=y CONFIG_USB_ARCH_HAS_OHCI=y CONFIG_USB_ARCH_HAS_EHCI=y @@ -1094,6 +1078,7 @@ CONFIG_USB_DEVICEFS=y # CONFIG_USB_DEVICE_CLASS is not set # CONFIG_USB_DYNAMIC_MINORS is not set # CONFIG_USB_SUSPEND is not set +# CONFIG_USB_PERSIST is not set # CONFIG_USB_OTG is not set # @@ -1103,7 +1088,6 @@ CONFIG_USB_EHCI_HCD=y # CONFIG_USB_EHCI_SPLIT_ISO is not set # CONFIG_USB_EHCI_ROOT_HUB_TT is not set # CONFIG_USB_EHCI_TT_NEWSCHED is not set -# CONFIG_USB_EHCI_BIG_ENDIAN_MMIO is not set # CONFIG_USB_ISP116X_HCD is not set CONFIG_USB_OHCI_HCD=y # CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set @@ -1111,6 +1095,7 @@ CONFIG_USB_OHCI_HCD=y CONFIG_USB_OHCI_LITTLE_ENDIAN=y CONFIG_USB_UHCI_HCD=y # CONFIG_USB_SL811_HCD is not set +# CONFIG_USB_R8A66597_HCD is not set # # USB Device Class drivers @@ -1201,15 +1186,7 @@ CONFIG_USB_MON=y # # LED Triggers # - -# -# InfiniBand support -# # CONFIG_INFINIBAND is not set - -# -# EDAC - error detection and reporting (RAS) (EXPERIMENTAL) -# # CONFIG_EDAC is not set # @@ -1229,11 +1206,13 @@ CONFIG_USB_MON=y # # DMA Devices # +CONFIG_VIRTUALIZATION=y +# CONFIG_KVM is not set # -# Virtualization +# Userspace I/O # -# CONFIG_KVM is not set +# CONFIG_UIO is not set # # File systems @@ -1271,6 +1250,7 @@ CONFIG_DNOTIFY=y # CONFIG_AUTOFS_FS is not set CONFIG_AUTOFS4_FS=y # CONFIG_FUSE_FS is not set +CONFIG_GENERIC_ACL=y # # CD-ROM/DVD Filesystems @@ -1298,7 +1278,7 @@ CONFIG_PROC_KCORE=y CONFIG_PROC_SYSCTL=y CONFIG_SYSFS=y CONFIG_TMPFS=y -# CONFIG_TMPFS_POSIX_ACL is not set +CONFIG_TMPFS_POSIX_ACL=y CONFIG_HUGETLBFS=y CONFIG_HUGETLB_PAGE=y CONFIG_RAMFS=y @@ -1348,7 +1328,6 @@ CONFIG_SUNRPC=y # CONFIG_NCP_FS is not set # CONFIG_CODA_FS is not set # CONFIG_AFS_FS is not set -# CONFIG_9P_FS is not set # # Partition Types @@ -1404,10 +1383,7 @@ CONFIG_NLS_UTF8=y # Distributed Lock Manager # # CONFIG_DLM is not set - -# -# Instrumentation Support -# +CONFIG_INSTRUMENTATION=y CONFIG_PROFILING=y CONFIG_OPROFILE=y CONFIG_KPROBES=y @@ -1417,7 +1393,7 @@ CONFIG_KPROBES=y # CONFIG_TRACE_IRQFLAGS_SUPPORT=y # CONFIG_PRINTK_TIME is not set -CONFIG_ENABLE_MUST_CHECK=y +# CONFIG_ENABLE_MUST_CHECK is not set CONFIG_MAGIC_SYSRQ=y CONFIG_UNUSED_SYMBOLS=y # CONFIG_DEBUG_FS is not set @@ -1425,15 +1401,17 @@ CONFIG_UNUSED_SYMBOLS=y CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_SHIRQ is not set CONFIG_DETECT_SOFTLOCKUP=y +# CONFIG_SCHED_DEBUG is not set # CONFIG_SCHEDSTATS is not set -# CONFIG_TIMER_STATS is not set -# CONFIG_DEBUG_SLAB is not set +CONFIG_TIMER_STATS=y +# CONFIG_SLUB_DEBUG_ON is not set # CONFIG_DEBUG_RT_MUTEXES is not set # CONFIG_RT_MUTEX_TESTER is not set # CONFIG_DEBUG_SPINLOCK is not set # CONFIG_DEBUG_MUTEXES is not set # CONFIG_DEBUG_LOCK_ALLOC is not set # CONFIG_PROVE_LOCKING is not set +# CONFIG_LOCK_STAT is not set # CONFIG_DEBUG_SPINLOCK_SLEEP is not set # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set # CONFIG_DEBUG_KOBJECT is not set @@ -1443,7 +1421,6 @@ CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_VM is not set # CONFIG_DEBUG_LIST is not set # CONFIG_FRAME_POINTER is not set -# CONFIG_UNWIND_INFO is not set # CONFIG_FORCED_INLINING is not set # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_LKDTM is not set @@ -1462,10 +1439,6 @@ CONFIG_DOUBLEFAULT=y # # CONFIG_KEYS is not set # CONFIG_SECURITY is not set - -# -# Cryptographic options -# # CONFIG_CRYPTO is not set # @@ -1476,6 +1449,7 @@ CONFIG_BITREVERSE=y # CONFIG_CRC16 is not set # CONFIG_CRC_ITU_T is not set CONFIG_CRC32=y +# CONFIG_CRC7 is not set # CONFIG_LIBCRC32C is not set CONFIG_ZLIB_INFLATE=y CONFIG_PLIST=y diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 06da59f6f837..dbe5e87e0d66 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -40,6 +40,7 @@ obj-$(CONFIG_VM86) += vm86.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_K8_NB) += k8.o +obj-$(CONFIG_MGEODE_LX) += geode.o obj-$(CONFIG_VMI) += vmi.o vmiclock.o obj-$(CONFIG_PARAVIRT) += paravirt.o diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index a574cd2c8b61..cacdd883bf2b 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -618,6 +618,8 @@ static int __init acpi_parse_sbf(struct acpi_table_header *table) #ifdef CONFIG_HPET_TIMER #include <asm/hpet.h> +static struct __initdata resource *hpet_res; + static int __init acpi_parse_hpet(struct acpi_table_header *table) { struct acpi_table_hpet *hpet_tbl; @@ -638,8 +640,42 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table) printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", hpet_tbl->id, hpet_address); + /* + * Allocate and initialize the HPET firmware resource for adding into + * the resource tree during the lateinit timeframe. + */ +#define HPET_RESOURCE_NAME_SIZE 9 + hpet_res = alloc_bootmem(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE); + + if (!hpet_res) + return 0; + + memset(hpet_res, 0, sizeof(*hpet_res)); + hpet_res->name = (void *)&hpet_res[1]; + hpet_res->flags = IORESOURCE_MEM; + snprintf((char *)hpet_res->name, HPET_RESOURCE_NAME_SIZE, "HPET %u", + hpet_tbl->sequence); + + hpet_res->start = hpet_address; + hpet_res->end = hpet_address + (1 * 1024) - 1; + return 0; } + +/* + * hpet_insert_resource inserts the HPET resources used into the resource + * tree. + */ +static __init int hpet_insert_resource(void) +{ + if (!hpet_res) + return 1; + + return insert_resource(&iomem_resource, hpet_res); +} + +late_initcall(hpet_insert_resource); + #else #define acpi_parse_hpet NULL #endif @@ -950,14 +986,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { }, { .callback = force_acpi_ht, - .ident = "DELL GX240", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Dell Computer Corporation"), - DMI_MATCH(DMI_BOARD_NAME, "OptiPlex GX240"), - }, - }, - { - .callback = force_acpi_ht, .ident = "HP VISUALIZE NT Workstation", .matches = { DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"), diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c index d8cda14fff8b..c3750c2c4113 100644 --- a/arch/i386/kernel/alternative.c +++ b/arch/i386/kernel/alternative.c @@ -2,12 +2,17 @@ #include <linux/sched.h> #include <linux/spinlock.h> #include <linux/list.h> +#include <linux/kprobes.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> #include <asm/alternative.h> #include <asm/sections.h> +#include <asm/pgtable.h> +#include <asm/mce.h> +#include <asm/nmi.h> -static int noreplace_smp = 0; -static int smp_alt_once = 0; -static int debug_alternative = 0; +#ifdef CONFIG_HOTPLUG_CPU +static int smp_alt_once; static int __init bootonly(char *str) { @@ -15,6 +20,11 @@ static int __init bootonly(char *str) return 1; } __setup("smp-alt-boot", bootonly); +#else +#define smp_alt_once 1 +#endif + +static int debug_alternative; static int __init debug_alt(char *str) { @@ -23,6 +33,8 @@ static int __init debug_alt(char *str) } __setup("debug-alternative", debug_alt); +static int noreplace_smp; + static int __init setup_noreplace_smp(char *str) { noreplace_smp = 1; @@ -144,7 +156,7 @@ static void nop_out(void *insns, unsigned int len) unsigned int noplen = len; if (noplen > ASM_NOP_MAX) noplen = ASM_NOP_MAX; - memcpy(insns, noptable[noplen], noplen); + text_poke(insns, noptable[noplen], noplen); insns += noplen; len -= noplen; } @@ -196,7 +208,7 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) continue; if (*ptr > text_end) continue; - **ptr = 0xf0; /* lock prefix */ + text_poke(*ptr, ((unsigned char []){0xf0}), 1); /* add lock prefix */ }; } @@ -354,10 +366,6 @@ void apply_paravirt(struct paravirt_patch_site *start, /* Pad the rest with nops */ nop_out(p->instr + used, p->len - used); } - - /* Sync to be conservative, in case we patched following - * instructions */ - sync_core(); } extern struct paravirt_patch_site __start_parainstructions[], __stop_parainstructions[]; @@ -367,6 +375,14 @@ void __init alternative_instructions(void) { unsigned long flags; + /* The patching is not fully atomic, so try to avoid local interruptions + that might execute the to be patched code. + Other CPUs are not running. */ + stop_nmi(); +#ifdef CONFIG_MCE + stop_mce(); +#endif + local_irq_save(flags); apply_alternatives(__alt_instructions, __alt_instructions_end); @@ -376,8 +392,6 @@ void __init alternative_instructions(void) #ifdef CONFIG_HOTPLUG_CPU if (num_possible_cpus() < 2) smp_alt_once = 1; -#else - smp_alt_once = 1; #endif #ifdef CONFIG_SMP @@ -401,4 +415,37 @@ void __init alternative_instructions(void) #endif apply_paravirt(__parainstructions, __parainstructions_end); local_irq_restore(flags); + + restart_nmi(); +#ifdef CONFIG_MCE + restart_mce(); +#endif +} + +/* + * Warning: + * When you use this code to patch more than one byte of an instruction + * you need to make sure that other CPUs cannot execute this code in parallel. + * Also no thread must be currently preempted in the middle of these instructions. + * And on the local CPU you need to be protected again NMI or MCE handlers + * seeing an inconsistent instruction while you patch. + */ +void __kprobes text_poke(void *oaddr, unsigned char *opcode, int len) +{ + u8 *addr = oaddr; + if (!pte_write(*lookup_address((unsigned long)addr))) { + struct page *p[2] = { virt_to_page(addr), virt_to_page(addr+PAGE_SIZE) }; + addr = vmap(p, 2, VM_MAP, PAGE_KERNEL); + if (!addr) + return; + addr += ((unsigned long)oaddr) % PAGE_SIZE; + } + memcpy(addr, opcode, len); + sync_core(); + /* Not strictly needed, but can speed CPU recovery up. Ignore cross cacheline + case. */ + if (cpu_has_clflush) + asm("clflush (%0) " :: "r" (oaddr) : "memory"); + if (addr != oaddr) + vunmap(addr); } diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index 67824f3bb974..bfc6cb7df7e7 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -263,6 +263,9 @@ static void lapic_timer_setup(enum clock_event_mode mode, v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); apic_write_around(APIC_LVTT, v); break; + case CLOCK_EVT_MODE_RESUME: + /* Nothing to do here */ + break; } local_irq_restore(flags); @@ -315,7 +318,7 @@ static void __devinit setup_APIC_timer(void) #define LAPIC_CAL_LOOPS (HZ/10) -static __initdata volatile int lapic_cal_loops = -1; +static __initdata int lapic_cal_loops = -1; static __initdata long lapic_cal_t1, lapic_cal_t2; static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2; static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2; @@ -485,7 +488,7 @@ void __init setup_boot_APIC_clock(void) /* Let the interrupts run */ local_irq_enable(); - while(lapic_cal_loops <= LAPIC_CAL_LOOPS) + while (lapic_cal_loops <= LAPIC_CAL_LOOPS) cpu_relax(); local_irq_disable(); @@ -521,6 +524,9 @@ void __init setup_boot_APIC_clock(void) */ if (nmi_watchdog != NMI_IO_APIC) lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; + else + printk(KERN_WARNING "APIC timer registered as dummy," + " due to nmi_watchdog=1!\n"); } /* Setup the lapic or request the broadcast */ diff --git a/arch/i386/kernel/cpu/Makefile b/arch/i386/kernel/cpu/Makefile index 0b6a8551e9e2..778396c78d65 100644 --- a/arch/i386/kernel/cpu/Makefile +++ b/arch/i386/kernel/cpu/Makefile @@ -9,7 +9,6 @@ obj-y += cyrix.o obj-y += centaur.o obj-y += transmeta.o obj-y += intel.o intel_cacheinfo.o addon_cpuid_features.o -obj-y += rise.o obj-y += nexgen.o obj-y += umc.o diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c index 6f47eeeb93ea..c7ba455d5ac7 100644 --- a/arch/i386/kernel/cpu/amd.c +++ b/arch/i386/kernel/cpu/amd.c @@ -231,6 +231,9 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) switch (c->x86) { case 15: + /* Use K8 tuning for Fam10h and Fam11h */ + case 0x10: + case 0x11: set_bit(X86_FEATURE_K8, c->x86_capability); break; case 6: @@ -272,8 +275,12 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) } #endif - if (cpuid_eax(0x80000000) >= 0x80000006) - num_cache_leaves = 3; + if (cpuid_eax(0x80000000) >= 0x80000006) { + if ((c->x86 == 0x10) && (cpuid_edx(0x80000006) & 0xf000)) + num_cache_leaves = 4; + else + num_cache_leaves = 3; + } if (amd_apic_timer_broken()) set_bit(X86_FEATURE_LAPIC_TIMER_BROKEN, c->x86_capability); diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index e5419a9dec88..d506201d397c 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -606,7 +606,6 @@ extern int nsc_init_cpu(void); extern int amd_init_cpu(void); extern int centaur_init_cpu(void); extern int transmeta_init_cpu(void); -extern int rise_init_cpu(void); extern int nexgen_init_cpu(void); extern int umc_init_cpu(void); @@ -618,7 +617,6 @@ void __init early_cpu_init(void) amd_init_cpu(); centaur_init_cpu(); transmeta_init_cpu(); - rise_init_cpu(); nexgen_init_cpu(); umc_init_cpu(); early_cpu_detect(); diff --git a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c index 18c8b67ea3a7..6f846bee2103 100644 --- a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -665,8 +665,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) data->max_freq = perf->states[0].core_frequency * 1000; /* table init */ for (i=0; i<perf->state_count; i++) { - if (i>0 && perf->states[i].core_frequency == - perf->states[i-1].core_frequency) + if (i>0 && perf->states[i].core_frequency >= + data->freq_table[valid_states-1].frequency / 1000) continue; data->freq_table[valid_states].index = i; diff --git a/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c b/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c index 194144539a6f..461dabc4e495 100644 --- a/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c +++ b/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c @@ -79,7 +79,7 @@ #include <linux/smp.h> #include <linux/cpufreq.h> #include <linux/pci.h> -#include <asm/processor.h> +#include <asm/processor-cyrix.h> #include <asm/errno.h> /* PCI config registers, all at F0 */ diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c index e88d2fba156b..122d2d75aa9f 100644 --- a/arch/i386/kernel/cpu/cyrix.c +++ b/arch/i386/kernel/cpu/cyrix.c @@ -4,7 +4,7 @@ #include <linux/pci.h> #include <asm/dma.h> #include <asm/io.h> -#include <asm/processor.h> +#include <asm/processor-cyrix.h> #include <asm/timer.h> #include <asm/pci-direct.h> #include <asm/tsc.h> diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c index e5be819492ef..d5a456d27d82 100644 --- a/arch/i386/kernel/cpu/intel_cacheinfo.c +++ b/arch/i386/kernel/cpu/intel_cacheinfo.c @@ -4,7 +4,7 @@ * Changes: * Venkatesh Pallipadi : Adding cache identification through cpuid(4) * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. - * Andi Kleen : CPUID4 emulation on AMD. + * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. */ #include <linux/init.h> @@ -135,7 +135,7 @@ unsigned short num_cache_leaves; /* AMD doesn't have CPUID4. Emulate it here to report the same information to the user. This makes some assumptions about the machine: - No L3, L2 not shared, no SMT etc. that is currently true on AMD CPUs. + L2 not shared, no SMT etc. that is currently true on AMD CPUs. In theory the TLBs could be reported as fake type (they are in "dummy"). Maybe later */ @@ -159,13 +159,26 @@ union l2_cache { unsigned val; }; +union l3_cache { + struct { + unsigned line_size : 8; + unsigned lines_per_tag : 4; + unsigned assoc : 4; + unsigned res : 2; + unsigned size_encoded : 14; + }; + unsigned val; +}; + static const unsigned short assocs[] = { [1] = 1, [2] = 2, [4] = 4, [6] = 8, - [8] = 16, + [8] = 16, [0xa] = 32, [0xb] = 48, + [0xc] = 64, [0xf] = 0xffff // ?? - }; -static const unsigned char levels[] = { 1, 1, 2 }; -static const unsigned char types[] = { 1, 2, 3 }; +}; + +static const unsigned char levels[] = { 1, 1, 2, 3 }; +static const unsigned char types[] = { 1, 2, 3, 3 }; static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, union _cpuid4_leaf_ebx *ebx, @@ -175,37 +188,58 @@ static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, unsigned line_size, lines_per_tag, assoc, size_in_kb; union l1_cache l1i, l1d; union l2_cache l2; + union l3_cache l3; + union l1_cache *l1 = &l1d; eax->full = 0; ebx->full = 0; ecx->full = 0; cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val); - cpuid(0x80000006, &dummy, &dummy, &l2.val, &dummy); - - if (leaf > 2 || !l1d.val || !l1i.val || !l2.val) - return; - - eax->split.is_self_initializing = 1; - eax->split.type = types[leaf]; - eax->split.level = levels[leaf]; - eax->split.num_threads_sharing = 0; - eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; - - if (leaf <= 1) { - union l1_cache *l1 = leaf == 0 ? &l1d : &l1i; + cpuid(0x80000006, &dummy, &dummy, &l2.val, &l3.val); + + switch (leaf) { + case 1: + l1 = &l1i; + case 0: + if (!l1->val) + return; assoc = l1->assoc; line_size = l1->line_size; lines_per_tag = l1->lines_per_tag; size_in_kb = l1->size_in_kb; - } else { + break; + case 2: + if (!l2.val) + return; assoc = l2.assoc; line_size = l2.line_size; lines_per_tag = l2.lines_per_tag; /* cpu_data has errata corrections for K7 applied */ size_in_kb = current_cpu_data.x86_cache_size; + break; + case 3: + if (!l3.val) + return; + assoc = l3.assoc; + line_size = l3.line_size; + lines_per_tag = l3.lines_per_tag; + size_in_kb = l3.size_encoded * 512; + break; + default: + return; } + eax->split.is_self_initializing = 1; + eax->split.type = types[leaf]; + eax->split.level = levels[leaf]; + if (leaf == 3) + eax->split.num_threads_sharing = current_cpu_data.x86_max_cores - 1; + else + eax->split.num_threads_sharing = 0; + eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; + + if (assoc == 0xf) eax->split.is_fully_associative = 1; ebx->split.coherency_line_size = line_size - 1; @@ -239,8 +273,7 @@ static int __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_le return 0; } -/* will only be called once; __init is safe here */ -static int __init find_num_cache_leaves(void) +static int __cpuinit find_num_cache_leaves(void) { unsigned int eax, ebx, ecx, edx; union _cpuid4_leaf_eax cache_eax; @@ -710,7 +743,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) return retval; } -static void __cpuexit cache_remove_dev(struct sys_device * sys_dev) +static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) { unsigned int cpu = sys_dev->id; unsigned long i; diff --git a/arch/i386/kernel/cpu/mcheck/mce.c b/arch/i386/kernel/cpu/mcheck/mce.c index 56cd485b127c..34c781eddee4 100644 --- a/arch/i386/kernel/cpu/mcheck/mce.c +++ b/arch/i386/kernel/cpu/mcheck/mce.c @@ -60,6 +60,20 @@ void mcheck_init(struct cpuinfo_x86 *c) } } +static unsigned long old_cr4 __initdata; + +void __init stop_mce(void) +{ + old_cr4 = read_cr4(); + clear_in_cr4(X86_CR4_MCE); +} + +void __init restart_mce(void) +{ + if (old_cr4 & X86_CR4_MCE) + set_in_cr4(X86_CR4_MCE); +} + static int __init mcheck_disable(char *str) { mce_disabled = 1; diff --git a/arch/i386/kernel/cpu/mcheck/non-fatal.c b/arch/i386/kernel/cpu/mcheck/non-fatal.c index 6b5d3518a1c0..bf39409b3838 100644 --- a/arch/i386/kernel/cpu/mcheck/non-fatal.c +++ b/arch/i386/kernel/cpu/mcheck/non-fatal.c @@ -57,7 +57,7 @@ static DECLARE_DELAYED_WORK(mce_work, mce_work_fn); static void mce_work_fn(struct work_struct *work) { on_each_cpu(mce_checkregs, NULL, 1, 1); - schedule_delayed_work(&mce_work, MCE_RATE); + schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); } static int __init init_nonfatal_mce_checker(void) @@ -82,7 +82,7 @@ static int __init init_nonfatal_mce_checker(void) /* * Check for non-fatal errors every MCE_RATE s */ - schedule_delayed_work(&mce_work, MCE_RATE); + schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); printk(KERN_INFO "Machine check exception polling timer started.\n"); return 0; } diff --git a/arch/i386/kernel/cpu/mtrr/cyrix.c b/arch/i386/kernel/cpu/mtrr/cyrix.c index 1001f1e0fe6d..2287d4863a8a 100644 --- a/arch/i386/kernel/cpu/mtrr/cyrix.c +++ b/arch/i386/kernel/cpu/mtrr/cyrix.c @@ -3,6 +3,7 @@ #include <asm/mtrr.h> #include <asm/msr.h> #include <asm/io.h> +#include <asm/processor-cyrix.h> #include "mtrr.h" int arr3_protected; diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c index f6e46943e6ef..56f64e34829f 100644 --- a/arch/i386/kernel/cpu/mtrr/generic.c +++ b/arch/i386/kernel/cpu/mtrr/generic.c @@ -79,7 +79,7 @@ static void print_fixed(unsigned base, unsigned step, const mtrr_type*types) } /* Grab all of the MTRR state for this CPU into *state */ -void get_mtrr_state(void) +void __init get_mtrr_state(void) { unsigned int i; struct mtrr_var_range *vrs; diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c index 75dc6d5214bc..c48b6fea5ab4 100644 --- a/arch/i386/kernel/cpu/mtrr/main.c +++ b/arch/i386/kernel/cpu/mtrr/main.c @@ -643,7 +643,7 @@ static struct sysdev_driver mtrr_sysdev_driver = { * initialized (i.e. before smp_init()). * */ -__init void mtrr_bp_init(void) +void __init mtrr_bp_init(void) { init_ifs(); diff --git a/arch/i386/kernel/cpu/mtrr/state.c b/arch/i386/kernel/cpu/mtrr/state.c index 7b39a2f954d9..c9014ca4a575 100644 --- a/arch/i386/kernel/cpu/mtrr/state.c +++ b/arch/i386/kernel/cpu/mtrr/state.c @@ -3,6 +3,7 @@ #include <asm/io.h> #include <asm/mtrr.h> #include <asm/msr.h> +#include <asm-i386/processor-cyrix.h> #include "mtrr.h" diff --git a/arch/i386/kernel/cpu/perfctr-watchdog.c b/arch/i386/kernel/cpu/perfctr-watchdog.c index 4d26d514c56f..4be488e73bee 100644 --- a/arch/i386/kernel/cpu/perfctr-watchdog.c +++ b/arch/i386/kernel/cpu/perfctr-watchdog.c @@ -325,7 +325,7 @@ static struct wd_ops k7_wd_ops = { .stop = single_msr_stop_watchdog, .perfctr = MSR_K7_PERFCTR0, .evntsel = MSR_K7_EVNTSEL0, - .checkbit = 1ULL<<63, + .checkbit = 1ULL<<47, }; /* Intel Model 6 (PPro+,P2,P3,P-M,Core1) */ @@ -346,7 +346,9 @@ static int setup_p6_watchdog(unsigned nmi_hz) perfctr_msr = MSR_P6_PERFCTR0; evntsel_msr = MSR_P6_EVNTSEL0; - wrmsrl(perfctr_msr, 0UL); + /* KVM doesn't implement this MSR */ + if (wrmsr_safe(perfctr_msr, 0, 0) < 0) + return 0; evntsel = P6_EVNTSEL_INT | P6_EVNTSEL_OS @@ -599,8 +601,8 @@ static struct wd_ops intel_arch_wd_ops = { .setup = setup_intel_arch_watchdog, .rearm = p6_rearm, .stop = single_msr_stop_watchdog, - .perfctr = MSR_ARCH_PERFMON_PERFCTR0, - .evntsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR1, + .evntsel = MSR_ARCH_PERFMON_EVENTSEL1, }; static void probe_nmi_watchdog(void) diff --git a/arch/i386/kernel/cpu/rise.c b/arch/i386/kernel/cpu/rise.c deleted file mode 100644 index 50076f22e90f..000000000000 --- a/arch/i386/kernel/cpu/rise.c +++ /dev/null @@ -1,52 +0,0 @@ -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/bitops.h> -#include <asm/processor.h> - -#include "cpu.h" - -static void __cpuinit init_rise(struct cpuinfo_x86 *c) -{ - printk("CPU: Rise iDragon"); - if (c->x86_model > 2) - printk(" II"); - printk("\n"); - - /* Unhide possibly hidden capability flags - The mp6 iDragon family don't have MSRs. - We switch on extra features with this cpuid weirdness: */ - __asm__ ( - "movl $0x6363452a, %%eax\n\t" - "movl $0x3231206c, %%ecx\n\t" - "movl $0x2a32313a, %%edx\n\t" - "cpuid\n\t" - "movl $0x63634523, %%eax\n\t" - "movl $0x32315f6c, %%ecx\n\t" - "movl $0x2333313a, %%edx\n\t" - "cpuid\n\t" : : : "eax", "ebx", "ecx", "edx" - ); - set_bit(X86_FEATURE_CX8, c->x86_capability); -} - -static struct cpu_dev rise_cpu_dev __cpuinitdata = { - .c_vendor = "Rise", - .c_ident = { "RiseRiseRise" }, - .c_models = { - { .vendor = X86_VENDOR_RISE, .family = 5, .model_names = - { - [0] = "iDragon", - [2] = "iDragon", - [8] = "iDragon II", - [9] = "iDragon II" - } - }, - }, - .c_init = init_rise, -}; - -int __init rise_init_cpu(void) -{ - cpu_devs[X86_VENDOR_RISE] = &rise_cpu_dev; - return 0; -} - diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c index fc822a46897a..e60cddbc4cfb 100644 --- a/arch/i386/kernel/e820.c +++ b/arch/i386/kernel/e820.c @@ -10,6 +10,7 @@ #include <linux/efi.h> #include <linux/pfn.h> #include <linux/uaccess.h> +#include <linux/suspend.h> #include <asm/pgtable.h> #include <asm/page.h> @@ -320,6 +321,37 @@ static int __init request_standard_resources(void) subsys_initcall(request_standard_resources); +#if defined(CONFIG_PM) && defined(CONFIG_SOFTWARE_SUSPEND) +/** + * e820_mark_nosave_regions - Find the ranges of physical addresses that do not + * correspond to e820 RAM areas and mark the corresponding pages as nosave for + * hibernation. + * + * This function requires the e820 map to be sorted and without any + * overlapping entries and assumes the first e820 area to be RAM. + */ +void __init e820_mark_nosave_regions(void) +{ + int i; + unsigned long pfn; + + pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size); + for (i = 1; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (pfn < PFN_UP(ei->addr)) + register_nosave_region(pfn, PFN_UP(ei->addr)); + + pfn = PFN_DOWN(ei->addr + ei->size); + if (ei->type != E820_RAM) + register_nosave_region(PFN_UP(ei->addr), pfn); + + if (pfn >= max_low_pfn) + break; + } +} +#endif + void __init add_memory_region(unsigned long long start, unsigned long long size, int type) { diff --git a/arch/i386/kernel/geode.c b/arch/i386/kernel/geode.c new file mode 100644 index 000000000000..41e8aec4c61d --- /dev/null +++ b/arch/i386/kernel/geode.c @@ -0,0 +1,155 @@ +/* + * AMD Geode southbridge support code + * Copyright (C) 2006, Advanced Micro Devices, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/ioport.h> +#include <linux/io.h> +#include <asm/msr.h> +#include <asm/geode.h> + +static struct { + char *name; + u32 msr; + int size; + u32 base; +} lbars[] = { + { "geode-pms", MSR_LBAR_PMS, LBAR_PMS_SIZE, 0 }, + { "geode-acpi", MSR_LBAR_ACPI, LBAR_ACPI_SIZE, 0 }, + { "geode-gpio", MSR_LBAR_GPIO, LBAR_GPIO_SIZE, 0 }, + { "geode-mfgpt", MSR_LBAR_MFGPT, LBAR_MFGPT_SIZE, 0 } +}; + +static void __init init_lbars(void) +{ + u32 lo, hi; + int i; + + for (i = 0; i < ARRAY_SIZE(lbars); i++) { + rdmsr(lbars[i].msr, lo, hi); + if (hi & 0x01) + lbars[i].base = lo & 0x0000ffff; + + if (lbars[i].base == 0) + printk(KERN_ERR "geode: Couldn't initialize '%s'\n", + lbars[i].name); + } +} + +int geode_get_dev_base(unsigned int dev) +{ + BUG_ON(dev >= ARRAY_SIZE(lbars)); + return lbars[dev].base; +} +EXPORT_SYMBOL_GPL(geode_get_dev_base); + +/* === GPIO API === */ + +void geode_gpio_set(unsigned int gpio, unsigned int reg) +{ + u32 base = geode_get_dev_base(GEODE_DEV_GPIO); + + if (!base) + return; + + if (gpio < 16) + outl(1 << gpio, base + reg); + else + outl(1 << (gpio - 16), base + 0x80 + reg); +} +EXPORT_SYMBOL_GPL(geode_gpio_set); + +void geode_gpio_clear(unsigned int gpio, unsigned int reg) +{ + u32 base = geode_get_dev_base(GEODE_DEV_GPIO); + + if (!base) + return; + + if (gpio < 16) + outl(1 << (gpio + 16), base + reg); + else + outl(1 << gpio, base + 0x80 + reg); +} +EXPORT_SYMBOL_GPL(geode_gpio_clear); + +int geode_gpio_isset(unsigned int gpio, unsigned int reg) +{ + u32 base = geode_get_dev_base(GEODE_DEV_GPIO); + + if (!base) + return 0; + + if (gpio < 16) + return (inl(base + reg) & (1 << gpio)) ? 1 : 0; + else + return (inl(base + 0x80 + reg) & (1 << (gpio - 16))) ? 1 : 0; +} +EXPORT_SYMBOL_GPL(geode_gpio_isset); + +void geode_gpio_set_irq(unsigned int group, unsigned int irq) +{ + u32 lo, hi; + + if (group > 7 || irq > 15) + return; + + rdmsr(MSR_PIC_ZSEL_HIGH, lo, hi); + + lo &= ~(0xF << (group * 4)); + lo |= (irq & 0xF) << (group * 4); + + wrmsr(MSR_PIC_ZSEL_HIGH, lo, hi); +} +EXPORT_SYMBOL_GPL(geode_gpio_set_irq); + +void geode_gpio_setup_event(unsigned int gpio, int pair, int pme) +{ + u32 base = geode_get_dev_base(GEODE_DEV_GPIO); + u32 offset, shift, val; + + if (gpio >= 24) + offset = GPIO_MAP_W; + else if (gpio >= 16) + offset = GPIO_MAP_Z; + else if (gpio >= 8) + offset = GPIO_MAP_Y; + else + offset = GPIO_MAP_X; + + shift = (gpio % 8) * 4; + + val = inl(base + offset); + + /* Clear whatever was there before */ + val &= ~(0xF << shift); + + /* And set the new value */ + + val |= ((pair & 7) << shift); + + /* Set the PME bit if this is a PME event */ + + if (pme) + val |= (1 << (shift + 3)); + + outl(val, base + offset); +} +EXPORT_SYMBOL_GPL(geode_gpio_setup_event); + +static int __init geode_southbridge_init(void) +{ + if (!is_geode()) + return -ENODEV; + + init_lbars(); + return 0; +} + +postcore_initcall(geode_southbridge_init); diff --git a/arch/i386/kernel/hpet.c b/arch/i386/kernel/hpet.c index 17d73459fc5f..533d4932bc79 100644 --- a/arch/i386/kernel/hpet.c +++ b/arch/i386/kernel/hpet.c @@ -5,6 +5,7 @@ #include <linux/init.h> #include <linux/sysdev.h> #include <linux/pm.h> +#include <linux/delay.h> #include <asm/hpet.h> #include <asm/io.h> @@ -187,6 +188,10 @@ static void hpet_set_mode(enum clock_event_mode mode, cfg &= ~HPET_TN_ENABLE; hpet_writel(cfg, HPET_T0_CFG); break; + + case CLOCK_EVT_MODE_RESUME: + hpet_enable_int(); + break; } } @@ -217,6 +222,7 @@ static struct clocksource clocksource_hpet = { .mask = HPET_MASK, .shift = HPET_SHIFT, .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .resume = hpet_start_counter, }; /* @@ -226,7 +232,8 @@ int __init hpet_enable(void) { unsigned long id; uint64_t hpet_freq; - u64 tmp; + u64 tmp, start, now; + cycle_t t1; if (!is_hpet_capable()) return 0; @@ -273,6 +280,27 @@ int __init hpet_enable(void) /* Start the counter */ hpet_start_counter(); + /* Verify whether hpet counter works */ + t1 = read_hpet(); + rdtscll(start); + + /* + * We don't know the TSC frequency yet, but waiting for + * 200000 TSC cycles is safe: + * 4 GHz == 50us + * 1 GHz == 200us + */ + do { + rep_nop(); + rdtscll(now); + } while ((now - start) < 200000UL); + + if (t1 == read_hpet()) { + printk(KERN_WARNING + "HPET counter not counting. HPET disabled\n"); + goto out_nohpet; + } + /* Initialize and register HPET clocksource * * hpet period is in femto seconds per cycle @@ -291,7 +319,6 @@ int __init hpet_enable(void) clocksource_register(&clocksource_hpet); - if (id & HPET_ID_LEGSUP) { hpet_enable_int(); hpet_reserve_platform_timers(id); @@ -299,7 +326,7 @@ int __init hpet_enable(void) * Start hpet with the boot cpu mask and make it * global after the IO_APIC has been initialized. */ - hpet_clockevent.cpumask =cpumask_of_cpu(0); + hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id()); clockevents_register_device(&hpet_clockevent); global_clock_event = &hpet_clockevent; return 1; @@ -524,68 +551,3 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } #endif - - -/* - * Suspend/resume part - */ - -#ifdef CONFIG_PM - -static int hpet_suspend(struct sys_device *sys_device, pm_message_t state) -{ - unsigned long cfg = hpet_readl(HPET_CFG); - - cfg &= ~(HPET_CFG_ENABLE|HPET_CFG_LEGACY); - hpet_writel(cfg, HPET_CFG); - - return 0; -} - -static int hpet_resume(struct sys_device *sys_device) -{ - unsigned int id; - - hpet_start_counter(); - - id = hpet_readl(HPET_ID); - - if (id & HPET_ID_LEGSUP) - hpet_enable_int(); - - return 0; -} - -static struct sysdev_class hpet_class = { - set_kset_name("hpet"), - .suspend = hpet_suspend, - .resume = hpet_resume, -}; - -static struct sys_device hpet_device = { - .id = 0, - .cls = &hpet_class, -}; - - -static __init int hpet_register_sysfs(void) -{ - int err; - - if (!is_hpet_capable()) - return 0; - - err = sysdev_class_register(&hpet_class); - - if (!err) { - err = sysdev_register(&hpet_device); - if (err) - sysdev_class_unregister(&hpet_class); - } - - return err; -} - -device_initcall(hpet_register_sysfs); - -#endif diff --git a/arch/i386/kernel/i8253.c b/arch/i386/kernel/i8253.c index f8a3c4054c70..6d839f2f1b1a 100644 --- a/arch/i386/kernel/i8253.c +++ b/arch/i386/kernel/i8253.c @@ -3,18 +3,17 @@ * */ #include <linux/clockchips.h> -#include <linux/spinlock.h> +#include <linux/init.h> +#include <linux/interrupt.h> #include <linux/jiffies.h> -#include <linux/sysdev.h> #include <linux/module.h> -#include <linux/init.h> +#include <linux/spinlock.h> #include <asm/smp.h> #include <asm/delay.h> #include <asm/i8253.h> #include <asm/io.h> - -#include "io_ports.h" +#include <asm/timer.h> DEFINE_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); @@ -41,26 +40,27 @@ static void init_pit_timer(enum clock_event_mode mode, case CLOCK_EVT_MODE_PERIODIC: /* binary, mode 2, LSB/MSB, ch 0 */ outb_p(0x34, PIT_MODE); - udelay(10); outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ - udelay(10); outb(LATCH >> 8 , PIT_CH0); /* MSB */ break; - /* - * Avoid unnecessary state transitions, as it confuses - * Geode / Cyrix based boxen. - */ case CLOCK_EVT_MODE_SHUTDOWN: - if (evt->mode == CLOCK_EVT_MODE_UNUSED) - break; case CLOCK_EVT_MODE_UNUSED: - if (evt->mode == CLOCK_EVT_MODE_SHUTDOWN) - break; + if (evt->mode == CLOCK_EVT_MODE_PERIODIC || + evt->mode == CLOCK_EVT_MODE_ONESHOT) { + outb_p(0x30, PIT_MODE); + outb_p(0, PIT_CH0); + outb_p(0, PIT_CH0); + } + break; + case CLOCK_EVT_MODE_ONESHOT: /* One shot setup */ outb_p(0x38, PIT_MODE); - udelay(10); + break; + + case CLOCK_EVT_MODE_RESUME: + /* Nothing to do here */ break; } spin_unlock_irqrestore(&i8253_lock, flags); diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 21db8f56c9a1..893df8280756 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -353,14 +353,6 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) # include <linux/slab.h> /* kmalloc() */ # include <linux/timer.h> /* time_after() */ -#ifdef CONFIG_BALANCED_IRQ_DEBUG -# define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0) -# define Dprintk(x...) do { TDprintk(x); } while (0) -# else -# define TDprintk(x...) -# define Dprintk(x...) -# endif - #define IRQBALANCE_CHECK_ARCH -999 #define MAX_BALANCED_IRQ_INTERVAL (5*HZ) #define MIN_BALANCED_IRQ_INTERVAL (HZ/2) @@ -443,7 +435,7 @@ static inline void balance_irq(int cpu, int irq) static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) { int i, j; - Dprintk("Rotating IRQs among CPUs.\n"); + for_each_online_cpu(i) { for (j = 0; j < NR_IRQS; j++) { if (!irq_desc[j].action) @@ -560,19 +552,11 @@ tryanothercpu: max_loaded = tmp_loaded; /* processor */ imbalance = (max_cpu_irq - min_cpu_irq) / 2; - Dprintk("max_loaded cpu = %d\n", max_loaded); - Dprintk("min_loaded cpu = %d\n", min_loaded); - Dprintk("max_cpu_irq load = %ld\n", max_cpu_irq); - Dprintk("min_cpu_irq load = %ld\n", min_cpu_irq); - Dprintk("load imbalance = %lu\n", imbalance); - /* if imbalance is less than approx 10% of max load, then * observe diminishing returns action. - quit */ - if (imbalance < (max_cpu_irq >> 3)) { - Dprintk("Imbalance too trivial\n"); + if (imbalance < (max_cpu_irq >> 3)) goto not_worth_the_effort; - } tryanotherirq: /* if we select an IRQ to move that can't go where we want, then @@ -629,9 +613,6 @@ tryanotherirq: cpus_and(tmp, target_cpu_mask, allowed_mask); if (!cpus_empty(tmp)) { - - Dprintk("irq = %d moved to cpu = %d\n", - selected_irq, min_loaded); /* mark for change destination */ set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded)); @@ -651,7 +632,6 @@ not_worth_the_effort: */ balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL, balanced_irq_interval + BALANCED_IRQ_MORE_DELTA); - Dprintk("IRQ worth rotating not found\n"); return; } @@ -1902,7 +1882,7 @@ __setup("no_timer_check", notimercheck); * - if this function detects that timer IRQs are defunct, then we fall * back to ISA timer IRQs */ -int __init timer_irq_works(void) +static int __init timer_irq_works(void) { unsigned long t1 = jiffies; diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c index ba44d40b066d..dd2b97fc00b2 100644 --- a/arch/i386/kernel/irq.c +++ b/arch/i386/kernel/irq.c @@ -149,15 +149,11 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs) #ifdef CONFIG_4KSTACKS -/* - * These should really be __section__(".bss.page_aligned") as well, but - * gcc's 3.0 and earlier don't handle that correctly. - */ static char softirq_stack[NR_CPUS * THREAD_SIZE] - __attribute__((__aligned__(THREAD_SIZE))); + __attribute__((__section__(".bss.page_aligned"))); static char hardirq_stack[NR_CPUS * THREAD_SIZE] - __attribute__((__aligned__(THREAD_SIZE))); + __attribute__((__section__(".bss.page_aligned"))); /* * allocate per-cpu stacks for hardirq and for softirq processing diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c index dde828a333c3..448a50b1324c 100644 --- a/arch/i386/kernel/kprobes.c +++ b/arch/i386/kernel/kprobes.c @@ -35,6 +35,7 @@ #include <asm/cacheflush.h> #include <asm/desc.h> #include <asm/uaccess.h> +#include <asm/alternative.h> void jprobe_return_end(void); @@ -169,16 +170,12 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) void __kprobes arch_arm_kprobe(struct kprobe *p) { - *p->addr = BREAKPOINT_INSTRUCTION; - flush_icache_range((unsigned long) p->addr, - (unsigned long) p->addr + sizeof(kprobe_opcode_t)); + text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1); } void __kprobes arch_disarm_kprobe(struct kprobe *p) { - *p->addr = p->opcode; - flush_icache_range((unsigned long) p->addr, - (unsigned long) p->addr + sizeof(kprobe_opcode_t)); + text_poke(p->addr, &p->opcode, 1); } void __kprobes arch_remove_kprobe(struct kprobe *p) diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 03b7f5584d71..99beac7f96ce 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -353,7 +353,7 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) * Take the local apic timer and PIT/HPET into account. We don't * know which one is active, when we have highres/dyntick on */ - sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_irqs(0); + sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_cpu(cpu).irqs[0]; /* if the none of the timers isn't firing, this cpu isn't doing much */ if (!touched && last_irq_sums[cpu] == sum) { diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index 53f07a8275e3..ea962c0667d5 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -124,20 +124,28 @@ unsigned paravirt_patch_ignore(unsigned len) return len; } +struct branch { + unsigned char opcode; + u32 delta; +} __attribute__((packed)); + unsigned paravirt_patch_call(void *target, u16 tgt_clobbers, void *site, u16 site_clobbers, unsigned len) { unsigned char *call = site; unsigned long delta = (unsigned long)target - (unsigned long)(call+5); + struct branch b; if (tgt_clobbers & ~site_clobbers) return len; /* target would clobber too much for this site */ if (len < 5) return len; /* call too long for patch site */ - *call++ = 0xe8; /* call */ - *(unsigned long *)call = delta; + b.opcode = 0xe8; /* call */ + b.delta = delta; + BUILD_BUG_ON(sizeof(b) != 5); + text_poke(call, (unsigned char *)&b, 5); return 5; } @@ -146,12 +154,14 @@ unsigned paravirt_patch_jmp(void *target, void *site, unsigned len) { unsigned char *jmp = site; unsigned long delta = (unsigned long)target - (unsigned long)(jmp+5); + struct branch b; if (len < 5) return len; /* call too long for patch site */ - *jmp++ = 0xe9; /* jmp */ - *(unsigned long *)jmp = delta; + b.opcode = 0xe9; /* jmp */ + b.delta = delta; + text_poke(jmp, (unsigned char *)&b, 5); return 5; } diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 6c49acb96982..84664710b784 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -300,6 +300,7 @@ early_param("idle", idle_setup); void show_regs(struct pt_regs * regs) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; + unsigned long d0, d1, d2, d3, d6, d7; printk("\n"); printk("Pid: %d, comm: %20s\n", current->pid, current->comm); @@ -324,6 +325,17 @@ void show_regs(struct pt_regs * regs) cr3 = read_cr3(); cr4 = read_cr4_safe(); printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); + + get_debugreg(d0, 0); + get_debugreg(d1, 1); + get_debugreg(d2, 2); + get_debugreg(d3, 3); + printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", + d0, d1, d2, d3); + get_debugreg(d6, 6); + get_debugreg(d7, 7); + printk("DR6: %08lx DR7: %08lx\n", d6, d7); + show_trace(NULL, regs, ®s->esp); } diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c index 5513f8d5b5be..0d796248866c 100644 --- a/arch/i386/kernel/reboot.c +++ b/arch/i386/kernel/reboot.c @@ -113,6 +113,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"), }, }, + { /* Handle problems with rebooting on Dell Optiplex 745's SFF*/ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 745", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), + DMI_MATCH(DMI_BOARD_NAME, "0WF810"), + }, + }, { /* Handle problems with rebooting on Dell 2400's */ .callback = set_bios_reboot, .ident = "Dell PowerEdge 2400", diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 74871d066c2b..d474cd639bcb 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -273,18 +273,18 @@ unsigned long __init find_max_low_pfn(void) printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20); if (max_pfn > MAX_NONPAE_PFN) - printk(KERN_WARNING "Use a PAE enabled kernel.\n"); + printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n"); else printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); max_pfn = MAXMEM_PFN; #else /* !CONFIG_HIGHMEM */ -#ifndef CONFIG_X86_PAE +#ifndef CONFIG_HIGHMEM64G if (max_pfn > MAX_NONPAE_PFN) { max_pfn = MAX_NONPAE_PFN; printk(KERN_WARNING "Warning only 4GB will be used.\n"); - printk(KERN_WARNING "Use a PAE enabled kernel.\n"); + printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n"); } -#endif /* !CONFIG_X86_PAE */ +#endif /* !CONFIG_HIGHMEM64G */ #endif /* !CONFIG_HIGHMEM */ } else { if (highmem_pages == -1) @@ -466,7 +466,7 @@ void __init setup_bootmem_allocator(void) * * This should all compile down to nothing when NUMA is off. */ -void __init remapped_pgdat_init(void) +static void __init remapped_pgdat_init(void) { int nid; @@ -640,6 +640,7 @@ void __init setup_arch(char **cmdline_p) #endif e820_register_memory(); + e820_mark_nosave_regions(); #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c index d574e38f0f77..f5dd85656c18 100644 --- a/arch/i386/kernel/signal.c +++ b/arch/i386/kernel/signal.c @@ -199,6 +199,13 @@ asmlinkage int sys_sigreturn(unsigned long __unused) return eax; badframe: + if (show_unhandled_signals && printk_ratelimit()) + printk("%s%s[%d] bad frame in sigreturn frame:%p eip:%lx" + " esp:%lx oeax:%lx\n", + current->pid > 1 ? KERN_INFO : KERN_EMERG, + current->comm, current->pid, frame, regs->eip, + regs->esp, regs->orig_eax); + force_sig(SIGSEGV, current); return 0; } diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 5910d3fac561..e4f61d1c6248 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -308,7 +308,7 @@ cpumask_t cpu_coregroup_map(int cpu) /* representing cpus for which sibling maps can be computed */ static cpumask_t cpu_sibling_setup_map; -void set_cpu_sibling_map(int cpu) +void __cpuinit set_cpu_sibling_map(int cpu) { int i; struct cpuinfo_x86 *c = cpu_data; diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c index ff4ee6f3326b..6deb159d08e0 100644 --- a/arch/i386/kernel/sysenter.c +++ b/arch/i386/kernel/sysenter.c @@ -336,7 +336,9 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk) int in_gate_area(struct task_struct *task, unsigned long addr) { - return 0; + const struct vm_area_struct *vma = get_gate_vma(task); + + return vma && addr >= vma->vm_start && addr < vma->vm_end; } int in_gate_area_no_task(unsigned long addr) diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c index a665df61f08c..19a6c678d02e 100644 --- a/arch/i386/kernel/time.c +++ b/arch/i386/kernel/time.c @@ -207,55 +207,9 @@ unsigned long read_persistent_clock(void) return retval; } -static void sync_cmos_clock(unsigned long dummy); - -static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); -int no_sync_cmos_clock; - -static void sync_cmos_clock(unsigned long dummy) -{ - struct timeval now, next; - int fail = 1; - - /* - * If we have an externally synchronized Linux clock, then update - * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be - * called as close as possible to 500 ms before the new second starts. - * This code is run on a timer. If the clock is set, that timer - * may not expire at the correct time. Thus, we adjust... - */ - if (!ntp_synced()) - /* - * Not synced, exit, do not restart a timer (if one is - * running, let it run out). - */ - return; - - do_gettimeofday(&now); - if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 && - now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2) - fail = set_rtc_mmss(now.tv_sec); - - next.tv_usec = USEC_AFTER - now.tv_usec; - if (next.tv_usec <= 0) - next.tv_usec += USEC_PER_SEC; - - if (!fail) - next.tv_sec = 659; - else - next.tv_sec = 0; - - if (next.tv_usec >= USEC_PER_SEC) { - next.tv_sec++; - next.tv_usec -= USEC_PER_SEC; - } - mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next)); -} - -void notify_arch_cmos_timer(void) +int update_persistent_clock(struct timespec now) { - if (!no_sync_cmos_clock) - mod_timer(&sync_cmos_timer, jiffies + 1); + return set_rtc_mmss(now.tv_sec); } extern void (*late_time_init)(void); diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 3e7753c78b9b..cfffe3dd9e83 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -152,7 +152,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (!stack) { unsigned long dummy; stack = &dummy; - if (task && task != current) + if (task != current) stack = (unsigned long *)task->thread.esp; } @@ -211,6 +211,7 @@ static void print_trace_address(void *data, unsigned long addr) { printk("%s [<%08lx>] ", (char *)data, addr); print_symbol("%s\n", addr); + touch_nmi_watchdog(); } static struct stacktrace_ops print_trace_ops = { @@ -617,6 +618,13 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs, current->thread.error_code = error_code; current->thread.trap_no = 13; + if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) && + printk_ratelimit()) + printk(KERN_INFO + "%s[%d] general protection eip:%lx esp:%lx error:%lx\n", + current->comm, current->pid, + regs->eip, regs->esp, error_code); + force_sig(SIGSEGV, current); return; @@ -767,6 +775,8 @@ static __kprobes void default_do_nmi(struct pt_regs * regs) reassert_nmi(); } +static int ignore_nmis; + fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code) { int cpu; @@ -777,11 +787,24 @@ fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code) ++nmi_count(cpu); - default_do_nmi(regs); + if (!ignore_nmis) + default_do_nmi(regs); nmi_exit(); } +void stop_nmi(void) +{ + acpi_nmi_disable(); + ignore_nmis++; +} + +void restart_nmi(void) +{ + ignore_nmis--; + acpi_nmi_enable(); +} + #ifdef CONFIG_KPROBES fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code) { diff --git a/arch/i386/kernel/vmiclock.c b/arch/i386/kernel/vmiclock.c index f9b845f4e692..b1b5ab08b26e 100644 --- a/arch/i386/kernel/vmiclock.c +++ b/arch/i386/kernel/vmiclock.c @@ -32,6 +32,7 @@ #include <asm/apicdef.h> #include <asm/apic.h> #include <asm/timer.h> +#include <asm/i8253.h> #include <irq_vectors.h> #include "io_ports.h" @@ -142,6 +143,7 @@ static void vmi_timer_set_mode(enum clock_event_mode mode, switch (mode) { case CLOCK_EVT_MODE_ONESHOT: + case CLOCK_EVT_MODE_RESUME: break; case CLOCK_EVT_MODE_PERIODIC: cycles_per_hz = vmi_timer_ops.get_cycle_frequency(); diff --git a/arch/i386/lib/Makefile b/arch/i386/lib/Makefile index 22d8ac5815f0..4d105fdfe817 100644 --- a/arch/i386/lib/Makefile +++ b/arch/i386/lib/Makefile @@ -4,7 +4,7 @@ lib-y = checksum.o delay.o usercopy.o getuser.o putuser.o memcpy.o strstr.o \ - bitops.o semaphore.o + bitops.o semaphore.o string.o lib-$(CONFIG_X86_USE_3DNOW) += mmx.o diff --git a/arch/i386/lib/string.c b/arch/i386/lib/string.c new file mode 100644 index 000000000000..2c773fefa3dd --- /dev/null +++ b/arch/i386/lib/string.c @@ -0,0 +1,257 @@ +/* + * Most of the string-functions are rather heavily hand-optimized, + * see especially strsep,strstr,str[c]spn. They should work, but are not + * very easy to understand. Everything is done entirely within the register + * set, making the functions fast and clean. String instructions have been + * used through-out, making for "slightly" unclear code :-) + * + * AK: On P4 and K7 using non string instruction implementations might be faster + * for large memory blocks. But most of them are unlikely to be used on large + * strings. + */ + +#include <linux/string.h> +#include <linux/module.h> + +#ifdef __HAVE_ARCH_STRCPY +char *strcpy(char * dest,const char *src) +{ + int d0, d1, d2; + asm volatile( "1:\tlodsb\n\t" + "stosb\n\t" + "testb %%al,%%al\n\t" + "jne 1b" + : "=&S" (d0), "=&D" (d1), "=&a" (d2) + :"0" (src),"1" (dest) : "memory"); + return dest; +} +EXPORT_SYMBOL(strcpy); +#endif + +#ifdef __HAVE_ARCH_STRNCPY +char *strncpy(char * dest,const char *src,size_t count) +{ + int d0, d1, d2, d3; + asm volatile( "1:\tdecl %2\n\t" + "js 2f\n\t" + "lodsb\n\t" + "stosb\n\t" + "testb %%al,%%al\n\t" + "jne 1b\n\t" + "rep\n\t" + "stosb\n" + "2:" + : "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3) + :"0" (src),"1" (dest),"2" (count) : "memory"); + return dest; +} +EXPORT_SYMBOL(strncpy); +#endif + +#ifdef __HAVE_ARCH_STRCAT +char *strcat(char * dest,const char * src) +{ + int d0, d1, d2, d3; + asm volatile( "repne\n\t" + "scasb\n\t" + "decl %1\n" + "1:\tlodsb\n\t" + "stosb\n\t" + "testb %%al,%%al\n\t" + "jne 1b" + : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3) + : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu): "memory"); + return dest; +} +EXPORT_SYMBOL(strcat); +#endif + +#ifdef __HAVE_ARCH_STRNCAT +char *strncat(char * dest,const char * src,size_t count) +{ + int d0, d1, d2, d3; + asm volatile( "repne\n\t" + "scasb\n\t" + "decl %1\n\t" + "movl %8,%3\n" + "1:\tdecl %3\n\t" + "js 2f\n\t" + "lodsb\n\t" + "stosb\n\t" + "testb %%al,%%al\n\t" + "jne 1b\n" + "2:\txorl %2,%2\n\t" + "stosb" + : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3) + : "0" (src),"1" (dest),"2" (0),"3" (0xffffffffu), "g" (count) + : "memory"); + return dest; +} +EXPORT_SYMBOL(strncat); +#endif + +#ifdef __HAVE_ARCH_STRCMP +int strcmp(const char * cs,const char * ct) +{ + int d0, d1; + int res; + asm volatile( "1:\tlodsb\n\t" + "scasb\n\t" + "jne 2f\n\t" + "testb %%al,%%al\n\t" + "jne 1b\n\t" + "xorl %%eax,%%eax\n\t" + "jmp 3f\n" + "2:\tsbbl %%eax,%%eax\n\t" + "orb $1,%%al\n" + "3:" + :"=a" (res), "=&S" (d0), "=&D" (d1) + :"1" (cs),"2" (ct) + :"memory"); + return res; +} +EXPORT_SYMBOL(strcmp); +#endif + +#ifdef __HAVE_ARCH_STRNCMP +int strncmp(const char * cs,const char * ct,size_t count) +{ + int res; + int d0, d1, d2; + asm volatile( "1:\tdecl %3\n\t" + "js 2f\n\t" + "lodsb\n\t" + "scasb\n\t" + "jne 3f\n\t" + "testb %%al,%%al\n\t" + "jne 1b\n" + "2:\txorl %%eax,%%eax\n\t" + "jmp 4f\n" + "3:\tsbbl %%eax,%%eax\n\t" + "orb $1,%%al\n" + "4:" + :"=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2) + :"1" (cs),"2" (ct),"3" (count) + :"memory"); + return res; +} +EXPORT_SYMBOL(strncmp); +#endif + +#ifdef __HAVE_ARCH_STRCHR +char *strchr(const char * s, int c) +{ + int d0; + char * res; + asm volatile( "movb %%al,%%ah\n" + "1:\tlodsb\n\t" + "cmpb %%ah,%%al\n\t" + "je 2f\n\t" + "testb %%al,%%al\n\t" + "jne 1b\n\t" + "movl $1,%1\n" + "2:\tmovl %1,%0\n\t" + "decl %0" + :"=a" (res), "=&S" (d0) + :"1" (s),"0" (c) + :"memory"); + return res; +} +EXPORT_SYMBOL(strchr); +#endif + +#ifdef __HAVE_ARCH_STRRCHR +char *strrchr(const char * s, int c) +{ + int d0, d1; + char * res; + asm volatile( "movb %%al,%%ah\n" + "1:\tlodsb\n\t" + "cmpb %%ah,%%al\n\t" + "jne 2f\n\t" + "leal -1(%%esi),%0\n" + "2:\ttestb %%al,%%al\n\t" + "jne 1b" + :"=g" (res), "=&S" (d0), "=&a" (d1) + :"0" (0),"1" (s),"2" (c) + :"memory"); + return res; +} +EXPORT_SYMBOL(strrchr); +#endif + +#ifdef __HAVE_ARCH_STRLEN +size_t strlen(const char * s) +{ + int d0; + int res; + asm volatile( "repne\n\t" + "scasb\n\t" + "notl %0\n\t" + "decl %0" + :"=c" (res), "=&D" (d0) + :"1" (s),"a" (0), "0" (0xffffffffu) + :"memory"); + return res; +} +EXPORT_SYMBOL(strlen); +#endif + +#ifdef __HAVE_ARCH_MEMCHR +void *memchr(const void *cs,int c,size_t count) +{ + int d0; + void *res; + if (!count) + return NULL; + asm volatile( "repne\n\t" + "scasb\n\t" + "je 1f\n\t" + "movl $1,%0\n" + "1:\tdecl %0" + :"=D" (res), "=&c" (d0) + :"a" (c),"0" (cs),"1" (count) + :"memory"); + return res; +} +EXPORT_SYMBOL(memchr); +#endif + +#ifdef __HAVE_ARCH_MEMSCAN +void *memscan(void * addr, int c, size_t size) +{ + if (!size) + return addr; + asm volatile("repnz; scasb\n\t" + "jnz 1f\n\t" + "dec %%edi\n" + "1:" + : "=D" (addr), "=c" (size) + : "0" (addr), "1" (size), "a" (c) + : "memory"); + return addr; +} +EXPORT_SYMBOL(memscan); +#endif + +#ifdef __HAVE_ARCH_STRNLEN +size_t strnlen(const char *s, size_t count) +{ + int d0; + int res; + asm volatile( "movl %2,%0\n\t" + "jmp 2f\n" + "1:\tcmpb $0,(%0)\n\t" + "je 3f\n\t" + "incl %0\n" + "2:\tdecl %1\n\t" + "cmpl $-1,%1\n\t" + "jne 1b\n" + "3:\tsubl %2,%0" + :"=a" (res), "=&d" (d0) + :"c" (s),"1" (count) + :"memory"); + return res; +} +EXPORT_SYMBOL(strnlen); +#endif diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c index e92a10124935..01ffdd4964f0 100644 --- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c @@ -283,6 +283,8 @@ static inline int vmalloc_fault(unsigned long address) return 0; } +int show_unhandled_signals = 1; + /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate @@ -469,6 +471,14 @@ bad_area_nosemaphore: if (is_prefetch(regs, address, error_code)) return; + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) { + printk("%s%s[%d]: segfault at %08lx eip %08lx " + "esp %08lx error %lx\n", + tsk->pid > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, tsk->pid, address, regs->eip, + regs->esp, error_code); + } tsk->thread.cr2 = address; /* Kernel addresses are always protection faults */ tsk->thread.error_code = error_code | (address >= TASK_SIZE); diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 6e72f22e6bbd..c3b9905af2d5 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -471,6 +471,10 @@ void zap_low_mappings (void) flush_tlb_all(); } +int nx_enabled = 0; + +#ifdef CONFIG_X86_PAE + static int disable_nx __initdata = 0; u64 __supported_pte_mask __read_mostly = ~_PAGE_NX; EXPORT_SYMBOL_GPL(__supported_pte_mask); @@ -500,9 +504,6 @@ static int __init noexec_setup(char *str) } early_param("noexec", noexec_setup); -int nx_enabled = 0; -#ifdef CONFIG_X86_PAE - static void __init set_nx(void) { unsigned int v[4], l, h; @@ -799,17 +800,9 @@ void mark_rodata_ro(void) unsigned long start = PFN_ALIGN(_text); unsigned long size = PFN_ALIGN(_etext) - start; -#ifndef CONFIG_KPROBES -#ifdef CONFIG_HOTPLUG_CPU - /* It must still be possible to apply SMP alternatives. */ - if (num_possible_cpus() <= 1) -#endif - { - change_page_attr(virt_to_page(start), - size >> PAGE_SHIFT, PAGE_KERNEL_RX); - printk("Write protecting the kernel text: %luk\n", size >> 10); - } -#endif + change_page_attr(virt_to_page(start), + size >> PAGE_SHIFT, PAGE_KERNEL_RX); + printk("Write protecting the kernel text: %luk\n", size >> 10); start += size; size = (unsigned long)__end_rodata - start; change_page_attr(virt_to_page(start), diff --git a/arch/i386/mm/ioremap.c b/arch/i386/mm/ioremap.c index fff08ae7b5ed..0b278315d737 100644 --- a/arch/i386/mm/ioremap.c +++ b/arch/i386/mm/ioremap.c @@ -196,7 +196,7 @@ void iounmap(volatile void __iomem *addr) /* Reset the direct mapping. Can block */ if ((p->flags >> 20) && p->phys_addr < virt_to_phys(high_memory) - 1) { change_page_attr(virt_to_page(__va(p->phys_addr)), - p->size >> PAGE_SHIFT, + get_vm_area_size(p) >> PAGE_SHIFT, PAGE_KERNEL); global_flush_tlb(); } diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c index 37992ffb1633..8927222b3ab2 100644 --- a/arch/i386/mm/pageattr.c +++ b/arch/i386/mm/pageattr.c @@ -82,7 +82,7 @@ static void flush_kernel_map(void *arg) struct page *p; /* High level code is not ready for clflush yet */ - if (0 && cpu_has_clflush) { + if (cpu_has_clflush) { list_for_each_entry (p, lh, lru) cache_flush_page(p); } else if (boot_cpu_data.x86_model >= 4) @@ -136,6 +136,12 @@ static inline void revert_page(struct page *kpte_page, unsigned long address) ref_prot)); } +static inline void save_page(struct page *kpte_page) +{ + if (!test_and_set_bit(PG_arch_1, &kpte_page->flags)) + list_add(&kpte_page->lru, &df_list); +} + static int __change_page_attr(struct page *page, pgprot_t prot) { @@ -150,6 +156,9 @@ __change_page_attr(struct page *page, pgprot_t prot) if (!kpte) return -EINVAL; kpte_page = virt_to_page(kpte); + BUG_ON(PageLRU(kpte_page)); + BUG_ON(PageCompound(kpte_page)); + if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { if (!pte_huge(*kpte)) { set_pte_atomic(kpte, mk_pte(page, prot)); @@ -179,11 +188,11 @@ __change_page_attr(struct page *page, pgprot_t prot) * time (not via split_large_page) and in turn we must not * replace it with a largepage. */ + + save_page(kpte_page); if (!PageReserved(kpte_page)) { if (cpu_has_pse && (page_private(kpte_page) == 0)) { - ClearPagePrivate(kpte_page); paravirt_release_pt(page_to_pfn(kpte_page)); - list_add(&kpte_page->lru, &df_list); revert_page(kpte_page, address); } } @@ -236,6 +245,11 @@ void global_flush_tlb(void) spin_unlock_irq(&cpa_lock); flush_map(&l); list_for_each_entry_safe(pg, next, &l, lru) { + list_del(&pg->lru); + clear_bit(PG_arch_1, &pg->flags); + if (PageReserved(pg) || !cpu_has_pse || page_private(pg) != 0) + continue; + ClearPagePrivate(pg); __free_page(pg); } } diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c index 8d7c0864cc04..01437c46baae 100644 --- a/arch/i386/mm/pgtable.c +++ b/arch/i386/mm/pgtable.c @@ -235,7 +235,7 @@ static inline void pgd_list_del(pgd_t *pgd) #if (PTRS_PER_PMD == 1) /* Non-PAE pgd constructor */ -void pgd_ctor(void *pgd) +static void pgd_ctor(void *pgd) { unsigned long flags; @@ -257,7 +257,7 @@ void pgd_ctor(void *pgd) } #else /* PTRS_PER_PMD > 1 */ /* PAE pgd constructor */ -void pgd_ctor(void *pgd) +static void pgd_ctor(void *pgd) { /* PAE, kernel PMD may be shared */ @@ -276,7 +276,7 @@ void pgd_ctor(void *pgd) } #endif /* PTRS_PER_PMD */ -void pgd_dtor(void *pgd) +static void pgd_dtor(void *pgd) { unsigned long flags; /* can be called from interrupt context */ diff --git a/arch/i386/pci/acpi.c b/arch/i386/pci/acpi.c index b33aea845f58..bc8a44bddaa7 100644 --- a/arch/i386/pci/acpi.c +++ b/arch/i386/pci/acpi.c @@ -8,20 +8,42 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int domain, int busnum) { struct pci_bus *bus; + struct pci_sysdata *sd; + int pxm; + + /* Allocate per-root-bus (not per bus) arch-specific data. + * TODO: leak; this memory is never freed. + * It's arguable whether it's worth the trouble to care. + */ + sd = kzalloc(sizeof(*sd), GFP_KERNEL); + if (!sd) { + printk(KERN_ERR "PCI: OOM, not probing PCI bus %02x\n", busnum); + return NULL; + } if (domain != 0) { printk(KERN_WARNING "PCI: Multiple domains not supported\n"); + kfree(sd); return NULL; } - bus = pcibios_scan_root(busnum); + sd->node = -1; + + pxm = acpi_get_pxm(device->handle); +#ifdef CONFIG_ACPI_NUMA + if (pxm >= 0) + sd->node = pxm_to_node(pxm); +#endif + + bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd); + if (!bus) + kfree(sd); + #ifdef CONFIG_ACPI_NUMA if (bus != NULL) { - int pxm = acpi_get_pxm(device->handle); if (pxm >= 0) { - bus->sysdata = (void *)(unsigned long)pxm_to_node(pxm); - printk("bus %d -> pxm %d -> node %ld\n", - busnum, pxm, (long)(bus->sysdata)); + printk("bus %d -> pxm %d -> node %d\n", + busnum, pxm, sd->node); } } #endif diff --git a/arch/i386/pci/common.c b/arch/i386/pci/common.c index 3f78d4d8ecf3..85503deeda46 100644 --- a/arch/i386/pci/common.c +++ b/arch/i386/pci/common.c @@ -293,6 +293,7 @@ static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = { struct pci_bus * __devinit pcibios_scan_root(int busnum) { struct pci_bus *bus = NULL; + struct pci_sysdata *sd; dmi_check_system(pciprobe_dmi_table); @@ -303,9 +304,19 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum) } } + /* Allocate per-root-bus (not per bus) arch-specific data. + * TODO: leak; this memory is never freed. + * It's arguable whether it's worth the trouble to care. + */ + sd = kzalloc(sizeof(*sd), GFP_KERNEL); + if (!sd) { + printk(KERN_ERR "PCI: OOM, not probing PCI bus %02x\n", busnum); + return NULL; + } + printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busnum); - return pci_scan_bus_parented(NULL, busnum, &pci_root_ops, NULL); + return pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd); } extern u8 pci_cache_line_size; diff --git a/arch/i386/pci/mmconfig-shared.c b/arch/i386/pci/mmconfig-shared.c index c7cabeed4d7b..4df637e34f81 100644 --- a/arch/i386/pci/mmconfig-shared.c +++ b/arch/i386/pci/mmconfig-shared.c @@ -24,6 +24,9 @@ DECLARE_BITMAP(pci_mmcfg_fallback_slots, 32*PCI_MMCFG_MAX_CHECK_BUS); +/* Indicate if the mmcfg resources have been placed into the resource table. */ +static int __initdata pci_mmcfg_resources_inserted; + /* K8 systems have some devices (typically in the builtin northbridge) that are only accessible using type1 Normally this can be expressed in the MCFG by not listing them @@ -170,7 +173,7 @@ static int __init pci_mmcfg_check_hostbridge(void) return name != NULL; } -static void __init pci_mmcfg_insert_resources(void) +static void __init pci_mmcfg_insert_resources(unsigned long resource_flags) { #define PCI_MMCFG_RESOURCE_NAME_LEN 19 int i; @@ -194,10 +197,13 @@ static void __init pci_mmcfg_insert_resources(void) cfg->pci_segment); res->start = cfg->address; res->end = res->start + (num_buses << 20) - 1; - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + res->flags = IORESOURCE_MEM | resource_flags; insert_resource(&iomem_resource, res); names += PCI_MMCFG_RESOURCE_NAME_LEN; } + + /* Mark that the resources have been inserted. */ + pci_mmcfg_resources_inserted = 1; } static void __init pci_mmcfg_reject_broken(int type) @@ -267,7 +273,43 @@ void __init pci_mmcfg_init(int type) if (type == 1) unreachable_devices(); if (known_bridge) - pci_mmcfg_insert_resources(); + pci_mmcfg_insert_resources(IORESOURCE_BUSY); pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; + } else { + /* + * Signal not to attempt to insert mmcfg resources because + * the architecture mmcfg setup could not initialize. + */ + pci_mmcfg_resources_inserted = 1; } } + +static int __init pci_mmcfg_late_insert_resources(void) +{ + /* + * If resources are already inserted or we are not using MMCONFIG, + * don't insert the resources. + */ + if ((pci_mmcfg_resources_inserted == 1) || + (pci_probe & PCI_PROBE_MMCONF) == 0 || + (pci_mmcfg_config_num == 0) || + (pci_mmcfg_config == NULL) || + (pci_mmcfg_config[0].address == 0)) + return 1; + + /* + * Attempt to insert the mmcfg resources but not with the busy flag + * marked so it won't cause request errors when __request_region is + * called. + */ + pci_mmcfg_insert_resources(0); + + return 0; +} + +/* + * Perform MMCONFIG resource insertion after PCI initialization to allow for + * misprogrammed MCFG tables that state larger sizes but actually conflict + * with other system resources. + */ +late_initcall(pci_mmcfg_late_insert_resources); diff --git a/arch/i386/xen/time.c b/arch/i386/xen/time.c index 51fdabf1fd4d..dfd6db69ead5 100644 --- a/arch/i386/xen/time.c +++ b/arch/i386/xen/time.c @@ -412,6 +412,7 @@ static void xen_timerop_set_mode(enum clock_event_mode mode, break; case CLOCK_EVT_MODE_ONESHOT: + case CLOCK_EVT_MODE_RESUME: break; case CLOCK_EVT_MODE_UNUSED: @@ -474,6 +475,8 @@ static void xen_vcpuop_set_mode(enum clock_event_mode mode, HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) BUG(); break; + case CLOCK_EVT_MODE_RESUME: + break; } } diff --git a/arch/i386/xen/xen-head.S b/arch/i386/xen/xen-head.S index 2998d55a0017..bc71f3bc4014 100644 --- a/arch/i386/xen/xen-head.S +++ b/arch/i386/xen/xen-head.S @@ -7,6 +7,7 @@ #include <asm/boot.h> #include <xen/interface/elfnote.h> + .section .init.text ENTRY(startup_xen) movl %esi,xen_start_info cld @@ -19,6 +20,7 @@ ENTRY(hypercall_page) .skip 0x1000 .popsection + .section .text ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") diff --git a/arch/ia64/ia32/binfmt_elf32.c b/arch/ia64/ia32/binfmt_elf32.c index e1189ba1ca5e..1cfab326fb7e 100644 --- a/arch/ia64/ia32/binfmt_elf32.c +++ b/arch/ia64/ia32/binfmt_elf32.c @@ -226,7 +226,7 @@ elf32_set_personality (void) } static unsigned long -elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type, unsigned long unused) +elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type) { unsigned long pgoff = (eppnt->p_vaddr) & ~IA32_PAGE_MASK; diff --git a/arch/powerpc/boot/ps3-head.S b/arch/powerpc/boot/ps3-head.S index 1a6d64a68df5..a55c2735f759 100644 --- a/arch/powerpc/boot/ps3-head.S +++ b/arch/powerpc/boot/ps3-head.S @@ -20,6 +20,8 @@ #include "ppc_asm.h" + .machine "ppc64" + .text /* diff --git a/arch/powerpc/boot/ps3-hvcall.S b/arch/powerpc/boot/ps3-hvcall.S index c8b7df3210d1..585965f7e6a8 100644 --- a/arch/powerpc/boot/ps3-hvcall.S +++ b/arch/powerpc/boot/ps3-hvcall.S @@ -20,6 +20,8 @@ #include "ppc_asm.h" + .machine "ppc64" + /* * The PS3 hypervisor uses a 64 bit "C" language calling convention. * The routines here marshal arguments between the 32 bit wrapper diff --git a/arch/powerpc/mm/tlb_32.c b/arch/powerpc/mm/tlb_32.c index 06c7e77e097a..eb4b512d65fa 100644 --- a/arch/powerpc/mm/tlb_32.c +++ b/arch/powerpc/mm/tlb_32.c @@ -26,6 +26,8 @@ #include <linux/mm.h> #include <linux/init.h> #include <linux/highmem.h> +#include <linux/pagemap.h> + #include <asm/tlbflush.h> #include <asm/tlb.h> diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index 7de4e919687b..4100ddc52f02 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -2177,8 +2177,8 @@ struct tree_descr spufs_dir_contents[] = { { "mbox_stat", &spufs_mbox_stat_fops, 0444, }, { "ibox_stat", &spufs_ibox_stat_fops, 0444, }, { "wbox_stat", &spufs_wbox_stat_fops, 0444, }, - { "signal1", &spufs_signal1_fops, 0666, }, - { "signal2", &spufs_signal2_fops, 0666, }, + { "signal1", &spufs_signal1_nosched_fops, 0222, }, + { "signal2", &spufs_signal2_nosched_fops, 0222, }, { "signal1_type", &spufs_signal1_type, 0666, }, { "signal2_type", &spufs_signal2_type, 0666, }, { "cntl", &spufs_cntl_fops, 0666, }, diff --git a/arch/powerpc/platforms/ps3/Kconfig b/arch/powerpc/platforms/ps3/Kconfig index a05079b07696..d4fc74f7bb15 100644 --- a/arch/powerpc/platforms/ps3/Kconfig +++ b/arch/powerpc/platforms/ps3/Kconfig @@ -102,4 +102,40 @@ config PS3_STORAGE depends on PPC_PS3 tristate +config PS3_DISK + tristate "PS3 Disk Storage Driver" + depends on PPC_PS3 && BLOCK + select PS3_STORAGE + help + Include support for the PS3 Disk Storage. + + This support is required to access the PS3 hard disk. + In general, all users will say Y or M. + +config PS3_ROM + tristate "PS3 BD/DVD/CD-ROM Storage Driver" + depends on PPC_PS3 && SCSI + select PS3_STORAGE + help + Include support for the PS3 ROM Storage. + + This support is required to access the PS3 BD/DVD/CD-ROM drive. + In general, all users will say Y or M. + Also make sure to say Y or M to "SCSI CDROM support" later. + +config PS3_FLASH + tristate "PS3 FLASH ROM Storage Driver" + depends on PPC_PS3 + select PS3_STORAGE + help + Include support for the PS3 FLASH ROM Storage. + + This support is required to access the PS3 FLASH ROM, which + contains the boot loader and some boot options. + In general, all users will say Y or M. + + As this driver needs a fixed buffer of 256 KiB of memory, it can + be disabled on the kernel command line using "ps3flash=off", to + not allocate this fixed buffer. + endmenu diff --git a/arch/sh/kernel/timers/timer-tmu.c b/arch/sh/kernel/timers/timer-tmu.c index 097ebd49f1bf..7aca37d79766 100644 --- a/arch/sh/kernel/timers/timer-tmu.c +++ b/arch/sh/kernel/timers/timer-tmu.c @@ -80,6 +80,7 @@ static void tmu_set_mode(enum clock_event_mode mode, break; case CLOCK_EVT_MODE_UNUSED: case CLOCK_EVT_MODE_SHUTDOWN: + case CLOCK_EVT_MODE_RESUME: break; } } diff --git a/arch/sparc/kernel/entry.S b/arch/sparc/kernel/entry.S index 831f540251f8..eac38388f5fd 100644 --- a/arch/sparc/kernel/entry.S +++ b/arch/sparc/kernel/entry.S @@ -1749,8 +1749,8 @@ fpload: __ndelay: save %sp, -STACKFRAME_SZ, %sp mov %i0, %o0 - call .umul - mov 0x1ad, %o1 ! 2**32 / (1 000 000 000 / HZ) + call .umul ! round multiplier up so large ns ok + mov 0x1ae, %o1 ! 2**32 / (1 000 000 000 / HZ) call .umul mov %i1, %o1 ! udelay_val ba delay_continue @@ -1760,11 +1760,17 @@ __ndelay: __udelay: save %sp, -STACKFRAME_SZ, %sp mov %i0, %o0 - sethi %hi(0x10c6), %o1 + sethi %hi(0x10c7), %o1 ! round multiplier up so large us ok call .umul - or %o1, %lo(0x10c6), %o1 ! 2**32 / 1 000 000 + or %o1, %lo(0x10c7), %o1 ! 2**32 / 1 000 000 call .umul mov %i1, %o1 ! udelay_val + sethi %hi(0x028f4b62), %l0 ! Add in rounding constant * 2**32, + or %g0, %lo(0x028f4b62), %l0 + addcc %o0, %l0, %o0 ! 2**32 * 0.009 999 + bcs,a 3f + add %o1, 0x01, %o1 +3: call .umul mov HZ, %o0 ! >>32 earlier for wider range diff --git a/arch/sparc/kernel/irq.c b/arch/sparc/kernel/irq.c index f257a67bcf93..75b2240ad0f9 100644 --- a/arch/sparc/kernel/irq.c +++ b/arch/sparc/kernel/irq.c @@ -47,6 +47,8 @@ #include <asm/cacheflush.h> #include <asm/irq_regs.h> +#include "irq.h" + #ifdef CONFIG_SMP #define SMP_NOP2 "nop; nop;\n\t" #define SMP_NOP3 "nop; nop; nop;\n\t" @@ -268,7 +270,7 @@ void free_irq(unsigned int irq, void *dev_id) kfree(action); if (!sparc_irq[cpu_irq].action) - disable_irq(irq); + __disable_irq(irq); out_unlock: spin_unlock_irqrestore(&irq_action_lock, flags); @@ -464,7 +466,7 @@ int request_fast_irq(unsigned int irq, sparc_irq[cpu_irq].action = action; - enable_irq(irq); + __enable_irq(irq); ret = 0; out_unlock: @@ -544,7 +546,7 @@ int request_irq(unsigned int irq, *actionp = action; - enable_irq(irq); + __enable_irq(irq); ret = 0; out_unlock: @@ -555,6 +557,25 @@ out: EXPORT_SYMBOL(request_irq); +void disable_irq_nosync(unsigned int irq) +{ + return __disable_irq(irq); +} +EXPORT_SYMBOL(disable_irq_nosync); + +void disable_irq(unsigned int irq) +{ + return __disable_irq(irq); +} +EXPORT_SYMBOL(disable_irq); + +void enable_irq(unsigned int irq) +{ + return __enable_irq(irq); +} + +EXPORT_SYMBOL(enable_irq); + /* We really don't need these at all on the Sparc. We only have * stubs here because they are exported to modules. */ diff --git a/arch/sparc/kernel/irq.h b/arch/sparc/kernel/irq.h new file mode 100644 index 000000000000..32ef3ebd0a88 --- /dev/null +++ b/arch/sparc/kernel/irq.h @@ -0,0 +1,68 @@ +#include <asm/btfixup.h> + +/* Dave Redman (djhr@tadpole.co.uk) + * changed these to function pointers.. it saves cycles and will allow + * the irq dependencies to be split into different files at a later date + * sun4c_irq.c, sun4m_irq.c etc so we could reduce the kernel size. + * Jakub Jelinek (jj@sunsite.mff.cuni.cz) + * Changed these to btfixup entities... It saves cycles :) + */ + +BTFIXUPDEF_CALL(void, disable_irq, unsigned int) +BTFIXUPDEF_CALL(void, enable_irq, unsigned int) +BTFIXUPDEF_CALL(void, disable_pil_irq, unsigned int) +BTFIXUPDEF_CALL(void, enable_pil_irq, unsigned int) +BTFIXUPDEF_CALL(void, clear_clock_irq, void) +BTFIXUPDEF_CALL(void, clear_profile_irq, int) +BTFIXUPDEF_CALL(void, load_profile_irq, int, unsigned int) + +static inline void __disable_irq(unsigned int irq) +{ + BTFIXUP_CALL(disable_irq)(irq); +} + +static inline void __enable_irq(unsigned int irq) +{ + BTFIXUP_CALL(enable_irq)(irq); +} + +static inline void disable_pil_irq(unsigned int irq) +{ + BTFIXUP_CALL(disable_pil_irq)(irq); +} + +static inline void enable_pil_irq(unsigned int irq) +{ + BTFIXUP_CALL(enable_pil_irq)(irq); +} + +static inline void clear_clock_irq(void) +{ + BTFIXUP_CALL(clear_clock_irq)(); +} + +static inline void clear_profile_irq(int irq) +{ + BTFIXUP_CALL(clear_profile_irq)(irq); +} + +static inline void load_profile_irq(int cpu, int limit) +{ + BTFIXUP_CALL(load_profile_irq)(cpu, limit); +} + +extern void (*sparc_init_timers)(irq_handler_t lvl10_irq); + +extern void claim_ticker14(irq_handler_t irq_handler, + int irq, + unsigned int timeout); + +#ifdef CONFIG_SMP +BTFIXUPDEF_CALL(void, set_cpu_int, int, int) +BTFIXUPDEF_CALL(void, clear_cpu_int, int, int) +BTFIXUPDEF_CALL(void, set_irq_udt, int) + +#define set_cpu_int(cpu,level) BTFIXUP_CALL(set_cpu_int)(cpu,level) +#define clear_cpu_int(cpu,level) BTFIXUP_CALL(clear_cpu_int)(cpu,level) +#define set_irq_udt(cpu) BTFIXUP_CALL(set_irq_udt)(cpu) +#endif diff --git a/arch/sparc/kernel/pcic.c b/arch/sparc/kernel/pcic.c index 791771196905..f2eae457fc9a 100644 --- a/arch/sparc/kernel/pcic.c +++ b/arch/sparc/kernel/pcic.c @@ -36,6 +36,7 @@ #include <asm/uaccess.h> #include <asm/irq_regs.h> +#include "irq.h" /* * I studied different documents and many live PROMs both from 2.30 diff --git a/arch/sparc/kernel/smp.c b/arch/sparc/kernel/smp.c index 4fea3ac7bff0..6724ab90f82b 100644 --- a/arch/sparc/kernel/smp.c +++ b/arch/sparc/kernel/smp.c @@ -33,6 +33,8 @@ #include <asm/tlbflush.h> #include <asm/cpudata.h> +#include "irq.h" + int smp_num_cpus = 1; volatile unsigned long cpu_callin_map[NR_CPUS] __initdata = {0,}; unsigned char boot_cpu_id = 0; diff --git a/arch/sparc/kernel/sparc_ksyms.c b/arch/sparc/kernel/sparc_ksyms.c index d8e008a04e2b..55bac516dfe2 100644 --- a/arch/sparc/kernel/sparc_ksyms.c +++ b/arch/sparc/kernel/sparc_ksyms.c @@ -154,8 +154,6 @@ EXPORT_SYMBOL(BTFIXUP_CALL(___xchg32)); #else EXPORT_SYMBOL(BTFIXUP_CALL(__hard_smp_processor_id)); #endif -EXPORT_SYMBOL(BTFIXUP_CALL(enable_irq)); -EXPORT_SYMBOL(BTFIXUP_CALL(disable_irq)); EXPORT_SYMBOL(BTFIXUP_CALL(mmu_unlockarea)); EXPORT_SYMBOL(BTFIXUP_CALL(mmu_lockarea)); EXPORT_SYMBOL(BTFIXUP_CALL(mmu_get_scsi_sgl)); diff --git a/arch/sparc/kernel/sun4c_irq.c b/arch/sparc/kernel/sun4c_irq.c index 009e891a4329..c6ac9fc52563 100644 --- a/arch/sparc/kernel/sun4c_irq.c +++ b/arch/sparc/kernel/sun4c_irq.c @@ -18,6 +18,7 @@ #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/init.h> +#include "irq.h" #include <asm/ptrace.h> #include <asm/processor.h> @@ -40,6 +41,20 @@ static struct resource sun4c_timer_eb = { "sun4c_timer" }; static struct resource sun4c_intr_eb = { "sun4c_intr" }; #endif +/* + * Bit field defines for the interrupt registers on various + * Sparc machines. + */ + +/* The sun4c interrupt register. */ +#define SUN4C_INT_ENABLE 0x01 /* Allow interrupts. */ +#define SUN4C_INT_E14 0x80 /* Enable level 14 IRQ. */ +#define SUN4C_INT_E10 0x20 /* Enable level 10 IRQ. */ +#define SUN4C_INT_E8 0x10 /* Enable level 8 IRQ. */ +#define SUN4C_INT_E6 0x08 /* Enable level 6 IRQ. */ +#define SUN4C_INT_E4 0x04 /* Enable level 4 IRQ. */ +#define SUN4C_INT_E1 0x02 /* Enable level 1 IRQ. */ + /* Pointer to the interrupt enable byte * * Dave Redman (djhr@tadpole.co.uk) diff --git a/arch/sparc/kernel/sun4d_irq.c b/arch/sparc/kernel/sun4d_irq.c index 396797e20c39..e0efab2a6bef 100644 --- a/arch/sparc/kernel/sun4d_irq.c +++ b/arch/sparc/kernel/sun4d_irq.c @@ -39,6 +39,8 @@ #include <asm/cacheflush.h> #include <asm/irq_regs.h> +#include "irq.h" + /* If you trust current SCSI layer to handle different SCSI IRQs, enable this. I don't trust it... -jj */ /* #define DISTRIBUTE_IRQS */ @@ -188,7 +190,7 @@ void sun4d_free_irq(unsigned int irq, void *dev_id) kfree(action); if (!(*actionp)) - disable_irq(irq); + __disable_irq(irq); out_unlock: spin_unlock_irqrestore(&irq_action_lock, flags); @@ -346,7 +348,7 @@ int sun4d_request_irq(unsigned int irq, else *actionp = action; - enable_irq(irq); + __enable_irq(irq); ret = 0; out_unlock: diff --git a/arch/sparc/kernel/sun4d_smp.c b/arch/sparc/kernel/sun4d_smp.c index 098c94f1a322..89a6de95070c 100644 --- a/arch/sparc/kernel/sun4d_smp.c +++ b/arch/sparc/kernel/sun4d_smp.c @@ -36,6 +36,7 @@ #include <asm/cacheflush.h> #include <asm/cpudata.h> +#include "irq.h" #define IRQ_CROSS_CALL 15 extern ctxd_t *srmmu_ctx_table_phys; diff --git a/arch/sparc/kernel/sun4m_irq.c b/arch/sparc/kernel/sun4m_irq.c index 91a803ea88be..b92d6d2d5b04 100644 --- a/arch/sparc/kernel/sun4m_irq.c +++ b/arch/sparc/kernel/sun4m_irq.c @@ -38,11 +38,85 @@ #include <asm/sbus.h> #include <asm/cacheflush.h> +#include "irq.h" + +/* On the sun4m, just like the timers, we have both per-cpu and master + * interrupt registers. + */ + +/* These registers are used for sending/receiving irqs from/to + * different cpu's. + */ +struct sun4m_intreg_percpu { + unsigned int tbt; /* Interrupts still pending for this cpu. */ + + /* These next two registers are WRITE-ONLY and are only + * "on bit" sensitive, "off bits" written have NO affect. + */ + unsigned int clear; /* Clear this cpus irqs here. */ + unsigned int set; /* Set this cpus irqs here. */ + unsigned char space[PAGE_SIZE - 12]; +}; + +/* + * djhr + * Actually the clear and set fields in this struct are misleading.. + * according to the SLAVIO manual (and the same applies for the SEC) + * the clear field clears bits in the mask which will ENABLE that IRQ + * the set field sets bits in the mask to DISABLE the IRQ. + * + * Also the undirected_xx address in the SLAVIO is defined as + * RESERVED and write only.. + * + * DAVEM_NOTE: The SLAVIO only specifies behavior on uniprocessor + * sun4m machines, for MP the layout makes more sense. + */ +struct sun4m_intregs { + struct sun4m_intreg_percpu cpu_intregs[SUN4M_NCPUS]; + unsigned int tbt; /* IRQ's that are still pending. */ + unsigned int irqs; /* Master IRQ bits. */ + + /* Again, like the above, two these registers are WRITE-ONLY. */ + unsigned int clear; /* Clear master IRQ's by setting bits here. */ + unsigned int set; /* Set master IRQ's by setting bits here. */ + + /* This register is both READ and WRITE. */ + unsigned int undirected_target; /* Which cpu gets undirected irqs. */ +}; + static unsigned long dummy; struct sun4m_intregs *sun4m_interrupts; unsigned long *irq_rcvreg = &dummy; +/* Dave Redman (djhr@tadpole.co.uk) + * The sun4m interrupt registers. + */ +#define SUN4M_INT_ENABLE 0x80000000 +#define SUN4M_INT_E14 0x00000080 +#define SUN4M_INT_E10 0x00080000 + +#define SUN4M_HARD_INT(x) (0x000000001 << (x)) +#define SUN4M_SOFT_INT(x) (0x000010000 << (x)) + +#define SUN4M_INT_MASKALL 0x80000000 /* mask all interrupts */ +#define SUN4M_INT_MODULE_ERR 0x40000000 /* module error */ +#define SUN4M_INT_M2S_WRITE 0x20000000 /* write buffer error */ +#define SUN4M_INT_ECC 0x10000000 /* ecc memory error */ +#define SUN4M_INT_FLOPPY 0x00400000 /* floppy disk */ +#define SUN4M_INT_MODULE 0x00200000 /* module interrupt */ +#define SUN4M_INT_VIDEO 0x00100000 /* onboard video */ +#define SUN4M_INT_REALTIME 0x00080000 /* system timer */ +#define SUN4M_INT_SCSI 0x00040000 /* onboard scsi */ +#define SUN4M_INT_AUDIO 0x00020000 /* audio/isdn */ +#define SUN4M_INT_ETHERNET 0x00010000 /* onboard ethernet */ +#define SUN4M_INT_SERIAL 0x00008000 /* serial ports */ +#define SUN4M_INT_KBDMS 0x00004000 /* keyboard/mouse */ +#define SUN4M_INT_SBUSBITS 0x00003F80 /* sbus int bits */ + +#define SUN4M_INT_SBUS(x) (1 << (x+7)) +#define SUN4M_INT_VME(x) (1 << (x)) + /* These tables only apply for interrupts greater than 15.. * * any intr value below 0x10 is considered to be a soft-int diff --git a/arch/sparc/kernel/sun4m_smp.c b/arch/sparc/kernel/sun4m_smp.c index 63ed19bfd028..730eb5796f8e 100644 --- a/arch/sparc/kernel/sun4m_smp.c +++ b/arch/sparc/kernel/sun4m_smp.c @@ -31,6 +31,8 @@ #include <asm/oplib.h> #include <asm/cpudata.h> +#include "irq.h" + #define IRQ_RESCHEDULE 13 #define IRQ_STOP_CPU 14 #define IRQ_CROSS_CALL 15 diff --git a/arch/sparc/kernel/tick14.c b/arch/sparc/kernel/tick14.c index f1a7bd19e04f..707bfda86570 100644 --- a/arch/sparc/kernel/tick14.c +++ b/arch/sparc/kernel/tick14.c @@ -25,6 +25,8 @@ #include <asm/irq.h> #include <asm/io.h> +#include "irq.h" + extern unsigned long lvl14_save[5]; static unsigned long *linux_lvl14 = NULL; static unsigned long obp_lvl14[4]; @@ -62,7 +64,7 @@ void claim_ticker14(irq_handler_t handler, /* first we copy the obp handler instructions */ - disable_irq(irq_nr); + __disable_irq(irq_nr); if (!handler) return; @@ -79,6 +81,6 @@ void claim_ticker14(irq_handler_t handler, NULL)) { install_linux_ticker(); load_profile_irq(cpu, timeout); - enable_irq(irq_nr); + __enable_irq(irq_nr); } } diff --git a/arch/sparc/kernel/time.c b/arch/sparc/kernel/time.c index f2fdbb3664d3..6a2513321620 100644 --- a/arch/sparc/kernel/time.c +++ b/arch/sparc/kernel/time.c @@ -44,6 +44,8 @@ #include <asm/of_device.h> #include <asm/irq_regs.h> +#include "irq.h" + DEFINE_SPINLOCK(rtc_lock); enum sparc_clock_type sp_clock_typ; DEFINE_SPINLOCK(mostek_lock); diff --git a/arch/sparc/mm/init.c b/arch/sparc/mm/init.c index a532922e2e35..a1bef07755a9 100644 --- a/arch/sparc/mm/init.c +++ b/arch/sparc/mm/init.c @@ -308,6 +308,9 @@ extern void sun4c_paging_init(void); extern void srmmu_paging_init(void); extern void device_scan(void); +pgprot_t PAGE_SHARED __read_mostly; +EXPORT_SYMBOL(PAGE_SHARED); + void __init paging_init(void) { switch(sparc_cpu_model) { diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index ca26232da7ab..17b485f2825c 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -2154,7 +2154,7 @@ void __init ld_mmu_srmmu(void) BTFIXUPSET_SIMM13(ptrs_per_pgd, SRMMU_PTRS_PER_PGD); BTFIXUPSET_INT(page_none, pgprot_val(SRMMU_PAGE_NONE)); - BTFIXUPSET_INT(page_shared, pgprot_val(SRMMU_PAGE_SHARED)); + PAGE_SHARED = pgprot_val(SRMMU_PAGE_SHARED); BTFIXUPSET_INT(page_copy, pgprot_val(SRMMU_PAGE_COPY)); BTFIXUPSET_INT(page_readonly, pgprot_val(SRMMU_PAGE_RDONLY)); BTFIXUPSET_INT(page_kernel, pgprot_val(SRMMU_PAGE_KERNEL)); diff --git a/arch/sparc/mm/sun4c.c b/arch/sparc/mm/sun4c.c index bdd835fba02e..a57a366e339a 100644 --- a/arch/sparc/mm/sun4c.c +++ b/arch/sparc/mm/sun4c.c @@ -2155,7 +2155,7 @@ void __init ld_mmu_sun4c(void) BTFIXUPSET_SIMM13(user_ptrs_per_pgd, KERNBASE / SUN4C_PGDIR_SIZE); BTFIXUPSET_INT(page_none, pgprot_val(SUN4C_PAGE_NONE)); - BTFIXUPSET_INT(page_shared, pgprot_val(SUN4C_PAGE_SHARED)); + PAGE_SHARED = pgprot_val(SUN4C_PAGE_SHARED); BTFIXUPSET_INT(page_copy, pgprot_val(SUN4C_PAGE_COPY)); BTFIXUPSET_INT(page_readonly, pgprot_val(SUN4C_PAGE_READONLY)); BTFIXUPSET_INT(page_kernel, pgprot_val(SUN4C_PAGE_KERNEL)); diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig index f1cc55677ff2..33dabf588bdd 100644 --- a/arch/sparc64/Kconfig +++ b/arch/sparc64/Kconfig @@ -23,6 +23,10 @@ config GENERIC_TIME bool default y +config GENERIC_CMOS_UPDATE + bool + default y + config GENERIC_CLOCKEVENTS bool default y diff --git a/arch/sparc64/kernel/sparc64_ksyms.c b/arch/sparc64/kernel/sparc64_ksyms.c index 7d36531aa5b9..d270c2f0be0f 100644 --- a/arch/sparc64/kernel/sparc64_ksyms.c +++ b/arch/sparc64/kernel/sparc64_ksyms.c @@ -280,6 +280,7 @@ EXPORT_SYMBOL(sys_getgid); EXPORT_SYMBOL(svr4_getcontext); EXPORT_SYMBOL(svr4_setcontext); EXPORT_SYMBOL(compat_sys_ioctl); +EXPORT_SYMBOL(sys_ioctl); EXPORT_SYMBOL(sparc32_open); #endif diff --git a/arch/sparc64/kernel/time.c b/arch/sparc64/kernel/time.c index e340eb401fb9..49063ca2efcd 100644 --- a/arch/sparc64/kernel/time.c +++ b/arch/sparc64/kernel/time.c @@ -403,58 +403,9 @@ static struct sparc64_tick_ops hbtick_operations __read_mostly = { static unsigned long timer_ticks_per_nsec_quotient __read_mostly; -#define TICK_SIZE (tick_nsec / 1000) - -#define USEC_AFTER 500000 -#define USEC_BEFORE 500000 - -static void sync_cmos_clock(unsigned long dummy); - -static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); - -static void sync_cmos_clock(unsigned long dummy) -{ - struct timeval now, next; - int fail = 1; - - /* - * If we have an externally synchronized Linux clock, then update - * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be - * called as close as possible to 500 ms before the new second starts. - * This code is run on a timer. If the clock is set, that timer - * may not expire at the correct time. Thus, we adjust... - */ - if (!ntp_synced()) - /* - * Not synced, exit, do not restart a timer (if one is - * running, let it run out). - */ - return; - - do_gettimeofday(&now); - if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 && - now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2) - fail = set_rtc_mmss(now.tv_sec); - - next.tv_usec = USEC_AFTER - now.tv_usec; - if (next.tv_usec <= 0) - next.tv_usec += USEC_PER_SEC; - - if (!fail) - next.tv_sec = 659; - else - next.tv_sec = 0; - - if (next.tv_usec >= USEC_PER_SEC) { - next.tv_sec++; - next.tv_usec -= USEC_PER_SEC; - } - mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next)); -} - -void notify_arch_cmos_timer(void) +int update_persistent_clock(struct timespec now) { - mod_timer(&sync_cmos_timer, jiffies + 1); + return set_rtc_mmss(now.tv_sec); } /* Kick start a stopped clock (procedure from the Sun NVRAM/hostid FAQ). */ @@ -931,6 +882,7 @@ static void sparc64_timer_setup(enum clock_event_mode mode, { switch (mode) { case CLOCK_EVT_MODE_ONESHOT: + case CLOCK_EVT_MODE_RESUME: break; case CLOCK_EVT_MODE_SHUTDOWN: diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 14bf8ce3ea23..45f82ae6d389 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -32,6 +32,10 @@ config GENERIC_TIME_VSYSCALL bool default y +config GENERIC_CMOS_UPDATE + bool + default y + config ZONE_DMA32 bool default y @@ -56,6 +60,14 @@ config ZONE_DMA bool default y +config QUICKLIST + bool + default y + +config NR_QUICK + int + default 2 + config ISA bool diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile index 29617ae3926d..128561d3e876 100644 --- a/arch/x86_64/Makefile +++ b/arch/x86_64/Makefile @@ -76,7 +76,8 @@ head-y := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kern libs-y += arch/x86_64/lib/ core-y += arch/x86_64/kernel/ \ arch/x86_64/mm/ \ - arch/x86_64/crypto/ + arch/x86_64/crypto/ \ + arch/x86_64/vdso/ core-$(CONFIG_IA32_EMULATION) += arch/x86_64/ia32/ drivers-$(CONFIG_PCI) += arch/x86_64/pci/ drivers-$(CONFIG_OPROFILE) += arch/x86_64/oprofile/ diff --git a/arch/x86_64/boot/compressed/Makefile b/arch/x86_64/boot/compressed/Makefile index c9f2da7496c1..877c0bdbbc67 100644 --- a/arch/x86_64/boot/compressed/Makefile +++ b/arch/x86_64/boot/compressed/Makefile @@ -3,8 +3,6 @@ # # create a compressed vmlinux image from the original vmlinux # -# Note all the files here are compiled/linked as 32bit executables. -# targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig index 40178e5c3104..b7c4cd04bfc3 100644 --- a/arch/x86_64/defconfig +++ b/arch/x86_64/defconfig @@ -1,19 +1,22 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.22-rc2 -# Mon May 21 13:23:40 2007 +# Linux kernel version: 2.6.22-git14 +# Fri Jul 20 09:53:15 2007 # CONFIG_X86_64=y CONFIG_64BIT=y CONFIG_X86=y CONFIG_GENERIC_TIME=y CONFIG_GENERIC_TIME_VSYSCALL=y +CONFIG_GENERIC_CMOS_UPDATE=y CONFIG_ZONE_DMA32=y CONFIG_LOCKDEP_SUPPORT=y CONFIG_STACKTRACE_SUPPORT=y CONFIG_SEMAPHORE_SLEEPERS=y CONFIG_MMU=y CONFIG_ZONE_DMA=y +CONFIG_QUICKLIST=y +CONFIG_NR_QUICK=2 CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_HWEIGHT=y CONFIG_GENERIC_CALIBRATE_DELAY=y @@ -44,19 +47,18 @@ CONFIG_LOCALVERSION="" CONFIG_LOCALVERSION_AUTO=y CONFIG_SWAP=y CONFIG_SYSVIPC=y -# CONFIG_IPC_NS is not set CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y # CONFIG_BSD_PROCESS_ACCT is not set # CONFIG_TASKSTATS is not set -# CONFIG_UTS_NS is not set +# CONFIG_USER_NS is not set # CONFIG_AUDIT is not set CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=18 # CONFIG_CPUSETS is not set CONFIG_SYSFS_DEPRECATED=y -# CONFIG_RELAY is not set +CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y CONFIG_INITRAMFS_SOURCE="" CONFIG_CC_OPTIMIZE_FOR_SIZE=y @@ -86,10 +88,6 @@ CONFIG_SLAB=y CONFIG_RT_MUTEXES=y # CONFIG_TINY_SHMEM is not set CONFIG_BASE_SMALL=0 - -# -# Loadable module support -# CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y @@ -97,12 +95,9 @@ CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_MODULE_SRCVERSION_ALL is not set # CONFIG_KMOD is not set CONFIG_STOP_MACHINE=y - -# -# Block layer -# CONFIG_BLOCK=y # CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_BLK_DEV_BSG is not set # # IO Schedulers @@ -165,9 +160,12 @@ CONFIG_SPLIT_PTLOCK_CPUS=4 CONFIG_MIGRATION=y CONFIG_RESOURCES_64BIT=y CONFIG_ZONE_DMA_FLAG=1 +CONFIG_BOUNCE=y +CONFIG_VIRT_TO_BUS=y CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y CONFIG_OUT_OF_LINE_PFN_TO_PAGE=y CONFIG_NR_CPUS=32 +CONFIG_PHYSICAL_ALIGN=0x200000 CONFIG_HOTPLUG_CPU=y CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y CONFIG_HPET_TIMER=y @@ -180,7 +178,7 @@ CONFIG_X86_MCE_INTEL=y CONFIG_X86_MCE_AMD=y # CONFIG_KEXEC is not set # CONFIG_CRASH_DUMP is not set -CONFIG_RELOCATABLE=y +# CONFIG_RELOCATABLE is not set CONFIG_PHYSICAL_START=0x200000 CONFIG_SECCOMP=y # CONFIG_CC_STACKPROTECTOR is not set @@ -201,7 +199,6 @@ CONFIG_GENERIC_PENDING_IRQ=y CONFIG_PM=y # CONFIG_PM_LEGACY is not set # CONFIG_PM_DEBUG is not set -# CONFIG_PM_SYSFS_DEPRECATED is not set CONFIG_SOFTWARE_SUSPEND=y CONFIG_PM_STD_PARTITION="" CONFIG_SUSPEND_SMP=y @@ -248,7 +245,7 @@ CONFIG_CPU_FREQ_GOV_PERFORMANCE=y # CONFIG_CPU_FREQ_GOV_POWERSAVE is not set CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_GOV_ONDEMAND=y -# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set +CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y # # CPUFreq processor drivers @@ -351,20 +348,8 @@ CONFIG_IPV6_SIT=y # CONFIG_IPV6_MULTIPLE_TABLES is not set # CONFIG_NETWORK_SECMARK is not set # CONFIG_NETFILTER is not set - -# -# DCCP Configuration (EXPERIMENTAL) -# # CONFIG_IP_DCCP is not set - -# -# SCTP Configuration (EXPERIMENTAL) -# # CONFIG_IP_SCTP is not set - -# -# TIPC Configuration (EXPERIMENTAL) -# # CONFIG_TIPC is not set # CONFIG_ATM is not set # CONFIG_BRIDGE is not set @@ -401,6 +386,7 @@ CONFIG_IPV6_SIT=y # CONFIG_MAC80211 is not set # CONFIG_IEEE80211 is not set # CONFIG_RFKILL is not set +# CONFIG_NET_9P is not set # # Device Drivers @@ -415,21 +401,9 @@ CONFIG_FW_LOADER=y # CONFIG_DEBUG_DRIVER is not set # CONFIG_DEBUG_DEVRES is not set # CONFIG_SYS_HYPERVISOR is not set - -# -# Connector - unified userspace <-> kernelspace linker -# # CONFIG_CONNECTOR is not set # CONFIG_MTD is not set - -# -# Parallel port support -# # CONFIG_PARPORT is not set - -# -# Plug and Play support -# CONFIG_PNP=y # CONFIG_PNP_DEBUG is not set @@ -437,10 +411,7 @@ CONFIG_PNP=y # Protocols # CONFIG_PNPACPI=y - -# -# Block devices -# +CONFIG_BLK_DEV=y CONFIG_BLK_DEV_FD=y # CONFIG_BLK_CPQ_DA is not set # CONFIG_BLK_CPQ_CISS_DA is not set @@ -458,17 +429,14 @@ CONFIG_BLK_DEV_RAM_SIZE=4096 CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024 # CONFIG_CDROM_PKTCDVD is not set # CONFIG_ATA_OVER_ETH is not set - -# -# Misc devices -# +CONFIG_MISC_DEVICES=y # CONFIG_IBM_ASM is not set # CONFIG_PHANTOM is not set +# CONFIG_EEPROM_93CX6 is not set # CONFIG_SGI_IOC4 is not set # CONFIG_TIFM_CORE is not set # CONFIG_SONY_LAPTOP is not set # CONFIG_THINKPAD_ACPI is not set -# CONFIG_BLINK is not set CONFIG_IDE=y CONFIG_BLK_DEV_IDE=y @@ -539,6 +507,7 @@ CONFIG_BLK_DEV_IDEDMA=y # # CONFIG_RAID_ATTRS is not set CONFIG_SCSI=y +CONFIG_SCSI_DMA=y # CONFIG_SCSI_TGT is not set CONFIG_SCSI_NETLINK=y # CONFIG_SCSI_PROC_FS is not set @@ -590,11 +559,9 @@ CONFIG_AIC79XX_DEBUG_MASK=0 # CONFIG_AIC79XX_REG_PRETTY_PRINT is not set # CONFIG_SCSI_AIC94XX is not set # CONFIG_SCSI_ARCMSR is not set -CONFIG_MEGARAID_NEWGEN=y -CONFIG_MEGARAID_MM=y -CONFIG_MEGARAID_MAILBOX=y +# CONFIG_MEGARAID_NEWGEN is not set # CONFIG_MEGARAID_LEGACY is not set -CONFIG_MEGARAID_SAS=y +# CONFIG_MEGARAID_SAS is not set # CONFIG_SCSI_HPTIOP is not set # CONFIG_SCSI_BUSLOGIC is not set # CONFIG_SCSI_DMX3191D is not set @@ -614,7 +581,6 @@ CONFIG_MEGARAID_SAS=y # CONFIG_SCSI_DC395x is not set # CONFIG_SCSI_DC390T is not set # CONFIG_SCSI_DEBUG is not set -# CONFIG_SCSI_ESP_CORE is not set # CONFIG_SCSI_SRP is not set CONFIG_ATA=y # CONFIG_ATA_NONSTANDARD is not set @@ -671,10 +637,6 @@ CONFIG_SATA_VIA=y # CONFIG_PATA_SIS is not set # CONFIG_PATA_VIA is not set # CONFIG_PATA_WINBOND is not set - -# -# Multi-device support (RAID and LVM) -# CONFIG_MD=y # CONFIG_BLK_DEV_MD is not set CONFIG_BLK_DEV_DM=y @@ -692,7 +654,7 @@ CONFIG_BLK_DEV_DM=y CONFIG_FUSION=y CONFIG_FUSION_SPI=y # CONFIG_FUSION_FC is not set -CONFIG_FUSION_SAS=y +# CONFIG_FUSION_SAS is not set CONFIG_FUSION_MAX_SGE=128 # CONFIG_FUSION_CTL is not set @@ -710,7 +672,10 @@ CONFIG_IEEE1394=y # # Controllers # -# CONFIG_IEEE1394_PCILYNX is not set + +# +# Texas Instruments PCILynx requires I2C +# CONFIG_IEEE1394_OHCI1394=y # @@ -722,32 +687,19 @@ CONFIG_IEEE1394_OHCI1394=y # CONFIG_IEEE1394_ETH1394 is not set # CONFIG_IEEE1394_DV1394 is not set CONFIG_IEEE1394_RAWIO=y - -# -# I2O device support -# # CONFIG_I2O is not set -# CONFIG_MACINTOSH_DRIVERS is not set - -# -# Network device support -# +CONFIG_MACINTOSH_DRIVERS=y +# CONFIG_MAC_EMUMOUSEBTN is not set CONFIG_NETDEVICES=y +CONFIG_NETDEVICES_MULTIQUEUE=y # CONFIG_DUMMY is not set # CONFIG_BONDING is not set +# CONFIG_MACVLAN is not set # CONFIG_EQUALIZER is not set CONFIG_TUN=y # CONFIG_NET_SB1000 is not set - -# -# ARCnet devices -# # CONFIG_ARCNET is not set # CONFIG_PHYLIB is not set - -# -# Ethernet (10 or 100Mbit) -# CONFIG_NET_ETHERNET=y CONFIG_MII=y # CONFIG_HAPPYMEAL is not set @@ -756,10 +708,6 @@ CONFIG_MII=y CONFIG_NET_VENDOR_3COM=y CONFIG_VORTEX=y # CONFIG_TYPHOON is not set - -# -# Tulip family network device support -# CONFIG_NET_TULIP=y # CONFIG_DE2104X is not set CONFIG_TULIP=y @@ -773,7 +721,8 @@ CONFIG_TULIP=y # CONFIG_HP100 is not set CONFIG_NET_PCI=y # CONFIG_PCNET32 is not set -# CONFIG_AMD8111_ETH is not set +CONFIG_AMD8111_ETH=y +# CONFIG_AMD8111E_NAPI is not set # CONFIG_ADAPTEC_STARFIRE is not set CONFIG_B44=y CONFIG_FORCEDETH=y @@ -808,7 +757,6 @@ CONFIG_E1000=y # CONFIG_SIS190 is not set # CONFIG_SKGE is not set # CONFIG_SKY2 is not set -# CONFIG_SK98LIN is not set # CONFIG_VIA_VELOCITY is not set CONFIG_TIGON3=y CONFIG_BNX2=y @@ -823,10 +771,6 @@ CONFIG_S2IO=m # CONFIG_MYRI10GE is not set # CONFIG_NETXEN_NIC is not set # CONFIG_MLX4_CORE is not set - -# -# Token Ring devices -# # CONFIG_TR is not set # @@ -855,15 +799,7 @@ CONFIG_NETCONSOLE=y CONFIG_NETPOLL=y # CONFIG_NETPOLL_TRAP is not set CONFIG_NET_POLL_CONTROLLER=y - -# -# ISDN subsystem -# # CONFIG_ISDN is not set - -# -# Telephony Support -# # CONFIG_PHONE is not set # @@ -871,6 +807,7 @@ CONFIG_NET_POLL_CONTROLLER=y # CONFIG_INPUT=y # CONFIG_INPUT_FF_MEMLESS is not set +# CONFIG_INPUT_POLLDEV is not set # # Userland interfaces @@ -936,6 +873,7 @@ CONFIG_HW_CONSOLE=y # CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_FIX_EARLYCON_MEM=y CONFIG_SERIAL_8250_PCI=y CONFIG_SERIAL_8250_PNP=y CONFIG_SERIAL_8250_NR_UARTS=4 @@ -951,16 +889,11 @@ CONFIG_SERIAL_CORE_CONSOLE=y CONFIG_UNIX98_PTYS=y CONFIG_LEGACY_PTYS=y CONFIG_LEGACY_PTY_COUNT=256 - -# -# IPMI -# # CONFIG_IPMI_HANDLER is not set # CONFIG_WATCHDOG is not set CONFIG_HW_RANDOM=y CONFIG_HW_RANDOM_INTEL=y CONFIG_HW_RANDOM_AMD=y -# CONFIG_HW_RANDOM_GEODE is not set # CONFIG_NVRAM is not set CONFIG_RTC=y # CONFIG_R3964 is not set @@ -979,127 +912,19 @@ CONFIG_HPET=y # CONFIG_HPET_RTC_IRQ is not set CONFIG_HPET_MMAP=y # CONFIG_HANGCHECK_TIMER is not set - -# -# TPM devices -# # CONFIG_TCG_TPM is not set # CONFIG_TELCLOCK is not set CONFIG_DEVPORT=y -CONFIG_I2C=m -CONFIG_I2C_BOARDINFO=y -CONFIG_I2C_CHARDEV=m - -# -# I2C Algorithms -# -# CONFIG_I2C_ALGOBIT is not set -# CONFIG_I2C_ALGOPCF is not set -# CONFIG_I2C_ALGOPCA is not set - -# -# I2C Hardware Bus support -# -# CONFIG_I2C_ALI1535 is not set -# CONFIG_I2C_ALI1563 is not set -# CONFIG_I2C_ALI15X3 is not set -# CONFIG_I2C_AMD756 is not set -# CONFIG_I2C_AMD8111 is not set -# CONFIG_I2C_I801 is not set -# CONFIG_I2C_I810 is not set -# CONFIG_I2C_PIIX4 is not set -# CONFIG_I2C_NFORCE2 is not set -# CONFIG_I2C_OCORES is not set -# CONFIG_I2C_PARPORT_LIGHT is not set -# CONFIG_I2C_PROSAVAGE is not set -# CONFIG_I2C_SAVAGE4 is not set -# CONFIG_I2C_SIMTEC is not set -# CONFIG_I2C_SIS5595 is not set -# CONFIG_I2C_SIS630 is not set -# CONFIG_I2C_SIS96X is not set -# CONFIG_I2C_STUB is not set -# CONFIG_I2C_TINY_USB is not set -# CONFIG_I2C_VIA is not set -# CONFIG_I2C_VIAPRO is not set -# CONFIG_I2C_VOODOO3 is not set - -# -# Miscellaneous I2C Chip support -# -# CONFIG_SENSORS_DS1337 is not set -# CONFIG_SENSORS_DS1374 is not set -# CONFIG_SENSORS_EEPROM is not set -# CONFIG_SENSORS_PCF8574 is not set -# CONFIG_SENSORS_PCA9539 is not set -# CONFIG_SENSORS_PCF8591 is not set -# CONFIG_SENSORS_MAX6875 is not set -# CONFIG_I2C_DEBUG_CORE is not set -# CONFIG_I2C_DEBUG_ALGO is not set -# CONFIG_I2C_DEBUG_BUS is not set -# CONFIG_I2C_DEBUG_CHIP is not set +# CONFIG_I2C is not set # # SPI support # # CONFIG_SPI is not set # CONFIG_SPI_MASTER is not set - -# -# Dallas's 1-wire bus -# # CONFIG_W1 is not set -CONFIG_HWMON=y -# CONFIG_HWMON_VID is not set -# CONFIG_SENSORS_ABITUGURU is not set -# CONFIG_SENSORS_AD7418 is not set -# CONFIG_SENSORS_ADM1021 is not set -# CONFIG_SENSORS_ADM1025 is not set -# CONFIG_SENSORS_ADM1026 is not set -# CONFIG_SENSORS_ADM1029 is not set -# CONFIG_SENSORS_ADM1031 is not set -# CONFIG_SENSORS_ADM9240 is not set -# CONFIG_SENSORS_K8TEMP is not set -# CONFIG_SENSORS_ASB100 is not set -# CONFIG_SENSORS_ATXP1 is not set -# CONFIG_SENSORS_DS1621 is not set -# CONFIG_SENSORS_F71805F is not set -# CONFIG_SENSORS_FSCHER is not set -# CONFIG_SENSORS_FSCPOS is not set -# CONFIG_SENSORS_GL518SM is not set -# CONFIG_SENSORS_GL520SM is not set -CONFIG_SENSORS_CORETEMP=y -# CONFIG_SENSORS_IT87 is not set -# CONFIG_SENSORS_LM63 is not set -# CONFIG_SENSORS_LM75 is not set -# CONFIG_SENSORS_LM77 is not set -# CONFIG_SENSORS_LM78 is not set -# CONFIG_SENSORS_LM80 is not set -# CONFIG_SENSORS_LM83 is not set -# CONFIG_SENSORS_LM85 is not set -# CONFIG_SENSORS_LM87 is not set -# CONFIG_SENSORS_LM90 is not set -# CONFIG_SENSORS_LM92 is not set -# CONFIG_SENSORS_MAX1619 is not set -# CONFIG_SENSORS_MAX6650 is not set -# CONFIG_SENSORS_PC87360 is not set -# CONFIG_SENSORS_PC87427 is not set -# CONFIG_SENSORS_SIS5595 is not set -# CONFIG_SENSORS_SMSC47M1 is not set -# CONFIG_SENSORS_SMSC47M192 is not set -CONFIG_SENSORS_SMSC47B397=m -# CONFIG_SENSORS_VIA686A is not set -# CONFIG_SENSORS_VT1211 is not set -# CONFIG_SENSORS_VT8231 is not set -# CONFIG_SENSORS_W83781D is not set -# CONFIG_SENSORS_W83791D is not set -# CONFIG_SENSORS_W83792D is not set -# CONFIG_SENSORS_W83793 is not set -# CONFIG_SENSORS_W83L785TS is not set -# CONFIG_SENSORS_W83627HF is not set -# CONFIG_SENSORS_W83627EHF is not set -# CONFIG_SENSORS_HDAPS is not set -# CONFIG_SENSORS_APPLESMC is not set -# CONFIG_HWMON_DEBUG_CHIP is not set +# CONFIG_POWER_SUPPLY is not set +# CONFIG_HWMON is not set # # Multifunction device drivers @@ -1149,15 +974,11 @@ CONFIG_SOUND=y # Open Sound System # CONFIG_SOUND_PRIME=y -# CONFIG_OSS_OBSOLETE is not set # CONFIG_SOUND_TRIDENT is not set # CONFIG_SOUND_MSNDCLAS is not set # CONFIG_SOUND_MSNDPIN is not set # CONFIG_SOUND_OSS is not set - -# -# HID Devices -# +CONFIG_HID_SUPPORT=y CONFIG_HID=y # CONFIG_HID_DEBUG is not set @@ -1168,10 +989,7 @@ CONFIG_USB_HID=y # CONFIG_USB_HIDINPUT_POWERBOOK is not set # CONFIG_HID_FF is not set # CONFIG_USB_HIDDEV is not set - -# -# USB support -# +CONFIG_USB_SUPPORT=y CONFIG_USB_ARCH_HAS_HCD=y CONFIG_USB_ARCH_HAS_OHCI=y CONFIG_USB_ARCH_HAS_EHCI=y @@ -1185,6 +1003,7 @@ CONFIG_USB_DEVICEFS=y # CONFIG_USB_DEVICE_CLASS is not set # CONFIG_USB_DYNAMIC_MINORS is not set # CONFIG_USB_SUSPEND is not set +# CONFIG_USB_PERSIST is not set # CONFIG_USB_OTG is not set # @@ -1194,7 +1013,6 @@ CONFIG_USB_EHCI_HCD=y # CONFIG_USB_EHCI_SPLIT_ISO is not set # CONFIG_USB_EHCI_ROOT_HUB_TT is not set # CONFIG_USB_EHCI_TT_NEWSCHED is not set -# CONFIG_USB_EHCI_BIG_ENDIAN_MMIO is not set # CONFIG_USB_ISP116X_HCD is not set CONFIG_USB_OHCI_HCD=y # CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set @@ -1202,6 +1020,7 @@ CONFIG_USB_OHCI_HCD=y CONFIG_USB_OHCI_LITTLE_ENDIAN=y CONFIG_USB_UHCI_HCD=y # CONFIG_USB_SL811_HCD is not set +# CONFIG_USB_R8A66597_HCD is not set # # USB Device Class drivers @@ -1292,15 +1111,7 @@ CONFIG_USB_MON=y # # LED Triggers # - -# -# InfiniBand support -# # CONFIG_INFINIBAND is not set - -# -# EDAC - error detection and reporting (RAS) (EXPERIMENTAL) -# # CONFIG_EDAC is not set # @@ -1320,11 +1131,13 @@ CONFIG_USB_MON=y # # DMA Devices # +CONFIG_VIRTUALIZATION=y +# CONFIG_KVM is not set # -# Virtualization +# Userspace I/O # -# CONFIG_KVM is not set +# CONFIG_UIO is not set # # Firmware Drivers @@ -1332,6 +1145,7 @@ CONFIG_USB_MON=y # CONFIG_EDD is not set # CONFIG_DELL_RBU is not set # CONFIG_DCDBAS is not set +CONFIG_DMIID=y # # File systems @@ -1447,7 +1261,6 @@ CONFIG_SUNRPC=y # CONFIG_NCP_FS is not set # CONFIG_CODA_FS is not set # CONFIG_AFS_FS is not set -# CONFIG_9P_FS is not set # # Partition Types @@ -1524,8 +1337,9 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_SHIRQ is not set CONFIG_DETECT_SOFTLOCKUP=y +# CONFIG_SCHED_DEBUG is not set # CONFIG_SCHEDSTATS is not set -# CONFIG_TIMER_STATS is not set +CONFIG_TIMER_STATS=y # CONFIG_DEBUG_SLAB is not set # CONFIG_DEBUG_RT_MUTEXES is not set # CONFIG_RT_MUTEX_TESTER is not set @@ -1533,6 +1347,7 @@ CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_DEBUG_MUTEXES is not set # CONFIG_DEBUG_LOCK_ALLOC is not set # CONFIG_PROVE_LOCKING is not set +# CONFIG_LOCK_STAT is not set # CONFIG_DEBUG_SPINLOCK_SLEEP is not set # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set # CONFIG_DEBUG_KOBJECT is not set @@ -1541,8 +1356,6 @@ CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_VM is not set # CONFIG_DEBUG_LIST is not set # CONFIG_FRAME_POINTER is not set -CONFIG_UNWIND_INFO=y -CONFIG_STACK_UNWIND=y # CONFIG_FORCED_INLINING is not set # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_LKDTM is not set @@ -1557,10 +1370,6 @@ CONFIG_DEBUG_STACKOVERFLOW=y # # CONFIG_KEYS is not set # CONFIG_SECURITY is not set - -# -# Cryptographic options -# # CONFIG_CRYPTO is not set # @@ -1571,6 +1380,7 @@ CONFIG_BITREVERSE=y # CONFIG_CRC16 is not set # CONFIG_CRC_ITU_T is not set CONFIG_CRC32=y +# CONFIG_CRC7 is not set # CONFIG_LIBCRC32C is not set CONFIG_ZLIB_INFLATE=y CONFIG_PLIST=y diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c index ed56a8806eab..b70f3e7cf06c 100644 --- a/arch/x86_64/ia32/ia32_binfmt.c +++ b/arch/x86_64/ia32/ia32_binfmt.c @@ -38,6 +38,7 @@ int sysctl_vsyscall32 = 1; +#undef ARCH_DLINFO #define ARCH_DLINFO do { \ if (sysctl_vsyscall32) { \ NEW_AUX_ENT(AT_SYSINFO, (u32)(u64)VSYSCALL32_VSYSCALL); \ diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 3f66e970d86f..938278697e20 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -104,7 +104,7 @@ ENTRY(ia32_sysenter_target) pushq %rax CFI_ADJUST_CFA_OFFSET 8 cld - SAVE_ARGS 0,0,0 + SAVE_ARGS 0,0,1 /* no need to do an access_ok check here because rbp has been 32bit zero extended */ 1: movl (%rbp),%r9d @@ -294,7 +294,7 @@ ia32_badarg: */ ENTRY(ia32_syscall) - CFI_STARTPROC simple + CFI_STARTPROC32 simple CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,SS+8-RIP /*CFI_REL_OFFSET ss,SS-RIP*/ @@ -330,6 +330,7 @@ ia32_sysret: ia32_tracesys: SAVE_REST + CLEAR_RREGS movq $-ENOSYS,RAX(%rsp) /* really needed? */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c index a3d450d6c15b..8f681cae7bf7 100644 --- a/arch/x86_64/kernel/aperture.c +++ b/arch/x86_64/kernel/aperture.c @@ -20,7 +20,7 @@ #include <linux/ioport.h> #include <asm/e820.h> #include <asm/io.h> -#include <asm/proto.h> +#include <asm/iommu.h> #include <asm/pci-direct.h> #include <asm/dma.h> #include <asm/k8.h> @@ -214,7 +214,7 @@ void __init iommu_hole_init(void) if (iommu_aperture_disabled || !fix_aperture || !early_pci_allowed()) return; - printk("Checking aperture...\n"); + printk(KERN_INFO "Checking aperture...\n"); fix = 0; for (num = 24; num < 32; num++) { diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 1b0e07bb8728..900ff38d68de 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -92,8 +92,9 @@ unsigned int safe_apic_wait_icr_idle(void) void enable_NMI_through_LVT0 (void * dummy) { unsigned int v; - - v = APIC_DM_NMI; /* unmask and set to NMI */ + + /* unmask and set to NMI */ + v = APIC_DM_NMI; apic_write(APIC_LVT0, v); } @@ -120,7 +121,7 @@ void ack_bad_irq(unsigned int irq) * holds up an irq slot - in excessive cases (when multiple * unexpected vectors occur) that might lock up the APIC * completely. - * But don't ack when the APIC is disabled. -AK + * But don't ack when the APIC is disabled. -AK */ if (!disable_apic) ack_APIC_irq(); @@ -616,7 +617,7 @@ early_param("apic", apic_set_verbosity); * Detect and enable local APICs on non-SMP boards. * Original code written by Keir Fraser. * On AMD64 we trust the BIOS - if it says no APIC it is likely - * not correctly set up (usually the APIC timer won't work etc.) + * not correctly set up (usually the APIC timer won't work etc.) */ static int __init detect_init_APIC (void) @@ -789,13 +790,13 @@ static void setup_APIC_timer(unsigned int clocks) local_irq_save(flags); /* wait for irq slice */ - if (hpet_address && hpet_use_timer) { - int trigger = hpet_readl(HPET_T0_CMP); - while (hpet_readl(HPET_COUNTER) >= trigger) - /* do nothing */ ; - while (hpet_readl(HPET_COUNTER) < trigger) - /* do nothing */ ; - } else { + if (hpet_address && hpet_use_timer) { + int trigger = hpet_readl(HPET_T0_CMP); + while (hpet_readl(HPET_COUNTER) >= trigger) + /* do nothing */ ; + while (hpet_readl(HPET_COUNTER) < trigger) + /* do nothing */ ; + } else { int c1, c2; outb_p(0x00, 0x43); c2 = inb_p(0x40); @@ -881,10 +882,10 @@ static unsigned int calibration_result; void __init setup_boot_APIC_clock (void) { - if (disable_apic_timer) { - printk(KERN_INFO "Disabling APIC timer\n"); - return; - } + if (disable_apic_timer) { + printk(KERN_INFO "Disabling APIC timer\n"); + return; + } printk(KERN_INFO "Using local APIC timer interrupts.\n"); using_apic_timer = 1; @@ -990,8 +991,8 @@ int setup_profiling_timer(unsigned int multiplier) return -EINVAL; } -void setup_APIC_extened_lvt(unsigned char lvt_off, unsigned char vector, - unsigned char msg_type, unsigned char mask) +void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector, + unsigned char msg_type, unsigned char mask) { unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE; unsigned int v = (mask << 16) | (msg_type << 8) | vector; @@ -1128,20 +1129,6 @@ asmlinkage void smp_spurious_interrupt(void) if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) ack_APIC_irq(); -#if 0 - static unsigned long last_warning; - static unsigned long skipped; - - /* see sw-dev-man vol 3, chapter 7.4.13.5 */ - if (time_before(last_warning+30*HZ,jiffies)) { - printk(KERN_INFO "spurious APIC interrupt on CPU#%d, %ld skipped.\n", - smp_processor_id(), skipped); - last_warning = jiffies; - skipped = 0; - } else { - skipped++; - } -#endif irq_exit(); } @@ -1173,11 +1160,11 @@ asmlinkage void smp_error_interrupt(void) 7: Illegal register address */ printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", - smp_processor_id(), v , v1); + smp_processor_id(), v , v1); irq_exit(); } -int disable_apic; +int disable_apic; /* * This initializes the IO-APIC and APIC hardware if this is @@ -1185,11 +1172,11 @@ int disable_apic; */ int __init APIC_init_uniprocessor (void) { - if (disable_apic) { + if (disable_apic) { printk(KERN_INFO "Apic disabled\n"); - return -1; + return -1; } - if (!cpu_has_apic) { + if (!cpu_has_apic) { disable_apic = 1; printk(KERN_INFO "Apic disabled by BIOS\n"); return -1; @@ -1211,8 +1198,8 @@ int __init APIC_init_uniprocessor (void) return 0; } -static __init int setup_disableapic(char *str) -{ +static __init int setup_disableapic(char *str) +{ disable_apic = 1; clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); return 0; @@ -1220,10 +1207,10 @@ static __init int setup_disableapic(char *str) early_param("disableapic", setup_disableapic); /* same as disableapic, for compatibility */ -static __init int setup_nolapic(char *str) -{ +static __init int setup_nolapic(char *str) +{ return setup_disableapic(str); -} +} early_param("nolapic", setup_nolapic); static int __init parse_lapic_timer_c2_ok(char *arg) @@ -1233,13 +1220,13 @@ static int __init parse_lapic_timer_c2_ok(char *arg) } early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); -static __init int setup_noapictimer(char *str) -{ +static __init int setup_noapictimer(char *str) +{ if (str[0] != ' ' && str[0] != 0) return 0; disable_apic_timer = 1; return 1; -} +} static __init int setup_apicmaintimer(char *str) { @@ -1264,5 +1251,5 @@ static __init int setup_apicpmtimer(char *s) } __setup("apicpmtimer", setup_apicpmtimer); -__setup("noapictimer", setup_noapictimer); +__setup("noapictimer", setup_noapictimer); diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index 13c6c37610e0..0f4d5e209e9b 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -194,37 +194,6 @@ unsigned long __init e820_end_of_ram(void) } /* - * Find the hole size in the range. - */ -unsigned long __init e820_hole_size(unsigned long start, unsigned long end) -{ - unsigned long ram = 0; - int i; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long last, addr; - - if (ei->type != E820_RAM || - ei->addr+ei->size <= start || - ei->addr >= end) - continue; - - addr = round_up(ei->addr, PAGE_SIZE); - if (addr < start) - addr = start; - - last = round_down(ei->addr + ei->size, PAGE_SIZE); - if (last >= end) - last = end; - - if (last > addr) - ram += last - addr; - } - return ((end - start) - ram); -} - -/* * Mark e820 reserved areas as busy for the resource manager. */ void __init e820_reserve_resources(void) @@ -289,47 +258,61 @@ void __init e820_mark_nosave_regions(void) } } -/* Walk the e820 map and register active regions within a node */ -void __init -e820_register_active_regions(int nid, unsigned long start_pfn, - unsigned long end_pfn) +/* + * Finds an active region in the address range from start_pfn to end_pfn and + * returns its range in ei_startpfn and ei_endpfn for the e820 entry. + */ +static int __init e820_find_active_region(const struct e820entry *ei, + unsigned long start_pfn, + unsigned long end_pfn, + unsigned long *ei_startpfn, + unsigned long *ei_endpfn) { - int i; - unsigned long ei_startpfn, ei_endpfn; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT; - ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) - >> PAGE_SHIFT; + *ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT; + *ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT; - /* Skip map entries smaller than a page */ - if (ei_startpfn >= ei_endpfn) - continue; + /* Skip map entries smaller than a page */ + if (*ei_startpfn >= *ei_endpfn) + return 0; - /* Check if end_pfn_map should be updated */ - if (ei->type != E820_RAM && ei_endpfn > end_pfn_map) - end_pfn_map = ei_endpfn; + /* Check if end_pfn_map should be updated */ + if (ei->type != E820_RAM && *ei_endpfn > end_pfn_map) + end_pfn_map = *ei_endpfn; - /* Skip if map is outside the node */ - if (ei->type != E820_RAM || - ei_endpfn <= start_pfn || - ei_startpfn >= end_pfn) - continue; + /* Skip if map is outside the node */ + if (ei->type != E820_RAM || *ei_endpfn <= start_pfn || + *ei_startpfn >= end_pfn) + return 0; - /* Check for overlaps */ - if (ei_startpfn < start_pfn) - ei_startpfn = start_pfn; - if (ei_endpfn > end_pfn) - ei_endpfn = end_pfn; + /* Check for overlaps */ + if (*ei_startpfn < start_pfn) + *ei_startpfn = start_pfn; + if (*ei_endpfn > end_pfn) + *ei_endpfn = end_pfn; - /* Obey end_user_pfn to save on memmap */ - if (ei_startpfn >= end_user_pfn) - continue; - if (ei_endpfn > end_user_pfn) - ei_endpfn = end_user_pfn; + /* Obey end_user_pfn to save on memmap */ + if (*ei_startpfn >= end_user_pfn) + return 0; + if (*ei_endpfn > end_user_pfn) + *ei_endpfn = end_user_pfn; - add_active_range(nid, ei_startpfn, ei_endpfn); - } + return 1; +} + +/* Walk the e820 map and register active regions within a node */ +void __init +e820_register_active_regions(int nid, unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long ei_startpfn; + unsigned long ei_endpfn; + int i; + + for (i = 0; i < e820.nr_map; i++) + if (e820_find_active_region(&e820.map[i], + start_pfn, end_pfn, + &ei_startpfn, &ei_endpfn)) + add_active_range(nid, ei_startpfn, ei_endpfn); } /* @@ -350,12 +333,35 @@ void __init add_memory_region(unsigned long start, unsigned long size, int type) e820.nr_map++; } +/* + * Find the hole size (in bytes) in the memory range. + * @start: starting address of the memory range to scan + * @end: ending address of the memory range to scan + */ +unsigned long __init e820_hole_size(unsigned long start, unsigned long end) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long end_pfn = end >> PAGE_SHIFT; + unsigned long ei_startpfn; + unsigned long ei_endpfn; + unsigned long ram = 0; + int i; + + for (i = 0; i < e820.nr_map; i++) { + if (e820_find_active_region(&e820.map[i], + start_pfn, end_pfn, + &ei_startpfn, &ei_endpfn)) + ram += ei_endpfn - ei_startpfn; + } + return end - start - (ram << PAGE_SHIFT); +} + void __init e820_print_map(char *who) { int i; for (i = 0; i < e820.nr_map; i++) { - printk(" %s: %016Lx - %016Lx ", who, + printk(KERN_INFO " %s: %016Lx - %016Lx ", who, (unsigned long long) e820.map[i].addr, (unsigned long long) (e820.map[i].addr + e820.map[i].size)); switch (e820.map[i].type) { diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c index 990d9c218a5d..13aa4fd728f3 100644 --- a/arch/x86_64/kernel/early-quirks.c +++ b/arch/x86_64/kernel/early-quirks.c @@ -14,6 +14,7 @@ #include <linux/pci_ids.h> #include <asm/pci-direct.h> #include <asm/proto.h> +#include <asm/iommu.h> #include <asm/dma.h> static void __init via_bugs(void) diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index a67f87bf4015..830cfc6ee8cb 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -282,7 +282,7 @@ sysret_careful: sysret_signal: TRACE_IRQS_ON sti - testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx + testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx jz 1f /* Really a signal */ @@ -375,7 +375,7 @@ int_very_careful: jmp int_restore_rest int_signal: - testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx + testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx jz 1f movq %rsp,%rdi # &ptregs -> arg1 xorl %esi,%esi # oldset -> arg2 @@ -599,7 +599,7 @@ retint_careful: jmp retint_check retint_signal: - testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx + testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx jz retint_swapgs TRACE_IRQS_ON sti diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 941c84baecc8..e89abcdbdde8 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -25,7 +25,7 @@ */ .text - .section .bootstrap.text + .section .text.head .code64 .globl startup_64 startup_64: @@ -243,10 +243,16 @@ ENTRY(secondary_startup_64) lretq /* SMP bootup changes these two */ +#ifndef CONFIG_HOTPLUG_CPU + .pushsection .init.data +#endif .align 8 .globl initial_code initial_code: .quad x86_64_start_kernel +#ifndef CONFIG_HOTPLUG_CPU + .popsection +#endif .globl init_rsp init_rsp: .quad init_thread_union+THREAD_SIZE-8 diff --git a/arch/x86_64/kernel/hpet.c b/arch/x86_64/kernel/hpet.c index b8286968662d..e2d1b912e154 100644 --- a/arch/x86_64/kernel/hpet.c +++ b/arch/x86_64/kernel/hpet.c @@ -133,7 +133,7 @@ struct clocksource clocksource_hpet = { .vread = vread_hpet, }; -int hpet_arch_init(void) +int __init hpet_arch_init(void) { unsigned int id; u64 tmp; @@ -190,7 +190,7 @@ int hpet_reenable(void) */ #define TICK_COUNT 100000000 -#define TICK_MIN 5000 +#define SMI_THRESHOLD 50000 #define MAX_TRIES 5 /* @@ -205,7 +205,7 @@ static void __init read_hpet_tsc(int *hpet, int *tsc) tsc1 = get_cycles_sync(); hpet1 = hpet_readl(HPET_COUNTER); tsc2 = get_cycles_sync(); - if (tsc2 - tsc1 > TICK_MIN) + if ((tsc2 - tsc1) < SMI_THRESHOLD) break; } *hpet = hpet1; @@ -439,7 +439,7 @@ int hpet_rtc_dropped_irq(void) return 1; } -irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs) +irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) { struct rtc_time curr_time; unsigned long rtc_int_flag = 0; diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c index 4b326655b208..948cae646099 100644 --- a/arch/x86_64/kernel/i8259.c +++ b/arch/x86_64/kernel/i8259.c @@ -444,24 +444,6 @@ void __init init_ISA_irqs (void) } } -void apic_timer_interrupt(void); -void spurious_interrupt(void); -void error_interrupt(void); -void reschedule_interrupt(void); -void call_function_interrupt(void); -void irq_move_cleanup_interrupt(void); -void invalidate_interrupt0(void); -void invalidate_interrupt1(void); -void invalidate_interrupt2(void); -void invalidate_interrupt3(void); -void invalidate_interrupt4(void); -void invalidate_interrupt5(void); -void invalidate_interrupt6(void); -void invalidate_interrupt7(void); -void thermal_interrupt(void); -void threshold_interrupt(void); -void i8254_timer_resume(void); - static void setup_timer_hardware(void) { outb_p(0x34,0x43); /* binary, mode 2, LSB/MSB, ch 0 */ diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 1c6c6f724573..050141c0602b 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -152,6 +152,32 @@ static inline void io_apic_modify(unsigned int apic, unsigned int value) writel(value, &io_apic->data); } +static int io_apic_level_ack_pending(unsigned int irq) +{ + struct irq_pin_list *entry; + unsigned long flags; + int pending = 0; + + spin_lock_irqsave(&ioapic_lock, flags); + entry = irq_2_pin + irq; + for (;;) { + unsigned int reg; + int pin; + + pin = entry->pin; + if (pin == -1) + break; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + /* Is the remote IRR bit set? */ + pending |= (reg >> 14) & 1; + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } + spin_unlock_irqrestore(&ioapic_lock, flags); + return pending; +} + /* * Synchronize the IO-APIC and the CPU by doing * a dummy read from the IO-APIC @@ -1418,9 +1444,37 @@ static void ack_apic_level(unsigned int irq) ack_APIC_irq(); /* Now we can move and renable the irq */ - move_masked_irq(irq); - if (unlikely(do_unmask_irq)) + if (unlikely(do_unmask_irq)) { + /* Only migrate the irq if the ack has been received. + * + * On rare occasions the broadcast level triggered ack gets + * delayed going to ioapics, and if we reprogram the + * vector while Remote IRR is still set the irq will never + * fire again. + * + * To prevent this scenario we read the Remote IRR bit + * of the ioapic. This has two effects. + * - On any sane system the read of the ioapic will + * flush writes (and acks) going to the ioapic from + * this cpu. + * - We get to see if the ACK has actually been delivered. + * + * Based on failed experiments of reprogramming the + * ioapic entry from outside of irq context starting + * with masking the ioapic entry and then polling until + * Remote IRR was clear before reprogramming the + * ioapic I don't trust the Remote IRR bit to be + * completey accurate. + * + * However there appears to be no other way to plug + * this race, so if the Remote IRR bit is not + * accurate and is causing problems then it is a hardware bug + * and you can go talk to the chipset vendor about it. + */ + if (!io_apic_level_ack_pending(irq)) + move_masked_irq(irq); unmask_IO_APIC_irq(irq); + } } static struct irq_chip ioapic_chip __read_mostly = { diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c index d4a0d0ac9935..a30e004682e2 100644 --- a/arch/x86_64/kernel/kprobes.c +++ b/arch/x86_64/kernel/kprobes.c @@ -39,9 +39,9 @@ #include <linux/module.h> #include <linux/kdebug.h> -#include <asm/cacheflush.h> #include <asm/pgtable.h> #include <asm/uaccess.h> +#include <asm/alternative.h> void jprobe_return_end(void); static void __kprobes arch_copy_kprobe(struct kprobe *p); @@ -209,16 +209,12 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p) void __kprobes arch_arm_kprobe(struct kprobe *p) { - *p->addr = BREAKPOINT_INSTRUCTION; - flush_icache_range((unsigned long) p->addr, - (unsigned long) p->addr + sizeof(kprobe_opcode_t)); + text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1); } void __kprobes arch_disarm_kprobe(struct kprobe *p) { - *p->addr = p->opcode; - flush_icache_range((unsigned long) p->addr, - (unsigned long) p->addr + sizeof(kprobe_opcode_t)); + text_poke(p->addr, &p->opcode, 1); } void __kprobes arch_remove_kprobe(struct kprobe *p) diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index f3fb8174559e..a66d607f5b92 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -18,6 +18,8 @@ #include <linux/capability.h> #include <linux/cpu.h> #include <linux/percpu.h> +#include <linux/poll.h> +#include <linux/thread_info.h> #include <linux/ctype.h> #include <linux/kmod.h> #include <linux/kdebug.h> @@ -26,6 +28,7 @@ #include <asm/mce.h> #include <asm/uaccess.h> #include <asm/smp.h> +#include <asm/idle.h> #define MISC_MCELOG_MINOR 227 #define NR_BANKS 6 @@ -34,13 +37,17 @@ atomic_t mce_entry; static int mce_dont_init; -/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic, - 3: never panic or exit (for testing only) */ +/* + * Tolerant levels: + * 0: always panic on uncorrected errors, log corrected errors + * 1: panic or SIGBUS on uncorrected errors, log corrected errors + * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors + * 3: never panic or SIGBUS, log all errors (for testing only) + */ static int tolerant = 1; static int banks; static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL }; -static unsigned long console_logged; -static int notify_user; +static unsigned long notify_user; static int rip_msr; static int mce_bootlog = 1; static atomic_t mce_events; @@ -48,6 +55,8 @@ static atomic_t mce_events; static char trigger[128]; static char *trigger_argv[2] = { trigger, NULL }; +static DECLARE_WAIT_QUEUE_HEAD(mce_wait); + /* * Lockless MCE logging infrastructure. * This avoids deadlocks on printk locks without having to break locks. Also @@ -94,8 +103,7 @@ void mce_log(struct mce *mce) mcelog.entry[entry].finished = 1; wmb(); - if (!test_and_set_bit(0, &console_logged)) - notify_user = 1; + set_bit(0, ¬ify_user); } static void print_mce(struct mce *m) @@ -128,6 +136,7 @@ static void print_mce(struct mce *m) static void mce_panic(char *msg, struct mce *backup, unsigned long start) { int i; + oops_begin(); for (i = 0; i < MCE_LOG_LEN; i++) { unsigned long tsc = mcelog.entry[i].tsc; @@ -139,10 +148,7 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start) } if (backup) print_mce(backup); - if (tolerant >= 3) - printk("Fake panic: %s\n", msg); - else - panic(msg); + panic(msg); } static int mce_available(struct cpuinfo_x86 *c) @@ -167,17 +173,6 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) } } -static void do_mce_trigger(void) -{ - static atomic_t mce_logged; - int events = atomic_read(&mce_events); - if (events != atomic_read(&mce_logged) && trigger[0]) { - /* Small race window, but should be harmless. */ - atomic_set(&mce_logged, events); - call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); - } -} - /* * The actual machine check handler */ @@ -185,11 +180,19 @@ static void do_mce_trigger(void) void do_machine_check(struct pt_regs * regs, long error_code) { struct mce m, panicm; - int nowayout = (tolerant < 1); - int kill_it = 0; u64 mcestart = 0; int i; int panicm_found = 0; + /* + * If no_way_out gets set, there is no safe way to recover from this + * MCE. If tolerant is cranked up, we'll try anyway. + */ + int no_way_out = 0; + /* + * If kill_it gets set, there might be a way to recover from this + * error. + */ + int kill_it = 0; atomic_inc(&mce_entry); @@ -201,8 +204,9 @@ void do_machine_check(struct pt_regs * regs, long error_code) memset(&m, 0, sizeof(struct mce)); m.cpu = smp_processor_id(); rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + /* if the restart IP is not valid, we're done for */ if (!(m.mcgstatus & MCG_STATUS_RIPV)) - kill_it = 1; + no_way_out = 1; rdtscll(mcestart); barrier(); @@ -221,10 +225,18 @@ void do_machine_check(struct pt_regs * regs, long error_code) continue; if (m.status & MCI_STATUS_EN) { - /* In theory _OVER could be a nowayout too, but - assume any overflowed errors were no fatal. */ - nowayout |= !!(m.status & MCI_STATUS_PCC); - kill_it |= !!(m.status & MCI_STATUS_UC); + /* if PCC was set, there's no way out */ + no_way_out |= !!(m.status & MCI_STATUS_PCC); + /* + * If this error was uncorrectable and there was + * an overflow, we're in trouble. If no overflow, + * we might get away with just killing a task. + */ + if (m.status & MCI_STATUS_UC) { + if (tolerant < 1 || m.status & MCI_STATUS_OVER) + no_way_out = 1; + kill_it = 1; + } } if (m.status & MCI_STATUS_MISCV) @@ -235,7 +247,6 @@ void do_machine_check(struct pt_regs * regs, long error_code) mce_get_rip(&m, regs); if (error_code >= 0) rdtscll(m.tsc); - wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0); if (error_code != -2) mce_log(&m); @@ -251,45 +262,59 @@ void do_machine_check(struct pt_regs * regs, long error_code) } /* Never do anything final in the polling timer */ - if (!regs) { - /* Normal interrupt context here. Call trigger for any new - events. */ - do_mce_trigger(); + if (!regs) goto out; - } /* If we didn't find an uncorrectable error, pick the last one (shouldn't happen, just being safe). */ if (!panicm_found) panicm = m; - if (nowayout) + + /* + * If we have decided that we just CAN'T continue, and the user + * has not set tolerant to an insane level, give up and die. + */ + if (no_way_out && tolerant < 3) mce_panic("Machine check", &panicm, mcestart); - if (kill_it) { + + /* + * If the error seems to be unrecoverable, something should be + * done. Try to kill as little as possible. If we can kill just + * one task, do that. If the user has set the tolerance very + * high, don't try to do anything at all. + */ + if (kill_it && tolerant < 3) { int user_space = 0; - if (m.mcgstatus & MCG_STATUS_RIPV) + /* + * If the EIPV bit is set, it means the saved IP is the + * instruction which caused the MCE. + */ + if (m.mcgstatus & MCG_STATUS_EIPV) user_space = panicm.rip && (panicm.cs & 3); - - /* When the machine was in user space and the CPU didn't get - confused it's normally not necessary to panic, unless you - are paranoid (tolerant == 0) - - RED-PEN could be more tolerant for MCEs in idle, - but most likely they occur at boot anyways, where - it is best to just halt the machine. */ - if ((!user_space && (panic_on_oops || tolerant < 2)) || - (unsigned)current->pid <= 1) - mce_panic("Uncorrected machine check", &panicm, mcestart); - - /* do_exit takes an awful lot of locks and has as - slight risk of deadlocking. If you don't want that - don't set tolerant >= 2 */ - if (tolerant < 3) + + /* + * If we know that the error was in user space, send a + * SIGBUS. Otherwise, panic if tolerance is low. + * + * do_exit() takes an awful lot of locks and has a slight + * risk of deadlocking. + */ + if (user_space) { do_exit(SIGBUS); + } else if (panic_on_oops || tolerant < 2) { + mce_panic("Uncorrected machine check", + &panicm, mcestart); + } } + /* notify userspace ASAP */ + set_thread_flag(TIF_MCE_NOTIFY); + out: - /* Last thing done in the machine check exception to clear state. */ + /* the last thing we do is clear state */ + for (i = 0; i < banks; i++) + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); wrmsrl(MSR_IA32_MCG_STATUS, 0); out2: atomic_dec(&mce_entry); @@ -344,37 +369,69 @@ static void mcheck_timer(struct work_struct *work) on_each_cpu(mcheck_check_cpu, NULL, 1, 1); /* - * It's ok to read stale data here for notify_user and - * console_logged as we'll simply get the updated versions - * on the next mcheck_timer execution and atomic operations - * on console_logged act as synchronization for notify_user - * writes. + * Alert userspace if needed. If we logged an MCE, reduce the + * polling interval, otherwise increase the polling interval. */ - if (notify_user && console_logged) { + if (mce_notify_user()) { + next_interval = max(next_interval/2, HZ/100); + } else { + next_interval = min(next_interval*2, + (int)round_jiffies_relative(check_interval*HZ)); + } + + schedule_delayed_work(&mcheck_work, next_interval); +} + +/* + * This is only called from process context. This is where we do + * anything we need to alert userspace about new MCEs. This is called + * directly from the poller and also from entry.S and idle, thanks to + * TIF_MCE_NOTIFY. + */ +int mce_notify_user(void) +{ + clear_thread_flag(TIF_MCE_NOTIFY); + if (test_and_clear_bit(0, ¬ify_user)) { static unsigned long last_print; unsigned long now = jiffies; - /* if we logged an MCE, reduce the polling interval */ - next_interval = max(next_interval/2, HZ/100); - notify_user = 0; - clear_bit(0, &console_logged); + wake_up_interruptible(&mce_wait); + if (trigger[0]) + call_usermodehelper(trigger, trigger_argv, NULL, + UMH_NO_WAIT); + if (time_after_eq(now, last_print + (check_interval*HZ))) { last_print = now; printk(KERN_INFO "Machine check events logged\n"); } - } else { - next_interval = min(next_interval*2, check_interval*HZ); + + return 1; } + return 0; +} - schedule_delayed_work(&mcheck_work, next_interval); +/* see if the idle task needs to notify userspace */ +static int +mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk) +{ + /* IDLE_END should be safe - interrupts are back on */ + if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY)) + mce_notify_user(); + + return NOTIFY_OK; } +static struct notifier_block mce_idle_notifier = { + .notifier_call = mce_idle_callback, +}; static __init int periodic_mcheck_init(void) { next_interval = check_interval * HZ; if (next_interval) - schedule_delayed_work(&mcheck_work, next_interval); + schedule_delayed_work(&mcheck_work, + round_jiffies_relative(next_interval)); + idle_notifier_register(&mce_idle_notifier); return 0; } __initcall(periodic_mcheck_init); @@ -465,6 +522,40 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) * Character device to read and clear the MCE log. */ +static DEFINE_SPINLOCK(mce_state_lock); +static int open_count; /* #times opened */ +static int open_exclu; /* already open exclusive? */ + +static int mce_open(struct inode *inode, struct file *file) +{ + spin_lock(&mce_state_lock); + + if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { + spin_unlock(&mce_state_lock); + return -EBUSY; + } + + if (file->f_flags & O_EXCL) + open_exclu = 1; + open_count++; + + spin_unlock(&mce_state_lock); + + return nonseekable_open(inode, file); +} + +static int mce_release(struct inode *inode, struct file *file) +{ + spin_lock(&mce_state_lock); + + open_count--; + open_exclu = 0; + + spin_unlock(&mce_state_lock); + + return 0; +} + static void collect_tscs(void *data) { unsigned long *cpu_tsc = (unsigned long *)data; @@ -532,6 +623,14 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff return err ? -EFAULT : buf - ubuf; } +static unsigned int mce_poll(struct file *file, poll_table *wait) +{ + poll_wait(file, &mce_wait, wait); + if (rcu_dereference(mcelog.next)) + return POLLIN | POLLRDNORM; + return 0; +} + static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg) { int __user *p = (int __user *)arg; @@ -555,7 +654,10 @@ static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned } static const struct file_operations mce_chrdev_ops = { + .open = mce_open, + .release = mce_release, .read = mce_read, + .poll = mce_poll, .ioctl = mce_ioctl, }; @@ -565,6 +667,20 @@ static struct miscdevice mce_log_device = { &mce_chrdev_ops, }; +static unsigned long old_cr4 __initdata; + +void __init stop_mce(void) +{ + old_cr4 = read_cr4(); + clear_in_cr4(X86_CR4_MCE); +} + +void __init restart_mce(void) +{ + if (old_cr4 & X86_CR4_MCE) + set_in_cr4(X86_CR4_MCE); +} + /* * Old style boot options parsing. Only for compatibility. */ @@ -620,7 +736,8 @@ static void mce_restart(void) on_each_cpu(mce_init, NULL, 1, 1); next_interval = check_interval * HZ; if (next_interval) - schedule_delayed_work(&mcheck_work, next_interval); + schedule_delayed_work(&mcheck_work, + round_jiffies_relative(next_interval)); } static struct sysdev_class mce_sysclass = { diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c index 03356e64f9c8..2f8a7f18b0fe 100644 --- a/arch/x86_64/kernel/mce_amd.c +++ b/arch/x86_64/kernel/mce_amd.c @@ -157,9 +157,9 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) high |= K8_APIC_EXT_LVT_ENTRY_THRESHOLD << 20; wrmsr(address, low, high); - setup_APIC_extened_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD, - THRESHOLD_APIC_VECTOR, - K8_APIC_EXT_INT_MSG_FIX, 0); + setup_APIC_extended_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD, + THRESHOLD_APIC_VECTOR, + K8_APIC_EXT_INT_MSG_FIX, 0); threshold_defaults.address = address; threshold_restart_bank(&threshold_defaults, 0, 0); diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 61ae57eb9e4c..8bf0ca03ac8e 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -32,7 +32,6 @@ /* Have we found an MP table */ int smp_found_config; -unsigned int __initdata maxcpus = NR_CPUS; /* * Various Linux-internal data structures created from the @@ -649,6 +648,20 @@ static int mp_find_ioapic(int gsi) return -1; } +static u8 uniq_ioapic_id(u8 id) +{ + int i; + DECLARE_BITMAP(used, 256); + bitmap_zero(used, 256); + for (i = 0; i < nr_ioapics; i++) { + struct mpc_config_ioapic *ia = &mp_ioapics[i]; + __set_bit(ia->mpc_apicid, used); + } + if (!test_bit(id, used)) + return id; + return find_first_zero_bit(used, 256); +} + void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) { int idx = 0; @@ -656,14 +669,14 @@ void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) if (bad_ioapic(address)) return; - idx = nr_ioapics++; + idx = nr_ioapics; mp_ioapics[idx].mpc_type = MP_IOAPIC; mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; mp_ioapics[idx].mpc_apicaddr = address; set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); - mp_ioapics[idx].mpc_apicid = id; + mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id); mp_ioapics[idx].mpc_apicver = 0; /* @@ -680,6 +693,8 @@ void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) mp_ioapics[idx].mpc_apicaddr, mp_ioapic_routing[idx].gsi_start, mp_ioapic_routing[idx].gsi_end); + + nr_ioapics++; } void __init diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index edbbc59b7523..cb8ee9d02f86 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -384,11 +384,14 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) return rc; } +static unsigned ignore_nmis; + asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code) { nmi_enter(); add_pda(__nmi_count,1); - default_do_nmi(regs); + if (!ignore_nmis) + default_do_nmi(regs); nmi_exit(); } @@ -401,6 +404,18 @@ int do_nmi_callback(struct pt_regs * regs, int cpu) return 0; } +void stop_nmi(void) +{ + acpi_nmi_disable(); + ignore_nmis++; +} + +void restart_nmi(void) +{ + ignore_nmis--; + acpi_nmi_enable(); +} + #ifdef CONFIG_SYSCTL static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 5bd20b542c1e..ba16c968ca3f 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -1,7 +1,7 @@ /* * Derived from arch/powerpc/kernel/iommu.c * - * Copyright (C) IBM Corporation, 2006 + * Copyright IBM Corporation, 2006-2007 * Copyright (C) 2006 Jon Mason <jdmason@kudzu.us> * * Author: Jon Mason <jdmason@kudzu.us> @@ -35,7 +35,7 @@ #include <linux/pci_ids.h> #include <linux/pci.h> #include <linux/delay.h> -#include <asm/proto.h> +#include <asm/iommu.h> #include <asm/calgary.h> #include <asm/tce.h> #include <asm/pci-direct.h> @@ -50,13 +50,7 @@ int use_calgary __read_mostly = 0; #endif /* CONFIG_CALGARY_DEFAULT_ENABLED */ #define PCI_DEVICE_ID_IBM_CALGARY 0x02a1 -#define PCI_VENDOR_DEVICE_ID_CALGARY \ - (PCI_VENDOR_ID_IBM | PCI_DEVICE_ID_IBM_CALGARY << 16) - -/* we need these for register space address calculation */ -#define START_ADDRESS 0xfe000000 -#define CHASSIS_BASE 0 -#define ONE_BASED_CHASSIS_NUM 1 +#define PCI_DEVICE_ID_IBM_CALIOC2 0x0308 /* register offsets inside the host bridge space */ #define CALGARY_CONFIG_REG 0x0108 @@ -80,6 +74,12 @@ int use_calgary __read_mostly = 0; #define PHB_MEM_2_SIZE_LOW 0x02E0 #define PHB_DOSHOLE_OFFSET 0x08E0 +/* CalIOC2 specific */ +#define PHB_SAVIOR_L2 0x0DB0 +#define PHB_PAGE_MIG_CTRL 0x0DA8 +#define PHB_PAGE_MIG_DEBUG 0x0DA0 +#define PHB_ROOT_COMPLEX_STATUS 0x0CB0 + /* PHB_CONFIG_RW */ #define PHB_TCE_ENABLE 0x20000000 #define PHB_SLOT_DISABLE 0x1C000000 @@ -92,7 +92,11 @@ int use_calgary __read_mostly = 0; /* CSR (Channel/DMA Status Register) */ #define CSR_AGENT_MASK 0xffe0ffff /* CCR (Calgary Configuration Register) */ -#define CCR_2SEC_TIMEOUT 0x000000000000000EUL +#define CCR_2SEC_TIMEOUT 0x000000000000000EUL +/* PMCR/PMDR (Page Migration Control/Debug Registers */ +#define PMR_SOFTSTOP 0x80000000 +#define PMR_SOFTSTOPFAULT 0x40000000 +#define PMR_HARDSTOP 0x20000000 #define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */ #define MAX_NUM_CHASSIS 8 /* max number of chassis */ @@ -155,9 +159,26 @@ struct calgary_bus_info { void __iomem *bbar; }; -static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; +static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); +static void calgary_tce_cache_blast(struct iommu_table *tbl); +static void calgary_dump_error_regs(struct iommu_table *tbl); +static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); +static void calioc2_tce_cache_blast(struct iommu_table *tbl); +static void calioc2_dump_error_regs(struct iommu_table *tbl); + +static struct cal_chipset_ops calgary_chip_ops = { + .handle_quirks = calgary_handle_quirks, + .tce_cache_blast = calgary_tce_cache_blast, + .dump_error_regs = calgary_dump_error_regs +}; -static void tce_cache_blast(struct iommu_table *tbl); +static struct cal_chipset_ops calioc2_chip_ops = { + .handle_quirks = calioc2_handle_quirks, + .tce_cache_blast = calioc2_tce_cache_blast, + .dump_error_regs = calioc2_dump_error_regs +}; + +static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; /* enable this to stress test the chip's TCE cache */ #ifdef CONFIG_IOMMU_DEBUG @@ -187,6 +208,7 @@ static inline unsigned long verify_bit_range(unsigned long* bitmap, { return ~0UL; } + #endif /* CONFIG_IOMMU_DEBUG */ static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen) @@ -206,11 +228,12 @@ static inline int translate_phb(struct pci_dev* dev) } static void iommu_range_reserve(struct iommu_table *tbl, - unsigned long start_addr, unsigned int npages) + unsigned long start_addr, unsigned int npages) { unsigned long index; unsigned long end; unsigned long badbit; + unsigned long flags; index = start_addr >> PAGE_SHIFT; @@ -222,6 +245,8 @@ static void iommu_range_reserve(struct iommu_table *tbl, if (end > tbl->it_size) /* don't go off the table */ end = tbl->it_size; + spin_lock_irqsave(&tbl->it_lock, flags); + badbit = verify_bit_range(tbl->it_map, 0, index, end); if (badbit != ~0UL) { if (printk_ratelimit()) @@ -231,23 +256,29 @@ static void iommu_range_reserve(struct iommu_table *tbl, } set_bit_string(tbl->it_map, index, npages); + + spin_unlock_irqrestore(&tbl->it_lock, flags); } static unsigned long iommu_range_alloc(struct iommu_table *tbl, unsigned int npages) { + unsigned long flags; unsigned long offset; BUG_ON(npages == 0); + spin_lock_irqsave(&tbl->it_lock, flags); + offset = find_next_zero_string(tbl->it_map, tbl->it_hint, tbl->it_size, npages); if (offset == ~0UL) { - tce_cache_blast(tbl); + tbl->chip_ops->tce_cache_blast(tbl); offset = find_next_zero_string(tbl->it_map, 0, tbl->it_size, npages); if (offset == ~0UL) { printk(KERN_WARNING "Calgary: IOMMU full.\n"); + spin_unlock_irqrestore(&tbl->it_lock, flags); if (panic_on_overflow) panic("Calgary: fix the allocator.\n"); else @@ -259,17 +290,17 @@ static unsigned long iommu_range_alloc(struct iommu_table *tbl, tbl->it_hint = offset + npages; BUG_ON(tbl->it_hint > tbl->it_size); + spin_unlock_irqrestore(&tbl->it_lock, flags); + return offset; } static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *vaddr, unsigned int npages, int direction) { - unsigned long entry, flags; + unsigned long entry; dma_addr_t ret = bad_dma_address; - spin_lock_irqsave(&tbl->it_lock, flags); - entry = iommu_range_alloc(tbl, npages); if (unlikely(entry == bad_dma_address)) @@ -282,23 +313,21 @@ static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *vaddr, tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK, direction); - spin_unlock_irqrestore(&tbl->it_lock, flags); - return ret; error: - spin_unlock_irqrestore(&tbl->it_lock, flags); printk(KERN_WARNING "Calgary: failed to allocate %u pages in " "iommu %p\n", npages, tbl); return bad_dma_address; } -static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, +static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned int npages) { unsigned long entry; unsigned long badbit; unsigned long badend; + unsigned long flags; /* were we called with bad_dma_address? */ badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); @@ -315,6 +344,8 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, tce_free(tbl, entry, npages); + spin_lock_irqsave(&tbl->it_lock, flags); + badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages); if (badbit != ~0UL) { if (printk_ratelimit()) @@ -324,23 +355,40 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, } __clear_bit_string(tbl->it_map, entry, npages); + + spin_unlock_irqrestore(&tbl->it_lock, flags); } -static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, - unsigned int npages) +static inline struct iommu_table *find_iommu_table(struct device *dev) { - unsigned long flags; + struct pci_dev *pdev; + struct pci_bus *pbus; + struct iommu_table *tbl; - spin_lock_irqsave(&tbl->it_lock, flags); + pdev = to_pci_dev(dev); - __iommu_free(tbl, dma_addr, npages); + /* is the device behind a bridge? */ + if (unlikely(pdev->bus->parent)) + pbus = pdev->bus->parent; + else + pbus = pdev->bus; - spin_unlock_irqrestore(&tbl->it_lock, flags); + tbl = pci_iommu(pbus); + + BUG_ON(pdev->bus->parent && + (tbl->it_busno != pdev->bus->parent->number)); + + return tbl; } -static void __calgary_unmap_sg(struct iommu_table *tbl, +static void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems, int direction) { + struct iommu_table *tbl = find_iommu_table(dev); + + if (!translate_phb(to_pci_dev(dev))) + return; + while (nelems--) { unsigned int npages; dma_addr_t dma = sglist->dma_address; @@ -350,33 +398,17 @@ static void __calgary_unmap_sg(struct iommu_table *tbl, break; npages = num_dma_pages(dma, dmalen); - __iommu_free(tbl, dma, npages); + iommu_free(tbl, dma, npages); sglist++; } } -void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist, - int nelems, int direction) -{ - unsigned long flags; - struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata; - - if (!translate_phb(to_pci_dev(dev))) - return; - - spin_lock_irqsave(&tbl->it_lock, flags); - - __calgary_unmap_sg(tbl, sglist, nelems, direction); - - spin_unlock_irqrestore(&tbl->it_lock, flags); -} - static int calgary_nontranslate_map_sg(struct device* dev, struct scatterlist *sg, int nelems, int direction) { int i; - for (i = 0; i < nelems; i++ ) { + for (i = 0; i < nelems; i++ ) { struct scatterlist *s = &sg[i]; BUG_ON(!s->page); s->dma_address = virt_to_bus(page_address(s->page) +s->offset); @@ -385,11 +417,10 @@ static int calgary_nontranslate_map_sg(struct device* dev, return nelems; } -int calgary_map_sg(struct device *dev, struct scatterlist *sg, +static int calgary_map_sg(struct device *dev, struct scatterlist *sg, int nelems, int direction) { - struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata; - unsigned long flags; + struct iommu_table *tbl = find_iommu_table(dev); unsigned long vaddr; unsigned int npages; unsigned long entry; @@ -398,8 +429,6 @@ int calgary_map_sg(struct device *dev, struct scatterlist *sg, if (!translate_phb(to_pci_dev(dev))) return calgary_nontranslate_map_sg(dev, sg, nelems, direction); - spin_lock_irqsave(&tbl->it_lock, flags); - for (i = 0; i < nelems; i++ ) { struct scatterlist *s = &sg[i]; BUG_ON(!s->page); @@ -423,26 +452,23 @@ int calgary_map_sg(struct device *dev, struct scatterlist *sg, s->dma_length = s->length; } - spin_unlock_irqrestore(&tbl->it_lock, flags); - return nelems; error: - __calgary_unmap_sg(tbl, sg, nelems, direction); + calgary_unmap_sg(dev, sg, nelems, direction); for (i = 0; i < nelems; i++) { sg[i].dma_address = bad_dma_address; sg[i].dma_length = 0; } - spin_unlock_irqrestore(&tbl->it_lock, flags); return 0; } -dma_addr_t calgary_map_single(struct device *dev, void *vaddr, +static dma_addr_t calgary_map_single(struct device *dev, void *vaddr, size_t size, int direction) { dma_addr_t dma_handle = bad_dma_address; unsigned long uaddr; unsigned int npages; - struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata; + struct iommu_table *tbl = find_iommu_table(dev); uaddr = (unsigned long)vaddr; npages = num_dma_pages(uaddr, size); @@ -455,10 +481,10 @@ dma_addr_t calgary_map_single(struct device *dev, void *vaddr, return dma_handle; } -void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, +static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, size_t size, int direction) { - struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata; + struct iommu_table *tbl = find_iommu_table(dev); unsigned int npages; if (!translate_phb(to_pci_dev(dev))) @@ -468,15 +494,13 @@ void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, iommu_free(tbl, dma_handle, npages); } -void* calgary_alloc_coherent(struct device *dev, size_t size, +static void* calgary_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag) { void *ret = NULL; dma_addr_t mapping; unsigned int npages, order; - struct iommu_table *tbl; - - tbl = to_pci_dev(dev)->bus->self->sysdata; + struct iommu_table *tbl = find_iommu_table(dev); size = PAGE_ALIGN(size); /* size rounded up to full pages */ npages = size >> PAGE_SHIFT; @@ -552,7 +576,22 @@ static inline void __iomem* calgary_reg(void __iomem *bar, unsigned long offset) return (void __iomem*)target; } -static void tce_cache_blast(struct iommu_table *tbl) +static inline int is_calioc2(unsigned short device) +{ + return (device == PCI_DEVICE_ID_IBM_CALIOC2); +} + +static inline int is_calgary(unsigned short device) +{ + return (device == PCI_DEVICE_ID_IBM_CALGARY); +} + +static inline int is_cal_pci_dev(unsigned short device) +{ + return (is_calgary(device) || is_calioc2(device)); +} + +static void calgary_tce_cache_blast(struct iommu_table *tbl) { u64 val; u32 aer; @@ -589,6 +628,85 @@ static void tce_cache_blast(struct iommu_table *tbl) (void)readl(target); /* flush */ } +static void calioc2_tce_cache_blast(struct iommu_table *tbl) +{ + void __iomem *bbar = tbl->bbar; + void __iomem *target; + u64 val64; + u32 val; + int i = 0; + int count = 1; + unsigned char bus = tbl->it_busno; + +begin: + printk(KERN_DEBUG "Calgary: CalIOC2 bus 0x%x entering tce cache blast " + "sequence - count %d\n", bus, count); + + /* 1. using the Page Migration Control reg set SoftStop */ + target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); + val = be32_to_cpu(readl(target)); + printk(KERN_DEBUG "1a. read 0x%x [LE] from %p\n", val, target); + val |= PMR_SOFTSTOP; + printk(KERN_DEBUG "1b. writing 0x%x [LE] to %p\n", val, target); + writel(cpu_to_be32(val), target); + + /* 2. poll split queues until all DMA activity is done */ + printk(KERN_DEBUG "2a. starting to poll split queues\n"); + target = calgary_reg(bbar, split_queue_offset(bus)); + do { + val64 = readq(target); + i++; + } while ((val64 & 0xff) != 0xff && i < 100); + if (i == 100) + printk(KERN_WARNING "CalIOC2: PCI bus not quiesced, " + "continuing anyway\n"); + + /* 3. poll Page Migration DEBUG for SoftStopFault */ + target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); + val = be32_to_cpu(readl(target)); + printk(KERN_DEBUG "3. read 0x%x [LE] from %p\n", val, target); + + /* 4. if SoftStopFault - goto (1) */ + if (val & PMR_SOFTSTOPFAULT) { + if (++count < 100) + goto begin; + else { + printk(KERN_WARNING "CalIOC2: too many SoftStopFaults, " + "aborting TCE cache flush sequence!\n"); + return; /* pray for the best */ + } + } + + /* 5. Slam into HardStop by reading PHB_PAGE_MIG_CTRL */ + target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); + printk(KERN_DEBUG "5a. slamming into HardStop by reading %p\n", target); + val = be32_to_cpu(readl(target)); + printk(KERN_DEBUG "5b. read 0x%x [LE] from %p\n", val, target); + target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); + val = be32_to_cpu(readl(target)); + printk(KERN_DEBUG "5c. read 0x%x [LE] from %p (debug)\n", val, target); + + /* 6. invalidate TCE cache */ + printk(KERN_DEBUG "6. invalidating TCE cache\n"); + target = calgary_reg(bbar, tar_offset(bus)); + writeq(tbl->tar_val, target); + + /* 7. Re-read PMCR */ + printk(KERN_DEBUG "7a. Re-reading PMCR\n"); + target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); + val = be32_to_cpu(readl(target)); + printk(KERN_DEBUG "7b. read 0x%x [LE] from %p\n", val, target); + + /* 8. Remove HardStop */ + printk(KERN_DEBUG "8a. removing HardStop from PMCR\n"); + target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); + val = 0; + printk(KERN_DEBUG "8b. writing 0x%x [LE] to %p\n", val, target); + writel(cpu_to_be32(val), target); + val = be32_to_cpu(readl(target)); + printk(KERN_DEBUG "8c. read 0x%x [LE] from %p\n", val, target); +} + static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start, u64 limit) { @@ -598,7 +716,7 @@ static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start, limit++; numpages = ((limit - start) >> PAGE_SHIFT); - iommu_range_reserve(dev->sysdata, start, numpages); + iommu_range_reserve(pci_iommu(dev->bus), start, numpages); } static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev) @@ -606,7 +724,7 @@ static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev) void __iomem *target; u64 low, high, sizelow; u64 start, limit; - struct iommu_table *tbl = dev->sysdata; + struct iommu_table *tbl = pci_iommu(dev->bus); unsigned char busnum = dev->bus->number; void __iomem *bbar = tbl->bbar; @@ -630,7 +748,7 @@ static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev) u32 val32; u64 low, high, sizelow, sizehigh; u64 start, limit; - struct iommu_table *tbl = dev->sysdata; + struct iommu_table *tbl = pci_iommu(dev->bus); unsigned char busnum = dev->bus->number; void __iomem *bbar = tbl->bbar; @@ -666,14 +784,20 @@ static void __init calgary_reserve_regions(struct pci_dev *dev) { unsigned int npages; u64 start; - struct iommu_table *tbl = dev->sysdata; + struct iommu_table *tbl = pci_iommu(dev->bus); /* reserve EMERGENCY_PAGES from bad_dma_address and up */ iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES); /* avoid the BIOS/VGA first 640KB-1MB region */ - start = (640 * 1024); - npages = ((1024 - 640) * 1024) >> PAGE_SHIFT; + /* for CalIOC2 - avoid the entire first MB */ + if (is_calgary(dev->device)) { + start = (640 * 1024); + npages = ((1024 - 640) * 1024) >> PAGE_SHIFT; + } else { /* calioc2 */ + start = 0; + npages = (1 * 1024 * 1024) >> PAGE_SHIFT; + } iommu_range_reserve(tbl, start, npages); /* reserve the two PCI peripheral memory regions in IO space */ @@ -694,10 +818,17 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) if (ret) return ret; - tbl = dev->sysdata; + tbl = pci_iommu(dev->bus); tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space; tce_free(tbl, 0, tbl->it_size); + if (is_calgary(dev->device)) + tbl->chip_ops = &calgary_chip_ops; + else if (is_calioc2(dev->device)) + tbl->chip_ops = &calioc2_chip_ops; + else + BUG(); + calgary_reserve_regions(dev); /* set TARs for each PHB */ @@ -706,15 +837,15 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) /* zero out all TAR bits under sw control */ val64 &= ~TAR_SW_BITS; - - tbl = dev->sysdata; table_phys = (u64)__pa(tbl->it_base); + val64 |= table_phys; BUG_ON(specified_table_size > TCE_TABLE_SIZE_8M); val64 |= (u64) specified_table_size; tbl->tar_val = cpu_to_be64(val64); + writeq(tbl->tar_val, target); readq(target); /* flush */ @@ -724,7 +855,7 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) static void __init calgary_free_bus(struct pci_dev *dev) { u64 val64; - struct iommu_table *tbl = dev->sysdata; + struct iommu_table *tbl = pci_iommu(dev->bus); void __iomem *target; unsigned int bitmapsz; @@ -739,16 +870,81 @@ static void __init calgary_free_bus(struct pci_dev *dev) tbl->it_map = NULL; kfree(tbl); - dev->sysdata = NULL; + + set_pci_iommu(dev->bus, NULL); /* Can't free bootmem allocated memory after system is up :-( */ bus_info[dev->bus->number].tce_space = NULL; } +static void calgary_dump_error_regs(struct iommu_table *tbl) +{ + void __iomem *bbar = tbl->bbar; + void __iomem *target; + u32 csr, plssr; + + target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET); + csr = be32_to_cpu(readl(target)); + + target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET); + plssr = be32_to_cpu(readl(target)); + + /* If no error, the agent ID in the CSR is not valid */ + printk(KERN_EMERG "Calgary: DMA error on Calgary PHB 0x%x, " + "0x%08x@CSR 0x%08x@PLSSR\n", tbl->it_busno, csr, plssr); +} + +static void calioc2_dump_error_regs(struct iommu_table *tbl) +{ + void __iomem *bbar = tbl->bbar; + u32 csr, csmr, plssr, mck, rcstat; + void __iomem *target; + unsigned long phboff = phb_offset(tbl->it_busno); + unsigned long erroff; + u32 errregs[7]; + int i; + + /* dump CSR */ + target = calgary_reg(bbar, phboff | PHB_CSR_OFFSET); + csr = be32_to_cpu(readl(target)); + /* dump PLSSR */ + target = calgary_reg(bbar, phboff | PHB_PLSSR_OFFSET); + plssr = be32_to_cpu(readl(target)); + /* dump CSMR */ + target = calgary_reg(bbar, phboff | 0x290); + csmr = be32_to_cpu(readl(target)); + /* dump mck */ + target = calgary_reg(bbar, phboff | 0x800); + mck = be32_to_cpu(readl(target)); + + printk(KERN_EMERG "Calgary: DMA error on CalIOC2 PHB 0x%x\n", + tbl->it_busno); + + printk(KERN_EMERG "Calgary: 0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n", + csr, plssr, csmr, mck); + + /* dump rest of error regs */ + printk(KERN_EMERG "Calgary: "); + for (i = 0; i < ARRAY_SIZE(errregs); i++) { + /* err regs are at 0x810 - 0x870 */ + erroff = (0x810 + (i * 0x10)); + target = calgary_reg(bbar, phboff | erroff); + errregs[i] = be32_to_cpu(readl(target)); + printk("0x%08x@0x%lx ", errregs[i], erroff); + } + printk("\n"); + + /* root complex status */ + target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS); + rcstat = be32_to_cpu(readl(target)); + printk(KERN_EMERG "Calgary: 0x%08x@0x%x\n", rcstat, + PHB_ROOT_COMPLEX_STATUS); +} + static void calgary_watchdog(unsigned long data) { struct pci_dev *dev = (struct pci_dev *)data; - struct iommu_table *tbl = dev->sysdata; + struct iommu_table *tbl = pci_iommu(dev->bus); void __iomem *bbar = tbl->bbar; u32 val32; void __iomem *target; @@ -758,13 +954,14 @@ static void calgary_watchdog(unsigned long data) /* If no error, the agent ID in the CSR is not valid */ if (val32 & CSR_AGENT_MASK) { - printk(KERN_EMERG "calgary_watchdog: DMA error on PHB %#x, " - "CSR = %#x\n", dev->bus->number, val32); + tbl->chip_ops->dump_error_regs(tbl); + + /* reset error */ writel(0, target); /* Disable bus that caused the error */ target = calgary_reg(bbar, phb_offset(tbl->it_busno) | - PHB_CONFIG_RW_OFFSET); + PHB_CONFIG_RW_OFFSET); val32 = be32_to_cpu(readl(target)); val32 |= PHB_SLOT_DISABLE; writel(cpu_to_be32(val32), target); @@ -775,8 +972,8 @@ static void calgary_watchdog(unsigned long data) } } -static void __init calgary_increase_split_completion_timeout(void __iomem *bbar, - unsigned char busnum) +static void __init calgary_set_split_completion_timeout(void __iomem *bbar, + unsigned char busnum, unsigned long timeout) { u64 val64; void __iomem *target; @@ -802,11 +999,40 @@ static void __init calgary_increase_split_completion_timeout(void __iomem *bbar, /* zero out this PHB's timer bits */ mask = ~(0xFUL << phb_shift); val64 &= mask; - val64 |= (CCR_2SEC_TIMEOUT << phb_shift); + val64 |= (timeout << phb_shift); writeq(cpu_to_be64(val64), target); readq(target); /* flush */ } +static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev) +{ + unsigned char busnum = dev->bus->number; + void __iomem *bbar = tbl->bbar; + void __iomem *target; + u32 val; + + /* + * CalIOC2 designers recommend setting bit 8 in 0xnDB0 to 1 + */ + target = calgary_reg(bbar, phb_offset(busnum) | PHB_SAVIOR_L2); + val = cpu_to_be32(readl(target)); + val |= 0x00800000; + writel(cpu_to_be32(val), target); +} + +static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev) +{ + unsigned char busnum = dev->bus->number; + + /* + * Give split completion a longer timeout on bus 1 for aic94xx + * http://bugzilla.kernel.org/show_bug.cgi?id=7180 + */ + if (is_calgary(dev->device) && (busnum == 1)) + calgary_set_split_completion_timeout(tbl->bbar, busnum, + CCR_2SEC_TIMEOUT); +} + static void __init calgary_enable_translation(struct pci_dev *dev) { u32 val32; @@ -816,7 +1042,7 @@ static void __init calgary_enable_translation(struct pci_dev *dev) struct iommu_table *tbl; busnum = dev->bus->number; - tbl = dev->sysdata; + tbl = pci_iommu(dev->bus); bbar = tbl->bbar; /* enable TCE in PHB Config Register */ @@ -824,20 +1050,15 @@ static void __init calgary_enable_translation(struct pci_dev *dev) val32 = be32_to_cpu(readl(target)); val32 |= PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE; - printk(KERN_INFO "Calgary: enabling translation on PHB %#x\n", busnum); + printk(KERN_INFO "Calgary: enabling translation on %s PHB %#x\n", + (dev->device == PCI_DEVICE_ID_IBM_CALGARY) ? + "Calgary" : "CalIOC2", busnum); printk(KERN_INFO "Calgary: errant DMAs will now be prevented on this " "bus.\n"); writel(cpu_to_be32(val32), target); readl(target); /* flush */ - /* - * Give split completion a longer timeout on bus 1 for aic94xx - * http://bugzilla.kernel.org/show_bug.cgi?id=7180 - */ - if (busnum == 1) - calgary_increase_split_completion_timeout(bbar, busnum); - init_timer(&tbl->watchdog_timer); tbl->watchdog_timer.function = &calgary_watchdog; tbl->watchdog_timer.data = (unsigned long)dev; @@ -853,7 +1074,7 @@ static void __init calgary_disable_translation(struct pci_dev *dev) struct iommu_table *tbl; busnum = dev->bus->number; - tbl = dev->sysdata; + tbl = pci_iommu(dev->bus); bbar = tbl->bbar; /* disable TCE in PHB Config Register */ @@ -871,13 +1092,19 @@ static void __init calgary_disable_translation(struct pci_dev *dev) static void __init calgary_init_one_nontraslated(struct pci_dev *dev) { pci_dev_get(dev); - dev->sysdata = NULL; - dev->bus->self = dev; + set_pci_iommu(dev->bus, NULL); + + /* is the device behind a bridge? */ + if (dev->bus->parent) + dev->bus->parent->self = dev; + else + dev->bus->self = dev; } static int __init calgary_init_one(struct pci_dev *dev) { void __iomem *bbar; + struct iommu_table *tbl; int ret; BUG_ON(dev->bus->number >= MAX_PHB_BUS_NUM); @@ -888,7 +1115,18 @@ static int __init calgary_init_one(struct pci_dev *dev) goto done; pci_dev_get(dev); - dev->bus->self = dev; + + if (dev->bus->parent) { + if (dev->bus->parent->self) + printk(KERN_WARNING "Calgary: IEEEE, dev %p has " + "bus->parent->self!\n", dev); + dev->bus->parent->self = dev; + } else + dev->bus->self = dev; + + tbl = pci_iommu(dev->bus); + tbl->chip_ops->handle_quirks(tbl, dev); + calgary_enable_translation(dev); return 0; @@ -924,11 +1162,18 @@ static int __init calgary_locate_bbars(void) target = calgary_reg(bbar, offset); val = be32_to_cpu(readl(target)); + start_bus = (u8)((val & 0x00FF0000) >> 16); end_bus = (u8)((val & 0x0000FF00) >> 8); - for (bus = start_bus; bus <= end_bus; bus++) { - bus_info[bus].bbar = bbar; - bus_info[bus].phbid = phb; + + if (end_bus) { + for (bus = start_bus; bus <= end_bus; bus++) { + bus_info[bus].bbar = bbar; + bus_info[bus].phbid = phb; + } + } else { + bus_info[start_bus].bbar = bbar; + bus_info[start_bus].phbid = phb; } } } @@ -948,22 +1193,24 @@ static int __init calgary_init(void) { int ret; struct pci_dev *dev = NULL; + void *tce_space; ret = calgary_locate_bbars(); if (ret) return ret; do { - dev = pci_get_device(PCI_VENDOR_ID_IBM, - PCI_DEVICE_ID_IBM_CALGARY, - dev); + dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); if (!dev) break; + if (!is_cal_pci_dev(dev->device)) + continue; if (!translate_phb(dev)) { calgary_init_one_nontraslated(dev); continue; } - if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots) + tce_space = bus_info[dev->bus->number].tce_space; + if (!tce_space && !translate_empty_slots) continue; ret = calgary_init_one(dev); @@ -976,10 +1223,11 @@ static int __init calgary_init(void) error: do { dev = pci_get_device_reverse(PCI_VENDOR_ID_IBM, - PCI_DEVICE_ID_IBM_CALGARY, - dev); + PCI_ANY_ID, dev); if (!dev) break; + if (!is_cal_pci_dev(dev->device)) + continue; if (!translate_phb(dev)) { pci_dev_put(dev); continue; @@ -1057,9 +1305,29 @@ static int __init build_detail_arrays(void) return 0; } -void __init detect_calgary(void) +static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev) { + int dev; u32 val; + + if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) { + /* + * FIXME: properly scan for devices accross the + * PCI-to-PCI bridge on every CalIOC2 port. + */ + return 1; + } + + for (dev = 1; dev < 8; dev++) { + val = read_pci_config(bus, dev, 0, 0); + if (val != 0xffffffff) + break; + } + return (val != 0xffffffff); +} + +void __init detect_calgary(void) +{ int bus; void *tbl; int calgary_found = 0; @@ -1116,29 +1384,26 @@ void __init detect_calgary(void) specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE); for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { - int dev; struct calgary_bus_info *info = &bus_info[bus]; + unsigned short pci_device; + u32 val; + + val = read_pci_config(bus, 0, 0, 0); + pci_device = (val & 0xFFFF0000) >> 16; - if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY) + if (!is_cal_pci_dev(pci_device)) continue; if (info->translation_disabled) continue; - /* - * Scan the slots of the PCI bus to see if there is a device present. - * The parent bus will be the zero-ith device, so start at 1. - */ - for (dev = 1; dev < 8; dev++) { - val = read_pci_config(bus, dev, 0, 0); - if (val != 0xffffffff || translate_empty_slots) { - tbl = alloc_tce_table(); - if (!tbl) - goto cleanup; - info->tce_space = tbl; - calgary_found = 1; - break; - } + if (calgary_bus_has_devices(bus, pci_device) || + translate_empty_slots) { + tbl = alloc_tce_table(); + if (!tbl) + goto cleanup; + info->tce_space = tbl; + calgary_found = 1; } } @@ -1249,3 +1514,66 @@ static int __init calgary_parse_options(char *p) return 1; } __setup("calgary=", calgary_parse_options); + +static void __init calgary_fixup_one_tce_space(struct pci_dev *dev) +{ + struct iommu_table *tbl; + unsigned int npages; + int i; + + tbl = pci_iommu(dev->bus); + + for (i = 0; i < 4; i++) { + struct resource *r = &dev->resource[PCI_BRIDGE_RESOURCES + i]; + + /* Don't give out TCEs that map MEM resources */ + if (!(r->flags & IORESOURCE_MEM)) + continue; + + /* 0-based? we reserve the whole 1st MB anyway */ + if (!r->start) + continue; + + /* cover the whole region */ + npages = (r->end - r->start) >> PAGE_SHIFT; + npages++; + + iommu_range_reserve(tbl, r->start, npages); + } +} + +static int __init calgary_fixup_tce_spaces(void) +{ + struct pci_dev *dev = NULL; + void *tce_space; + + if (no_iommu || swiotlb || !calgary_detected) + return -ENODEV; + + printk(KERN_DEBUG "Calgary: fixing up tce spaces\n"); + + do { + dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); + if (!dev) + break; + if (!is_cal_pci_dev(dev->device)) + continue; + if (!translate_phb(dev)) + continue; + + tce_space = bus_info[dev->bus->number].tce_space; + if (!tce_space) + continue; + + calgary_fixup_one_tce_space(dev); + + } while (1); + + return 0; +} + +/* + * We need to be call after pcibios_assign_resources (fs_initcall level) + * and before device_initcall. + */ +rootfs_initcall(calgary_fixup_tce_spaces); diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c index 90f6315d02d4..05d745ede561 100644 --- a/arch/x86_64/kernel/pci-dma.c +++ b/arch/x86_64/kernel/pci-dma.c @@ -8,7 +8,7 @@ #include <linux/pci.h> #include <linux/module.h> #include <asm/io.h> -#include <asm/proto.h> +#include <asm/iommu.h> #include <asm/calgary.h> int iommu_merge __read_mostly = 0; @@ -321,6 +321,11 @@ static int __init pci_iommu_init(void) return 0; } +void pci_iommu_shutdown(void) +{ + gart_iommu_shutdown(); +} + #ifdef CONFIG_PCI /* Many VIA bridges seem to corrupt data for DAC. Disable it here */ diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c index ae091cdc1a4d..4918c575d582 100644 --- a/arch/x86_64/kernel/pci-gart.c +++ b/arch/x86_64/kernel/pci-gart.c @@ -28,6 +28,7 @@ #include <asm/mtrr.h> #include <asm/pgtable.h> #include <asm/proto.h> +#include <asm/iommu.h> #include <asm/cacheflush.h> #include <asm/swiotlb.h> #include <asm/dma.h> @@ -235,7 +236,7 @@ static dma_addr_t gart_map_simple(struct device *dev, char *buf, } /* Map a single area into the IOMMU */ -dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir) +static dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir) { unsigned long phys_mem, bus; @@ -253,7 +254,7 @@ dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir) /* * Free a DMA mapping. */ -void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, +static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, int direction) { unsigned long iommu_page; @@ -275,7 +276,7 @@ void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, /* * Wrapper for pci_unmap_single working with scatterlists. */ -void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) +static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) { int i; @@ -571,6 +572,26 @@ static const struct dma_mapping_ops gart_dma_ops = { .unmap_sg = gart_unmap_sg, }; +void gart_iommu_shutdown(void) +{ + struct pci_dev *dev; + int i; + + if (no_agp && (dma_ops != &gart_dma_ops)) + return; + + for (i = 0; i < num_k8_northbridges; i++) { + u32 ctl; + + dev = k8_northbridges[i]; + pci_read_config_dword(dev, 0x90, &ctl); + + ctl &= ~1; + + pci_write_config_dword(dev, 0x90, ctl); + } +} + void __init gart_iommu_init(void) { struct agp_kern_info info; diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c index 6dade0c867cc..2a34c6c025a9 100644 --- a/arch/x86_64/kernel/pci-nommu.c +++ b/arch/x86_64/kernel/pci-nommu.c @@ -6,7 +6,7 @@ #include <linux/string.h> #include <linux/dma-mapping.h> -#include <asm/proto.h> +#include <asm/iommu.h> #include <asm/processor.h> #include <asm/dma.h> @@ -34,7 +34,7 @@ nommu_map_single(struct device *hwdev, void *ptr, size_t size, return bus; } -void nommu_unmap_single(struct device *dev, dma_addr_t addr,size_t size, +static void nommu_unmap_single(struct device *dev, dma_addr_t addr,size_t size, int direction) { } @@ -54,7 +54,7 @@ void nommu_unmap_single(struct device *dev, dma_addr_t addr,size_t size, * Device ownership issues as mentioned above for pci_map_single are * the same here. */ -int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, +static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, int direction) { int i; @@ -74,7 +74,7 @@ int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, * Again, cpu read rules concerning calls here are the same as for * pci_unmap_single() above. */ -void nommu_unmap_sg(struct device *dev, struct scatterlist *sg, +static void nommu_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) { } diff --git a/arch/x86_64/kernel/pci-swiotlb.c b/arch/x86_64/kernel/pci-swiotlb.c index 4b4569abc60c..b2f405ea7c85 100644 --- a/arch/x86_64/kernel/pci-swiotlb.c +++ b/arch/x86_64/kernel/pci-swiotlb.c @@ -5,7 +5,7 @@ #include <linux/module.h> #include <linux/dma-mapping.h> -#include <asm/proto.h> +#include <asm/iommu.h> #include <asm/swiotlb.h> #include <asm/dma.h> diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 5909039f37aa..e7ac629d4c46 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -207,6 +207,7 @@ void cpu_idle (void) if (__get_cpu_var(cpu_idle_state)) __get_cpu_var(cpu_idle_state) = 0; + check_pgt_cache(); rmb(); idle = pm_idle; if (!idle) @@ -278,7 +279,7 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) */ if (!pm_idle) { if (!printed) { - printk("using mwait in idle threads.\n"); + printk(KERN_INFO "using mwait in idle threads.\n"); printed = 1; } pm_idle = mwait_idle; @@ -305,6 +306,7 @@ early_param("idle", idle_setup); void __show_regs(struct pt_regs * regs) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; + unsigned long d0, d1, d2, d3, d6, d7; unsigned int fsindex,gsindex; unsigned int ds,cs,es; @@ -340,15 +342,24 @@ void __show_regs(struct pt_regs * regs) rdmsrl(MSR_GS_BASE, gs); rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); - asm("movq %%cr0, %0": "=r" (cr0)); - asm("movq %%cr2, %0": "=r" (cr2)); - asm("movq %%cr3, %0": "=r" (cr3)); - asm("movq %%cr4, %0": "=r" (cr4)); + cr0 = read_cr0(); + cr2 = read_cr2(); + cr3 = read_cr3(); + cr4 = read_cr4(); printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", fs,fsindex,gs,gsindex,shadowgs); printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); + + get_debugreg(d0, 0); + get_debugreg(d1, 1); + get_debugreg(d2, 2); + printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); + get_debugreg(d3, 3); + get_debugreg(d6, 6); + get_debugreg(d7, 7); + printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); } void show_regs(struct pt_regs *regs) diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c index 7503068e788d..368db2b9c5ac 100644 --- a/arch/x86_64/kernel/reboot.c +++ b/arch/x86_64/kernel/reboot.c @@ -16,6 +16,7 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/apic.h> +#include <asm/iommu.h> /* * Power off function, if any @@ -81,6 +82,7 @@ static inline void kb_wait(void) void machine_shutdown(void) { unsigned long flags; + /* Stop the cpus and apics */ #ifdef CONFIG_SMP int reboot_cpu_id; @@ -111,6 +113,8 @@ void machine_shutdown(void) disable_IO_APIC(); local_irq_restore(flags); + + pci_iommu_shutdown(); } void machine_emergency_restart(void) diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 33ef718f8cb5..af838f6b0b7f 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -575,6 +575,8 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) level = cpuid_eax(1); if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)) set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); + if (c->x86 == 0x10) + set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); /* Enable workaround for FXSAVE leak */ if (c->x86 >= 6) @@ -600,8 +602,14 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) if (c->extended_cpuid_level >= 0x80000008) amd_detect_cmp(c); - /* Fix cpuid4 emulation for more */ - num_cache_leaves = 3; + if (c->extended_cpuid_level >= 0x80000006 && + (cpuid_edx(0x80000006) & 0xf000)) + num_cache_leaves = 4; + else + num_cache_leaves = 3; + + if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11) + set_bit(X86_FEATURE_K8, &c->x86_capability); /* RDTSC can be speculated around */ clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c index 290f5d8037cd..739175b01e06 100644 --- a/arch/x86_64/kernel/signal.c +++ b/arch/x86_64/kernel/signal.c @@ -26,6 +26,7 @@ #include <asm/i387.h> #include <asm/proto.h> #include <asm/ia32_unistd.h> +#include <asm/mce.h> /* #define DEBUG_SIG 1 */ @@ -472,6 +473,12 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) clear_thread_flag(TIF_SINGLESTEP); } +#ifdef CONFIG_X86_MCE + /* notify userspace of pending MCEs */ + if (thread_info_flags & _TIF_MCE_NOTIFY) + mce_notify_user(); +#endif /* CONFIG_X86_MCE */ + /* deal with pending signal delivery */ if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK)) do_signal(regs); @@ -480,7 +487,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) void signal_fault(struct pt_regs *regs, void __user *frame, char *where) { struct task_struct *me = current; - if (exception_trace) + if (show_unhandled_signals && printk_ratelimit()) printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n", me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax); diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index 0694940b2e73..673a300b5944 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -241,7 +241,7 @@ void flush_tlb_mm (struct mm_struct * mm) } if (!cpus_empty(cpu_mask)) flush_tlb_others(cpu_mask, mm, FLUSH_ALL); - + check_pgt_cache(); preempt_enable(); } EXPORT_SYMBOL(flush_tlb_mm); @@ -386,9 +386,9 @@ int smp_call_function_single (int cpu, void (*func) (void *info), void *info, return 0; } - spin_lock_bh(&call_lock); + spin_lock(&call_lock); __smp_call_function_single(cpu, func, info, nonatomic, wait); - spin_unlock_bh(&call_lock); + spin_unlock(&call_lock); put_cpu(); return 0; } diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c index 6a5a98f2a75c..ea83a9f91965 100644 --- a/arch/x86_64/kernel/suspend.c +++ b/arch/x86_64/kernel/suspend.c @@ -55,11 +55,11 @@ void __save_processor_state(struct saved_context *ctxt) * control registers */ rdmsrl(MSR_EFER, ctxt->efer); - asm volatile ("movq %%cr0, %0" : "=r" (ctxt->cr0)); - asm volatile ("movq %%cr2, %0" : "=r" (ctxt->cr2)); - asm volatile ("movq %%cr3, %0" : "=r" (ctxt->cr3)); - asm volatile ("movq %%cr4, %0" : "=r" (ctxt->cr4)); - asm volatile ("movq %%cr8, %0" : "=r" (ctxt->cr8)); + ctxt->cr0 = read_cr0(); + ctxt->cr2 = read_cr2(); + ctxt->cr3 = read_cr3(); + ctxt->cr4 = read_cr4(); + ctxt->cr8 = read_cr8(); } void save_processor_state(void) @@ -81,11 +81,11 @@ void __restore_processor_state(struct saved_context *ctxt) * control registers */ wrmsrl(MSR_EFER, ctxt->efer); - asm volatile ("movq %0, %%cr8" :: "r" (ctxt->cr8)); - asm volatile ("movq %0, %%cr4" :: "r" (ctxt->cr4)); - asm volatile ("movq %0, %%cr3" :: "r" (ctxt->cr3)); - asm volatile ("movq %0, %%cr2" :: "r" (ctxt->cr2)); - asm volatile ("movq %0, %%cr0" :: "r" (ctxt->cr0)); + write_cr8(ctxt->cr8); + write_cr4(ctxt->cr4); + write_cr3(ctxt->cr3); + write_cr2(ctxt->cr2); + write_cr0(ctxt->cr0); /* * now restore the descriptor tables to their proper values diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c index f61fb8e4f129..3aeae2fa2e24 100644 --- a/arch/x86_64/kernel/tce.c +++ b/arch/x86_64/kernel/tce.c @@ -136,9 +136,9 @@ int build_tce_table(struct pci_dev *dev, void __iomem *bbar) struct iommu_table *tbl; int ret; - if (dev->sysdata) { - printk(KERN_ERR "Calgary: dev %p has sysdata %p\n", - dev, dev->sysdata); + if (pci_iommu(dev->bus)) { + printk(KERN_ERR "Calgary: dev %p has sysdata->iommu %p\n", + dev, pci_iommu(dev->bus)); BUG(); } @@ -155,11 +155,7 @@ int build_tce_table(struct pci_dev *dev, void __iomem *bbar) tbl->bbar = bbar; - /* - * NUMA is already using the bus's sysdata pointer, so we use - * the bus's pci_dev's sysdata instead. - */ - dev->sysdata = tbl; + set_pci_iommu(dev->bus, tbl); return 0; diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 4a0895bacf51..6d48a4e826d9 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -33,6 +33,7 @@ #include <acpi/acpi_bus.h> #endif #include <asm/8253pit.h> +#include <asm/i8253.h> #include <asm/pgtable.h> #include <asm/vsyscall.h> #include <asm/timex.h> @@ -44,12 +45,14 @@ #include <asm/hpet.h> #include <asm/mpspec.h> #include <asm/nmi.h> +#include <asm/vgtod.h> static char *timename = NULL; DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); DEFINE_SPINLOCK(i8253_lock); +EXPORT_SYMBOL(i8253_lock); volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; @@ -79,8 +82,9 @@ EXPORT_SYMBOL(profile_pc); * sheet for details. */ -static void set_rtc_mmss(unsigned long nowtime) +static int set_rtc_mmss(unsigned long nowtime) { + int retval = 0; int real_seconds, real_minutes, cmos_minutes; unsigned char control, freq_select; @@ -120,6 +124,7 @@ static void set_rtc_mmss(unsigned long nowtime) if (abs(real_minutes - cmos_minutes) >= 30) { printk(KERN_WARNING "time.c: can't update CMOS clock " "from %d to %d\n", cmos_minutes, real_minutes); + retval = -1; } else { BIN_TO_BCD(real_seconds); BIN_TO_BCD(real_minutes); @@ -139,12 +144,17 @@ static void set_rtc_mmss(unsigned long nowtime) CMOS_WRITE(freq_select, RTC_FREQ_SELECT); spin_unlock(&rtc_lock); + + return retval; } +int update_persistent_clock(struct timespec now) +{ + return set_rtc_mmss(now.tv_sec); +} void main_timer_handler(void) { - static unsigned long rtc_update = 0; /* * Here we are in the timer irq handler. We have irqs locally disabled (so we * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running @@ -172,20 +182,6 @@ void main_timer_handler(void) if (!using_apic_timer) smp_local_timer_interrupt(); -/* - * If we have an externally synchronized Linux clock, then update CMOS clock - * accordingly every ~11 minutes. set_rtc_mmss() will be called in the jiffy - * closest to exactly 500 ms before the next second. If the update fails, we - * don't care, as it'll be updated on the next turn, and the problem (time way - * off) isn't likely to go away much sooner anyway. - */ - - if (ntp_synced() && xtime.tv_sec > rtc_update && - abs(xtime.tv_nsec - 500000000) <= tick_nsec / 2) { - set_rtc_mmss(xtime.tv_sec); - rtc_update = xtime.tv_sec + 660; - } - write_sequnlock(&xtime_lock); } @@ -199,7 +195,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static unsigned long get_cmos_time(void) +unsigned long read_persistent_clock(void) { unsigned int year, mon, day, hour, min, sec; unsigned long flags; @@ -226,7 +222,7 @@ static unsigned long get_cmos_time(void) /* * We know that x86-64 always uses BCD format, no need to check the * config register. - */ + */ BCD_TO_BIN(sec); BCD_TO_BIN(min); @@ -239,11 +235,11 @@ static unsigned long get_cmos_time(void) BCD_TO_BIN(century); year += century * 100; printk(KERN_INFO "Extended CMOS year: %d\n", century * 100); - } else { + } else { /* * x86-64 systems only exists since 2002. * This will work up to Dec 31, 2100 - */ + */ year += 2000; } @@ -255,45 +251,45 @@ static unsigned long get_cmos_time(void) #define TICK_COUNT 100000000 static unsigned int __init tsc_calibrate_cpu_khz(void) { - int tsc_start, tsc_now; - int i, no_ctr_free; - unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; - unsigned long flags; - - for (i = 0; i < 4; i++) - if (avail_to_resrv_perfctr_nmi_bit(i)) - break; - no_ctr_free = (i == 4); - if (no_ctr_free) { - i = 3; - rdmsrl(MSR_K7_EVNTSEL3, evntsel3); - wrmsrl(MSR_K7_EVNTSEL3, 0); - rdmsrl(MSR_K7_PERFCTR3, pmc3); - } else { - reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); - reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } - local_irq_save(flags); - /* start meauring cycles, incrementing from 0 */ - wrmsrl(MSR_K7_PERFCTR0 + i, 0); - wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); - rdtscl(tsc_start); - do { - rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); - tsc_now = get_cycles_sync(); - } while ((tsc_now - tsc_start) < TICK_COUNT); - - local_irq_restore(flags); - if (no_ctr_free) { - wrmsrl(MSR_K7_EVNTSEL3, 0); - wrmsrl(MSR_K7_PERFCTR3, pmc3); - wrmsrl(MSR_K7_EVNTSEL3, evntsel3); - } else { - release_perfctr_nmi(MSR_K7_PERFCTR0 + i); - release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } - - return pmc_now * tsc_khz / (tsc_now - tsc_start); + int tsc_start, tsc_now; + int i, no_ctr_free; + unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; + unsigned long flags; + + for (i = 0; i < 4; i++) + if (avail_to_resrv_perfctr_nmi_bit(i)) + break; + no_ctr_free = (i == 4); + if (no_ctr_free) { + i = 3; + rdmsrl(MSR_K7_EVNTSEL3, evntsel3); + wrmsrl(MSR_K7_EVNTSEL3, 0); + rdmsrl(MSR_K7_PERFCTR3, pmc3); + } else { + reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); + reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } + local_irq_save(flags); + /* start meauring cycles, incrementing from 0 */ + wrmsrl(MSR_K7_PERFCTR0 + i, 0); + wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); + rdtscl(tsc_start); + do { + rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); + tsc_now = get_cycles_sync(); + } while ((tsc_now - tsc_start) < TICK_COUNT); + + local_irq_restore(flags); + if (no_ctr_free) { + wrmsrl(MSR_K7_EVNTSEL3, 0); + wrmsrl(MSR_K7_PERFCTR3, pmc3); + wrmsrl(MSR_K7_EVNTSEL3, evntsel3); + } else { + release_perfctr_nmi(MSR_K7_PERFCTR0 + i); + release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } + + return pmc_now * tsc_khz / (tsc_now - tsc_start); } /* @@ -321,7 +317,7 @@ static unsigned int __init pit_calibrate_tsc(void) end = get_cycles_sync(); spin_unlock_irqrestore(&i8253_lock, flags); - + return (end - start) / 50; } @@ -366,25 +362,20 @@ static struct irqaction irq0 = { .handler = timer_interrupt, .flags = IRQF_DISABLED | IRQF_IRQPOLL, .mask = CPU_MASK_NONE, - .name = "timer" + .name = "timer" }; void __init time_init(void) { if (nohpet) hpet_address = 0; - xtime.tv_sec = get_cmos_time(); - xtime.tv_nsec = 0; - - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); if (hpet_arch_init()) hpet_address = 0; if (hpet_use_timer) { /* set tick_nsec to use the proper rate for HPET */ - tick_nsec = TICK_NSEC_HPET; + tick_nsec = TICK_NSEC_HPET; tsc_khz = hpet_calibrate_tsc(); timename = "HPET"; } else { @@ -415,54 +406,21 @@ void __init time_init(void) setup_irq(0, &irq0); } - -static long clock_cmos_diff; -static unsigned long sleep_start; - /* * sysfs support for the timer. */ static int timer_suspend(struct sys_device *dev, pm_message_t state) { - /* - * Estimate time zone so that set_time can update the clock - */ - long cmos_time = get_cmos_time(); - - clock_cmos_diff = -cmos_time; - clock_cmos_diff += get_seconds(); - sleep_start = cmos_time; return 0; } static int timer_resume(struct sys_device *dev) { - unsigned long flags; - unsigned long sec; - unsigned long ctime = get_cmos_time(); - long sleep_length = (ctime - sleep_start) * HZ; - - if (sleep_length < 0) { - printk(KERN_WARNING "Time skew detected in timer resume!\n"); - /* The time after the resume must not be earlier than the time - * before the suspend or some nasty things will happen - */ - sleep_length = 0; - ctime = sleep_start; - } if (hpet_address) hpet_reenable(); else i8254_timer_resume(); - - sec = ctime + clock_cmos_diff; - write_seqlock_irqsave(&xtime_lock,flags); - xtime.tv_sec = sec; - xtime.tv_nsec = 0; - jiffies += sleep_length; - write_sequnlock_irqrestore(&xtime_lock,flags); - touch_softlockup_watchdog(); return 0; } diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 8713ad4a4db1..03888420775d 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -584,7 +584,8 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, tsk->thread.error_code = error_code; tsk->thread.trap_no = trapnr; - if (exception_trace && unhandled_signal(tsk, signr)) + if (show_unhandled_signals && unhandled_signal(tsk, signr) && + printk_ratelimit()) printk(KERN_INFO "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n", tsk->comm, tsk->pid, str, @@ -688,7 +689,8 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, tsk->thread.error_code = error_code; tsk->thread.trap_no = 13; - if (exception_trace && unhandled_signal(tsk, SIGSEGV)) + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) printk(KERN_INFO "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n", tsk->comm, tsk->pid, diff --git a/arch/x86_64/kernel/tsc.c b/arch/x86_64/kernel/tsc.c index e850aa01e1b3..9b76b03d0600 100644 --- a/arch/x86_64/kernel/tsc.c +++ b/arch/x86_64/kernel/tsc.c @@ -61,25 +61,9 @@ inline int check_tsc_unstable(void) * first tick after the change will be slightly wrong. */ -#include <linux/workqueue.h> - -static unsigned int cpufreq_delayed_issched = 0; -static unsigned int cpufreq_init = 0; -static struct work_struct cpufreq_delayed_get_work; - -static void handle_cpufreq_delayed_get(struct work_struct *v) -{ - unsigned int cpu; - for_each_online_cpu(cpu) { - cpufreq_get(cpu); - } - cpufreq_delayed_issched = 0; -} - -static unsigned int ref_freq = 0; -static unsigned long loops_per_jiffy_ref = 0; - -static unsigned long tsc_khz_ref = 0; +static unsigned int ref_freq; +static unsigned long loops_per_jiffy_ref; +static unsigned long tsc_khz_ref; static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) @@ -125,10 +109,8 @@ static struct notifier_block time_cpufreq_notifier_block = { static int __init cpufreq_tsc(void) { - INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get); - if (!cpufreq_register_notifier(&time_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER)) - cpufreq_init = 1; + cpufreq_register_notifier(&time_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); return 0; } @@ -153,17 +135,18 @@ __cpuinit int unsynchronized_tsc(void) #endif /* Most intel systems have synchronized TSCs except for multi node systems */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { #ifdef CONFIG_ACPI /* But TSC doesn't tick in C3 so don't use it there */ - if (acpi_gbl_FADT.header.length > 0 && acpi_gbl_FADT.C3latency < 1000) + if (acpi_gbl_FADT.header.length > 0 && + acpi_gbl_FADT.C3latency < 1000) return 1; #endif - return 0; + return 0; } - /* Assume multi socket systems are not synchronized */ - return num_present_cpus() > 1; + /* Assume multi socket systems are not synchronized */ + return num_present_cpus() > 1; } int __init notsc_setup(char *s) diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 5c57ea4591c1..ba8ea97abd21 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -28,7 +28,7 @@ SECTIONS _text = .; /* Text and read-only data */ .text : AT(ADDR(.text) - LOAD_OFFSET) { /* First the code that has to be first for bootstrapping */ - *(.bootstrap.text) + *(.text.head) _stext = .; /* Then the rest */ TEXT_TEXT @@ -54,6 +54,13 @@ SECTIONS RODATA + . = ALIGN(4); + .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) { + __tracedata_start = .; + *(.tracedata) + __tracedata_end = .; + } + . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */ /* Data */ .data : AT(ADDR(.data) - LOAD_OFFSET) { @@ -93,6 +100,9 @@ SECTIONS .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { *(.vsyscall_gtod_data) } vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); + .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) + { *(.vsyscall_clock) } + vsyscall_clock = VVIRT(.vsyscall_clock); .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) @@ -133,20 +143,11 @@ SECTIONS /* might get freed after init */ . = ALIGN(4096); __smp_alt_begin = .; - __smp_alt_instructions = .; - .smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) { - *(.smp_altinstructions) - } - __smp_alt_instructions_end = .; - . = ALIGN(8); __smp_locks = .; .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { *(.smp_locks) } __smp_locks_end = .; - .smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) { - *(.smp_altinstr_replacement) - } . = ALIGN(4096); __smp_alt_end = .; @@ -189,6 +190,12 @@ SECTIONS .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) } +/* vdso blob that is mapped into user space */ + vdso_start = . ; + .vdso : AT(ADDR(.vdso) - LOAD_OFFSET) { *(.vdso) } + . = ALIGN(4096); + vdso_end = .; + #ifdef CONFIG_BLK_DEV_INITRD . = ALIGN(4096); __initramfs_start = .; diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index 57660d58d500..06c34949bfdc 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -42,6 +42,7 @@ #include <asm/segment.h> #include <asm/desc.h> #include <asm/topology.h> +#include <asm/vgtod.h> #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) #define __syscall_clobber "r11","rcx","memory" @@ -57,26 +58,9 @@ * - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64) * Try to keep this structure as small as possible to avoid cache line ping pongs */ -struct vsyscall_gtod_data_t { - seqlock_t lock; - - /* open coded 'struct timespec' */ - time_t wall_time_sec; - u32 wall_time_nsec; - - int sysctl_enabled; - struct timezone sys_tz; - struct { /* extract of a clocksource struct */ - cycle_t (*vread)(void); - cycle_t cycle_last; - cycle_t mask; - u32 mult; - u32 shift; - } clock; -}; int __vgetcpu_mode __section_vgetcpu_mode; -struct vsyscall_gtod_data_t __vsyscall_gtod_data __section_vsyscall_gtod_data = +struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data = { .lock = SEQLOCK_UNLOCKED, .sysctl_enabled = 1, @@ -96,6 +80,8 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; vsyscall_gtod_data.sys_tz = sys_tz; + vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; + vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 84f11728fc76..327c9f2fa626 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -159,7 +159,7 @@ void dump_pagetable(unsigned long address) pmd_t *pmd; pte_t *pte; - asm("movq %%cr3,%0" : "=r" (pgd)); + pgd = (pgd_t *)read_cr3(); pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); pgd += pgd_index(address); @@ -221,16 +221,6 @@ static int is_errata93(struct pt_regs *regs, unsigned long address) return 0; } -int unhandled_signal(struct task_struct *tsk, int sig) -{ - if (is_init(tsk)) - return 1; - if (tsk->ptrace & PT_PTRACED) - return 0; - return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) || - (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL); -} - static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, unsigned long error_code) { @@ -301,8 +291,8 @@ static int vmalloc_fault(unsigned long address) return 0; } -int page_fault_trace = 0; -int exception_trace = 1; +static int page_fault_trace; +int show_unhandled_signals = 1; /* * This routine handles page faults. It determines the address, @@ -326,7 +316,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, prefetchw(&mm->mmap_sem); /* get the address */ - __asm__("movq %%cr2,%0":"=r" (address)); + address = read_cr2(); info.si_code = SEGV_MAPERR; @@ -494,7 +484,8 @@ bad_area_nosemaphore: (address >> 32)) return; - if (exception_trace && unhandled_signal(tsk, SIGSEGV)) { + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) { printk( "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n", tsk->pid > 1 ? KERN_INFO : KERN_EMERG, @@ -568,7 +559,7 @@ out_of_memory: } printk("VM: killing process %s\n", tsk->comm); if (error_code & 4) - do_exit(SIGKILL); + do_group_exit(SIGKILL); goto no_context; do_sigbus: diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 9a0e98accf04..38f5d6368006 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -383,7 +383,7 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end) } if (!after_bootmem) - asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features)); + mmu_cr4_features = read_cr4(); __flush_tlb_all(); } @@ -600,16 +600,6 @@ void mark_rodata_ro(void) { unsigned long start = (unsigned long)_stext, end; -#ifdef CONFIG_HOTPLUG_CPU - /* It must still be possible to apply SMP alternatives. */ - if (num_possible_cpus() > 1) - start = (unsigned long)_etext; -#endif - -#ifdef CONFIG_KPROBES - start = (unsigned long)__start_rodata; -#endif - end = (unsigned long)__end_rodata; start = (start + PAGE_SIZE - 1) & PAGE_MASK; end &= PAGE_MASK; @@ -697,41 +687,6 @@ int kern_addr_valid(unsigned long addr) return pfn_valid(pte_pfn(*pte)); } -#ifdef CONFIG_SYSCTL -#include <linux/sysctl.h> - -extern int exception_trace, page_fault_trace; - -static ctl_table debug_table2[] = { - { - .ctl_name = 99, - .procname = "exception-trace", - .data = &exception_trace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - {} -}; - -static ctl_table debug_root_table2[] = { - { - .ctl_name = CTL_DEBUG, - .procname = "debug", - .mode = 0555, - .child = debug_table2 - }, - {} -}; - -static __init int x8664_sysctl_init(void) -{ - register_sysctl_table(debug_root_table2); - return 0; -} -__initcall(x8664_sysctl_init); -#endif - /* A pseudo VMA to allow ptrace access for the vsyscall page. This only covers the 64bit vsyscall page now. 32bit has a real VMA now and does not need special handling anymore. */ @@ -769,8 +724,17 @@ int in_gate_area_no_task(unsigned long addr) return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); } -void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) +void * __init alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) { return __alloc_bootmem_core(pgdat->bdata, size, SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0); } + +const char *arch_vma_name(struct vm_area_struct *vma) +{ + if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) + return "[vdso]"; + if (vma == &gate_vma) + return "[vsyscall]"; + return NULL; +} diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c index f983c75825d0..a96006f7ae0c 100644 --- a/arch/x86_64/mm/k8topology.c +++ b/arch/x86_64/mm/k8topology.c @@ -44,12 +44,12 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) { unsigned long prevbase; struct bootnode nodes[8]; - int nodeid, i, nb; + int nodeid, i, j, nb; unsigned char nodeids[8]; int found = 0; u32 reg; unsigned numnodes; - unsigned dualcore = 0; + unsigned num_cores; if (!early_pci_allowed()) return -1; @@ -60,6 +60,9 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); + num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; + printk(KERN_INFO "CPU has %d num_cores\n", num_cores); + reg = read_pci_config(0, nb, 0, 0x60); numnodes = ((reg >> 4) & 0xF) + 1; if (numnodes <= 1) @@ -73,8 +76,6 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) unsigned long base,limit; u32 nodeid; - /* Undefined before E stepping, but hopefully 0 */ - dualcore |= ((read_pci_config(0, nb, 3, 0xe8) >> 12) & 3) == 1; base = read_pci_config(0, nb, 1, 0x40 + i*8); limit = read_pci_config(0, nb, 1, 0x44 + i*8); @@ -170,8 +171,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) for (i = 0; i < 8; i++) { if (nodes[i].start != nodes[i].end) { nodeid = nodeids[i]; - apicid_to_node[nodeid << dualcore] = i; - apicid_to_node[(nodeid << dualcore) + dualcore] = i; + for (j = 0; j < num_cores; j++) + apicid_to_node[(nodeid * num_cores) + j] = i; setup_node_bootmem(i, nodes[i].start, nodes[i].end); } } diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 51548947ad3b..6da235522269 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -273,9 +273,6 @@ void __init numa_init_array(void) #ifdef CONFIG_NUMA_EMU /* Numa emulation */ -#define E820_ADDR_HOLE_SIZE(start, end) \ - (e820_hole_size((start) >> PAGE_SHIFT, (end) >> PAGE_SHIFT) << \ - PAGE_SHIFT) char *cmdline __initdata; /* @@ -319,7 +316,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, return -1; if (num_nodes > MAX_NUMNODES) num_nodes = MAX_NUMNODES; - size = (max_addr - *addr - E820_ADDR_HOLE_SIZE(*addr, max_addr)) / + size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) / num_nodes; /* * Calculate the number of big nodes that can be allocated as a result @@ -347,7 +344,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, if (i == num_nodes + node_start - 1) end = max_addr; else - while (end - *addr - E820_ADDR_HOLE_SIZE(*addr, end) < + while (end - *addr - e820_hole_size(*addr, end) < size) { end += FAKE_NODE_MIN_SIZE; if (end > max_addr) { @@ -476,18 +473,22 @@ out: /* * We need to vacate all active ranges that may have been registered by - * SRAT. + * SRAT and set acpi_numa to -1 so that srat_disabled() always returns + * true. NUMA emulation has succeeded so we will not scan ACPI nodes. */ remove_all_active_ranges(); +#ifdef CONFIG_ACPI_NUMA + acpi_numa = -1; +#endif for_each_node_mask(i, node_possible_map) { e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, nodes[i].end >> PAGE_SHIFT); setup_node_bootmem(i, nodes[i].start, nodes[i].end); } + acpi_fake_nodes(nodes, num_nodes); numa_init_array(); return 0; } -#undef E820_ADDR_HOLE_SIZE #endif /* CONFIG_NUMA_EMU */ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c index 9148f4a4cec6..7e161c698af4 100644 --- a/arch/x86_64/mm/pageattr.c +++ b/arch/x86_64/mm/pageattr.c @@ -13,7 +13,7 @@ #include <asm/tlbflush.h> #include <asm/io.h> -static inline pte_t *lookup_address(unsigned long address) +pte_t *lookup_address(unsigned long address) { pgd_t *pgd = pgd_offset_k(address); pud_t *pud; @@ -74,14 +74,12 @@ static void flush_kernel_map(void *arg) struct page *pg; /* When clflush is available always use it because it is - much cheaper than WBINVD. Disable clflush for now because - the high level code is not ready yet */ - if (1 || !cpu_has_clflush) + much cheaper than WBINVD. */ + if (!cpu_has_clflush) asm volatile("wbinvd" ::: "memory"); else list_for_each_entry(pg, l, lru) { void *adr = page_address(pg); - if (cpu_has_clflush) - cache_flush_page(adr); + cache_flush_page(adr); } __flush_tlb_all(); } @@ -95,7 +93,8 @@ static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */ static inline void save_page(struct page *fpage) { - list_add(&fpage->lru, &deferred_pages); + if (!test_and_set_bit(PG_arch_1, &fpage->flags)) + list_add(&fpage->lru, &deferred_pages); } /* @@ -129,9 +128,12 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, pte_t *kpte; struct page *kpte_page; pgprot_t ref_prot2; + kpte = lookup_address(address); if (!kpte) return 0; kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); + BUG_ON(PageLRU(kpte_page)); + BUG_ON(PageCompound(kpte_page)); if (pgprot_val(prot) != pgprot_val(ref_prot)) { if (!pte_huge(*kpte)) { set_pte(kpte, pfn_pte(pfn, prot)); @@ -159,10 +161,9 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, /* on x86-64 the direct mapping set at boot is not using 4k pages */ BUG_ON(PageReserved(kpte_page)); - if (page_private(kpte_page) == 0) { - save_page(kpte_page); + save_page(kpte_page); + if (page_private(kpte_page) == 0) revert_page(address, ref_prot); - } return 0; } @@ -234,6 +235,10 @@ void global_flush_tlb(void) flush_map(&l); list_for_each_entry_safe(pg, next, &l, lru) { + list_del(&pg->lru); + clear_bit(PG_arch_1, &pg->flags); + if (page_private(pg) != 0) + continue; ClearPagePrivate(pg); __free_page(pg); } diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c index 1e76bb0a7277..acdf03e19146 100644 --- a/arch/x86_64/mm/srat.c +++ b/arch/x86_64/mm/srat.c @@ -106,9 +106,9 @@ static __init int slit_valid(struct acpi_table_slit *slit) for (j = 0; j < d; j++) { u8 val = slit->entry[d*i + j]; if (i == j) { - if (val != 10) + if (val != LOCAL_DISTANCE) return 0; - } else if (val <= 10) + } else if (val <= LOCAL_DISTANCE) return 0; } } @@ -350,7 +350,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) /* Sanity check to catch more bad SRATs (they are amazingly common). Make sure the PXMs cover all memory. */ -static int nodes_cover_memory(void) +static int __init nodes_cover_memory(const struct bootnode *nodes) { int i; unsigned long pxmram, e820ram; @@ -394,6 +394,9 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) { int i; + if (acpi_numa <= 0) + return -1; + /* First clean up the node list */ for (i = 0; i < MAX_NUMNODES; i++) { cutoff_node(i, start, end); @@ -403,10 +406,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) } } - if (acpi_numa <= 0) - return -1; - - if (!nodes_cover_memory()) { + if (!nodes_cover_memory(nodes)) { bad_srat(); return -1; } @@ -440,6 +440,86 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) return 0; } +#ifdef CONFIG_NUMA_EMU +static int __init find_node_by_addr(unsigned long addr) +{ + int ret = NUMA_NO_NODE; + int i; + + for_each_node_mask(i, nodes_parsed) { + /* + * Find the real node that this emulated node appears on. For + * the sake of simplicity, we only use a real node's starting + * address to determine which emulated node it appears on. + */ + if (addr >= nodes[i].start && addr < nodes[i].end) { + ret = i; + break; + } + } + return i; +} + +/* + * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID + * mappings that respect the real ACPI topology but reflect our emulated + * environment. For each emulated node, we find which real node it appears on + * and create PXM to NID mappings for those fake nodes which mirror that + * locality. SLIT will now represent the correct distances between emulated + * nodes as a result of the real topology. + */ +void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) +{ + int i, j; + int fake_node_to_pxm_map[MAX_NUMNODES] = { + [0 ... MAX_NUMNODES-1] = PXM_INVAL + }; + unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] = { + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE + }; + + printk(KERN_INFO "Faking PXM affinity for fake nodes on real " + "topology.\n"); + for (i = 0; i < num_nodes; i++) { + int nid, pxm; + + nid = find_node_by_addr(fake_nodes[i].start); + if (nid == NUMA_NO_NODE) + continue; + pxm = node_to_pxm(nid); + if (pxm == PXM_INVAL) + continue; + fake_node_to_pxm_map[i] = pxm; + /* + * For each apicid_to_node mapping that exists for this real + * node, it must now point to the fake node ID. + */ + for (j = 0; j < MAX_LOCAL_APIC; j++) + if (apicid_to_node[j] == nid) + fake_apicid_to_node[j] = i; + } + for (i = 0; i < num_nodes; i++) + __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); + memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); + + nodes_clear(nodes_parsed); + for (i = 0; i < num_nodes; i++) + if (fake_nodes[i].start != fake_nodes[i].end) + node_set(i, nodes_parsed); + WARN_ON(!nodes_cover_memory(fake_nodes)); +} + +static int null_slit_node_compare(int a, int b) +{ + return node_to_pxm(a) == node_to_pxm(b); +} +#else +static int null_slit_node_compare(int a, int b) +{ + return a == b; +} +#endif /* CONFIG_NUMA_EMU */ + void __init srat_reserve_add_area(int nodeid) { if (found_add_area && nodes_add[nodeid].end) { @@ -464,7 +544,8 @@ int __node_distance(int a, int b) int index; if (!acpi_slit) - return a == b ? 10 : 20; + return null_slit_node_compare(a, b) ? LOCAL_DISTANCE : + REMOTE_DISTANCE; index = acpi_slit->locality_count * node_to_pxm(a); return acpi_slit->entry[index + node_to_pxm(b)]; } diff --git a/arch/x86_64/pci/k8-bus.c b/arch/x86_64/pci/k8-bus.c index 3acf60ded2a0..9cc813e29706 100644 --- a/arch/x86_64/pci/k8-bus.c +++ b/arch/x86_64/pci/k8-bus.c @@ -59,6 +59,8 @@ fill_mp_bus_to_cpumask(void) j <= SUBORDINATE_LDT_BUS_NUMBER(ldtbus); j++) { struct pci_bus *bus; + struct pci_sysdata *sd; + long node = NODE_ID(nid); /* Algorithm a bit dumb, but it shouldn't matter here */ @@ -67,7 +69,9 @@ fill_mp_bus_to_cpumask(void) continue; if (!node_online(node)) node = 0; - bus->sysdata = (void *)node; + + sd = bus->sysdata; + sd->node = node; } } } diff --git a/arch/x86_64/vdso/Makefile b/arch/x86_64/vdso/Makefile new file mode 100644 index 000000000000..faaa72fb250c --- /dev/null +++ b/arch/x86_64/vdso/Makefile @@ -0,0 +1,49 @@ +# +# x86-64 vDSO. +# + +# files to link into the vdso +# vdso-start.o has to be first +vobjs-y := vdso-start.o vdso-note.o vclock_gettime.o vgetcpu.o vvar.o + +# files to link into kernel +obj-y := vma.o vdso.o vdso-syms.o + +vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) + +$(obj)/vdso.o: $(obj)/vdso.so + +targets += vdso.so vdso.lds $(vobjs-y) vdso-syms.o + +# The DSO images are built using a special linker script. +quiet_cmd_syscall = SYSCALL $@ + cmd_syscall = $(CC) -m elf_x86_64 -nostdlib $(SYSCFLAGS_$(@F)) \ + -Wl,-T,$(filter-out FORCE,$^) -o $@ + +export CPPFLAGS_vdso.lds += -P -C -U$(ARCH) + +vdso-flags = -fPIC -shared -Wl,-soname=linux-vdso.so.1 \ + $(call ld-option, -Wl$(comma)--hash-style=sysv) \ + -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 +SYSCFLAGS_vdso.so = $(vdso-flags) + +$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so + +$(obj)/vdso.so: $(src)/vdso.lds $(vobjs) FORCE + $(call if_changed,syscall) + +CF := $(PROFILING) -mcmodel=small -fPIC -g0 -O2 -fasynchronous-unwind-tables -m64 + +$(obj)/vclock_gettime.o: CFLAGS = $(CF) +$(obj)/vgetcpu.o: CFLAGS = $(CF) + +# We also create a special relocatable object that should mirror the symbol +# table and layout of the linked DSO. With ld -R we can then refer to +# these symbols in the kernel code rather than hand-coded addresses. +extra-y += vdso-syms.o +$(obj)/built-in.o: $(obj)/vdso-syms.o +$(obj)/built-in.o: ld_flags += -R $(obj)/vdso-syms.o + +SYSCFLAGS_vdso-syms.o = -r -d +$(obj)/vdso-syms.o: $(src)/vdso.lds $(vobjs) FORCE + $(call if_changed,syscall) diff --git a/arch/x86_64/vdso/vclock_gettime.c b/arch/x86_64/vdso/vclock_gettime.c new file mode 100644 index 000000000000..17f6a00de712 --- /dev/null +++ b/arch/x86_64/vdso/vclock_gettime.c @@ -0,0 +1,120 @@ +/* + * Copyright 2006 Andi Kleen, SUSE Labs. + * Subject to the GNU Public License, v.2 + * + * Fast user context implementation of clock_gettime and gettimeofday. + * + * The code should have no internal unresolved relocations. + * Check with readelf after changing. + * Also alternative() doesn't work. + */ + +#include <linux/kernel.h> +#include <linux/posix-timers.h> +#include <linux/time.h> +#include <linux/string.h> +#include <asm/vsyscall.h> +#include <asm/vgtod.h> +#include <asm/timex.h> +#include <asm/hpet.h> +#include <asm/unistd.h> +#include <asm/io.h> +#include <asm/vgtod.h> +#include "vextern.h" + +#define gtod vdso_vsyscall_gtod_data + +static long vdso_fallback_gettime(long clock, struct timespec *ts) +{ + long ret; + asm("syscall" : "=a" (ret) : + "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory"); + return ret; +} + +static inline long vgetns(void) +{ + cycles_t (*vread)(void); + vread = gtod->clock.vread; + return ((vread() - gtod->clock.cycle_last) * gtod->clock.mult) >> + gtod->clock.shift; +} + +static noinline int do_realtime(struct timespec *ts) +{ + unsigned long seq, ns; + do { + seq = read_seqbegin(>od->lock); + ts->tv_sec = gtod->wall_time_sec; + ts->tv_nsec = gtod->wall_time_nsec; + ns = vgetns(); + } while (unlikely(read_seqretry(>od->lock, seq))); + timespec_add_ns(ts, ns); + return 0; +} + +/* Copy of the version in kernel/time.c which we cannot directly access */ +static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec) +{ + while (nsec >= NSEC_PER_SEC) { + nsec -= NSEC_PER_SEC; + ++sec; + } + while (nsec < 0) { + nsec += NSEC_PER_SEC; + --sec; + } + ts->tv_sec = sec; + ts->tv_nsec = nsec; +} + +static noinline int do_monotonic(struct timespec *ts) +{ + unsigned long seq, ns, secs; + do { + seq = read_seqbegin(>od->lock); + secs = gtod->wall_time_sec; + ns = gtod->wall_time_nsec + vgetns(); + secs += gtod->wall_to_monotonic.tv_sec; + ns += gtod->wall_to_monotonic.tv_nsec; + } while (unlikely(read_seqretry(>od->lock, seq))); + vset_normalized_timespec(ts, secs, ns); + return 0; +} + +int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) +{ + if (likely(gtod->sysctl_enabled && gtod->clock.vread)) + switch (clock) { + case CLOCK_REALTIME: + return do_realtime(ts); + case CLOCK_MONOTONIC: + return do_monotonic(ts); + } + return vdso_fallback_gettime(clock, ts); +} +int clock_gettime(clockid_t, struct timespec *) + __attribute__((weak, alias("__vdso_clock_gettime"))); + +int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) +{ + long ret; + if (likely(gtod->sysctl_enabled && gtod->clock.vread)) { + BUILD_BUG_ON(offsetof(struct timeval, tv_usec) != + offsetof(struct timespec, tv_nsec) || + sizeof(*tv) != sizeof(struct timespec)); + do_realtime((struct timespec *)tv); + tv->tv_usec /= 1000; + if (unlikely(tz != NULL)) { + /* This relies on gcc inlining the memcpy. We'll notice + if it ever fails to do so. */ + memcpy(tz, >od->sys_tz, sizeof(struct timezone)); + } + return 0; + } + asm("syscall" : "=a" (ret) : + "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); + return ret; +} +int gettimeofday(struct timeval *, struct timezone *) + __attribute__((weak, alias("__vdso_gettimeofday"))); diff --git a/arch/x86_64/vdso/vdso-note.S b/arch/x86_64/vdso/vdso-note.S new file mode 100644 index 000000000000..79a071e4357e --- /dev/null +++ b/arch/x86_64/vdso/vdso-note.S @@ -0,0 +1,12 @@ +/* + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. + * Here we can supply some information useful to userland. + */ + +#include <linux/uts.h> +#include <linux/version.h> +#include <linux/elfnote.h> + +ELFNOTE_START(Linux, 0, "a") + .long LINUX_VERSION_CODE +ELFNOTE_END diff --git a/arch/x86_64/vdso/vdso-start.S b/arch/x86_64/vdso/vdso-start.S new file mode 100644 index 000000000000..2dc2cdb84d67 --- /dev/null +++ b/arch/x86_64/vdso/vdso-start.S @@ -0,0 +1,2 @@ + .globl vdso_kernel_start +vdso_kernel_start: diff --git a/arch/x86_64/vdso/vdso.S b/arch/x86_64/vdso/vdso.S new file mode 100644 index 000000000000..92e80c1972a7 --- /dev/null +++ b/arch/x86_64/vdso/vdso.S @@ -0,0 +1,2 @@ + .section ".vdso","a" + .incbin "arch/x86_64/vdso/vdso.so" diff --git a/arch/x86_64/vdso/vdso.lds.S b/arch/x86_64/vdso/vdso.lds.S new file mode 100644 index 000000000000..b9a60e665d08 --- /dev/null +++ b/arch/x86_64/vdso/vdso.lds.S @@ -0,0 +1,77 @@ +/* + * Linker script for vsyscall DSO. The vsyscall page is an ELF shared + * object prelinked to its virtual address, and with only one read-only + * segment (that fits in one page). This script controls its layout. + */ +#include <asm/asm-offsets.h> +#include "voffset.h" + +#define VDSO_PRELINK 0xffffffffff700000 + +SECTIONS +{ + . = VDSO_PRELINK + SIZEOF_HEADERS; + + .hash : { *(.hash) } :text + .gnu.hash : { *(.gnu.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + /* This linker script is used both with -r and with -shared. + For the layouts to match, we need to skip more than enough + space for the dynamic symbol table et al. If this amount + is insufficient, ld -shared will barf. Just increase it here. */ + . = VDSO_PRELINK + VDSO_TEXT_OFFSET; + + .text : { *(.text) } :text + .text.ptr : { *(.text.ptr) } :text + . = VDSO_PRELINK + 0x900; + .data : { *(.data) } :text + .bss : { *(.bss) } :text + + .altinstructions : { *(.altinstructions) } :text + .altinstr_replacement : { *(.altinstr_replacement) } :text + + .note : { *(.note.*) } :text :note + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : { KEEP (*(.eh_frame)) } :text + .dynamic : { *(.dynamic) } :text :dynamic + .useless : { + *(.got.plt) *(.got) + *(.gnu.linkonce.d.*) + *(.dynbss) + *(.gnu.linkonce.b.*) + } :text +} + +/* + * We must supply the ELF program headers explicitly to get just one + * PT_LOAD segment, and set the flags explicitly to make segments read-only. + */ +PHDRS +{ + text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + note PT_NOTE FLAGS(4); /* PF_R */ + eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */ +} + +/* + * This controls what symbols we export from the DSO. + */ +VERSION +{ + LINUX_2.6 { + global: + clock_gettime; + __vdso_clock_gettime; + gettimeofday; + __vdso_gettimeofday; + getcpu; + __vdso_getcpu; + local: *; + }; +} diff --git a/arch/x86_64/vdso/vextern.h b/arch/x86_64/vdso/vextern.h new file mode 100644 index 000000000000..1683ba2ae3e8 --- /dev/null +++ b/arch/x86_64/vdso/vextern.h @@ -0,0 +1,16 @@ +#ifndef VEXTERN +#include <asm/vsyscall.h> +#define VEXTERN(x) \ + extern typeof(x) *vdso_ ## x __attribute__((visibility("hidden"))); +#endif + +#define VMAGIC 0xfeedbabeabcdefabUL + +/* Any kernel variables used in the vDSO must be exported in the main + kernel's vmlinux.lds.S/vsyscall.h/proper __section and + put into vextern.h and be referenced as a pointer with vdso prefix. + The main kernel later fills in the values. */ + +VEXTERN(jiffies) +VEXTERN(vgetcpu_mode) +VEXTERN(vsyscall_gtod_data) diff --git a/arch/x86_64/vdso/vgetcpu.c b/arch/x86_64/vdso/vgetcpu.c new file mode 100644 index 000000000000..91f6e85d0fc2 --- /dev/null +++ b/arch/x86_64/vdso/vgetcpu.c @@ -0,0 +1,50 @@ +/* + * Copyright 2006 Andi Kleen, SUSE Labs. + * Subject to the GNU Public License, v.2 + * + * Fast user context implementation of getcpu() + */ + +#include <linux/kernel.h> +#include <linux/getcpu.h> +#include <linux/jiffies.h> +#include <linux/time.h> +#include <asm/vsyscall.h> +#include <asm/vgtod.h> +#include "vextern.h" + +long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) +{ + unsigned int dummy, p; + unsigned long j = 0; + + /* Fast cache - only recompute value once per jiffies and avoid + relatively costly rdtscp/cpuid otherwise. + This works because the scheduler usually keeps the process + on the same CPU and this syscall doesn't guarantee its + results anyways. + We do this here because otherwise user space would do it on + its own in a likely inferior way (no access to jiffies). + If you don't like it pass NULL. */ + if (tcache && tcache->blob[0] == (j = *vdso_jiffies)) { + p = tcache->blob[1]; + } else if (*vdso_vgetcpu_mode == VGETCPU_RDTSCP) { + /* Load per CPU data from RDTSCP */ + rdtscp(dummy, dummy, p); + } else { + /* Load per CPU data from GDT */ + asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); + } + if (tcache) { + tcache->blob[0] = j; + tcache->blob[1] = p; + } + if (cpu) + *cpu = p & 0xfff; + if (node) + *node = p >> 12; + return 0; +} + +long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) + __attribute__((weak, alias("__vdso_getcpu"))); diff --git a/arch/x86_64/vdso/vma.c b/arch/x86_64/vdso/vma.c new file mode 100644 index 000000000000..d4cb83a6c066 --- /dev/null +++ b/arch/x86_64/vdso/vma.c @@ -0,0 +1,139 @@ +/* + * Set up the VMAs to tell the VM about the vDSO. + * Copyright 2007 Andi Kleen, SUSE Labs. + * Subject to the GPL, v.2 + */ +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/random.h> +#include <asm/vsyscall.h> +#include <asm/vgtod.h> +#include <asm/proto.h> +#include "voffset.h" + +int vdso_enabled = 1; + +#define VEXTERN(x) extern typeof(__ ## x) *vdso_ ## x; +#include "vextern.h" +#undef VEXTERN + +extern char vdso_kernel_start[], vdso_start[], vdso_end[]; +extern unsigned short vdso_sync_cpuid; + +struct page **vdso_pages; + +static inline void *var_ref(void *vbase, char *var, char *name) +{ + unsigned offset = var - &vdso_kernel_start[0] + VDSO_TEXT_OFFSET; + void *p = vbase + offset; + if (*(void **)p != (void *)VMAGIC) { + printk("VDSO: variable %s broken\n", name); + vdso_enabled = 0; + } + return p; +} + +static int __init init_vdso_vars(void) +{ + int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE; + int i; + char *vbase; + + vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL); + if (!vdso_pages) + goto oom; + for (i = 0; i < npages; i++) { + struct page *p; + p = alloc_page(GFP_KERNEL); + if (!p) + goto oom; + vdso_pages[i] = p; + copy_page(page_address(p), vdso_start + i*PAGE_SIZE); + } + + vbase = vmap(vdso_pages, npages, 0, PAGE_KERNEL); + if (!vbase) + goto oom; + + if (memcmp(vbase, "\177ELF", 4)) { + printk("VDSO: I'm broken; not ELF\n"); + vdso_enabled = 0; + } + +#define V(x) *(typeof(x) *) var_ref(vbase, (char *)RELOC_HIDE(&x, 0), #x) +#define VEXTERN(x) \ + V(vdso_ ## x) = &__ ## x; +#include "vextern.h" +#undef VEXTERN + return 0; + + oom: + printk("Cannot allocate vdso\n"); + vdso_enabled = 0; + return -ENOMEM; +} +__initcall(init_vdso_vars); + +struct linux_binprm; + +/* Put the vdso above the (randomized) stack with another randomized offset. + This way there is no hole in the middle of address space. + To save memory make sure it is still in the same PTE as the stack top. + This doesn't give that many random bits */ +static unsigned long vdso_addr(unsigned long start, unsigned len) +{ + unsigned long addr, end; + unsigned offset; + end = (start + PMD_SIZE - 1) & PMD_MASK; + if (end >= TASK_SIZE64) + end = TASK_SIZE64; + end -= len; + /* This loses some more bits than a modulo, but is cheaper */ + offset = get_random_int() & (PTRS_PER_PTE - 1); + addr = start + (offset << PAGE_SHIFT); + if (addr >= end) + addr = end; + return addr; +} + +/* Setup a VMA at program startup for the vsyscall page. + Not called for compat tasks */ +int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) +{ + struct mm_struct *mm = current->mm; + unsigned long addr; + int ret; + unsigned len = round_up(vdso_end - vdso_start, PAGE_SIZE); + + if (!vdso_enabled) + return 0; + + down_write(&mm->mmap_sem); + addr = vdso_addr(mm->start_stack, len); + addr = get_unmapped_area(NULL, addr, len, 0, 0); + if (IS_ERR_VALUE(addr)) { + ret = addr; + goto up_fail; + } + + ret = install_special_mapping(mm, addr, len, + VM_READ|VM_EXEC| + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| + VM_ALWAYSDUMP, + vdso_pages); + if (ret) + goto up_fail; + + current->mm->context.vdso = (void *)addr; +up_fail: + up_write(&mm->mmap_sem); + return ret; +} + +static __init int vdso_setup(char *s) +{ + vdso_enabled = simple_strtoul(s, NULL, 0); + return 0; +} +__setup("vdso=", vdso_setup); diff --git a/arch/x86_64/vdso/voffset.h b/arch/x86_64/vdso/voffset.h new file mode 100644 index 000000000000..5304204911f2 --- /dev/null +++ b/arch/x86_64/vdso/voffset.h @@ -0,0 +1 @@ +#define VDSO_TEXT_OFFSET 0x500 diff --git a/arch/x86_64/vdso/vvar.c b/arch/x86_64/vdso/vvar.c new file mode 100644 index 000000000000..6fc22219a472 --- /dev/null +++ b/arch/x86_64/vdso/vvar.c @@ -0,0 +1,12 @@ +/* Define pointer to external vDSO variables. + These are part of the vDSO. The kernel fills in the real addresses + at boot time. This is done because when the vdso is linked the + kernel isn't yet and we don't know the final addresses. */ +#include <linux/kernel.h> +#include <linux/time.h> +#include <asm/vsyscall.h> +#include <asm/timex.h> +#include <asm/vgtod.h> + +#define VEXTERN(x) typeof (__ ## x) *vdso_ ## x = (void *)VMAGIC; +#include "vextern.h" |