From 16a6677fdf1d1194f688f8291b06fbaff248c353 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 6 Jan 2006 23:01:48 -0800
Subject: [XFRM]: Netfilter IPsec output hooks

Call netfilter hooks before IPsec transforms. Packets visit the
FORWARD/LOCAL_OUT and POST_ROUTING hook before the first encapsulation
and the LOCAL_OUT and POST_ROUTING hook before each following tunnel mode
transform.

Patch from Herbert Xu <herbert@gondor.apana.org.au>:

Move the loop from dst_output into xfrm4_output/xfrm6_output since they're
the only ones who need to it. xfrm{4,6}_output_one() processes the first SA
all subsequent transport mode SAs and is called in a loop that calls the
netfilter hooks between each two calls.

In order to avoid the tail call issue, I've added the inline function
nf_hook which is nf_hook_slow plus the empty list check.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter.h | 61 ++++++++++++++++++++++++++++-------------------
 1 file changed, 37 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index be365e70ee99..79bb977afeac 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -168,6 +168,37 @@ void nf_log_packet(int pf,
 		   const struct net_device *out,
 		   struct nf_loginfo *li,
 		   const char *fmt, ...);
+
+int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb,
+		 struct net_device *indev, struct net_device *outdev,
+		 int (*okfn)(struct sk_buff *), int thresh);
+
+/**
+ *	nf_hook_thresh - call a netfilter hook
+ *	
+ *	Returns 1 if the hook has allowed the packet to pass.  The function
+ *	okfn must be invoked by the caller in this case.  Any other return
+ *	value indicates the packet has been consumed by the hook.
+ */
+static inline int nf_hook_thresh(int pf, unsigned int hook,
+				 struct sk_buff **pskb,
+				 struct net_device *indev,
+				 struct net_device *outdev,
+				 int (*okfn)(struct sk_buff *), int thresh)
+{
+#ifndef CONFIG_NETFILTER_DEBUG
+	if (list_empty(&nf_hooks[pf][hook]))
+		return 1;
+#endif
+	return nf_hook_slow(pf, hook, pskb, indev, outdev, okfn, thresh);
+}
+
+static inline int nf_hook(int pf, unsigned int hook, struct sk_buff **pskb,
+			  struct net_device *indev, struct net_device *outdev,
+			  int (*okfn)(struct sk_buff *))
+{
+	return nf_hook_thresh(pf, hook, pskb, indev, outdev, okfn, INT_MIN);
+}
                    
 /* Activate hook; either okfn or kfree_skb called, unless a hook
    returns NF_STOLEN (in which case, it's up to the hook to deal with
@@ -188,35 +219,17 @@ void nf_log_packet(int pf,
 
 /* This is gross, but inline doesn't cut it for avoiding the function
    call in fast path: gcc doesn't inline (needs value tracking?). --RR */
-#ifdef CONFIG_NETFILTER_DEBUG
-#define NF_HOOK(pf, hook, skb, indev, outdev, okfn)			       \
-({int __ret;								       \
-if ((__ret=nf_hook_slow(pf, hook, &(skb), indev, outdev, okfn, INT_MIN)) == 1) \
-	__ret = (okfn)(skb);						       \
-__ret;})
-#define NF_HOOK_THRESH(pf, hook, skb, indev, outdev, okfn, thresh)	       \
-({int __ret;								       \
-if ((__ret=nf_hook_slow(pf, hook, &(skb), indev, outdev, okfn, thresh)) == 1)  \
-	__ret = (okfn)(skb);						       \
-__ret;})
-#else
-#define NF_HOOK(pf, hook, skb, indev, outdev, okfn)			       \
-({int __ret;								       \
-if (list_empty(&nf_hooks[pf][hook]) ||					       \
-    (__ret=nf_hook_slow(pf, hook, &(skb), indev, outdev, okfn, INT_MIN)) == 1) \
-	__ret = (okfn)(skb);						       \
-__ret;})
+
+/* HX: It's slightly less gross now. */
+
 #define NF_HOOK_THRESH(pf, hook, skb, indev, outdev, okfn, thresh)	       \
 ({int __ret;								       \
-if (list_empty(&nf_hooks[pf][hook]) ||					       \
-    (__ret=nf_hook_slow(pf, hook, &(skb), indev, outdev, okfn, thresh)) == 1)  \
+if ((__ret=nf_hook_thresh(pf, hook, &(skb), indev, outdev, okfn, thresh)) == 1)\
 	__ret = (okfn)(skb);						       \
 __ret;})
-#endif
 
-int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb,
-		 struct net_device *indev, struct net_device *outdev,
-		 int (*okfn)(struct sk_buff *), int thresh);
+#define NF_HOOK(pf, hook, skb, indev, outdev, okfn) \
+	NF_HOOK_THRESH(pf, hook, skb, indev, outdev, okfn, INT_MIN)
 
 /* Call setsockopt() */
 int nf_setsockopt(struct sock *sk, int pf, int optval, char __user *opt, 
-- 
cgit v1.2.3


From 951dbc8ac714b04c36296b8b5c36c8e036ce433f Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 6 Jan 2006 23:02:34 -0800
Subject: [IPV6]: Move nextheader offset to the IP6CB

Move nextheader offset to the IP6CB to make it possible to pass a
packet to ip6_input_finish multiple times and have it skip already
parsed headers. As a nice side effect this gets rid of the manual
hopopts skipping in ip6_input_finish.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h    |  1 +
 include/net/protocol.h  |  2 +-
 include/net/xfrm.h      |  6 +++---
 net/dccp/ipv6.c         |  2 +-
 net/ipv6/exthdrs.c      | 19 ++++++++++++-------
 net/ipv6/icmp.c         |  4 ++--
 net/ipv6/ip6_input.c    | 21 ++++++---------------
 net/ipv6/ip6_tunnel.c   |  2 +-
 net/ipv6/reassembly.c   | 11 +++++------
 net/ipv6/tcp_ipv6.c     |  2 +-
 net/ipv6/udp.c          |  2 +-
 net/ipv6/xfrm6_input.c  |  8 ++++----
 net/ipv6/xfrm6_tunnel.c |  6 +++---
 net/sctp/ipv6.c         |  2 +-
 14 files changed, 42 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 93bbed5c6cf4..5cfc71529595 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -191,6 +191,7 @@ struct inet6_skb_parm {
 	__u16			srcrt;
 	__u16			dst1;
 	__u16			lastopt;
+	__u32			nhoff;
 };
 
 #define IP6CB(skb)	((struct inet6_skb_parm*)((skb)->cb))
diff --git a/include/net/protocol.h b/include/net/protocol.h
index 63f7db99c2a6..6dc5970612d7 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -43,7 +43,7 @@ struct net_protocol {
 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
 struct inet6_protocol 
 {
-	int	(*handler)(struct sk_buff **skb, unsigned int *nhoffp);
+	int	(*handler)(struct sk_buff **skb);
 
 	void	(*err_handler)(struct sk_buff *skb,
 			       struct inet6_skb_parm *opt,
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 07d7b50cdd76..297d09d28fe4 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -831,7 +831,7 @@ struct xfrm_tunnel {
 };
 
 struct xfrm6_tunnel {
-	int (*handler)(struct sk_buff **pskb, unsigned int *nhoffp);
+	int (*handler)(struct sk_buff **pskb);
 	void (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			    int type, int code, int offset, __u32 info);
 };
@@ -868,8 +868,8 @@ extern int xfrm4_rcv(struct sk_buff *skb);
 extern int xfrm4_output(struct sk_buff *skb);
 extern int xfrm4_tunnel_register(struct xfrm_tunnel *handler);
 extern int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler);
-extern int xfrm6_rcv_spi(struct sk_buff **pskb, unsigned int *nhoffp, u32 spi);
-extern int xfrm6_rcv(struct sk_buff **pskb, unsigned int *nhoffp);
+extern int xfrm6_rcv_spi(struct sk_buff **pskb, u32 spi);
+extern int xfrm6_rcv(struct sk_buff **pskb);
 extern int xfrm6_tunnel_register(struct xfrm6_tunnel *handler);
 extern int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler);
 extern u32 xfrm6_tunnel_alloc_spi(xfrm_address_t *saddr);
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 683250a05f58..df074259f9c3 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -1029,7 +1029,7 @@ discard:
 	return 0;
 }
 
-static int dccp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+static int dccp_v6_rcv(struct sk_buff **pskb)
 {
 	const struct dccp_hdr *dh;
 	struct sk_buff *skb = *pskb;
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 113374dc342c..2a1e7e45b890 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -152,7 +152,7 @@ static struct tlvtype_proc tlvprocdestopt_lst[] = {
 	{-1,			NULL}
 };
 
-static int ipv6_destopt_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
+static int ipv6_destopt_rcv(struct sk_buff **skbp)
 {
 	struct sk_buff *skb = *skbp;
 	struct inet6_skb_parm *opt = IP6CB(skb);
@@ -169,7 +169,7 @@ static int ipv6_destopt_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
 
 	if (ip6_parse_tlv(tlvprocdestopt_lst, skb)) {
 		skb->h.raw += ((skb->h.raw[1]+1)<<3);
-		*nhoffp = opt->dst1;
+		opt->nhoff = opt->dst1;
 		return 1;
 	}
 
@@ -192,7 +192,7 @@ void __init ipv6_destopt_init(void)
   NONE header. No data in packet.
  ********************************/
 
-static int ipv6_nodata_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
+static int ipv6_nodata_rcv(struct sk_buff **skbp)
 {
 	struct sk_buff *skb = *skbp;
 
@@ -215,7 +215,7 @@ void __init ipv6_nodata_init(void)
   Routing header.
  ********************************/
 
-static int ipv6_rthdr_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
+static int ipv6_rthdr_rcv(struct sk_buff **skbp)
 {
 	struct sk_buff *skb = *skbp;
 	struct inet6_skb_parm *opt = IP6CB(skb);
@@ -249,7 +249,7 @@ looped_back:
 		skb->h.raw += (hdr->hdrlen + 1) << 3;
 		opt->dst0 = opt->dst1;
 		opt->dst1 = 0;
-		*nhoffp = (&hdr->nexthdr) - skb->nh.raw;
+		opt->nhoff = (&hdr->nexthdr) - skb->nh.raw;
 		return 1;
 	}
 
@@ -487,9 +487,14 @@ static struct tlvtype_proc tlvprochopopt_lst[] = {
 
 int ipv6_parse_hopopts(struct sk_buff *skb, int nhoff)
 {
-	IP6CB(skb)->hop = sizeof(struct ipv6hdr);
-	if (ip6_parse_tlv(tlvprochopopt_lst, skb))
+	struct inet6_skb_parm *opt = IP6CB(skb);
+
+	opt->hop = sizeof(struct ipv6hdr);
+	if (ip6_parse_tlv(tlvprochopopt_lst, skb)) {
+		skb->h.raw += (skb->h.raw[1]+1)<<3;
+		opt->nhoff = sizeof(struct ipv6hdr);
 		return sizeof(struct ipv6hdr);
+	}
 	return -1;
 }
 
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 6ec6a2b549bb..53c81fcd20ba 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -79,7 +79,7 @@ DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics) __read_mostly;
 static DEFINE_PER_CPU(struct socket *, __icmpv6_socket) = NULL;
 #define icmpv6_socket	__get_cpu_var(__icmpv6_socket)
 
-static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp);
+static int icmpv6_rcv(struct sk_buff **pskb);
 
 static struct inet6_protocol icmpv6_protocol = {
 	.handler	=	icmpv6_rcv,
@@ -581,7 +581,7 @@ static void icmpv6_notify(struct sk_buff *skb, int type, int code, u32 info)
  *	Handle icmp messages
  */
 
-static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+static int icmpv6_rcv(struct sk_buff **pskb)
 {
 	struct sk_buff *skb = *pskb;
 	struct net_device *dev = skb->dev;
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index a6026d2787d2..13d724150f33 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -97,6 +97,9 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
 	if (hdr->version != 6)
 		goto err;
 
+	skb->h.raw = (u8 *)(hdr + 1);
+	IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr);
+
 	pkt_len = ntohs(hdr->payload_len);
 
 	/* pkt_len may be zero if Jumbo payload option is present */
@@ -111,8 +114,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
 	}
 
 	if (hdr->nexthdr == NEXTHDR_HOP) {
-		skb->h.raw = (u8*)(hdr+1);
-		if (ipv6_parse_hopopts(skb, offsetof(struct ipv6hdr, nexthdr)) < 0) {
+		if (ipv6_parse_hopopts(skb, IP6CB(skb)->nhoff) < 0) {
 			IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
 			return 0;
 		}
@@ -143,26 +145,15 @@ static inline int ip6_input_finish(struct sk_buff *skb)
 	int nexthdr;
 	u8 hash;
 
-	skb->h.raw = skb->nh.raw + sizeof(struct ipv6hdr);
-
 	/*
 	 *	Parse extension headers
 	 */
 
-	nexthdr = skb->nh.ipv6h->nexthdr;
-	nhoff = offsetof(struct ipv6hdr, nexthdr);
-
-	/* Skip hop-by-hop options, they are already parsed. */
-	if (nexthdr == NEXTHDR_HOP) {
-		nhoff = sizeof(struct ipv6hdr);
-		nexthdr = skb->h.raw[0];
-		skb->h.raw += (skb->h.raw[1]+1)<<3;
-	}
-
 	rcu_read_lock();
 resubmit:
 	if (!pskb_pull(skb, skb->h.raw - skb->data))
 		goto discard;
+	nhoff = IP6CB(skb)->nhoff;
 	nexthdr = skb->nh.raw[nhoff];
 
 	raw_sk = sk_head(&raw_v6_htable[nexthdr & (MAX_INET_PROTOS - 1)]);
@@ -194,7 +185,7 @@ resubmit:
 		    !xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) 
 			goto discard;
 		
-		ret = ipprot->handler(&skb, &nhoff);
+		ret = ipprot->handler(&skb);
 		if (ret > 0)
 			goto resubmit;
 		else if (ret == 0)
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index e315d0f80af1..f079621c8b67 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -510,7 +510,7 @@ static inline void ip6ip6_ecn_decapsulate(struct ipv6hdr *outer_iph,
  **/
 
 static int 
-ip6ip6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+ip6ip6_rcv(struct sk_buff **pskb)
 {
 	struct sk_buff *skb = *pskb;
 	struct ipv6hdr *ipv6h;
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 5d316cb72ec9..15e1456b3f18 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -581,7 +581,6 @@ err:
  *	the last and the first frames arrived and all the bits are here.
  */
 static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in,
-			  unsigned int *nhoffp,
 			  struct net_device *dev)
 {
 	struct sk_buff *fp, *head = fq->fragments;
@@ -654,6 +653,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in,
 	head->dev = dev;
 	skb_set_timestamp(head, &fq->stamp);
 	head->nh.ipv6h->payload_len = htons(payload_len);
+	IP6CB(head)->nhoff = nhoff;
 
 	*skb_in = head;
 
@@ -663,7 +663,6 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in,
 
 	IP6_INC_STATS_BH(IPSTATS_MIB_REASMOKS);
 	fq->fragments = NULL;
-	*nhoffp = nhoff;
 	return 1;
 
 out_oversize:
@@ -678,7 +677,7 @@ out_fail:
 	return -1;
 }
 
-static int ipv6_frag_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
+static int ipv6_frag_rcv(struct sk_buff **skbp)
 {
 	struct sk_buff *skb = *skbp; 
 	struct net_device *dev = skb->dev;
@@ -710,7 +709,7 @@ static int ipv6_frag_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
 		skb->h.raw += sizeof(struct frag_hdr);
 		IP6_INC_STATS_BH(IPSTATS_MIB_REASMOKS);
 
-		*nhoffp = (u8*)fhdr - skb->nh.raw;
+		IP6CB(skb)->nhoff = (u8*)fhdr - skb->nh.raw;
 		return 1;
 	}
 
@@ -722,11 +721,11 @@ static int ipv6_frag_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
 
 		spin_lock(&fq->lock);
 
-		ip6_frag_queue(fq, skb, fhdr, *nhoffp);
+		ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff);
 
 		if (fq->last_in == (FIRST_IN|LAST_IN) &&
 		    fq->meat == fq->len)
-			ret = ip6_frag_reasm(fq, skbp, nhoffp, dev);
+			ret = ip6_frag_reasm(fq, skbp, dev);
 
 		spin_unlock(&fq->lock);
 		fq_put(fq, NULL);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2947bc56d8a0..a25f4e8a8ada 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1153,7 +1153,7 @@ ipv6_pktoptions:
 	return 0;
 }
 
-static int tcp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+static int tcp_v6_rcv(struct sk_buff **pskb)
 {
 	struct sk_buff *skb = *pskb;
 	struct tcphdr *th;	
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index d8538dcea813..c47648892c04 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -435,7 +435,7 @@ out:
 	read_unlock(&udp_hash_lock);
 }
 
-static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+static int udpv6_rcv(struct sk_buff **pskb)
 {
 	struct sk_buff *skb = *pskb;
 	struct sock *sk;
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 28c29d78338e..1079e47f3933 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -26,7 +26,7 @@ static inline void ipip6_ecn_decapsulate(struct sk_buff *skb)
 		IP6_ECN_set_ce(inner_iph);
 }
 
-int xfrm6_rcv_spi(struct sk_buff **pskb, unsigned int *nhoffp, u32 spi)
+int xfrm6_rcv_spi(struct sk_buff **pskb, u32 spi)
 {
 	struct sk_buff *skb = *pskb;
 	int err;
@@ -38,7 +38,7 @@ int xfrm6_rcv_spi(struct sk_buff **pskb, unsigned int *nhoffp, u32 spi)
 	int nexthdr;
 	unsigned int nhoff;
 
-	nhoff = *nhoffp;
+	nhoff = IP6CB(skb)->nhoff;
 	nexthdr = skb->nh.raw[nhoff];
 
 	seq = 0;
@@ -144,7 +144,7 @@ drop:
 
 EXPORT_SYMBOL(xfrm6_rcv_spi);
 
-int xfrm6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+int xfrm6_rcv(struct sk_buff **pskb)
 {
-	return xfrm6_rcv_spi(pskb, nhoffp, 0);
+	return xfrm6_rcv_spi(pskb, 0);
 }
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index fbef7826a74f..da09ff258648 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -397,7 +397,7 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler)
 
 EXPORT_SYMBOL(xfrm6_tunnel_deregister);
 
-static int xfrm6_tunnel_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+static int xfrm6_tunnel_rcv(struct sk_buff **pskb)
 {
 	struct sk_buff *skb = *pskb;
 	struct xfrm6_tunnel *handler = xfrm6_tunnel_handler;
@@ -405,11 +405,11 @@ static int xfrm6_tunnel_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
 	u32 spi;
 
 	/* device-like_ip6ip6_handler() */
-	if (handler && handler->handler(pskb, nhoffp) == 0)
+	if (handler && handler->handler(pskb) == 0)
 		return 0;
 
 	spi = xfrm6_tunnel_spi_lookup((xfrm_address_t *)&iph->saddr);
-	return xfrm6_rcv_spi(pskb, nhoffp, spi);
+	return xfrm6_rcv_spi(pskb, spi);
 }
 
 static void xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 15c05165c905..04c7fab4edc4 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -905,7 +905,7 @@ static struct inet_protosw sctpv6_stream_protosw = {
 	.flags         = SCTP_PROTOSW_FLAG,
 };
 
-static int sctp6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+static int sctp6_rcv(struct sk_buff **pskb)
 {
 	return sctp_rcv(*pskb) ? -1 : 0;
 }
-- 
cgit v1.2.3


From 3e3850e989c5d2eb1aab6f0fd9257759f0f4cbc6 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 6 Jan 2006 23:04:54 -0800
Subject: [NETFILTER]: Fix xfrm lookup in
 ip_route_me_harder/ip6_route_me_harder

ip_route_me_harder doesn't use the port numbers of the xfrm lookup and
uses ip_route_input for non-local addresses which doesn't do a xfrm
lookup, ip6_route_me_harder doesn't do a xfrm lookup at all.

Use xfrm_decode_session and do the lookup manually, make sure both
only do the lookup if the packet hasn't been transformed already.

Makeing sure the lookup only happens once needs a new field in the
IP6CB, which exceeds the size of skb->cb. The size of skb->cb is
increased to 48b. Apparently the IPv6 mobile extensions need some
more room anyway.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h    |  3 +++
 include/linux/skbuff.h  |  2 +-
 include/net/ip.h        |  3 ++-
 include/net/xfrm.h      |  2 +-
 net/ipv4/ip_gre.c       |  2 +-
 net/ipv4/ipip.c         |  2 +-
 net/ipv4/netfilter.c    | 12 ++++++++++--
 net/ipv4/xfrm4_output.c |  1 +
 net/ipv6/netfilter.c    |  9 ++++++++-
 net/ipv6/xfrm6_output.c |  1 +
 net/xfrm/xfrm_policy.c  |  9 +++++----
 11 files changed, 34 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 5cfc71529595..9c8f4c9ed429 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -192,6 +192,9 @@ struct inet6_skb_parm {
 	__u16			dst1;
 	__u16			lastopt;
 	__u32			nhoff;
+	__u16			flags;
+
+#define IP6SKB_XFRM_TRANSFORMED	1
 };
 
 #define IP6CB(skb)	((struct inet6_skb_parm*)((skb)->cb))
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 483cfc47ec34..e5fd66c5650b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -251,7 +251,7 @@ struct sk_buff {
 	 * want to keep them across layers you have to do a skb_clone()
 	 * first. This is owned by whoever has the skb queued ATM.
 	 */
-	char			cb[40];
+	char			cb[48];
 
 	unsigned int		len,
 				data_len,
diff --git a/include/net/ip.h b/include/net/ip.h
index 52f4d9c69704..a494d04e5dea 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -39,7 +39,8 @@ struct inet_skb_parm
 
 #define IPSKB_FORWARDED		1
 #define IPSKB_XFRM_TUNNEL_SIZE	2
-#define IPSKB_FRAG_COMPLETE	4
+#define IPSKB_XFRM_TRANSFORMED	4
+#define IPSKB_FRAG_COMPLETE	8
 };
 
 struct ipcm_cookie
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 297d09d28fe4..d6111a2f0a23 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -668,7 +668,7 @@ static inline int xfrm6_policy_check(struct sock *sk, int dir, struct sk_buff *s
 	return xfrm_policy_check(sk, dir, skb, AF_INET6);
 }
 
-
+extern int xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family);
 extern int __xfrm_route_forward(struct sk_buff *skb, unsigned short family);
 
 static inline int xfrm_route_forward(struct sk_buff *skb, unsigned short family)
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 65c3a91ed85e..de16e944777f 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -832,7 +832,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	skb->h.raw = skb->nh.raw;
 	skb->nh.raw = skb_push(skb, gre_hlen);
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
-	IPCB(skb)->flags &= ~IPSKB_XFRM_TUNNEL_SIZE;
+	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE|IPSKB_XFRM_TRANSFORMED);
 	dst_release(skb->dst);
 	skb->dst = &rt->u.dst;
 
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 078b59be91f4..bbd85f5ec985 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -621,7 +621,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	skb->h.raw = skb->nh.raw;
 	skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
-	IPCB(skb)->flags &= ~IPSKB_XFRM_TUNNEL_SIZE;
+	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE|IPSKB_XFRM_TRANSFORMED);
 	dst_release(skb->dst);
 	skb->dst = &rt->u.dst;
 
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index ae0779d82c5d..4c637a1cbd23 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -7,11 +7,13 @@
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
 
+#include <linux/ip.h>
 #include <linux/tcp.h>
 #include <linux/udp.h>
 #include <linux/icmp.h>
 #include <net/route.h>
-#include <linux/ip.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
 
 /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
 int ip_route_me_harder(struct sk_buff **pskb)
@@ -33,7 +35,6 @@ int ip_route_me_harder(struct sk_buff **pskb)
 #ifdef CONFIG_IP_ROUTE_FWMARK
 		fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark;
 #endif
-		fl.proto = iph->protocol;
 		if (ip_route_output_key(&rt, &fl) != 0)
 			return -1;
 
@@ -60,6 +61,13 @@ int ip_route_me_harder(struct sk_buff **pskb)
 	if ((*pskb)->dst->error)
 		return -1;
 
+#ifdef CONFIG_XFRM
+	if (!(IPCB(*pskb)->flags & IPSKB_XFRM_TRANSFORMED) &&
+	    xfrm_decode_session(*pskb, &fl, AF_INET) == 0)
+		if (xfrm_lookup(&(*pskb)->dst, &fl, (*pskb)->sk, 0))
+			return -1;
+#endif
+
 	/* Change in oif may mean change in hh_len. */
 	hh_len = (*pskb)->dst->dev->hard_header_len;
 	if (skb_headroom(*pskb) < hh_len) {
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 51fabb8f7c54..160c48800ab8 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -140,6 +140,7 @@ static int xfrm4_output_one(struct sk_buff *skb)
 		x = dst->xfrm;
 	} while (x && !x->props.mode);
 
+	IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED;
 	err = 0;
 
 out_exit:
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index f8626ebf90fd..b63678328a3b 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -10,6 +10,7 @@
 #include <net/dst.h>
 #include <net/ipv6.h>
 #include <net/ip6_route.h>
+#include <net/xfrm.h>
 
 int ip6_route_me_harder(struct sk_buff *skb)
 {
@@ -21,11 +22,17 @@ int ip6_route_me_harder(struct sk_buff *skb)
 		{ .ip6_u =
 		  { .daddr = iph->daddr,
 		    .saddr = iph->saddr, } },
-		.proto = iph->nexthdr,
 	};
 
 	dst = ip6_route_output(skb->sk, &fl);
 
+#ifdef CONFIG_XFRM
+	if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
+	    xfrm_decode_session(skb, &fl, AF_INET6) == 0)
+		if (xfrm_lookup(&skb->dst, &fl, skb->sk, 0))
+			return -1;
+#endif
+
 	if (dst->error) {
 		IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
 		LIMIT_NETDEBUG(KERN_DEBUG "ip6_route_me_harder: No more route.\n");
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index fc0ea38953c4..80242172a5df 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -139,6 +139,7 @@ static int xfrm6_output_one(struct sk_buff *skb)
 		x = dst->xfrm;
 	} while (x && !x->props.mode);
 
+	IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED;
 	err = 0;
 
 out_exit:
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 64a447375fdb..f2edc9225b6a 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -951,8 +951,8 @@ xfrm_policy_ok(struct xfrm_tmpl *tmpl, struct sec_path *sp, int start,
 	return start;
 }
 
-static int
-_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family)
+int
+xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family)
 {
 	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
 
@@ -963,6 +963,7 @@ _decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family)
 	xfrm_policy_put_afinfo(afinfo);
 	return 0;
 }
+EXPORT_SYMBOL(xfrm_decode_session);
 
 static inline int secpath_has_tunnel(struct sec_path *sp, int k)
 {
@@ -982,7 +983,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 	u8 fl_dir = policy_to_flow_dir(dir);
 	u32 sk_sid;
 
-	if (_decode_session(skb, &fl, family) < 0)
+	if (xfrm_decode_session(skb, &fl, family) < 0)
 		return 0;
 
 	sk_sid = security_sk_sid(sk, &fl, fl_dir);
@@ -1055,7 +1056,7 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
 {
 	struct flowi fl;
 
-	if (_decode_session(skb, &fl, family) < 0)
+	if (xfrm_decode_session(skb, &fl, family) < 0)
 		return 0;
 
 	return xfrm_lookup(&skb->dst, &fl, NULL, 0) == 0;
-- 
cgit v1.2.3


From eb9c7ebe6980c41cf6ae889e301c3b49f473ee9f Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 6 Jan 2006 23:06:30 -0800
Subject: [NETFILTER]: Handle NAT in IPsec policy checks

Handle NAT of decapsulated IPsec packets by reconstructing the struct flowi
of the original packet from the conntrack information for IPsec policy
checks.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter.h              | 16 +++++++++++
 net/dccp/ipv4.c                        |  2 +-
 net/ipv4/netfilter.c                   |  3 ++
 net/ipv4/netfilter/ip_nat_standalone.c | 50 ++++++++++++++++++++++++++++++++--
 net/xfrm/xfrm_policy.c                 |  2 ++
 5 files changed, 70 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 79bb977afeac..84506dfa1f37 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -274,6 +274,20 @@ struct nf_queue_rerouter {
 extern int nf_register_queue_rerouter(int pf, struct nf_queue_rerouter *rer);
 extern int nf_unregister_queue_rerouter(int pf);
 
+#include <net/flow.h>
+extern void (*ip_nat_decode_session)(struct sk_buff *, struct flowi *);
+
+static inline void
+nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, int family)
+{
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+	void (*decodefn)(struct sk_buff *, struct flowi *);
+
+	if (family == AF_INET && (decodefn = ip_nat_decode_session) != NULL)
+		decodefn(skb, fl);
+#endif
+}
+
 #ifdef CONFIG_PROC_FS
 #include <linux/proc_fs.h>
 extern struct proc_dir_entry *proc_net_netfilter;
@@ -282,6 +296,8 @@ extern struct proc_dir_entry *proc_net_netfilter;
 #else /* !CONFIG_NETFILTER */
 #define NF_HOOK(pf, hook, skb, indev, outdev, okfn) (okfn)(skb)
 static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
+static inline void
+nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, int family) {}
 #endif /*CONFIG_NETFILTER*/
 
 #endif /*__KERNEL__*/
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 23ba177c1150..00f983226672 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -986,6 +986,7 @@ int dccp_v4_rcv(struct sk_buff *skb)
 
 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
 		goto discard_and_relse;
+	nf_reset(skb);
 
 	return sk_receive_skb(sk, skb);
 
@@ -1099,7 +1100,6 @@ int dccp_v4_destroy_sock(struct sock *sk)
 		kfree_skb(sk->sk_send_head);
 		sk->sk_send_head = NULL;
 	}
-	nf_reset(skb);
 
 	/* Clean up a referenced DCCP bind bucket. */
 	if (inet_csk(sk)->icsk_bind_hash != NULL)
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 4c637a1cbd23..3321092b0914 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -86,6 +86,9 @@ int ip_route_me_harder(struct sk_buff **pskb)
 }
 EXPORT_SYMBOL(ip_route_me_harder);
 
+void (*ip_nat_decode_session)(struct sk_buff *, struct flowi *);
+EXPORT_SYMBOL(ip_nat_decode_session);
+
 /*
  * Extra routing may needed on local out, as the QUEUE target never
  * returns control to the table.
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index b518697af4db..8b8a1f00bbf4 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -55,6 +55,44 @@
 			         : ((hooknum) == NF_IP_LOCAL_IN ? "LOCAL_IN"  \
 				    : "*ERROR*")))
 
+#ifdef CONFIG_XFRM
+static void nat_decode_session(struct sk_buff *skb, struct flowi *fl)
+{
+	struct ip_conntrack *ct;
+	struct ip_conntrack_tuple *t;
+	enum ip_conntrack_info ctinfo;
+	enum ip_conntrack_dir dir;
+	unsigned long statusbit;
+
+	ct = ip_conntrack_get(skb, &ctinfo);
+	if (ct == NULL)
+		return;
+	dir = CTINFO2DIR(ctinfo);
+	t = &ct->tuplehash[dir].tuple;
+
+	if (dir == IP_CT_DIR_ORIGINAL)
+		statusbit = IPS_DST_NAT;
+	else
+		statusbit = IPS_SRC_NAT;
+
+	if (ct->status & statusbit) {
+		fl->fl4_dst = t->dst.ip;
+		if (t->dst.protonum == IPPROTO_TCP ||
+		    t->dst.protonum == IPPROTO_UDP)
+			fl->fl_ip_dport = t->dst.u.tcp.port;
+	}
+
+	statusbit ^= IPS_NAT_MASK;
+
+	if (ct->status & statusbit) {
+		fl->fl4_src = t->src.ip;
+		if (t->dst.protonum == IPPROTO_TCP ||
+		    t->dst.protonum == IPPROTO_UDP)
+			fl->fl_ip_sport = t->src.u.tcp.port;
+	}
+}
+#endif
+		
 static unsigned int
 ip_nat_fn(unsigned int hooknum,
 	  struct sk_buff **pskb,
@@ -330,10 +368,14 @@ static int init_or_cleanup(int init)
 
 	if (!init) goto cleanup;
 
+#ifdef CONFIG_XFRM
+	BUG_ON(ip_nat_decode_session != NULL);
+	ip_nat_decode_session = nat_decode_session;
+#endif
 	ret = ip_nat_rule_init();
 	if (ret < 0) {
 		printk("ip_nat_init: can't setup rules.\n");
-		goto cleanup_nothing;
+		goto cleanup_decode_session;
 	}
 	ret = nf_register_hook(&ip_nat_in_ops);
 	if (ret < 0) {
@@ -381,7 +423,11 @@ static int init_or_cleanup(int init)
 	nf_unregister_hook(&ip_nat_in_ops);
  cleanup_rule_init:
 	ip_nat_rule_cleanup();
- cleanup_nothing:
+ cleanup_decode_session:
+#ifdef CONFIG_XFRM
+	ip_nat_decode_session = NULL;
+	synchronize_net();
+#endif
 	return ret;
 }
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index f2edc9225b6a..59614a994b4e 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -22,6 +22,7 @@
 #include <linux/workqueue.h>
 #include <linux/notifier.h>
 #include <linux/netdevice.h>
+#include <linux/netfilter.h>
 #include <linux/module.h>
 #include <net/xfrm.h>
 #include <net/ip.h>
@@ -985,6 +986,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 
 	if (xfrm_decode_session(skb, &fl, family) < 0)
 		return 0;
+	nf_nat_decode_session(skb, &fl, family);
 
 	sk_sid = security_sk_sid(sk, &fl, fl_dir);
 
-- 
cgit v1.2.3


From e16a8f0b8c53312beb1d8b52e463aae79aa809c7 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 6 Jan 2006 23:06:48 -0800
Subject: [NETFILTER]: Add ipt_policy/ip6t_policy matches

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ipt_policy.h  |  52 +++++++++
 include/linux/netfilter_ipv6/ip6t_policy.h |  52 +++++++++
 net/ipv4/netfilter/Kconfig                 |  10 ++
 net/ipv4/netfilter/Makefile                |   1 +
 net/ipv4/netfilter/ipt_policy.c            | 170 ++++++++++++++++++++++++++++
 net/ipv6/netfilter/Kconfig                 |  10 ++
 net/ipv6/netfilter/Makefile                |   1 +
 net/ipv6/netfilter/ip6t_policy.c           | 175 +++++++++++++++++++++++++++++
 8 files changed, 471 insertions(+)
 create mode 100644 include/linux/netfilter_ipv4/ipt_policy.h
 create mode 100644 include/linux/netfilter_ipv6/ip6t_policy.h
 create mode 100644 net/ipv4/netfilter/ipt_policy.c
 create mode 100644 net/ipv6/netfilter/ip6t_policy.c

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv4/ipt_policy.h b/include/linux/netfilter_ipv4/ipt_policy.h
new file mode 100644
index 000000000000..7fd1bec453f1
--- /dev/null
+++ b/include/linux/netfilter_ipv4/ipt_policy.h
@@ -0,0 +1,52 @@
+#ifndef _IPT_POLICY_H
+#define _IPT_POLICY_H
+
+#define IPT_POLICY_MAX_ELEM	4
+
+enum ipt_policy_flags
+{
+	IPT_POLICY_MATCH_IN	= 0x1,
+	IPT_POLICY_MATCH_OUT	= 0x2,
+	IPT_POLICY_MATCH_NONE	= 0x4,
+	IPT_POLICY_MATCH_STRICT	= 0x8,
+};
+
+enum ipt_policy_modes
+{
+	IPT_POLICY_MODE_TRANSPORT,
+	IPT_POLICY_MODE_TUNNEL
+};
+
+struct ipt_policy_spec
+{
+	u_int8_t	saddr:1,
+			daddr:1,
+			proto:1,
+			mode:1,
+			spi:1,
+			reqid:1;
+};
+
+struct ipt_policy_elem
+{
+	u_int32_t	saddr;
+	u_int32_t	smask;
+	u_int32_t	daddr;
+	u_int32_t	dmask;
+	u_int32_t	spi;
+	u_int32_t	reqid;
+	u_int8_t	proto;
+	u_int8_t	mode;
+
+	struct ipt_policy_spec	match;
+	struct ipt_policy_spec	invert;
+};
+
+struct ipt_policy_info
+{
+	struct ipt_policy_elem pol[IPT_POLICY_MAX_ELEM];
+	u_int16_t flags;
+	u_int16_t len;
+};
+
+#endif /* _IPT_POLICY_H */
diff --git a/include/linux/netfilter_ipv6/ip6t_policy.h b/include/linux/netfilter_ipv6/ip6t_policy.h
new file mode 100644
index 000000000000..5a93afcd2ff1
--- /dev/null
+++ b/include/linux/netfilter_ipv6/ip6t_policy.h
@@ -0,0 +1,52 @@
+#ifndef _IP6T_POLICY_H
+#define _IP6T_POLICY_H
+
+#define IP6T_POLICY_MAX_ELEM	4
+
+enum ip6t_policy_flags
+{
+	IP6T_POLICY_MATCH_IN		= 0x1,
+	IP6T_POLICY_MATCH_OUT		= 0x2,
+	IP6T_POLICY_MATCH_NONE		= 0x4,
+	IP6T_POLICY_MATCH_STRICT	= 0x8,
+};
+
+enum ip6t_policy_modes
+{
+	IP6T_POLICY_MODE_TRANSPORT,
+	IP6T_POLICY_MODE_TUNNEL
+};
+
+struct ip6t_policy_spec
+{
+	u_int8_t	saddr:1,
+			daddr:1,
+			proto:1,
+			mode:1,
+			spi:1,
+			reqid:1;
+};
+
+struct ip6t_policy_elem
+{
+	struct in6_addr	saddr;
+	struct in6_addr	smask;
+	struct in6_addr	daddr;
+	struct in6_addr	dmask;
+	u_int32_t	spi;
+	u_int32_t	reqid;
+	u_int8_t	proto;
+	u_int8_t	mode;
+
+	struct ip6t_policy_spec	match;
+	struct ip6t_policy_spec	invert;
+};
+
+struct ip6t_policy_info
+{
+	struct ip6t_policy_elem pol[IP6T_POLICY_MAX_ELEM];
+	u_int16_t flags;
+	u_int16_t len;
+};
+
+#endif /* _IP6T_POLICY_H */
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 88a60650e6b8..a9893ec03e02 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -487,6 +487,16 @@ config IP_NF_MATCH_STRING
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config IP_NF_MATCH_POLICY
+       tristate "IPsec policy match support"
+       depends on IP_NF_IPTABLES && XFRM
+       help
+         Policy matching allows you to match packets based on the
+         IPsec policy that was used during decapsulation/will
+         be used during encapsulation.
+
+         To compile it as a module, choose M here.  If unsure, say N.
+
 # `filter', generic and specific targets
 config IP_NF_FILTER
 	tristate "Packet filtering"
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index d0a447e520a2..549b01a648b3 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -72,6 +72,7 @@ obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_tcpmss.o
 obj-$(CONFIG_IP_NF_MATCH_REALM) += ipt_realm.o
 obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
 obj-$(CONFIG_IP_NF_MATCH_PHYSDEV) += ipt_physdev.o
+obj-$(CONFIG_IP_NF_MATCH_POLICY) += ipt_policy.o
 obj-$(CONFIG_IP_NF_MATCH_COMMENT) += ipt_comment.o
 obj-$(CONFIG_IP_NF_MATCH_STRING) += ipt_string.o
 
diff --git a/net/ipv4/netfilter/ipt_policy.c b/net/ipv4/netfilter/ipt_policy.c
new file mode 100644
index 000000000000..709debcc69c9
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_policy.c
@@ -0,0 +1,170 @@
+/* IP tables module for matching IPsec policy
+ *
+ * Copyright (c) 2004,2005 Patrick McHardy, <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <net/xfrm.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_policy.h>
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("IPtables IPsec policy matching module");
+MODULE_LICENSE("GPL");
+
+
+static inline int
+match_xfrm_state(struct xfrm_state *x, const struct ipt_policy_elem *e)
+{
+#define MATCH(x,y)	(!e->match.x || ((e->x == (y)) ^ e->invert.x))
+
+	return MATCH(saddr, x->props.saddr.a4 & e->smask) &&
+	       MATCH(daddr, x->id.daddr.a4 & e->dmask) &&
+	       MATCH(proto, x->id.proto) &&
+	       MATCH(mode, x->props.mode) &&
+	       MATCH(spi, x->id.spi) &&
+	       MATCH(reqid, x->props.reqid);
+}
+
+static int
+match_policy_in(const struct sk_buff *skb, const struct ipt_policy_info *info)
+{
+	const struct ipt_policy_elem *e;
+	struct sec_path *sp = skb->sp;
+	int strict = info->flags & IPT_POLICY_MATCH_STRICT;
+	int i, pos;
+
+	if (sp == NULL)
+		return -1;
+	if (strict && info->len != sp->len)
+		return 0;
+
+	for (i = sp->len - 1; i >= 0; i--) {
+		pos = strict ? i - sp->len + 1 : 0;
+		if (pos >= info->len)
+			return 0;
+		e = &info->pol[pos];
+
+		if (match_xfrm_state(sp->x[i].xvec, e)) {
+			if (!strict)
+				return 1;
+		} else if (strict)
+			return 0;
+	}
+
+	return strict ? 1 : 0;
+}
+
+static int
+match_policy_out(const struct sk_buff *skb, const struct ipt_policy_info *info)
+{
+	const struct ipt_policy_elem *e;
+	struct dst_entry *dst = skb->dst;
+	int strict = info->flags & IPT_POLICY_MATCH_STRICT;
+	int i, pos;
+
+	if (dst->xfrm == NULL)
+		return -1;
+
+	for (i = 0; dst && dst->xfrm; dst = dst->child, i++) {
+		pos = strict ? i : 0;
+		if (pos >= info->len)
+			return 0;
+		e = &info->pol[pos];
+
+		if (match_xfrm_state(dst->xfrm, e)) {
+			if (!strict)
+				return 1;
+		} else if (strict)
+			return 0;
+	}
+
+	return strict ? 1 : 0;
+}
+
+static int match(const struct sk_buff *skb,
+                 const struct net_device *in,
+                 const struct net_device *out,
+                 const void *matchinfo, int offset, int *hotdrop)
+{
+	const struct ipt_policy_info *info = matchinfo;
+	int ret;
+
+	if (info->flags & IPT_POLICY_MATCH_IN)
+		ret = match_policy_in(skb, info);
+	else
+		ret = match_policy_out(skb, info);
+
+	if (ret < 0)
+		ret = info->flags & IPT_POLICY_MATCH_NONE ? 1 : 0;
+	else if (info->flags & IPT_POLICY_MATCH_NONE)
+		ret = 0;
+
+	return ret;
+}
+
+static int checkentry(const char *tablename, const struct ipt_ip *ip,
+                      void *matchinfo, unsigned int matchsize,
+                      unsigned int hook_mask)
+{
+	struct ipt_policy_info *info = matchinfo;
+
+	if (matchsize != IPT_ALIGN(sizeof(*info))) {
+		printk(KERN_ERR "ipt_policy: matchsize %u != %zu\n",
+		       matchsize, IPT_ALIGN(sizeof(*info)));
+		return 0;
+	}
+	if (!(info->flags & (IPT_POLICY_MATCH_IN|IPT_POLICY_MATCH_OUT))) {
+		printk(KERN_ERR "ipt_policy: neither incoming nor "
+		                "outgoing policy selected\n");
+		return 0;
+	}
+	if (hook_mask & (1 << NF_IP_PRE_ROUTING | 1 << NF_IP_LOCAL_IN)
+	    && info->flags & IPT_POLICY_MATCH_OUT) {
+		printk(KERN_ERR "ipt_policy: output policy not valid in "
+		                "PRE_ROUTING and INPUT\n");
+		return 0;
+	}
+	if (hook_mask & (1 << NF_IP_POST_ROUTING | 1 << NF_IP_LOCAL_OUT)
+	    && info->flags & IPT_POLICY_MATCH_IN) {
+		printk(KERN_ERR "ipt_policy: input policy not valid in "
+		                "POST_ROUTING and OUTPUT\n");
+		return 0;
+	}
+	if (info->len > IPT_POLICY_MAX_ELEM) {
+		printk(KERN_ERR "ipt_policy: too many policy elements\n");
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct ipt_match policy_match = {
+	.name		= "policy",
+	.match		= match,
+	.checkentry 	= checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ipt_register_match(&policy_match);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&policy_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 04912f9b35c3..105dd69ee9fb 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -179,6 +179,16 @@ config IP6_NF_MATCH_PHYSDEV
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config IP6_NF_MATCH_POLICY
+	tristate "IPsec policy match support"
+	depends on IP6_NF_IPTABLES && XFRM
+	help
+	  Policy matching allows you to match packets based on the
+	  IPsec policy that was used during decapsulation/will
+	  be used during encapsulation.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 # The targets
 config IP6_NF_FILTER
 	tristate "Packet filtering"
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 9ab5b2ca1f59..c0c809b426e8 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -13,6 +13,7 @@ obj-$(CONFIG_IP6_NF_MATCH_OPTS) += ip6t_hbh.o ip6t_dst.o
 obj-$(CONFIG_IP6_NF_MATCH_IPV6HEADER) += ip6t_ipv6header.o
 obj-$(CONFIG_IP6_NF_MATCH_FRAG) += ip6t_frag.o
 obj-$(CONFIG_IP6_NF_MATCH_AHESP) += ip6t_esp.o ip6t_ah.o
+obj-$(CONFIG_IP6_NF_MATCH_POLICY) += ip6t_policy.o
 obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o
 obj-$(CONFIG_IP6_NF_MATCH_MULTIPORT) += ip6t_multiport.o
 obj-$(CONFIG_IP6_NF_MATCH_OWNER) += ip6t_owner.o
diff --git a/net/ipv6/netfilter/ip6t_policy.c b/net/ipv6/netfilter/ip6t_policy.c
new file mode 100644
index 000000000000..13fedad48c1d
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_policy.c
@@ -0,0 +1,175 @@
+/* IP tables module for matching IPsec policy
+ *
+ * Copyright (c) 2004,2005 Patrick McHardy, <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <net/xfrm.h>
+
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv6/ip6t_policy.h>
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("IPtables IPsec policy matching module");
+MODULE_LICENSE("GPL");
+
+
+static inline int
+match_xfrm_state(struct xfrm_state *x, const struct ip6t_policy_elem *e)
+{
+#define MATCH_ADDR(x,y,z)	(!e->match.x || \
+				 ((ip6_masked_addrcmp((z), &e->x, &e->y)) == 0) ^ e->invert.x)
+#define MATCH(x,y)		(!e->match.x || ((e->x == (y)) ^ e->invert.x))
+	
+	return MATCH_ADDR(saddr, smask, (struct in6_addr *)&x->props.saddr.a6) &&
+	       MATCH_ADDR(daddr, dmask, (struct in6_addr *)&x->id.daddr.a6) &&
+	       MATCH(proto, x->id.proto) &&
+	       MATCH(mode, x->props.mode) &&
+	       MATCH(spi, x->id.spi) &&
+	       MATCH(reqid, x->props.reqid);
+}
+
+static int
+match_policy_in(const struct sk_buff *skb, const struct ip6t_policy_info *info)
+{
+	const struct ip6t_policy_elem *e;
+	struct sec_path *sp = skb->sp;
+	int strict = info->flags & IP6T_POLICY_MATCH_STRICT;
+	int i, pos;
+
+	if (sp == NULL)
+		return -1;
+	if (strict && info->len != sp->len)
+		return 0;
+
+	for (i = sp->len - 1; i >= 0; i--) {
+		pos = strict ? i - sp->len + 1 : 0;
+		if (pos >= info->len)
+			return 0;
+		e = &info->pol[pos];
+
+		if (match_xfrm_state(sp->x[i].xvec, e)) {
+			if (!strict)
+				return 1;
+		} else if (strict)
+			return 0;
+	}
+
+	return strict ? 1 : 0;
+}
+
+static int
+match_policy_out(const struct sk_buff *skb, const struct ip6t_policy_info *info)
+{
+	const struct ip6t_policy_elem *e;
+	struct dst_entry *dst = skb->dst;
+	int strict = info->flags & IP6T_POLICY_MATCH_STRICT;
+	int i, pos;
+
+	if (dst->xfrm == NULL)
+		return -1;
+
+	for (i = 0; dst && dst->xfrm; dst = dst->child, i++) {
+		pos = strict ? i : 0;
+		if (pos >= info->len)
+			return 0;
+		e = &info->pol[pos];
+
+		if (match_xfrm_state(dst->xfrm, e)) {
+			if (!strict)
+				return 1;
+		} else if (strict)
+			return 0;
+	}
+
+	return strict ? 1 : 0;
+}
+
+static int match(const struct sk_buff *skb,
+                 const struct net_device *in,
+                 const struct net_device *out,
+                 const void *matchinfo,
+		 int offset,
+		 unsigned int protoff,
+		 int *hotdrop)
+{
+	const struct ip6t_policy_info *info = matchinfo;
+	int ret;
+
+	if (info->flags & IP6T_POLICY_MATCH_IN)
+		ret = match_policy_in(skb, info);
+	else
+		ret = match_policy_out(skb, info);
+
+	if (ret < 0)
+		ret = info->flags & IP6T_POLICY_MATCH_NONE ? 1 : 0;
+	else if (info->flags & IP6T_POLICY_MATCH_NONE)
+		ret = 0;
+
+	return ret;
+}
+
+static int checkentry(const char *tablename, const struct ip6t_ip6 *ip,
+                      void *matchinfo, unsigned int matchsize,
+                      unsigned int hook_mask)
+{
+	struct ip6t_policy_info *info = matchinfo;
+
+	if (matchsize != IP6T_ALIGN(sizeof(*info))) {
+		printk(KERN_ERR "ip6t_policy: matchsize %u != %zu\n",
+		       matchsize, IP6T_ALIGN(sizeof(*info)));
+		return 0;
+	}
+	if (!(info->flags & (IP6T_POLICY_MATCH_IN|IP6T_POLICY_MATCH_OUT))) {
+		printk(KERN_ERR "ip6t_policy: neither incoming nor "
+		                "outgoing policy selected\n");
+		return 0;
+	}
+	if (hook_mask & (1 << NF_IP6_PRE_ROUTING | 1 << NF_IP6_LOCAL_IN)
+	    && info->flags & IP6T_POLICY_MATCH_OUT) {
+		printk(KERN_ERR "ip6t_policy: output policy not valid in "
+		                "PRE_ROUTING and INPUT\n");
+		return 0;
+	}
+	if (hook_mask & (1 << NF_IP6_POST_ROUTING | 1 << NF_IP6_LOCAL_OUT)
+	    && info->flags & IP6T_POLICY_MATCH_IN) {
+		printk(KERN_ERR "ip6t_policy: input policy not valid in "
+		                "POST_ROUTING and OUTPUT\n");
+		return 0;
+	}
+	if (info->len > IP6T_POLICY_MAX_ELEM) {
+		printk(KERN_ERR "ip6t_policy: too many policy elements\n");
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct ip6t_match policy_match = {
+	.name		= "policy",
+	.match		= match,
+	.checkentry 	= checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ip6t_register_match(&policy_match);
+}
+
+static void __exit fini(void)
+{
+	ip6t_unregister_match(&policy_match);
+}
+
+module_init(init);
+module_exit(fini);
-- 
cgit v1.2.3


From f53b61d8c385140fe7f09e0c9187ae813ee9f330 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Sat, 7 Jan 2006 12:50:27 -0800
Subject: [NETFILTER]: Add dummy nf_hook{_thresh}() when NETFILTER is disabled.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 84506dfa1f37..4cf6088625c1 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -295,7 +295,22 @@ extern struct proc_dir_entry *proc_net_netfilter;
 
 #else /* !CONFIG_NETFILTER */
 #define NF_HOOK(pf, hook, skb, indev, outdev, okfn) (okfn)(skb)
+static inline int nf_hook_thresh(int pf, unsigned int hook,
+				 struct sk_buff **pskb,
+				 struct net_device *indev,
+				 struct net_device *outdev,
+				 int (*okfn)(struct sk_buff *), int thresh)
+{
+	return okfn(*pskb);
+}
+static inline int nf_hook(int pf, unsigned int hook, struct sk_buff **pskb,
+			  struct net_device *indev, struct net_device *outdev,
+			  int (*okfn)(struct sk_buff *))
+{
+	return okfn(*pskb);
+}
 static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
+struct flowi;
 static inline void
 nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, int family) {}
 #endif /*CONFIG_NETFILTER*/
-- 
cgit v1.2.3


From b792de39d892e06b18ddea85be076bae123d6bf6 Mon Sep 17 00:00:00 2001
From: Olaf Hering <olh@suse.de>
Date: Sun, 8 Jan 2006 01:00:32 -0800
Subject: [PATCH] Fix compilation with CONFIG_MEMORY_HOTPLUG=y and gcc41.

Fix compilation with CONFIG_MEMORY_HOTPLUG=y and gcc41.
Also remove unneeded declations, add a public function.

drivers/base/memory.c:53: error: static declaration of 'register_memory_notifier' follows non-static declaration
include/linux/memory.h:85: error: previous declaration of 'register_memory_notifier' was here
drivers/base/memory.c:58: error: static declaration of 'unregister_memory_notifier' follows non-static declaration
include/linux/memory.h:86: error: previous declaration of 'unregister_memory_notifier' was here
drivers/base/memory.c:68: error: static declaration of 'register_memory' follows non-static declaration
include/linux/memory.h:73: error: previous declaration of 'register_memory' was here

Signed-off-by: Olaf Hering <olh@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/memory.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory.h b/include/linux/memory.h
index dc4081b6f161..e251dc43d0f5 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -70,21 +70,15 @@ static inline void unregister_memory_notifier(struct notifier_block *nb)
 {
 }
 #else
-extern int register_memory(struct memory_block *, struct mem_section *section, struct node *);
 extern int register_new_memory(struct mem_section *);
 extern int unregister_memory_section(struct mem_section *);
 extern int memory_dev_init(void);
-extern int register_memory_notifier(struct notifier_block *nb);
-extern void unregister_memory_notifier(struct notifier_block *nb);
+extern int remove_memory_block(unsigned long, struct mem_section *, int);
 
 #define CONFIG_MEM_BLOCK_SIZE	(PAGES_PER_SECTION<<PAGE_SHIFT)
 
-extern int invalidate_phys_mapping(unsigned long, unsigned long);
 struct notifier_block;
 
-extern int register_memory_notifier(struct notifier_block *nb);
-extern void unregister_memory_notifier(struct notifier_block *nb);
-
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 #define hotplug_memory_notifier(fn, pri) {			\
-- 
cgit v1.2.3


From f9f7500521b25dbf1aba476b81230489ad8e2c4b Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Sun, 8 Jan 2006 01:00:33 -0800
Subject: [PATCH] slab: remove unused align parameter from alloc_percpu

__alloc_percpu and alloc_percpu both take an 'align' argument which is
completely ignored.  snmp6_mib_init() in net/ipv6/af_inet6.c attempts to use
it, but it will be ignored.  Therefore, remove the 'align' argument and fixup
the lone caller.

Signed-off-by: Matthew Dobson <colpatch@us.ibm.com>
Acked-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/percpu.h | 7 +++----
 mm/slab.c              | 3 +--
 net/ipv6/af_inet6.c    | 4 ++--
 3 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index fb8d2d24e4bb..20317d88deba 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -33,14 +33,14 @@ struct percpu_data {
         (__typeof__(ptr))__p->ptrs[(cpu)];	\
 })
 
-extern void *__alloc_percpu(size_t size, size_t align);
+extern void *__alloc_percpu(size_t size);
 extern void free_percpu(const void *);
 
 #else /* CONFIG_SMP */
 
 #define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
 
-static inline void *__alloc_percpu(size_t size, size_t align)
+static inline void *__alloc_percpu(size_t size)
 {
 	void *ret = kmalloc(size, GFP_KERNEL);
 	if (ret)
@@ -55,7 +55,6 @@ static inline void free_percpu(const void *ptr)
 #endif /* CONFIG_SMP */
 
 /* Simple wrapper for the common case: zeros memory. */
-#define alloc_percpu(type) \
-	((type *)(__alloc_percpu(sizeof(type), __alignof__(type))))
+#define alloc_percpu(type)	((type *)(__alloc_percpu(sizeof(type))))
 
 #endif /* __LINUX_PERCPU_H */
diff --git a/mm/slab.c b/mm/slab.c
index e5ec26e0c460..eb70fddf2059 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2944,9 +2944,8 @@ EXPORT_SYMBOL(__kmalloc);
  * Objects should be dereferenced using the per_cpu_ptr macro only.
  *
  * @size: how many bytes of memory are required.
- * @align: the alignment, which can't be greater than SMP_CACHE_BYTES.
  */
-void *__alloc_percpu(size_t size, size_t align)
+void *__alloc_percpu(size_t size)
 {
 	int i;
 	struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 68afc53be662..25c3fe5005d9 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -689,11 +689,11 @@ snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign)
 	if (ptr == NULL)
 		return -EINVAL;
 
-	ptr[0] = __alloc_percpu(mibsize, mibalign);
+	ptr[0] = __alloc_percpu(mibsize);
 	if (!ptr[0])
 		goto err0;
 
-	ptr[1] = __alloc_percpu(mibsize, mibalign);
+	ptr[1] = __alloc_percpu(mibsize);
 	if (!ptr[1])
 		goto err1;
 
-- 
cgit v1.2.3


From 9d0243bca345d5ce25d3f4b74b7facb3a6df1232 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 8 Jan 2006 01:00:39 -0800
Subject: [PATCH] drop-pagecache

Add /proc/sys/vm/drop_caches.  When written to, this will cause the kernel to
discard as much pagecache and/or reclaimable slab objects as it can.  THis
operation requires root permissions.

It won't drop dirty data, so the user should run `sync' first.

Caveats:

a) Holds inode_lock for exorbitant amounts of time.

b) Needs to be taught about NUMA nodes: propagate these all the way through
   so the discarding can be controlled on a per-node basis.

This is a debugging feature: useful for getting consistent results between
filesystem benchmarks.  We could possibly put it under a config option, but
it's less than 300 bytes.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/filesystems/proc.txt | 17 ++++++++++
 Documentation/sysctl/vm.txt        |  3 +-
 fs/Makefile                        |  2 +-
 fs/drop_caches.c                   | 68 ++++++++++++++++++++++++++++++++++++++
 include/linux/mm.h                 |  7 ++++
 include/linux/sysctl.h             |  1 +
 kernel/sysctl.c                    | 10 ++++++
 mm/truncate.c                      |  1 -
 mm/vmscan.c                        |  3 +-
 9 files changed, 107 insertions(+), 5 deletions(-)
 create mode 100644 fs/drop_caches.c

(limited to 'include/linux')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index d4773565ea2f..a4dcf42c2fd9 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1302,6 +1302,23 @@ VM has token based thrashing control mechanism and uses the token to prevent
 unnecessary page faults in thrashing situation. The unit of the value is
 second. The value would be useful to tune thrashing behavior.
 
+drop_caches
+-----------
+
+Writing to this will cause the kernel to drop clean caches, dentries and
+inodes from memory, causing that memory to become free.
+
+To free pagecache:
+	echo 1 > /proc/sys/vm/drop_caches
+To free dentries and inodes:
+	echo 2 > /proc/sys/vm/drop_caches
+To free pagecache, dentries and inodes:
+	echo 3 > /proc/sys/vm/drop_caches
+
+As this is a non-destructive operation and dirty objects are not freeable, the
+user should run `sync' first.
+
+
 2.5 /proc/sys/dev - Device specific parameters
 ----------------------------------------------
 
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 2f1aae32a5d9..89ba1a42a17d 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -26,12 +26,13 @@ Currently, these files are in /proc/sys/vm:
 - min_free_kbytes
 - laptop_mode
 - block_dump
+- drop-caches
 
 ==============================================================
 
 dirty_ratio, dirty_background_ratio, dirty_expire_centisecs,
 dirty_writeback_centisecs, vfs_cache_pressure, laptop_mode,
-block_dump, swap_token_timeout:
+block_dump, swap_token_timeout, drop-caches:
 
 See Documentation/filesystems/proc.txt
 
diff --git a/fs/Makefile b/fs/Makefile
index 73676111ebbe..35e9aec608e4 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -10,7 +10,7 @@ obj-y :=	open.o read_write.o file_table.o buffer.o  bio.o super.o \
 		ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
-		ioprio.o pnode.o
+		ioprio.o pnode.o drop_caches.o
 
 obj-$(CONFIG_INOTIFY)		+= inotify.o
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
new file mode 100644
index 000000000000..4e4762389bdc
--- /dev/null
+++ b/fs/drop_caches.c
@@ -0,0 +1,68 @@
+/*
+ * Implement the manual drop-all-pagecache function
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/writeback.h>
+#include <linux/sysctl.h>
+#include <linux/gfp.h>
+
+/* A global variable is a bit ugly, but it keeps the code simple */
+int sysctl_drop_caches;
+
+static void drop_pagecache_sb(struct super_block *sb)
+{
+	struct inode *inode;
+
+	spin_lock(&inode_lock);
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		if (inode->i_state & (I_FREEING|I_WILL_FREE))
+			continue;
+		invalidate_inode_pages(inode->i_mapping);
+	}
+	spin_unlock(&inode_lock);
+}
+
+void drop_pagecache(void)
+{
+	struct super_block *sb;
+
+	spin_lock(&sb_lock);
+restart:
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		sb->s_count++;
+		spin_unlock(&sb_lock);
+		down_read(&sb->s_umount);
+		if (sb->s_root)
+			drop_pagecache_sb(sb);
+		up_read(&sb->s_umount);
+		spin_lock(&sb_lock);
+		if (__put_super_and_need_restart(sb))
+			goto restart;
+	}
+	spin_unlock(&sb_lock);
+}
+
+void drop_slab(void)
+{
+	int nr_objects;
+
+	do {
+		nr_objects = shrink_slab(1000, GFP_KERNEL, 1000);
+	} while (nr_objects > 10);
+}
+
+int drop_caches_sysctl_handler(ctl_table *table, int write,
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	if (write) {
+		if (sysctl_drop_caches & 1)
+			drop_pagecache();
+		if (sysctl_drop_caches & 2)
+			drop_slab();
+	}
+	return 0;
+}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bc01fff3aa01..83c651f25188 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1036,5 +1036,12 @@ int in_gate_area_no_task(unsigned long addr);
 /* /proc/<pid>/oom_adj set to -17 protects from the oom-killer */
 #define OOM_DISABLE -17
 
+int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *,
+					void __user *, size_t *, loff_t *);
+int shrink_slab(unsigned long scanned, gfp_t gfp_mask,
+			unsigned long lru_pages);
+void drop_pagecache(void);
+void drop_slab(void);
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index a9b80fc7f0f3..4cd267fe87ec 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -180,6 +180,7 @@ enum
 	VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */
 	VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */
 	VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */
+	VM_DROP_PAGECACHE=29,	/* int: nuke lots of pagecache */
 };
 
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index a85047bb5739..8dcf6fd5b0f9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -68,6 +68,7 @@ extern int min_free_kbytes;
 extern int printk_ratelimit_jiffies;
 extern int printk_ratelimit_burst;
 extern int pid_max_min, pid_max_max;
+extern int sysctl_drop_caches;
 
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 int unknown_nmi_panic;
@@ -774,6 +775,15 @@ static ctl_table vm_table[] = {
 		.proc_handler	= &lowmem_reserve_ratio_sysctl_handler,
 		.strategy	= &sysctl_intvec,
 	},
+	{
+		.ctl_name	= VM_DROP_PAGECACHE,
+		.procname	= "drop_caches",
+		.data		= &sysctl_drop_caches,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= drop_caches_sysctl_handler,
+		.strategy	= &sysctl_intvec,
+	},
 	{
 		.ctl_name	= VM_MIN_FREE_KBYTES,
 		.procname	= "min_free_kbytes",
diff --git a/mm/truncate.c b/mm/truncate.c
index 7dee32745901..b1a463d0fe71 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -249,7 +249,6 @@ unlock:
 				break;
 		}
 		pagevec_release(&pvec);
-		cond_resched();
 	}
 	return ret;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index be8235fb1939..428c5801d4b4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -180,8 +180,7 @@ EXPORT_SYMBOL(remove_shrinker);
  *
  * Returns the number of slab objects which we shrunk.
  */
-static int shrink_slab(unsigned long scanned, gfp_t gfp_mask,
-			unsigned long lru_pages)
+int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages)
 {
 	struct shrinker *shrinker;
 	int ret = 0;
-- 
cgit v1.2.3


From 8ad4b1fb8205340dba16b63467bb23efc27264d6 Mon Sep 17 00:00:00 2001
From: Rohit Seth <rohit.seth@intel.com>
Date: Sun, 8 Jan 2006 01:00:40 -0800
Subject: [PATCH] Make high and batch sizes of per_cpu_pagelists configurable

As recently there has been lot of traffic on the right values for batch and
high water marks for per_cpu_pagelists.  This patch makes these two
variables configurable through /proc interface.

A new tunable /proc/sys/vm/percpu_pagelist_fraction is added.  This entry
controls the fraction of pages at most in each zone that are allocated for
each per cpu page list.  The min value for this is 8.  It means that we
don't allow more than 1/8th of pages in each zone to be allocated in any
single per_cpu_pagelist.

The batch value of each per cpu pagelist is also updated as a result.  It
is set to pcp->high/4.  The upper limit of batch is (PAGE_SHIFT * 8)

Signed-off-by: Rohit Seth <rohit.seth@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/sysctl/vm.txt | 17 ++++++++++++++++
 include/linux/mmzone.h      |  2 ++
 include/linux/sysctl.h      |  1 +
 kernel/sysctl.c             | 12 +++++++++++
 mm/page_alloc.c             | 49 +++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 81 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 89ba1a42a17d..6910c0136f8d 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -103,3 +103,20 @@ This is used to force the Linux VM to keep a minimum number
 of kilobytes free.  The VM uses this number to compute a pages_min
 value for each lowmem zone in the system.  Each lowmem zone gets 
 a number of reserved free pages based proportionally on its size.
+
+==============================================================
+
+percpu_pagelist_fraction
+
+This is the fraction of pages at most (high mark pcp->high) in each zone that
+are allocated for each per cpu page list.  The min value for this is 8.  It
+means that we don't allow more than 1/8th of pages in each zone to be
+allocated in any single per_cpu_pagelist.  This entry only changes the value
+of hot per cpu pagelists.  User can specify a number like 100 to allocate
+1/100th of each zone to each per cpu page list.
+
+The batch value of each per cpu pagelist is also updated as a result.  It is
+set to pcp->high/4.  The upper limit of batch is (PAGE_SHIFT * 8)
+
+The initial value is zero.  Kernel does not use this value at boot time to set
+the high water marks for each per cpu page list.
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c34f4a2c62f8..2a89c132ba9c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -437,6 +437,8 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *,
 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
 					void __user *, size_t *, loff_t *);
+int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, struct file *,
+					void __user *, size_t *, loff_t *);
 
 #include <linux/topology.h>
 /* Returns the number of the current Node. */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 4cd267fe87ec..7f472127b7b5 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -181,6 +181,7 @@ enum
 	VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */
 	VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */
 	VM_DROP_PAGECACHE=29,	/* int: nuke lots of pagecache */
+	VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */
 };
 
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8dcf6fd5b0f9..03b0598f2369 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -69,6 +69,7 @@ extern int printk_ratelimit_jiffies;
 extern int printk_ratelimit_burst;
 extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
+extern int percpu_pagelist_fraction;
 
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 int unknown_nmi_panic;
@@ -79,6 +80,7 @@ extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *,
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
 static int minolduid;
+static int min_percpu_pagelist_fract = 8;
 
 static int ngroups_max = NGROUPS_MAX;
 
@@ -794,6 +796,16 @@ static ctl_table vm_table[] = {
 		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 	},
+	{
+		.ctl_name	= VM_PERCPU_PAGELIST_FRACTION,
+		.procname	= "percpu_pagelist_fraction",
+		.data		= &percpu_pagelist_fraction,
+		.maxlen		= sizeof(percpu_pagelist_fraction),
+		.mode		= 0644,
+		.proc_handler	= &percpu_pagelist_fraction_sysctl_handler,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &min_percpu_pagelist_fract,
+	},
 #ifdef CONFIG_MMU
 	{
 		.ctl_name	= VM_MAX_MAP_COUNT,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5eeeadd9f66a..2c46f697e8ff 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -53,6 +53,7 @@ struct pglist_data *pgdat_list __read_mostly;
 unsigned long totalram_pages __read_mostly;
 unsigned long totalhigh_pages __read_mostly;
 long nr_swap_pages;
+int percpu_pagelist_fraction;
 
 static void fastcall free_hot_cold_page(struct page *page, int cold);
 
@@ -1831,6 +1832,24 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
 	INIT_LIST_HEAD(&pcp->list);
 }
 
+/*
+ * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
+ * to the value high for the pageset p.
+ */
+
+static void setup_pagelist_highmark(struct per_cpu_pageset *p,
+				unsigned long high)
+{
+	struct per_cpu_pages *pcp;
+
+	pcp = &p->pcp[0]; /* hot list */
+	pcp->high = high;
+	pcp->batch = max(1UL, high/4);
+	if ((high/4) > (PAGE_SHIFT * 8))
+		pcp->batch = PAGE_SHIFT * 8;
+}
+
+
 #ifdef CONFIG_NUMA
 /*
  * Boot pageset table. One per cpu which is going to be used for all
@@ -1868,6 +1887,10 @@ static int __devinit process_zones(int cpu)
 			goto bad;
 
 		setup_pageset(zone->pageset[cpu], zone_batchsize(zone));
+
+		if (percpu_pagelist_fraction)
+			setup_pagelist_highmark(zone_pcp(zone, cpu),
+			 	(zone->present_pages / percpu_pagelist_fraction));
 	}
 
 	return 0;
@@ -2567,6 +2590,32 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
 	return 0;
 }
 
+/*
+ * percpu_pagelist_fraction - changes the pcp->high for each zone on each
+ * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
+ * can have before it gets flushed back to buddy allocator.
+ */
+
+int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	struct zone *zone;
+	unsigned int cpu;
+	int ret;
+
+	ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	if (!write || (ret == -EINVAL))
+		return ret;
+	for_each_zone(zone) {
+		for_each_online_cpu(cpu) {
+			unsigned long  high;
+			high = zone->present_pages / percpu_pagelist_fraction;
+			setup_pagelist_highmark(zone_pcp(zone, cpu), high);
+		}
+	}
+	return 0;
+}
+
 __initdata int hashdist = HASHDIST_DEFAULT;
 
 #ifdef CONFIG_NUMA
-- 
cgit v1.2.3


From 15316ba81aee6775d6079fb46c66c801989e7d10 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Sun, 8 Jan 2006 01:00:43 -0800
Subject: [PATCH] add schedule_on_each_cpu()

swap migration's isolate_lru_page() currently uses an IPI to notify other
processors that the lru caches need to be drained if the page cannot be
found on the LRU.  The IPI interrupt may interrupt a processor that is just
processing lru requests and cause a race condition.

This patch introduces a new function run_on_each_cpu() that uses the
keventd() to run the LRU draining on each processor.  Processors disable
preemption when dealing the LRU caches (these are per processor) and thus
executing LRU draining from another process is safe.

Thanks to Lee Schermerhorn <lee.schermerhorn@hp.com> for finding this race
condition.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/workqueue.h |  1 +
 kernel/workqueue.c        | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index ac39d04d027c..86b111300231 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -65,6 +65,7 @@ extern int FASTCALL(schedule_work(struct work_struct *work));
 extern int FASTCALL(schedule_delayed_work(struct work_struct *work, unsigned long delay));
 
 extern int schedule_delayed_work_on(int cpu, struct work_struct *work, unsigned long delay);
+extern int schedule_on_each_cpu(void (*func)(void *info), void *info);
 extern void flush_scheduled_work(void);
 extern int current_is_keventd(void);
 extern int keventd_up(void);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 2bd5aee1c736..62d47220696a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -419,6 +419,25 @@ int schedule_delayed_work_on(int cpu,
 	return ret;
 }
 
+int schedule_on_each_cpu(void (*func) (void *info), void *info)
+{
+	int cpu;
+	struct work_struct *work;
+
+	work = kmalloc(NR_CPUS * sizeof(struct work_struct), GFP_KERNEL);
+
+	if (!work)
+		return -ENOMEM;
+	for_each_online_cpu(cpu) {
+		INIT_WORK(work + cpu, func, info);
+		__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
+				work + cpu);
+	}
+	flush_workqueue(keventd_wq);
+	kfree(work);
+	return 0;
+}
+
 void flush_scheduled_work(void)
 {
 	flush_workqueue(keventd_wq);
-- 
cgit v1.2.3


From 21eac81f252fe31c3cf64b805a1e8652192f3a3b Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:00:45 -0800
Subject: [PATCH] Swap Migration V5: LRU operations

This is the start of the `swap migration' patch series.

Swap migration allows the moving of the physical location of pages between
nodes in a numa system while the process is running.  This means that the
virtual addresses that the process sees do not change.  However, the system
rearranges the physical location of those pages.

The main intent of page migration patches here is to reduce the latency of
memory access by moving pages near to the processor where the process
accessing that memory is running.

The patchset allows a process to manually relocate the node on which its
pages are located through the MF_MOVE and MF_MOVE_ALL options while
setting a new memory policy.

The pages of process can also be relocated from another process using the
sys_migrate_pages() function call.  Requires CAP_SYS_ADMIN.  The migrate_pages
function call takes two sets of nodes and moves pages of a process that are
located on the from nodes to the destination nodes.

Manual migration is very useful if for example the scheduler has relocated a
process to a processor on a distant node.  A batch scheduler or an
administrator can detect the situation and move the pages of the process
nearer to the new processor.

sys_migrate_pages() could be used on non-numa machines as well, to force all
of a particualr process's pages out to swap, if someone thinks that's useful.

Larger installations usually partition the system using cpusets into sections
of nodes.  Paul has equipped cpusets with the ability to move pages when a
task is moved to another cpuset.  This allows automatic control over locality
of a process.  If a task is moved to a new cpuset then also all its pages are
moved with it so that the performance of the process does not sink
dramatically (as is the case today).

Swap migration works by simply evicting the page.  The pages must be faulted
back in.  The pages are then typically reallocated by the system near the node
where the process is executing.

For swap migration the destination of the move is controlled by the allocation
policy.  Cpusets set the allocation policy before calling sys_migrate_pages()
in order to move the pages as intended.

No allocation policy changes are performed for sys_migrate_pages().  This
means that the pages may not faulted in to the specified nodes if no
allocation policy was set by other means.  The pages will just end up near the
node where the fault occurred.

There's another patch series in the pipeline which implements "direct
migration".

The direct migration patchset extends the migration functionality to avoid
going through swap.  The destination node of the relation is controllable
during the actual moving of pages.  The crutch of using the allocation policy
to relocate is not necessary and the pages are moved directly to the target.
Its also faster since swap is not used.

And sys_migrate_pages() can then move pages directly to the specified node.
Implement functions to isolate pages from the LRU and put them back later.

This patch:

An earlier implementation was provided by Hirokazu Takahashi
<taka@valinux.co.jp> and IWAMOTO Toshihiro <iwamoto@valinux.co.jp> for the
memory hotplug project.

From: Magnus

This breaks out isolate_lru_page() and putpack_lru_page().  Needed for swap
migration.

Signed-off-by: Magnus Damm <magnus.damm@gmail.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm_inline.h |  22 ++++++++++
 include/linux/swap.h      |   3 ++
 mm/vmscan.c               | 100 ++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 112 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 47762ca695a5..49cc68af01f8 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -38,3 +38,25 @@ del_page_from_lru(struct zone *zone, struct page *page)
 		zone->nr_inactive--;
 	}
 }
+
+/*
+ * Isolate one page from the LRU lists.
+ *
+ * - zone->lru_lock must be held
+ */
+static inline int __isolate_lru_page(struct page *page)
+{
+	if (unlikely(!TestClearPageLRU(page)))
+		return 0;
+
+	if (get_page_testone(page)) {
+		/*
+		 * It is being freed elsewhere
+		 */
+		__put_page(page);
+		SetPageLRU(page);
+		return -ENOENT;
+	}
+
+	return 1;
+}
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 556617bcf7ac..a49112536c02 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -175,6 +175,9 @@ extern int try_to_free_pages(struct zone **, gfp_t);
 extern int shrink_all_memory(int);
 extern int vm_swappiness;
 
+extern int isolate_lru_page(struct page *p);
+extern int putback_lru_pages(struct list_head *l);
+
 #ifdef CONFIG_MMU
 /* linux/mm/shmem.c */
 extern int shmem_unuse(swp_entry_t entry, struct page *page);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 428c5801d4b4..261a56ee11b6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -593,20 +593,18 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
 
-		if (!TestClearPageLRU(page))
-			BUG();
-		list_del(&page->lru);
-		if (get_page_testone(page)) {
-			/*
-			 * It is being freed elsewhere
-			 */
-			__put_page(page);
-			SetPageLRU(page);
-			list_add(&page->lru, src);
-			continue;
-		} else {
-			list_add(&page->lru, dst);
+		switch (__isolate_lru_page(page)) {
+		case 1:
+			/* Succeeded to isolate page */
+			list_move(&page->lru, dst);
 			nr_taken++;
+			break;
+		case -ENOENT:
+			/* Not possible to isolate */
+			list_move(&page->lru, src);
+			break;
+		default:
+			BUG();
 		}
 	}
 
@@ -614,6 +612,48 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
 	return nr_taken;
 }
 
+static void lru_add_drain_per_cpu(void *dummy)
+{
+	lru_add_drain();
+}
+
+/*
+ * Isolate one page from the LRU lists and put it on the
+ * indicated list. Do necessary cache draining if the
+ * page is not on the LRU lists yet.
+ *
+ * Result:
+ *  0 = page not on LRU list
+ *  1 = page removed from LRU list and added to the specified list.
+ * -ENOENT = page is being freed elsewhere.
+ */
+int isolate_lru_page(struct page *page)
+{
+	int rc = 0;
+	struct zone *zone = page_zone(page);
+
+redo:
+	spin_lock_irq(&zone->lru_lock);
+	rc = __isolate_lru_page(page);
+	if (rc == 1) {
+		if (PageActive(page))
+			del_page_from_active_list(zone, page);
+		else
+			del_page_from_inactive_list(zone, page);
+	}
+	spin_unlock_irq(&zone->lru_lock);
+	if (rc == 0) {
+		/*
+		 * Maybe this page is still waiting for a cpu to drain it
+		 * from one of the lru lists?
+		 */
+		rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
+		if (rc == 0 && PageLRU(page))
+			goto redo;
+	}
+	return rc;
+}
+
 /*
  * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
  */
@@ -679,6 +719,40 @@ done:
 	pagevec_release(&pvec);
 }
 
+static inline void move_to_lru(struct page *page)
+{
+	list_del(&page->lru);
+	if (PageActive(page)) {
+		/*
+		 * lru_cache_add_active checks that
+		 * the PG_active bit is off.
+		 */
+		ClearPageActive(page);
+		lru_cache_add_active(page);
+	} else {
+		lru_cache_add(page);
+	}
+	put_page(page);
+}
+
+/*
+ * Add isolated pages on the list back to the LRU
+ *
+ * returns the number of pages put back.
+ */
+int putback_lru_pages(struct list_head *l)
+{
+	struct page *page;
+	struct page *page2;
+	int count = 0;
+
+	list_for_each_entry_safe(page, page2, l, lru) {
+		move_to_lru(page);
+		count++;
+	}
+	return count;
+}
+
 /*
  * This moves pages from the active list to the inactive list.
  *
-- 
cgit v1.2.3


From 930d915252edda7042c944ed3c30194a2f9fe163 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:00:47 -0800
Subject: [PATCH] Swap Migration V5: PF_SWAPWRITE to allow writing to swap

Add PF_SWAPWRITE to control a processes permission to write to swap.

- Use PF_SWAPWRITE in may_write_to_queue() instead of checking for kswapd
  and pdflush

- Set PF_SWAPWRITE flag for kswapd and pdflush

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 1 +
 mm/pdflush.c          | 2 +-
 mm/vmscan.c           | 6 ++----
 3 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7da33619d5d0..a74662077d60 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -895,6 +895,7 @@ do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
 #define PF_SYNCWRITE	0x00200000	/* I am doing a sync write */
 #define PF_BORROWED_MM	0x00400000	/* I am a kthread doing use_mm */
 #define PF_RANDOMIZE	0x00800000	/* randomize virtual address space */
+#define PF_SWAPWRITE	0x01000000	/* Allowed to write to swap */
 
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 52822c98c489..c4b6d0afd736 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -90,7 +90,7 @@ struct pdflush_work {
 
 static int __pdflush(struct pdflush_work *my_work)
 {
-	current->flags |= PF_FLUSHER;
+	current->flags |= PF_FLUSHER | PF_SWAPWRITE;
 	my_work->fn = NULL;
 	my_work->who = current;
 	INIT_LIST_HEAD(&my_work->list);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 261a56ee11b6..6c30a8c59795 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -268,9 +268,7 @@ static inline int is_page_cache_freeable(struct page *page)
 
 static int may_write_to_queue(struct backing_dev_info *bdi)
 {
-	if (current_is_kswapd())
-		return 1;
-	if (current_is_pdflush())	/* This is unlikely, but why not... */
+	if (current->flags & PF_SWAPWRITE)
 		return 1;
 	if (!bdi_write_congested(bdi))
 		return 1;
@@ -1299,7 +1297,7 @@ static int kswapd(void *p)
 	 * us from recursively trying to free more memory as we're
 	 * trying to free the first piece of memory in the first place).
 	 */
-	tsk->flags |= PF_MEMALLOC|PF_KSWAPD;
+	tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
 
 	order = 0;
 	for ( ; ; ) {
-- 
cgit v1.2.3


From 49d2e9cc4544369635cd6f4ef6d5bb0f757079a7 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:00:48 -0800
Subject: [PATCH] Swap Migration V5: migrate_pages() function

This adds the basic page migration function with a minimal implementation that
only allows the eviction of pages to swap space.

Page eviction and migration may be useful to migrate pages, to suspend
programs or for remapping single pages (useful for faulty pages or pages with
soft ECC failures)

The process is as follows:

The function wanting to migrate pages must first build a list of pages to be
migrated or evicted and take them off the lru lists via isolate_lru_page().
isolate_lru_page determines that a page is freeable based on the LRU bit set.

Then the actual migration or swapout can happen by calling migrate_pages().

migrate_pages does its best to migrate or swapout the pages and does multiple
passes over the list.  Some pages may only be swappable if they are not dirty.
 migrate_pages may start writing out dirty pages in the initial passes over
the pages.  However, migrate_pages may not be able to migrate or evict all
pages for a variety of reasons.

The remaining pages may be returned to the LRU lists using putback_lru_pages().

Changelog V4->V5:
- Use the lru caches to return pages to the LRU

Changelog V3->V4:
- Restructure code so that applying patches to support full migration does
  require minimal changes. Rename swapout_pages() to migrate_pages().

Changelog V2->V3:
- Extract common code from shrink_list() and swapout_pages()

Signed-off-by: Mike Kravetz <kravetz@us.ibm.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: "Michael Kerrisk" <mtk-manpages@gmx.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h |   2 +
 mm/vmscan.c          | 214 +++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 182 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index a49112536c02..893096e67bdb 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -178,6 +178,8 @@ extern int vm_swappiness;
 extern int isolate_lru_page(struct page *p);
 extern int putback_lru_pages(struct list_head *l);
 
+extern int migrate_pages(struct list_head *l, struct list_head *t);
+
 #ifdef CONFIG_MMU
 /* linux/mm/shmem.c */
 extern int shmem_unuse(swp_entry_t entry, struct page *page);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6c30a8c59795..a537a7f16357 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -373,6 +373,43 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
 	return PAGE_CLEAN;
 }
 
+static int remove_mapping(struct address_space *mapping, struct page *page)
+{
+	if (!mapping)
+		return 0;		/* truncate got there first */
+
+	write_lock_irq(&mapping->tree_lock);
+
+	/*
+	 * The non-racy check for busy page.  It is critical to check
+	 * PageDirty _after_ making sure that the page is freeable and
+	 * not in use by anybody. 	(pagecache + us == 2)
+	 */
+	if (unlikely(page_count(page) != 2))
+		goto cannot_free;
+	smp_rmb();
+	if (unlikely(PageDirty(page)))
+		goto cannot_free;
+
+	if (PageSwapCache(page)) {
+		swp_entry_t swap = { .val = page_private(page) };
+		__delete_from_swap_cache(page);
+		write_unlock_irq(&mapping->tree_lock);
+		swap_free(swap);
+		__put_page(page);	/* The pagecache ref */
+		return 1;
+	}
+
+	__remove_from_page_cache(page);
+	write_unlock_irq(&mapping->tree_lock);
+	__put_page(page);
+	return 1;
+
+cannot_free:
+	write_unlock_irq(&mapping->tree_lock);
+	return 0;
+}
+
 /*
  * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
  */
@@ -504,36 +541,8 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 				goto free_it;
 		}
 
-		if (!mapping)
-			goto keep_locked;	/* truncate got there first */
-
-		write_lock_irq(&mapping->tree_lock);
-
-		/*
-		 * The non-racy check for busy page.  It is critical to check
-		 * PageDirty _after_ making sure that the page is freeable and
-		 * not in use by anybody. 	(pagecache + us == 2)
-		 */
-		if (unlikely(page_count(page) != 2))
-			goto cannot_free;
-		smp_rmb();
-		if (unlikely(PageDirty(page)))
-			goto cannot_free;
-
-#ifdef CONFIG_SWAP
-		if (PageSwapCache(page)) {
-			swp_entry_t swap = { .val = page_private(page) };
-			__delete_from_swap_cache(page);
-			write_unlock_irq(&mapping->tree_lock);
-			swap_free(swap);
-			__put_page(page);	/* The pagecache ref */
-			goto free_it;
-		}
-#endif /* CONFIG_SWAP */
-
-		__remove_from_page_cache(page);
-		write_unlock_irq(&mapping->tree_lock);
-		__put_page(page);
+		if (!remove_mapping(mapping, page))
+			goto keep_locked;
 
 free_it:
 		unlock_page(page);
@@ -542,10 +551,6 @@ free_it:
 			__pagevec_release_nonlru(&freed_pvec);
 		continue;
 
-cannot_free:
-		write_unlock_irq(&mapping->tree_lock);
-		goto keep_locked;
-
 activate_locked:
 		SetPageActive(page);
 		pgactivate++;
@@ -563,6 +568,147 @@ keep:
 	return reclaimed;
 }
 
+/*
+ * swapout a single page
+ * page is locked upon entry, unlocked on exit
+ *
+ * return codes:
+ *	0 = complete
+ *	1 = retry
+ */
+static int swap_page(struct page *page)
+{
+	struct address_space *mapping = page_mapping(page);
+
+	if (page_mapped(page) && mapping)
+		if (try_to_unmap(page) != SWAP_SUCCESS)
+			goto unlock_retry;
+
+	if (PageDirty(page)) {
+		/* Page is dirty, try to write it out here */
+		switch(pageout(page, mapping)) {
+		case PAGE_KEEP:
+		case PAGE_ACTIVATE:
+			goto unlock_retry;
+
+		case PAGE_SUCCESS:
+			goto retry;
+
+		case PAGE_CLEAN:
+			; /* try to free the page below */
+		}
+	}
+
+	if (PagePrivate(page)) {
+		if (!try_to_release_page(page, GFP_KERNEL) ||
+		    (!mapping && page_count(page) == 1))
+			goto unlock_retry;
+	}
+
+	if (remove_mapping(mapping, page)) {
+		/* Success */
+		unlock_page(page);
+		return 0;
+	}
+
+unlock_retry:
+	unlock_page(page);
+
+retry:
+	return 1;
+}
+/*
+ * migrate_pages
+ *
+ * Two lists are passed to this function. The first list
+ * contains the pages isolated from the LRU to be migrated.
+ * The second list contains new pages that the pages isolated
+ * can be moved to. If the second list is NULL then all
+ * pages are swapped out.
+ *
+ * The function returns after 10 attempts or if no pages
+ * are movable anymore because t has become empty
+ * or no retryable pages exist anymore.
+ *
+ * SIMPLIFIED VERSION: This implementation of migrate_pages
+ * is only swapping out pages and never touches the second
+ * list. The direct migration patchset
+ * extends this function to avoid the use of swap.
+ */
+int migrate_pages(struct list_head *l, struct list_head *t)
+{
+	int retry;
+	LIST_HEAD(failed);
+	int nr_failed = 0;
+	int pass = 0;
+	struct page *page;
+	struct page *page2;
+	int swapwrite = current->flags & PF_SWAPWRITE;
+
+	if (!swapwrite)
+		current->flags |= PF_SWAPWRITE;
+
+redo:
+	retry = 0;
+
+	list_for_each_entry_safe(page, page2, l, lru) {
+		cond_resched();
+
+		/*
+		 * Skip locked pages during the first two passes to give the
+		 * functions holding the lock time to release the page. Later we use
+		 * lock_page to have a higher chance of acquiring the lock.
+		 */
+		if (pass > 2)
+			lock_page(page);
+		else
+			if (TestSetPageLocked(page))
+				goto retry_later;
+
+		/*
+		 * Only wait on writeback if we have already done a pass where
+		 * we we may have triggered writeouts for lots of pages.
+		 */
+		if (pass > 0)
+			wait_on_page_writeback(page);
+		else
+			if (PageWriteback(page)) {
+				unlock_page(page);
+				goto retry_later;
+			}
+
+#ifdef CONFIG_SWAP
+		if (PageAnon(page) && !PageSwapCache(page)) {
+			if (!add_to_swap(page)) {
+				unlock_page(page);
+				list_move(&page->lru, &failed);
+				nr_failed++;
+				continue;
+			}
+		}
+#endif /* CONFIG_SWAP */
+
+		/*
+		 * Page is properly locked and writeback is complete.
+		 * Try to migrate the page.
+		 */
+		if (swap_page(page)) {
+retry_later:
+			retry++;
+		}
+	}
+	if (retry && pass++ < 10)
+		goto redo;
+
+	if (!swapwrite)
+		current->flags &= ~PF_SWAPWRITE;
+
+	if (!list_empty(&failed))
+		list_splice(&failed, l);
+
+	return nr_failed + retry;
+}
+
 /*
  * zone->lru_lock is heavily contended.  Some of the functions that
  * shrink the lists perform better by taking out a batch of pages
-- 
cgit v1.2.3


From 7cbe34cf86c673503b177ff47cfa2c7030dabb50 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Sun, 8 Jan 2006 01:00:49 -0800
Subject: [PATCH] Swap Migration V5: Add CONFIG_MIGRATION for page migration
 support

Include page migration if the system is NUMA or having a memory model that
allows distinct areas of memory (SPARSEMEM, DISCONTIGMEM).

And:
- Only include lru_add_drain_per_cpu if building for an SMP system.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h |  2 ++
 mm/Kconfig           |  7 +++++++
 mm/vmscan.c          | 20 +++++++++++---------
 3 files changed, 20 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 893096e67bdb..117add066f00 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -178,7 +178,9 @@ extern int vm_swappiness;
 extern int isolate_lru_page(struct page *p);
 extern int putback_lru_pages(struct list_head *l);
 
+#ifdef CONFIG_MIGRATION
 extern int migrate_pages(struct list_head *l, struct list_head *t);
+#endif
 
 #ifdef CONFIG_MMU
 /* linux/mm/shmem.c */
diff --git a/mm/Kconfig b/mm/Kconfig
index b3db11f137e0..a9cb80ae6409 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -132,3 +132,10 @@ config SPLIT_PTLOCK_CPUS
 	default "4096" if ARM && !CPU_CACHE_VIPT
 	default "4096" if PARISC && !PA20
 	default "4"
+
+#
+# support for page migration
+#
+config MIGRATION
+	def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM
+	depends on SWAP
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a537a7f16357..58270aea669a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -568,6 +568,7 @@ keep:
 	return reclaimed;
 }
 
+#ifdef CONFIG_MIGRATION
 /*
  * swapout a single page
  * page is locked upon entry, unlocked on exit
@@ -656,8 +657,9 @@ redo:
 
 		/*
 		 * Skip locked pages during the first two passes to give the
-		 * functions holding the lock time to release the page. Later we use
-		 * lock_page to have a higher chance of acquiring the lock.
+		 * functions holding the lock time to release the page. Later we
+		 * use lock_page() to have a higher chance of acquiring the
+		 * lock.
 		 */
 		if (pass > 2)
 			lock_page(page);
@@ -669,15 +671,15 @@ redo:
 		 * Only wait on writeback if we have already done a pass where
 		 * we we may have triggered writeouts for lots of pages.
 		 */
-		if (pass > 0)
+		if (pass > 0) {
 			wait_on_page_writeback(page);
-		else
+		} else {
 			if (PageWriteback(page)) {
 				unlock_page(page);
 				goto retry_later;
 			}
+		}
 
-#ifdef CONFIG_SWAP
 		if (PageAnon(page) && !PageSwapCache(page)) {
 			if (!add_to_swap(page)) {
 				unlock_page(page);
@@ -686,16 +688,15 @@ redo:
 				continue;
 			}
 		}
-#endif /* CONFIG_SWAP */
 
 		/*
 		 * Page is properly locked and writeback is complete.
 		 * Try to migrate the page.
 		 */
-		if (swap_page(page)) {
+		if (!swap_page(page))
+			continue;
 retry_later:
-			retry++;
-		}
+		retry++;
 	}
 	if (retry && pass++ < 10)
 		goto redo;
@@ -708,6 +709,7 @@ retry_later:
 
 	return nr_failed + retry;
 }
+#endif
 
 /*
  * zone->lru_lock is heavily contended.  Some of the functions that
-- 
cgit v1.2.3


From dc9aa5b9d65fd11b1f5246b46ec610ee8b83c6dd Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:00:50 -0800
Subject: [PATCH] Swap Migration V5: MPOL_MF_MOVE interface

Add page migration support via swap to the NUMA policy layer

This patch adds page migration support to the NUMA policy layer.  An
additional flag MPOL_MF_MOVE is introduced for mbind.  If MPOL_MF_MOVE is
specified then pages that do not conform to the memory policy will be evicted
from memory.  When they get pages back in new pages will be allocated
following the numa policy.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempolicy.h |   3 +
 mm/mempolicy.c            | 155 ++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 138 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index ed00b278cb93..05443a766cb8 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -22,6 +22,9 @@
 
 /* Flags for mbind */
 #define MPOL_MF_STRICT	(1<<0)	/* Verify existing pages in the mapping */
+#define MPOL_MF_MOVE	(1<<1)	/* Move pages owned by this process to conform to mapping */
+#define MPOL_MF_MOVE_ALL (1<<2)	/* Move every page to conform to mapping */
+#define MPOL_MF_INTERNAL (1<<3)	/* Internal flags start here */
 
 #ifdef __KERNEL__
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0f1d2b8a952b..9cc6d962831d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -83,9 +83,14 @@
 #include <linux/init.h>
 #include <linux/compat.h>
 #include <linux/mempolicy.h>
+#include <linux/swap.h>
+
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
 
+/* Internal MPOL_MF_xxx flags */
+#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
+
 static kmem_cache_t *policy_cache;
 static kmem_cache_t *sn_cache;
 
@@ -174,9 +179,59 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 	return policy;
 }
 
+/* Check if we are the only process mapping the page in question */
+static inline int single_mm_mapping(struct mm_struct *mm,
+			struct address_space *mapping)
+{
+	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
+	int rc = 1;
+
+	spin_lock(&mapping->i_mmap_lock);
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
+		if (mm != vma->vm_mm) {
+			rc = 0;
+			goto out;
+		}
+	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
+		if (mm != vma->vm_mm) {
+			rc = 0;
+			goto out;
+		}
+out:
+	spin_unlock(&mapping->i_mmap_lock);
+	return rc;
+}
+
+/*
+ * Add a page to be migrated to the pagelist
+ */
+static void migrate_page_add(struct vm_area_struct *vma,
+	struct page *page, struct list_head *pagelist, unsigned long flags)
+{
+	/*
+	 * Avoid migrating a page that is shared by others and not writable.
+	 */
+	if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
+	    mapping_writably_mapped(page->mapping) ||
+	    single_mm_mapping(vma->vm_mm, page->mapping)) {
+		int rc = isolate_lru_page(page);
+
+		if (rc == 1)
+			list_add(&page->lru, pagelist);
+		/*
+		 * If the isolate attempt was not successful then we just
+		 * encountered an unswappable page. Something must be wrong.
+	 	 */
+		WARN_ON(rc == 0);
+	}
+}
+
 /* Ensure all existing pages follow the policy. */
 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
-		unsigned long addr, unsigned long end, nodemask_t *nodes)
+		unsigned long addr, unsigned long end,
+		const nodemask_t *nodes, unsigned long flags,
+		struct list_head *pagelist)
 {
 	pte_t *orig_pte;
 	pte_t *pte;
@@ -193,15 +248,21 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		if (!page)
 			continue;
 		nid = page_to_nid(page);
-		if (!node_isset(nid, *nodes))
-			break;
+		if (!node_isset(nid, *nodes)) {
+			if (pagelist)
+				migrate_page_add(vma, page, pagelist, flags);
+			else
+				break;
+		}
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap_unlock(orig_pte, ptl);
 	return addr != end;
 }
 
 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
-		unsigned long addr, unsigned long end, nodemask_t *nodes)
+		unsigned long addr, unsigned long end,
+		const nodemask_t *nodes, unsigned long flags,
+		struct list_head *pagelist)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -211,14 +272,17 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		if (check_pte_range(vma, pmd, addr, next, nodes))
+		if (check_pte_range(vma, pmd, addr, next, nodes,
+				    flags, pagelist))
 			return -EIO;
 	} while (pmd++, addr = next, addr != end);
 	return 0;
 }
 
 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
-		unsigned long addr, unsigned long end, nodemask_t *nodes)
+		unsigned long addr, unsigned long end,
+		const nodemask_t *nodes, unsigned long flags,
+		struct list_head *pagelist)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -228,14 +292,17 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		if (check_pmd_range(vma, pud, addr, next, nodes))
+		if (check_pmd_range(vma, pud, addr, next, nodes,
+				    flags, pagelist))
 			return -EIO;
 	} while (pud++, addr = next, addr != end);
 	return 0;
 }
 
 static inline int check_pgd_range(struct vm_area_struct *vma,
-		unsigned long addr, unsigned long end, nodemask_t *nodes)
+		unsigned long addr, unsigned long end,
+		const nodemask_t *nodes, unsigned long flags,
+		struct list_head *pagelist)
 {
 	pgd_t *pgd;
 	unsigned long next;
@@ -245,16 +312,31 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		if (check_pud_range(vma, pgd, addr, next, nodes))
+		if (check_pud_range(vma, pgd, addr, next, nodes,
+				    flags, pagelist))
 			return -EIO;
 	} while (pgd++, addr = next, addr != end);
 	return 0;
 }
 
-/* Step 1: check the range */
+/* Check if a vma is migratable */
+static inline int vma_migratable(struct vm_area_struct *vma)
+{
+	if (vma->vm_flags & (
+		VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP))
+		return 0;
+	return 1;
+}
+
+/*
+ * Check if all pages in a range are on a set of nodes.
+ * If pagelist != NULL then isolate pages from the LRU and
+ * put them on the pagelist.
+ */
 static struct vm_area_struct *
 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
-	    nodemask_t *nodes, unsigned long flags)
+		const nodemask_t *nodes, unsigned long flags,
+		struct list_head *pagelist)
 {
 	int err;
 	struct vm_area_struct *first, *vma, *prev;
@@ -264,17 +346,24 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 		return ERR_PTR(-EFAULT);
 	prev = NULL;
 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
-		if (!vma->vm_next && vma->vm_end < end)
-			return ERR_PTR(-EFAULT);
-		if (prev && prev->vm_end < vma->vm_start)
-			return ERR_PTR(-EFAULT);
-		if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
+		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
+			if (!vma->vm_next && vma->vm_end < end)
+				return ERR_PTR(-EFAULT);
+			if (prev && prev->vm_end < vma->vm_start)
+				return ERR_PTR(-EFAULT);
+		}
+		if (!is_vm_hugetlb_page(vma) &&
+		    ((flags & MPOL_MF_STRICT) ||
+		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
+				vma_migratable(vma)))) {
 			unsigned long endvma = vma->vm_end;
+
 			if (endvma > end)
 				endvma = end;
 			if (vma->vm_start > start)
 				start = vma->vm_start;
-			err = check_pgd_range(vma, start, endvma, nodes);
+			err = check_pgd_range(vma, start, endvma, nodes,
+						flags, pagelist);
 			if (err) {
 				first = ERR_PTR(err);
 				break;
@@ -348,33 +437,59 @@ long do_mbind(unsigned long start, unsigned long len,
 	struct mempolicy *new;
 	unsigned long end;
 	int err;
+	LIST_HEAD(pagelist);
 
-	if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
+	if ((flags & ~(unsigned long)(MPOL_MF_STRICT|MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
+	    || mode > MPOL_MAX)
 		return -EINVAL;
+	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
+		return -EPERM;
+
 	if (start & ~PAGE_MASK)
 		return -EINVAL;
+
 	if (mode == MPOL_DEFAULT)
 		flags &= ~MPOL_MF_STRICT;
+
 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 	end = start + len;
+
 	if (end < start)
 		return -EINVAL;
 	if (end == start)
 		return 0;
+
 	if (mpol_check_policy(mode, nmask))
 		return -EINVAL;
+
 	new = mpol_new(mode, nmask);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
 
+	/*
+	 * If we are using the default policy then operation
+	 * on discontinuous address spaces is okay after all
+	 */
+	if (!new)
+		flags |= MPOL_MF_DISCONTIG_OK;
+
 	PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 			mode,nodes_addr(nodes)[0]);
 
 	down_write(&mm->mmap_sem);
-	vma = check_range(mm, start, end, nmask, flags);
+	vma = check_range(mm, start, end, nmask, flags,
+	      (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ? &pagelist : NULL);
 	err = PTR_ERR(vma);
-	if (!IS_ERR(vma))
+	if (!IS_ERR(vma)) {
 		err = mbind_range(vma, start, end, new);
+		if (!list_empty(&pagelist))
+			migrate_pages(&pagelist, NULL);
+		if (!err && !list_empty(&pagelist) && (flags & MPOL_MF_STRICT))
+			err = -EIO;
+	}
+	if (!list_empty(&pagelist))
+		putback_lru_pages(&pagelist);
+
 	up_write(&mm->mmap_sem);
 	mpol_free(new);
 	return err;
-- 
cgit v1.2.3


From 39743889aaf76725152f16aa90ca3c45f6d52da3 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:00:51 -0800
Subject: [PATCH] Swap Migration V5: sys_migrate_pages interface

sys_migrate_pages implementation using swap based page migration

This is the original API proposed by Ray Bryant in his posts during the first
half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org.

The intent of sys_migrate is to migrate memory of a process.  A process may
have migrated to another node.  Memory was allocated optimally for the prior
context.  sys_migrate_pages allows to shift the memory to the new node.

sys_migrate_pages is also useful if the processes available memory nodes have
changed through cpuset operations to manually move the processes memory.  Paul
Jackson is working on an automated mechanism that will allow an automatic
migration if the cpuset of a process is changed.  However, a user may decide
to manually control the migration.

This implementation is put into the policy layer since it uses concepts and
functions that are also needed for mbind and friends.  The patch also provides
a do_migrate_pages function that may be useful for cpusets to automatically
move memory.  sys_migrate_pages does not modify policies in contrast to Ray's
implementation.

The current code here is based on the swap based page migration capability and
thus is not able to preserve the physical layout relative to it containing
nodeset (which may be a cpuset).  When direct page migration becomes available
then the implementation needs to be changed to do a isomorphic move of pages
between different nodesets.  The current implementation simply evicts all
pages in source nodeset that are not in the target nodeset.

Patch supports ia64, i386 and x86_64.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/syscall_table.S |  1 +
 arch/ia64/kernel/entry.S         |  1 +
 arch/x86_64/ia32/ia32entry.S     |  1 +
 include/asm-i386/unistd.h        |  3 +-
 include/asm-ia64/unistd.h        |  3 +-
 include/asm-x86_64/ia32_unistd.h |  3 +-
 include/asm-x86_64/unistd.h      |  4 +-
 include/linux/mempolicy.h        |  3 ++
 include/linux/syscalls.h         |  2 +
 kernel/sys_ni.c                  |  1 +
 mm/mempolicy.c                   | 94 +++++++++++++++++++++++++++++++++++++++-
 11 files changed, 111 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index f7ba4acc20ec..6ff3e5243226 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -293,3 +293,4 @@ ENTRY(sys_call_table)
 	.long sys_inotify_init
 	.long sys_inotify_add_watch
 	.long sys_inotify_rm_watch
+	.long sys_migrate_pages
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index 0741b066b98f..7a6ffd613789 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -1600,5 +1600,6 @@ sys_call_table:
 	data8 sys_inotify_init
 	data8 sys_inotify_add_watch
 	data8 sys_inotify_rm_watch
+	data8 sys_migrate_pages			// 1280
 
 	.org sys_call_table + 8*NR_syscalls	// guard against failures to increase NR_syscalls
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index df0773c9bdbe..1f0ff5adc80e 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -643,6 +643,7 @@ ia32_sys_call_table:
 	.quad sys_inotify_init
 	.quad sys_inotify_add_watch
 	.quad sys_inotify_rm_watch
+	.quad sys_migrate_pages
 ia32_syscall_end:		
 	.rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8
 		.quad ni_syscall
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index fe38b9a96233..481c3c0ea720 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -299,8 +299,9 @@
 #define __NR_inotify_init	291
 #define __NR_inotify_add_watch	292
 #define __NR_inotify_rm_watch	293
+#define __NR_migrate_pages	294
 
-#define NR_syscalls 294
+#define NR_syscalls 295
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h
index 2bf543493cb8..962f9bd1bdff 100644
--- a/include/asm-ia64/unistd.h
+++ b/include/asm-ia64/unistd.h
@@ -269,12 +269,13 @@
 #define __NR_inotify_init		1277
 #define __NR_inotify_add_watch		1278
 #define __NR_inotify_rm_watch		1279
+#define __NR_migrate_pages		1280
 
 #ifdef __KERNEL__
 
 #include <linux/config.h>
 
-#define NR_syscalls			256 /* length of syscall table */
+#define NR_syscalls			270 /* length of syscall table */
 
 #define __ARCH_WANT_SYS_RT_SIGACTION
 
diff --git a/include/asm-x86_64/ia32_unistd.h b/include/asm-x86_64/ia32_unistd.h
index d5166ec3868d..e8843362a6cc 100644
--- a/include/asm-x86_64/ia32_unistd.h
+++ b/include/asm-x86_64/ia32_unistd.h
@@ -299,7 +299,8 @@
 #define __NR_ia32_inotify_init		291
 #define __NR_ia32_inotify_add_watch	292
 #define __NR_ia32_inotify_rm_watch	293
+#define __NR_ia32_migrate_pages		294
 
-#define IA32_NR_syscalls 294	/* must be > than biggest syscall! */
+#define IA32_NR_syscalls 295	/* must be > than biggest syscall! */
 
 #endif /* _ASM_X86_64_IA32_UNISTD_H_ */
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 2c42150bce0c..e6f896161c11 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -571,8 +571,10 @@ __SYSCALL(__NR_inotify_init, sys_inotify_init)
 __SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch)
 #define __NR_inotify_rm_watch	255
 __SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch)
+#define __NR_migrate_pages	256
+__SYSCALL(__NR_migrate_pages, sys_migrate_pages)
 
-#define __NR_syscall_max __NR_inotify_rm_watch
+#define __NR_syscall_max __NR_migrate_pages
 #ifndef __NO_STUBS
 
 /* user-visible error numbers are in the range -1 - -4095 */
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 05443a766cb8..3e61e829681d 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -162,6 +162,9 @@ static inline void check_highest_zone(int k)
 		policy_zone = k;
 }
 
+int do_migrate_pages(struct mm_struct *mm,
+	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags);
+
 #else
 
 struct mempolicy {};
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index c7007b1db91d..e910d1a481df 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -511,5 +511,7 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio);
 asmlinkage long sys_ioprio_get(int which, int who);
 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 					unsigned long maxnode);
+asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
+			const unsigned long __user *from, const unsigned long __user *to);
 
 #endif
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 1ab2370e2efa..7a8bc7f60d91 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -82,6 +82,7 @@ cond_syscall(compat_sys_socketcall);
 cond_syscall(sys_inotify_init);
 cond_syscall(sys_inotify_add_watch);
 cond_syscall(sys_inotify_rm_watch);
+cond_syscall(sys_migrate_pages);
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 9cc6d962831d..20d5ad39fa41 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -614,12 +614,42 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	return err;
 }
 
+/*
+ * For now migrate_pages simply swaps out the pages from nodes that are in
+ * the source set but not in the target set. In the future, we would
+ * want a function that moves pages between the two nodesets in such
+ * a way as to preserve the physical layout as much as possible.
+ *
+ * Returns the number of page that could not be moved.
+ */
+int do_migrate_pages(struct mm_struct *mm,
+	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+{
+	LIST_HEAD(pagelist);
+	int count = 0;
+	nodemask_t nodes;
+
+	nodes_andnot(nodes, *from_nodes, *to_nodes);
+	nodes_complement(nodes, nodes);
+
+	down_read(&mm->mmap_sem);
+	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
+			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+	if (!list_empty(&pagelist)) {
+		migrate_pages(&pagelist, NULL);
+		if (!list_empty(&pagelist))
+			count = putback_lru_pages(&pagelist);
+	}
+	up_read(&mm->mmap_sem);
+	return count;
+}
+
 /*
  * User space interface with variable sized bitmaps for nodelists.
  */
 
 /* Copy a node mask from user space. */
-static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
+static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 		     unsigned long maxnode)
 {
 	unsigned long k;
@@ -708,6 +738,68 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 	return do_set_mempolicy(mode, &nodes);
 }
 
+/* Macro needed until Paul implements this function in kernel/cpusets.c */
+#define cpuset_mems_allowed(task) node_online_map
+
+asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
+		const unsigned long __user *old_nodes,
+		const unsigned long __user *new_nodes)
+{
+	struct mm_struct *mm;
+	struct task_struct *task;
+	nodemask_t old;
+	nodemask_t new;
+	nodemask_t task_nodes;
+	int err;
+
+	err = get_nodes(&old, old_nodes, maxnode);
+	if (err)
+		return err;
+
+	err = get_nodes(&new, new_nodes, maxnode);
+	if (err)
+		return err;
+
+	/* Find the mm_struct */
+	read_lock(&tasklist_lock);
+	task = pid ? find_task_by_pid(pid) : current;
+	if (!task) {
+		read_unlock(&tasklist_lock);
+		return -ESRCH;
+	}
+	mm = get_task_mm(task);
+	read_unlock(&tasklist_lock);
+
+	if (!mm)
+		return -EINVAL;
+
+	/*
+	 * Check if this process has the right to modify the specified
+	 * process. The right exists if the process has administrative
+	 * capabilities, superuser priviledges or the same
+	 * userid as the target process.
+	 */
+	if ((current->euid != task->suid) && (current->euid != task->uid) &&
+	    (current->uid != task->suid) && (current->uid != task->uid) &&
+	    !capable(CAP_SYS_ADMIN)) {
+		err = -EPERM;
+		goto out;
+	}
+
+	task_nodes = cpuset_mems_allowed(task);
+	/* Is the user allowed to access the target nodes? */
+	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
+		err = -EPERM;
+		goto out;
+	}
+
+	err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
+out:
+	mmput(mm);
+	return err;
+}
+
+
 /* Retrieve NUMA policy */
 asmlinkage long sys_get_mempolicy(int __user *policy,
 				unsigned long __user *nmask,
-- 
cgit v1.2.3


From 8419c3181086c86664e8246bc997afc2e4ffba4f Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:00:52 -0800
Subject: [PATCH] SwapMig: CONFIG_MIGRATION fixes

Move move_to_lru, putback_lru_pages and isolate_lru in section surrounded by
CONFIG_MIGRATION saving some codesize for single processor kernels.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h |   3 +-
 mm/vmscan.c          | 152 +++++++++++++++++++++++++--------------------------
 2 files changed, 77 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 117add066f00..997d838f0e70 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -175,10 +175,9 @@ extern int try_to_free_pages(struct zone **, gfp_t);
 extern int shrink_all_memory(int);
 extern int vm_swappiness;
 
+#ifdef CONFIG_MIGRATION
 extern int isolate_lru_page(struct page *p);
 extern int putback_lru_pages(struct list_head *l);
-
-#ifdef CONFIG_MIGRATION
 extern int migrate_pages(struct list_head *l, struct list_head *t);
 #endif
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 58270aea669a..daed4a73b761 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -569,6 +569,40 @@ keep:
 }
 
 #ifdef CONFIG_MIGRATION
+static inline void move_to_lru(struct page *page)
+{
+	list_del(&page->lru);
+	if (PageActive(page)) {
+		/*
+		 * lru_cache_add_active checks that
+		 * the PG_active bit is off.
+		 */
+		ClearPageActive(page);
+		lru_cache_add_active(page);
+	} else {
+		lru_cache_add(page);
+	}
+	put_page(page);
+}
+
+/*
+ * Add isolated pages on the list back to the LRU
+ *
+ * returns the number of pages put back.
+ */
+int putback_lru_pages(struct list_head *l)
+{
+	struct page *page;
+	struct page *page2;
+	int count = 0;
+
+	list_for_each_entry_safe(page, page2, l, lru) {
+		move_to_lru(page);
+		count++;
+	}
+	return count;
+}
+
 /*
  * swapout a single page
  * page is locked upon entry, unlocked on exit
@@ -709,6 +743,48 @@ retry_later:
 
 	return nr_failed + retry;
 }
+
+static void lru_add_drain_per_cpu(void *dummy)
+{
+	lru_add_drain();
+}
+
+/*
+ * Isolate one page from the LRU lists and put it on the
+ * indicated list. Do necessary cache draining if the
+ * page is not on the LRU lists yet.
+ *
+ * Result:
+ *  0 = page not on LRU list
+ *  1 = page removed from LRU list and added to the specified list.
+ * -ENOENT = page is being freed elsewhere.
+ */
+int isolate_lru_page(struct page *page)
+{
+	int rc = 0;
+	struct zone *zone = page_zone(page);
+
+redo:
+	spin_lock_irq(&zone->lru_lock);
+	rc = __isolate_lru_page(page);
+	if (rc == 1) {
+		if (PageActive(page))
+			del_page_from_active_list(zone, page);
+		else
+			del_page_from_inactive_list(zone, page);
+	}
+	spin_unlock_irq(&zone->lru_lock);
+	if (rc == 0) {
+		/*
+		 * Maybe this page is still waiting for a cpu to drain it
+		 * from one of the lru lists?
+		 */
+		rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
+		if (rc == 0 && PageLRU(page))
+			goto redo;
+	}
+	return rc;
+}
 #endif
 
 /*
@@ -758,48 +834,6 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
 	return nr_taken;
 }
 
-static void lru_add_drain_per_cpu(void *dummy)
-{
-	lru_add_drain();
-}
-
-/*
- * Isolate one page from the LRU lists and put it on the
- * indicated list. Do necessary cache draining if the
- * page is not on the LRU lists yet.
- *
- * Result:
- *  0 = page not on LRU list
- *  1 = page removed from LRU list and added to the specified list.
- * -ENOENT = page is being freed elsewhere.
- */
-int isolate_lru_page(struct page *page)
-{
-	int rc = 0;
-	struct zone *zone = page_zone(page);
-
-redo:
-	spin_lock_irq(&zone->lru_lock);
-	rc = __isolate_lru_page(page);
-	if (rc == 1) {
-		if (PageActive(page))
-			del_page_from_active_list(zone, page);
-		else
-			del_page_from_inactive_list(zone, page);
-	}
-	spin_unlock_irq(&zone->lru_lock);
-	if (rc == 0) {
-		/*
-		 * Maybe this page is still waiting for a cpu to drain it
-		 * from one of the lru lists?
-		 */
-		rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
-		if (rc == 0 && PageLRU(page))
-			goto redo;
-	}
-	return rc;
-}
-
 /*
  * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
  */
@@ -865,40 +899,6 @@ done:
 	pagevec_release(&pvec);
 }
 
-static inline void move_to_lru(struct page *page)
-{
-	list_del(&page->lru);
-	if (PageActive(page)) {
-		/*
-		 * lru_cache_add_active checks that
-		 * the PG_active bit is off.
-		 */
-		ClearPageActive(page);
-		lru_cache_add_active(page);
-	} else {
-		lru_cache_add(page);
-	}
-	put_page(page);
-}
-
-/*
- * Add isolated pages on the list back to the LRU
- *
- * returns the number of pages put back.
- */
-int putback_lru_pages(struct list_head *l)
-{
-	struct page *page;
-	struct page *page2;
-	int count = 0;
-
-	list_for_each_entry_safe(page, page2, l, lru) {
-		move_to_lru(page);
-		count++;
-	}
-	return count;
-}
-
 /*
  * This moves pages from the active list to the inactive list.
  *
-- 
cgit v1.2.3


From 1480a540c98525640174a7eadd712378fcd6fd63 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:00:53 -0800
Subject: [PATCH] SwapMig: add_to_swap() avoid atomic allocations

Add gfp_mask to add_to_swap

add_to_swap does allocations with GFP_ATOMIC in order not to interfere with
swapping.  During migration we may have use add_to_swap extensively which may
lead to out of memory errors.

This patch makes add_to_swap take a parameter that specifies the gfp mask.
The page migration code can then make add_to_swap use GFP_KERNEL.

Signed-off-by: Hirokazu Takahashi <taka@valinux.co.jp>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h | 2 +-
 mm/swap_state.c      | 4 ++--
 mm/vmscan.c          | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 997d838f0e70..eb591eaad1b7 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -198,7 +198,7 @@ extern int rw_swap_page_sync(int, swp_entry_t, struct page *);
 extern struct address_space swapper_space;
 #define total_swapcache_pages  swapper_space.nrpages
 extern void show_swap_cache_info(void);
-extern int add_to_swap(struct page *);
+extern int add_to_swap(struct page *, gfp_t);
 extern void __delete_from_swap_cache(struct page *);
 extern void delete_from_swap_cache(struct page *);
 extern int move_to_swap_cache(struct page *, swp_entry_t);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index fc2aecb70a95..7b09ac503fec 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -141,7 +141,7 @@ void __delete_from_swap_cache(struct page *page)
  * Allocate swap space for the page and add the page to the
  * swap cache.  Caller needs to hold the page lock. 
  */
-int add_to_swap(struct page * page)
+int add_to_swap(struct page * page, gfp_t gfp_mask)
 {
 	swp_entry_t entry;
 	int err;
@@ -166,7 +166,7 @@ int add_to_swap(struct page * page)
 		 * Add it to the swap cache and mark it dirty
 		 */
 		err = __add_to_swap_cache(page, entry,
-				GFP_ATOMIC|__GFP_NOMEMALLOC|__GFP_NOWARN);
+				gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN);
 
 		switch (err) {
 		case 0:				/* Success */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index daed4a73b761..5393b093a87b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -458,7 +458,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 		 * Try to allocate it some swap space here.
 		 */
 		if (PageAnon(page) && !PageSwapCache(page)) {
-			if (!add_to_swap(page))
+			if (!add_to_swap(page, GFP_ATOMIC))
 				goto activate_locked;
 		}
 #endif /* CONFIG_SWAP */
@@ -715,7 +715,7 @@ redo:
 		}
 
 		if (PageAnon(page) && !PageSwapCache(page)) {
-			if (!add_to_swap(page)) {
+			if (!add_to_swap(page, GFP_KERNEL)) {
 				unlock_page(page);
 				list_move(&page->lru, &failed);
 				nr_failed++;
-- 
cgit v1.2.3


From d498471133ff1f9586a06820beaeebc575fe2814 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:00:55 -0800
Subject: [PATCH] SwapMig: Extend parameters for migrate_pages()

Extend the parameters of migrate_pages() to allow the caller control over the
fate of successfully migrated or impossible to migrate pages.

Swap migration and direct migration will have the same interface after this
patch so that patches can be independently applied to the policy layer and the
core migration code.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h |  3 ++-
 mm/mempolicy.c       | 27 ++++++++++++++++++++++-----
 mm/vmscan.c          | 17 ++++++++---------
 3 files changed, 32 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index eb591eaad1b7..389d1c382e20 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -178,7 +178,8 @@ extern int vm_swappiness;
 #ifdef CONFIG_MIGRATION
 extern int isolate_lru_page(struct page *p);
 extern int putback_lru_pages(struct list_head *l);
-extern int migrate_pages(struct list_head *l, struct list_head *t);
+extern int migrate_pages(struct list_head *l, struct list_head *t,
+		struct list_head *moved, struct list_head *failed);
 #endif
 
 #ifdef CONFIG_MMU
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 20d5ad39fa41..30bdafba52d8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -429,6 +429,19 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
 	return mpol_check_policy(mode, nodes);
 }
 
+static int swap_pages(struct list_head *pagelist)
+{
+	LIST_HEAD(moved);
+	LIST_HEAD(failed);
+	int n;
+
+	n = migrate_pages(pagelist, NULL, &moved, &failed);
+	putback_lru_pages(&failed);
+	putback_lru_pages(&moved);
+
+	return n;
+}
+
 long do_mbind(unsigned long start, unsigned long len,
 		unsigned long mode, nodemask_t *nmask, unsigned long flags)
 {
@@ -481,10 +494,13 @@ long do_mbind(unsigned long start, unsigned long len,
 	      (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ? &pagelist : NULL);
 	err = PTR_ERR(vma);
 	if (!IS_ERR(vma)) {
+		int nr_failed = 0;
+
 		err = mbind_range(vma, start, end, new);
 		if (!list_empty(&pagelist))
-			migrate_pages(&pagelist, NULL);
-		if (!err && !list_empty(&pagelist) && (flags & MPOL_MF_STRICT))
+			nr_failed = swap_pages(&pagelist);
+
+		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 			err = -EIO;
 	}
 	if (!list_empty(&pagelist))
@@ -635,11 +651,12 @@ int do_migrate_pages(struct mm_struct *mm,
 	down_read(&mm->mmap_sem);
 	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+
 	if (!list_empty(&pagelist)) {
-		migrate_pages(&pagelist, NULL);
-		if (!list_empty(&pagelist))
-			count = putback_lru_pages(&pagelist);
+		count = swap_pages(&pagelist);
+		putback_lru_pages(&pagelist);
 	}
+
 	up_read(&mm->mmap_sem);
 	return count;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 73ba4046ed27..5eecb514ccea 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -670,10 +670,10 @@ retry:
  * list. The direct migration patchset
  * extends this function to avoid the use of swap.
  */
-int migrate_pages(struct list_head *l, struct list_head *t)
+int migrate_pages(struct list_head *from, struct list_head *to,
+		  struct list_head *moved, struct list_head *failed)
 {
 	int retry;
-	LIST_HEAD(failed);
 	int nr_failed = 0;
 	int pass = 0;
 	struct page *page;
@@ -686,12 +686,12 @@ int migrate_pages(struct list_head *l, struct list_head *t)
 redo:
 	retry = 0;
 
-	list_for_each_entry_safe(page, page2, l, lru) {
+	list_for_each_entry_safe(page, page2, from, lru) {
 		cond_resched();
 
 		if (page_count(page) == 1) {
 			/* page was freed from under us. So we are done. */
-			move_to_lru(page);
+			list_move(&page->lru, moved);
 			continue;
 		}
 		/*
@@ -722,7 +722,7 @@ redo:
 		if (PageAnon(page) && !PageSwapCache(page)) {
 			if (!add_to_swap(page, GFP_KERNEL)) {
 				unlock_page(page);
-				list_move(&page->lru, &failed);
+				list_move(&page->lru, failed);
 				nr_failed++;
 				continue;
 			}
@@ -732,8 +732,10 @@ redo:
 		 * Page is properly locked and writeback is complete.
 		 * Try to migrate the page.
 		 */
-		if (!swap_page(page))
+		if (!swap_page(page)) {
+			list_move(&page->lru, moved);
 			continue;
+		}
 retry_later:
 		retry++;
 	}
@@ -743,9 +745,6 @@ retry_later:
 	if (!swapwrite)
 		current->flags &= ~PF_SWAPWRITE;
 
-	if (!list_empty(&failed))
-		list_splice(&failed, l);
-
 	return nr_failed + retry;
 }
 
-- 
cgit v1.2.3


From 45b07ef31d1182d2cfde7711327e3afb268bb1ac Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:00:56 -0800
Subject: [PATCH] cpusets: swap migration interface

Add a boolean "memory_migrate" to each cpuset, represented by a file
containing "0" or "1" in each directory below /dev/cpuset.

It defaults to false (file contains "0").  It can be set true by writing
"1" to the file.

If true, then anytime that a task is attached to the cpuset so marked, the
pages of that task will be moved to that cpuset, preserving, to the extent
practical, the cpuset-relative placement of the pages.

Also anytime that a cpuset so marked has its memory placement changed (by
writing to its "mems" file), the tasks in that cpuset will have their pages
moved to the cpusets new nodes, preserving, to the extent practical, the
cpuset-relative placement of the moved pages.

Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/cpusets.txt | 25 +++++++++++++++++++++++++
 include/linux/mempolicy.h |  7 +++++++
 kernel/cpuset.c           | 38 ++++++++++++++++++++++++++++++++++++--
 3 files changed, 68 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
index a09a8eb80665..e2d9afc30d2d 100644
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -192,6 +192,7 @@ containing the following files describing that cpuset:
 
  - cpus: list of CPUs in that cpuset
  - mems: list of Memory Nodes in that cpuset
+ - memory_migrate flag: if set, move pages to cpusets nodes
  - cpu_exclusive flag: is cpu placement exclusive?
  - mem_exclusive flag: is memory placement exclusive?
  - tasks: list of tasks (by pid) attached to that cpuset
@@ -277,6 +278,30 @@ rewritten to the 'tasks' file of its cpuset.  This is done to avoid
 impacting the scheduler code in the kernel with a check for changes
 in a tasks processor placement.
 
+Normally, once a page is allocated (given a physical page
+of main memory) then that page stays on whatever node it
+was allocated, so long as it remains allocated, even if the
+cpusets memory placement policy 'mems' subsequently changes.
+If the cpuset flag file 'memory_migrate' is set true, then when
+tasks are attached to that cpuset, any pages that task had
+allocated to it on nodes in its previous cpuset are migrated
+to the tasks new cpuset.  Depending on the implementation,
+this migration may either be done by swapping the page out,
+so that the next time the page is referenced, it will be paged
+into the tasks new cpuset, usually on the node where it was
+referenced, or this migration may be done by directly copying
+the pages from the tasks previous cpuset to the new cpuset,
+where possible to the same node, relative to the new cpuset,
+as the node that held the page, relative to the old cpuset.
+Also if 'memory_migrate' is set true, then if that cpusets
+'mems' file is modified, pages allocated to tasks in that
+cpuset, that were on nodes in the previous setting of 'mems',
+will be moved to nodes in the new setting of 'mems.'  Again,
+depending on the implementation, this might be done by swapping,
+or by direct copying.  In either case, pages that were not in
+the tasks prior cpuset, or in the cpusets prior 'mems' setting,
+will not be moved.
+
 There is an exception to the above.  If hotplug functionality is used
 to remove all the CPUs that are currently assigned to a cpuset,
 then the kernel will automatically update the cpus_allowed of all
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 3e61e829681d..66247eff24a0 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -235,6 +235,13 @@ static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 	return NODE_DATA(0)->node_zonelists + gfp_zone(GFP_HIGHUSER);
 }
 
+static inline int do_migrate_pages(struct mm_struct *mm,
+			const nodemask_t *from_nodes,
+			const nodemask_t *to_nodes, int flags)
+{
+	return 0;
+}
+
 static inline void check_highest_zone(int k)
 {
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7430640f9816..f63383e01ec7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -87,6 +87,7 @@ struct cpuset {
 typedef enum {
 	CS_CPU_EXCLUSIVE,
 	CS_MEM_EXCLUSIVE,
+	CS_MEMORY_MIGRATE,
 	CS_REMOVED,
 	CS_NOTIFY_ON_RELEASE
 } cpuset_flagbits_t;
@@ -112,6 +113,11 @@ static inline int notify_on_release(const struct cpuset *cs)
 	return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
 }
 
+static inline int is_memory_migrate(const struct cpuset *cs)
+{
+	return !!test_bit(CS_MEMORY_MIGRATE, &cs->flags);
+}
+
 /*
  * Increment this atomic integer everytime any cpuset changes its
  * mems_allowed value.  Users of cpusets can track this generation
@@ -602,16 +608,24 @@ static void refresh_mems(void)
 	if (current->cpuset_mems_generation != my_cpusets_mem_gen) {
 		struct cpuset *cs;
 		nodemask_t oldmem = current->mems_allowed;
+		int migrate;
 
 		down(&callback_sem);
 		task_lock(current);
 		cs = current->cpuset;
+		migrate = is_memory_migrate(cs);
 		guarantee_online_mems(cs, &current->mems_allowed);
 		current->cpuset_mems_generation = cs->mems_generation;
 		task_unlock(current);
 		up(&callback_sem);
-		if (!nodes_equal(oldmem, current->mems_allowed))
+		if (!nodes_equal(oldmem, current->mems_allowed)) {
 			numa_policy_rebind(&oldmem, &current->mems_allowed);
+			if (migrate) {
+				do_migrate_pages(current->mm, &oldmem,
+					&current->mems_allowed,
+					MPOL_MF_MOVE_ALL);
+			}
+		}
 	}
 }
 
@@ -795,7 +809,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 /*
  * update_flag - read a 0 or a 1 in a file and update associated flag
  * bit:	the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
- *						CS_NOTIFY_ON_RELEASE)
+ *				CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE)
  * cs:	the cpuset to update
  * buf:	the buffer where we read the 0 or 1
  *
@@ -848,6 +862,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 	struct task_struct *tsk;
 	struct cpuset *oldcs;
 	cpumask_t cpus;
+	nodemask_t from, to;
 
 	if (sscanf(pidbuf, "%d", &pid) != 1)
 		return -EIO;
@@ -893,7 +908,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 	guarantee_online_cpus(cs, &cpus);
 	set_cpus_allowed(tsk, cpus);
 
+	from = oldcs->mems_allowed;
+	to = cs->mems_allowed;
+
 	up(&callback_sem);
+	if (is_memory_migrate(cs))
+		do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
 	put_task_struct(tsk);
 	if (atomic_dec_and_test(&oldcs->count))
 		check_for_release(oldcs, ppathbuf);
@@ -905,6 +925,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 typedef enum {
 	FILE_ROOT,
 	FILE_DIR,
+	FILE_MEMORY_MIGRATE,
 	FILE_CPULIST,
 	FILE_MEMLIST,
 	FILE_CPU_EXCLUSIVE,
@@ -960,6 +981,9 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	case FILE_NOTIFY_ON_RELEASE:
 		retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer);
 		break;
+	case FILE_MEMORY_MIGRATE:
+		retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
+		break;
 	case FILE_TASKLIST:
 		retval = attach_task(cs, buffer, &pathbuf);
 		break;
@@ -1060,6 +1084,9 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
 	case FILE_NOTIFY_ON_RELEASE:
 		*s++ = notify_on_release(cs) ? '1' : '0';
 		break;
+	case FILE_MEMORY_MIGRATE:
+		*s++ = is_memory_migrate(cs) ? '1' : '0';
+		break;
 	default:
 		retval = -EINVAL;
 		goto out;
@@ -1408,6 +1435,11 @@ static struct cftype cft_notify_on_release = {
 	.private = FILE_NOTIFY_ON_RELEASE,
 };
 
+static struct cftype cft_memory_migrate = {
+	.name = "memory_migrate",
+	.private = FILE_MEMORY_MIGRATE,
+};
+
 static int cpuset_populate_dir(struct dentry *cs_dentry)
 {
 	int err;
@@ -1422,6 +1454,8 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
 		return err;
 	if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0)
 		return err;
+	if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0)
+		return err;
 	if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
 		return err;
 	return 0;
-- 
cgit v1.2.3


From 152194aaa6266d71dfee57882a23def339ef17a4 Mon Sep 17 00:00:00 2001
From: Avishay Traeger <atraeger@cs.sunysb.edu>
Date: Sun, 8 Jan 2006 01:00:58 -0800
Subject: [PATCH] set_page_count() macro safety

Fix set_page_count() macro to handle complex arguments.

Signed-off-by: Avishay Traeger <atraeger@cs.sunysb.edu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 83c651f25188..7ff54242c5d7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -308,7 +308,7 @@ struct page {
  */
 #define get_page_testone(p)	atomic_inc_and_test(&(p)->_count)
 
-#define set_page_count(p,v) 	atomic_set(&(p)->_count, v - 1)
+#define set_page_count(p,v) 	atomic_set(&(p)->_count, (v) - 1)
 #define __put_page(p)		atomic_dec(&(p)->_count)
 
 extern void FASTCALL(__page_cache_release(struct page *));
-- 
cgit v1.2.3


From 48fce3429df84a94766fbbc845fa8450d0715b48 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:01:03 -0800
Subject: [PATCH] mempolicies: unexport get_vma_policy()

Since the numa_maps functionality is now in mempolicy.c we no longer need to
export get_vma_policy().

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempolicy.h | 3 ---
 mm/mempolicy.c            | 4 ++--
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 66247eff24a0..05fddd5bee5d 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -144,9 +144,6 @@ void mpol_free_shared_policy(struct shared_policy *p);
 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
 					    unsigned long idx);
 
-struct mempolicy *get_vma_policy(struct task_struct *task,
-			struct vm_area_struct *vma, unsigned long addr);
-
 extern void numa_default_policy(void);
 extern void numa_policy_init(void);
 extern void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4c0510e9e7f6..4b077ec6c005 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -935,8 +935,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 #endif
 
 /* Return effective policy for a VMA */
-struct mempolicy *
-get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
+static struct mempolicy * get_vma_policy(struct task_struct *task,
+		struct vm_area_struct *vma, unsigned long addr)
 {
 	struct mempolicy *pol = task->mempolicy;
 
-- 
cgit v1.2.3


From 22fc6eccbf4ce4eb6265e6ada7b50a7b9cc57d05 Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Sun, 8 Jan 2006 01:01:27 -0800
Subject: [PATCH] Change maxaligned_in_smp alignemnt macros to
 internodealigned_in_smp macros

____cacheline_maxaligned_in_smp is currently used to align critical structures
and avoid false sharing.  It uses per-arch L1_CACHE_SHIFT_MAX and people find
L1_CACHE_SHIFT_MAX useless.

However, we have been using ____cacheline_maxaligned_in_smp to align
structures on the internode cacheline size.  As per Andi's suggestion,
following patch kills ____cacheline_maxaligned_in_smp and introduces
INTERNODE_CACHE_SHIFT, which defaults to L1_CACHE_SHIFT for all arches.
Arches needing L3/Internode cacheline alignment can define
INTERNODE_CACHE_SHIFT in the arch asm/cache.h.  Patch replaces
____cacheline_maxaligned_in_smp with ____cacheline_internodealigned_in_smp

With this patch, L1_CACHE_SHIFT_MAX can be killed

Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/init_task.c   |  2 +-
 arch/i386/kernel/irq.c         |  2 +-
 arch/x86_64/kernel/init_task.c |  2 +-
 include/linux/cache.h          | 17 +++++++++++++----
 include/linux/ide.h            |  2 +-
 include/linux/mmzone.h         |  4 ++--
 include/linux/rcupdate.h       |  2 +-
 kernel/rcupdate.c              |  4 ++--
 mm/sparse.c                    |  4 ++--
 9 files changed, 24 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/init_task.c b/arch/i386/kernel/init_task.c
index 9caa8e8db80c..cff95d10a4d8 100644
--- a/arch/i386/kernel/init_task.c
+++ b/arch/i386/kernel/init_task.c
@@ -42,5 +42,5 @@ EXPORT_SYMBOL(init_task);
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
  * no more per-task TSS's.
  */ 
-DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_maxaligned_in_smp = INIT_TSS;
+DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
 
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index 1a201a932865..f3a9c78c4a24 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -19,7 +19,7 @@
 #include <linux/cpu.h>
 #include <linux/delay.h>
 
-DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_maxaligned_in_smp;
+DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
 EXPORT_PER_CPU_SYMBOL(irq_stat);
 
 #ifndef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86_64/kernel/init_task.c b/arch/x86_64/kernel/init_task.c
index e0ba5c1043fd..ce31d904d601 100644
--- a/arch/x86_64/kernel/init_task.c
+++ b/arch/x86_64/kernel/init_task.c
@@ -44,6 +44,6 @@ EXPORT_SYMBOL(init_task);
  * section. Since TSS's are completely CPU-local, we want them
  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  */ 
-DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_maxaligned_in_smp = INIT_TSS;
+DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
 
 #define ALIGN_TO_4K __attribute__((section(".data.init_task")))
diff --git a/include/linux/cache.h b/include/linux/cache.h
index 0b7ecf3af78a..ffe52210fc4f 100644
--- a/include/linux/cache.h
+++ b/include/linux/cache.h
@@ -45,12 +45,21 @@
 #endif /* CONFIG_SMP */
 #endif
 
-#if !defined(____cacheline_maxaligned_in_smp)
+/*
+ * The maximum alignment needed for some critical structures
+ * These could be inter-node cacheline sizes/L3 cacheline
+ * size etc.  Define this in asm/cache.h for your arch
+ */
+#ifndef INTERNODE_CACHE_SHIFT
+#define INTERNODE_CACHE_SHIFT L1_CACHE_SHIFT
+#endif
+
+#if !defined(____cacheline_internodealigned_in_smp)
 #if defined(CONFIG_SMP)
-#define ____cacheline_maxaligned_in_smp \
-	__attribute__((__aligned__(1 << (L1_CACHE_SHIFT_MAX))))
+#define ____cacheline_internodealigned_in_smp \
+	__attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT))))
 #else
-#define ____cacheline_maxaligned_in_smp
+#define ____cacheline_internodealigned_in_smp
 #endif
 #endif
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 7b6a6a58e465..4dd6694963c0 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -801,7 +801,7 @@ typedef struct hwif_s {
 	unsigned dma;
 
 	void (*led_act)(void *data, int rw);
-} ____cacheline_maxaligned_in_smp ide_hwif_t;
+} ____cacheline_internodealigned_in_smp ide_hwif_t;
 
 /*
  *  internal ide interrupt handler type
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2a89c132ba9c..7e4ae6ab1977 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -38,7 +38,7 @@ struct pglist_data;
 #if defined(CONFIG_SMP)
 struct zone_padding {
 	char x[0];
-} ____cacheline_maxaligned_in_smp;
+} ____cacheline_internodealigned_in_smp;
 #define ZONE_PADDING(name)	struct zone_padding name;
 #else
 #define ZONE_PADDING(name)
@@ -233,7 +233,7 @@ struct zone {
 	 * rarely used fields:
 	 */
 	char			*name;
-} ____cacheline_maxaligned_in_smp;
+} ____cacheline_internodealigned_in_smp;
 
 
 /*
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index a471f3bb713e..51747cd88d1a 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -65,7 +65,7 @@ struct rcu_ctrlblk {
 	long	cur;		/* Current batch number.                      */
 	long	completed;	/* Number of the last completed batch         */
 	int	next_pending;	/* Is the next batch already waiting?         */
-} ____cacheline_maxaligned_in_smp;
+} ____cacheline_internodealigned_in_smp;
 
 /* Is batch a before batch b ? */
 static inline int rcu_batch_before(long a, long b)
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 48d3bce465b8..c9afc61240e4 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -61,9 +61,9 @@ struct rcu_state {
 	                              /* for current batch to proceed.        */
 };
 
-static struct rcu_state rcu_state ____cacheline_maxaligned_in_smp =
+static struct rcu_state rcu_state ____cacheline_internodealigned_in_smp =
 	  {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE };
-static struct rcu_state rcu_bh_state ____cacheline_maxaligned_in_smp =
+static struct rcu_state rcu_bh_state ____cacheline_internodealigned_in_smp =
 	  {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE };
 
 DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
diff --git a/mm/sparse.c b/mm/sparse.c
index 72079b538e2d..0a51f36ba3a1 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -18,10 +18,10 @@
  */
 #ifdef CONFIG_SPARSEMEM_EXTREME
 struct mem_section *mem_section[NR_SECTION_ROOTS]
-	____cacheline_maxaligned_in_smp;
+	____cacheline_internodealigned_in_smp;
 #else
 struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
-	____cacheline_maxaligned_in_smp;
+	____cacheline_internodealigned_in_smp;
 #endif
 EXPORT_SYMBOL(mem_section);
 
-- 
cgit v1.2.3


From e56d090310d7625ecb43a1eeebd479f04affb48b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 8 Jan 2006 01:01:37 -0800
Subject: [PATCH] RCU signal handling

RCU tasklist_lock and RCU signal handling: send signals RCU-read-locked
instead of tasklist_lock read-locked.  This is a scalability improvement on
SMP and a preemption-latency improvement under PREEMPT_RCU.

Signed-off-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: William Irwin <wli@holomorphy.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c             |  4 +--
 include/linux/sched.h | 32 +++++++++++++++--
 kernel/exit.c         |  1 -
 kernel/fork.c         | 10 +++++-
 kernel/pid.c          | 22 ++++++------
 kernel/rcupdate.c     |  1 +
 kernel/sched.c        |  7 ++++
 kernel/signal.c       | 97 +++++++++++++++++++++++++++++++++++++++++++--------
 8 files changed, 143 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index e75a9548da8e..e9650cd22a3b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -760,7 +760,7 @@ no_thread_group:
 		spin_lock(&oldsighand->siglock);
 		spin_lock(&newsighand->siglock);
 
-		current->sighand = newsighand;
+		rcu_assign_pointer(current->sighand, newsighand);
 		recalc_sigpending();
 
 		spin_unlock(&newsighand->siglock);
@@ -768,7 +768,7 @@ no_thread_group:
 		write_unlock_irq(&tasklist_lock);
 
 		if (atomic_dec_and_test(&oldsighand->count))
-			kmem_cache_free(sighand_cachep, oldsighand);
+			sighand_free(oldsighand);
 	}
 
 	BUG_ON(!thread_group_leader(current));
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a74662077d60..a6af77e9b4cf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -34,6 +34,7 @@
 #include <linux/percpu.h>
 #include <linux/topology.h>
 #include <linux/seccomp.h>
+#include <linux/rcupdate.h>
 
 #include <linux/auxvec.h>	/* For AT_VECTOR_SIZE */
 
@@ -350,8 +351,16 @@ struct sighand_struct {
 	atomic_t		count;
 	struct k_sigaction	action[_NSIG];
 	spinlock_t		siglock;
+	struct rcu_head		rcu;
 };
 
+extern void sighand_free_cb(struct rcu_head *rhp);
+
+static inline void sighand_free(struct sighand_struct *sp)
+{
+	call_rcu(&sp->rcu, sighand_free_cb);
+}
+
 /*
  * NOTE! "signal_struct" does not have it's own
  * locking, because a shared signal_struct always
@@ -844,6 +853,7 @@ struct task_struct {
 	int cpuset_mems_generation;
 #endif
 	atomic_t fs_excl;	/* holding fs exclusive resources */
+	struct rcu_head rcu;
 };
 
 static inline pid_t process_group(struct task_struct *tsk)
@@ -867,8 +877,26 @@ static inline int pid_alive(struct task_struct *p)
 extern void free_task(struct task_struct *tsk);
 extern void __put_task_struct(struct task_struct *tsk);
 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
-#define put_task_struct(tsk) \
-do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
+
+static inline int get_task_struct_rcu(struct task_struct *t)
+{
+	int oldusage;
+
+	do {
+		oldusage = atomic_read(&t->usage);
+		if (oldusage == 0)
+			return 0;
+	} while (cmpxchg(&t->usage.counter, oldusage, oldusage+1) != oldusage);
+	return 1;
+}
+
+extern void __put_task_struct_cb(struct rcu_head *rhp);
+
+static inline void put_task_struct(struct task_struct *t)
+{
+	if (atomic_dec_and_test(&t->usage))
+		call_rcu(&t->rcu, __put_task_struct_cb);
+}
 
 /*
  * Per process flags
diff --git a/kernel/exit.c b/kernel/exit.c
index ee515683b92d..c73a7eb26de3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -72,7 +72,6 @@ repeat:
 		__ptrace_unlink(p);
 	BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
 	__exit_signal(p);
-	__exit_sighand(p);
 	/*
 	 * Note that the fastpath in sys_times depends on __exit_signal having
 	 * updated the counters before a task is removed from the tasklist of
diff --git a/kernel/fork.c b/kernel/fork.c
index fb8572a42297..7fe3adfa65cb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -743,6 +743,14 @@ int unshare_files(void)
 
 EXPORT_SYMBOL(unshare_files);
 
+void sighand_free_cb(struct rcu_head *rhp)
+{
+	struct sighand_struct *sp;
+
+	sp = container_of(rhp, struct sighand_struct, rcu);
+	kmem_cache_free(sighand_cachep, sp);
+}
+
 static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
 {
 	struct sighand_struct *sig;
@@ -752,7 +760,7 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t
 		return 0;
 	}
 	sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
-	tsk->sighand = sig;
+	rcu_assign_pointer(tsk->sighand, sig);
 	if (!sig)
 		return -ENOMEM;
 	spin_lock_init(&sig->siglock);
diff --git a/kernel/pid.c b/kernel/pid.c
index edba31c681ac..1acc07246991 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -136,7 +136,7 @@ struct pid * fastcall find_pid(enum pid_type type, int nr)
 	struct hlist_node *elem;
 	struct pid *pid;
 
-	hlist_for_each_entry(pid, elem,
+	hlist_for_each_entry_rcu(pid, elem,
 			&pid_hash[type][pid_hashfn(nr)], pid_chain) {
 		if (pid->nr == nr)
 			return pid;
@@ -150,15 +150,15 @@ int fastcall attach_pid(task_t *task, enum pid_type type, int nr)
 
 	task_pid = &task->pids[type];
 	pid = find_pid(type, nr);
+	task_pid->nr = nr;
 	if (pid == NULL) {
-		hlist_add_head(&task_pid->pid_chain,
-				&pid_hash[type][pid_hashfn(nr)]);
 		INIT_LIST_HEAD(&task_pid->pid_list);
+		hlist_add_head_rcu(&task_pid->pid_chain,
+				   &pid_hash[type][pid_hashfn(nr)]);
 	} else {
 		INIT_HLIST_NODE(&task_pid->pid_chain);
-		list_add_tail(&task_pid->pid_list, &pid->pid_list);
+		list_add_tail_rcu(&task_pid->pid_list, &pid->pid_list);
 	}
-	task_pid->nr = nr;
 
 	return 0;
 }
@@ -170,20 +170,20 @@ static fastcall int __detach_pid(task_t *task, enum pid_type type)
 
 	pid = &task->pids[type];
 	if (!hlist_unhashed(&pid->pid_chain)) {
-		hlist_del(&pid->pid_chain);
 
-		if (list_empty(&pid->pid_list))
+		if (list_empty(&pid->pid_list)) {
 			nr = pid->nr;
-		else {
+			hlist_del_rcu(&pid->pid_chain);
+		} else {
 			pid_next = list_entry(pid->pid_list.next,
 						struct pid, pid_list);
 			/* insert next pid from pid_list to hash */
-			hlist_add_head(&pid_next->pid_chain,
-				&pid_hash[type][pid_hashfn(pid_next->nr)]);
+			hlist_replace_rcu(&pid->pid_chain,
+					  &pid_next->pid_chain);
 		}
 	}
 
-	list_del(&pid->pid_list);
+	list_del_rcu(&pid->pid_list);
 	pid->nr = 0;
 
 	return nr;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index c9afc61240e4..0a669bd2f6d1 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -35,6 +35,7 @@
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/smp.h>
+#include <linux/rcupdate.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
 #include <asm/atomic.h>
diff --git a/kernel/sched.c b/kernel/sched.c
index 6f46c94cc29e..92733091154c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -176,6 +176,13 @@ static unsigned int task_timeslice(task_t *p)
 #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran)	\
 				< (long long) (sd)->cache_hot_time)
 
+void __put_task_struct_cb(struct rcu_head *rhp)
+{
+	__put_task_struct(container_of(rhp, struct task_struct, rcu));
+}
+
+EXPORT_SYMBOL_GPL(__put_task_struct_cb);
+
 /*
  * These are the runqueue data structures:
  */
diff --git a/kernel/signal.c b/kernel/signal.c
index d7611f189ef7..64737c72dadd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -329,13 +329,20 @@ void __exit_sighand(struct task_struct *tsk)
 	/* Ok, we're done with the signal handlers */
 	tsk->sighand = NULL;
 	if (atomic_dec_and_test(&sighand->count))
-		kmem_cache_free(sighand_cachep, sighand);
+		sighand_free(sighand);
 }
 
 void exit_sighand(struct task_struct *tsk)
 {
 	write_lock_irq(&tasklist_lock);
-	__exit_sighand(tsk);
+	rcu_read_lock();
+	if (tsk->sighand != NULL) {
+		struct sighand_struct *sighand = rcu_dereference(tsk->sighand);
+		spin_lock(&sighand->siglock);
+		__exit_sighand(tsk);
+		spin_unlock(&sighand->siglock);
+	}
+	rcu_read_unlock();
 	write_unlock_irq(&tasklist_lock);
 }
 
@@ -345,12 +352,14 @@ void exit_sighand(struct task_struct *tsk)
 void __exit_signal(struct task_struct *tsk)
 {
 	struct signal_struct * sig = tsk->signal;
-	struct sighand_struct * sighand = tsk->sighand;
+	struct sighand_struct * sighand;
 
 	if (!sig)
 		BUG();
 	if (!atomic_read(&sig->count))
 		BUG();
+	rcu_read_lock();
+	sighand = rcu_dereference(tsk->sighand);
 	spin_lock(&sighand->siglock);
 	posix_cpu_timers_exit(tsk);
 	if (atomic_dec_and_test(&sig->count)) {
@@ -358,6 +367,7 @@ void __exit_signal(struct task_struct *tsk)
 		if (tsk == sig->curr_target)
 			sig->curr_target = next_thread(tsk);
 		tsk->signal = NULL;
+		__exit_sighand(tsk);
 		spin_unlock(&sighand->siglock);
 		flush_sigqueue(&sig->shared_pending);
 	} else {
@@ -389,9 +399,11 @@ void __exit_signal(struct task_struct *tsk)
 		sig->nvcsw += tsk->nvcsw;
 		sig->nivcsw += tsk->nivcsw;
 		sig->sched_time += tsk->sched_time;
+		__exit_sighand(tsk);
 		spin_unlock(&sighand->siglock);
 		sig = NULL;	/* Marker for below.  */
 	}
+	rcu_read_unlock();
 	clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
 	flush_sigqueue(&tsk->pending);
 	if (sig) {
@@ -1080,18 +1092,28 @@ void zap_other_threads(struct task_struct *p)
 }
 
 /*
- * Must be called with the tasklist_lock held for reading!
+ * Must be called under rcu_read_lock() or with tasklist_lock read-held.
  */
 int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
 	unsigned long flags;
+	struct sighand_struct *sp;
 	int ret;
 
+retry:
 	ret = check_kill_permission(sig, info, p);
-	if (!ret && sig && p->sighand) {
-		spin_lock_irqsave(&p->sighand->siglock, flags);
+	if (!ret && sig && (sp = p->sighand)) {
+		if (!get_task_struct_rcu(p))
+			return -ESRCH;
+		spin_lock_irqsave(&sp->siglock, flags);
+		if (p->sighand != sp) {
+			spin_unlock_irqrestore(&sp->siglock, flags);
+			put_task_struct(p);
+			goto retry;
+		}
 		ret = __group_send_sig_info(sig, info, p);
-		spin_unlock_irqrestore(&p->sighand->siglock, flags);
+		spin_unlock_irqrestore(&sp->siglock, flags);
+		put_task_struct(p);
 	}
 
 	return ret;
@@ -1136,14 +1158,21 @@ int
 kill_proc_info(int sig, struct siginfo *info, pid_t pid)
 {
 	int error;
+	int acquired_tasklist_lock = 0;
 	struct task_struct *p;
 
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
+	if (unlikely(sig_kernel_stop(sig) || sig == SIGCONT)) {
+		read_lock(&tasklist_lock);
+		acquired_tasklist_lock = 1;
+	}
 	p = find_task_by_pid(pid);
 	error = -ESRCH;
 	if (p)
 		error = group_send_sig_info(sig, info, p);
-	read_unlock(&tasklist_lock);
+	if (unlikely(acquired_tasklist_lock))
+		read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 	return error;
 }
 
@@ -1355,16 +1384,54 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 {
 	unsigned long flags;
 	int ret = 0;
+	struct sighand_struct *sh;
 
 	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
-	read_lock(&tasklist_lock);
+
+	/*
+	 * The rcu based delayed sighand destroy makes it possible to
+	 * run this without tasklist lock held. The task struct itself
+	 * cannot go away as create_timer did get_task_struct().
+	 *
+	 * We return -1, when the task is marked exiting, so
+	 * posix_timer_event can redirect it to the group leader
+	 */
+	rcu_read_lock();
 
 	if (unlikely(p->flags & PF_EXITING)) {
 		ret = -1;
 		goto out_err;
 	}
 
-	spin_lock_irqsave(&p->sighand->siglock, flags);
+retry:
+	sh = rcu_dereference(p->sighand);
+
+	spin_lock_irqsave(&sh->siglock, flags);
+	if (p->sighand != sh) {
+		/* We raced with exec() in a multithreaded process... */
+		spin_unlock_irqrestore(&sh->siglock, flags);
+		goto retry;
+	}
+
+	/*
+	 * We do the check here again to handle the following scenario:
+	 *
+	 * CPU 0		CPU 1
+	 * send_sigqueue
+	 * check PF_EXITING
+	 * interrupt		exit code running
+	 *			__exit_signal
+	 *			lock sighand->siglock
+	 *			unlock sighand->siglock
+	 * lock sh->siglock
+	 * add(tsk->pending) 	flush_sigqueue(tsk->pending)
+	 *
+	 */
+
+	if (unlikely(p->flags & PF_EXITING)) {
+		ret = -1;
+		goto out;
+	}
 
 	if (unlikely(!list_empty(&q->list))) {
 		/*
@@ -1388,9 +1455,9 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 		signal_wake_up(p, sig == SIGKILL);
 
 out:
-	spin_unlock_irqrestore(&p->sighand->siglock, flags);
+	spin_unlock_irqrestore(&sh->siglock, flags);
 out_err:
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 
 	return ret;
 }
@@ -1402,7 +1469,9 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 	int ret = 0;
 
 	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
+
 	read_lock(&tasklist_lock);
+	/* Since it_lock is held, p->sighand cannot be NULL. */
 	spin_lock_irqsave(&p->sighand->siglock, flags);
 	handle_stop_signal(sig, p);
 
@@ -1436,7 +1505,7 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 out:
 	spin_unlock_irqrestore(&p->sighand->siglock, flags);
 	read_unlock(&tasklist_lock);
-	return(ret);
+	return ret;
 }
 
 /*
-- 
cgit v1.2.3


From d4829cd5b4bd1ea58ba1bebad44d562f4027c290 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@us.ibm.com>
Date: Sun, 8 Jan 2006 01:01:39 -0800
Subject: [PATCH] remove get_task_struct_rcu()

The latest set of signal-RCU patches does not use get_task_struct_rcu().
Attached is a patch that removes it.

Signed-off-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a6af77e9b4cf..20bd70749104 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -878,18 +878,6 @@ extern void free_task(struct task_struct *tsk);
 extern void __put_task_struct(struct task_struct *tsk);
 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
 
-static inline int get_task_struct_rcu(struct task_struct *t)
-{
-	int oldusage;
-
-	do {
-		oldusage = atomic_read(&t->usage);
-		if (oldusage == 0)
-			return 0;
-	} while (cmpxchg(&t->usage.counter, oldusage, oldusage+1) != oldusage);
-	return 1;
-}
-
 extern void __put_task_struct_cb(struct rcu_head *rhp);
 
 static inline void put_task_struct(struct task_struct *t)
-- 
cgit v1.2.3


From 10cef6029502915bdb3cf0821d425cf9dc30c817 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Sun, 8 Jan 2006 01:01:45 -0800
Subject: [PATCH] slob: introduce the SLOB allocator

configurable replacement for slab allocator

This adds a CONFIG_SLAB option under CONFIG_EMBEDDED.  When CONFIG_SLAB is
disabled, the kernel falls back to using the 'SLOB' allocator.

SLOB is a traditional K&R/UNIX allocator with a SLAB emulation layer,
similar to the original Linux kmalloc allocator that SLAB replaced.  It's
signicantly smaller code and is more memory efficient.  But like all
similar allocators, it scales poorly and suffers from fragmentation more
than SLAB, so it's only appropriate for small systems.

It's been tested extensively in the Linux-tiny tree.  I've also
stress-tested it with make -j 8 compiles on a 3G SMP+PREEMPT box (not
recommended).

Here's a comparison for otherwise identical builds, showing SLOB saving
nearly half a megabyte of RAM:

$ size vmlinux*
   text    data     bss     dec     hex filename
3336372  529360  190812 4056544  3de5e0 vmlinux-slab
3323208  527948  190684 4041840  3dac70 vmlinux-slob

$ size mm/{slab,slob}.o
   text    data     bss     dec     hex filename
  13221     752      48   14021    36c5 mm/slab.o
   1896      52       8    1956     7a4 mm/slob.o

/proc/meminfo:
                  SLAB          SLOB      delta
MemTotal:        27964 kB      27980 kB     +16 kB
MemFree:         24596 kB      25092 kB    +496 kB
Buffers:            36 kB         36 kB       0 kB
Cached:           1188 kB       1188 kB       0 kB
SwapCached:          0 kB          0 kB       0 kB
Active:            608 kB        600 kB      -8 kB
Inactive:          808 kB        812 kB      +4 kB
HighTotal:           0 kB          0 kB       0 kB
HighFree:            0 kB          0 kB       0 kB
LowTotal:        27964 kB      27980 kB     +16 kB
LowFree:         24596 kB      25092 kB    +496 kB
SwapTotal:           0 kB          0 kB       0 kB
SwapFree:            0 kB          0 kB       0 kB
Dirty:               4 kB         12 kB      +8 kB
Writeback:           0 kB          0 kB       0 kB
Mapped:            560 kB        556 kB      -4 kB
Slab:             1756 kB          0 kB   -1756 kB
CommitLimit:     13980 kB      13988 kB      +8 kB
Committed_AS:     4208 kB       4208 kB       0 kB
PageTables:         28 kB         28 kB       0 kB
VmallocTotal:  1007312 kB    1007312 kB       0 kB
VmallocUsed:        48 kB         48 kB       0 kB
VmallocChunk:  1007264 kB    1007264 kB       0 kB

(this work has been sponsored in part by CELF)

From: Ingo Molnar <mingo@elte.hu>

   Fix 32-bitness bugs in mm/slob.c.

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/proc_misc.c  |   4 +
 include/linux/slab.h |  35 +++++
 init/Kconfig         |  13 ++
 mm/Makefile          |   4 +-
 mm/slob.c            | 385 +++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 440 insertions(+), 1 deletion(-)
 create mode 100644 mm/slob.c

(limited to 'include/linux')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 5b6b0b6038a7..63bf6c00fa0c 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -323,6 +323,7 @@ static struct file_operations proc_modules_operations = {
 };
 #endif
 
+#ifdef CONFIG_SLAB
 extern struct seq_operations slabinfo_op;
 extern ssize_t slabinfo_write(struct file *, const char __user *, size_t, loff_t *);
 static int slabinfo_open(struct inode *inode, struct file *file)
@@ -336,6 +337,7 @@ static struct file_operations proc_slabinfo_operations = {
 	.llseek		= seq_lseek,
 	.release	= seq_release,
 };
+#endif
 
 static int show_stat(struct seq_file *p, void *v)
 {
@@ -600,7 +602,9 @@ void __init proc_misc_init(void)
 	create_seq_entry("partitions", 0, &proc_partitions_operations);
 	create_seq_entry("stat", 0, &proc_stat_operations);
 	create_seq_entry("interrupts", 0, &proc_interrupts_operations);
+#ifdef CONFIG_SLAB
 	create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations);
+#endif
 	create_seq_entry("buddyinfo",S_IRUGO, &fragmentation_file_operations);
 	create_seq_entry("vmstat",S_IRUGO, &proc_vmstat_file_operations);
 	create_seq_entry("zoneinfo",S_IRUGO, &proc_zoneinfo_file_operations);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index d1ea4051b996..1fb77a9cc148 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -53,6 +53,8 @@ typedef struct kmem_cache kmem_cache_t;
 #define SLAB_CTOR_ATOMIC	0x002UL		/* tell constructor it can't sleep */
 #define	SLAB_CTOR_VERIFY	0x004UL		/* tell constructor it's a verify call */
 
+#ifndef CONFIG_SLOB
+
 /* prototypes */
 extern void __init kmem_cache_init(void);
 
@@ -134,6 +136,39 @@ static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 extern int FASTCALL(kmem_cache_reap(int));
 extern int FASTCALL(kmem_ptr_validate(kmem_cache_t *cachep, void *ptr));
 
+#else /* CONFIG_SLOB */
+
+/* SLOB allocator routines */
+
+void kmem_cache_init(void);
+struct kmem_cache *kmem_find_general_cachep(size_t, gfp_t gfpflags);
+struct kmem_cache *kmem_cache_create(const char *c, size_t, size_t,
+	unsigned long,
+	void (*)(void *, struct kmem_cache *, unsigned long),
+	void (*)(void *, struct kmem_cache *, unsigned long));
+int kmem_cache_destroy(struct kmem_cache *c);
+void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags);
+void kmem_cache_free(struct kmem_cache *c, void *b);
+const char *kmem_cache_name(struct kmem_cache *);
+void *kmalloc(size_t size, gfp_t flags);
+void *kzalloc(size_t size, gfp_t flags);
+void kfree(const void *m);
+unsigned int ksize(const void *m);
+unsigned int kmem_cache_size(struct kmem_cache *c);
+
+static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
+{
+	return kzalloc(n * size, flags);
+}
+
+#define kmem_cache_shrink(d) (0)
+#define kmem_cache_reap(a)
+#define kmem_ptr_validate(a, b) (0)
+#define kmem_cache_alloc_node(c, f, n) kmem_cache_alloc(c, f)
+#define kmalloc_node(s, f, n) kmalloc(s, f)
+
+#endif /* CONFIG_SLOB */
+
 /* System wide caches */
 extern kmem_cache_t	*vm_area_cachep;
 extern kmem_cache_t	*names_cachep;
diff --git a/init/Kconfig b/init/Kconfig
index ba42f3793a84..0c9932f9f06b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -380,6 +380,15 @@ config CC_ALIGN_JUMPS
 	  no dummy operations need be executed.
 	  Zero means use compiler's default.
 
+config SLAB
+	default y
+	bool "Use full SLAB allocator" if EMBEDDED
+	help
+	  Disabling this replaces the advanced SLAB allocator and
+	  kmalloc support with the drastically simpler SLOB allocator.
+	  SLOB is more space efficient but does not scale well and is
+	  more susceptible to fragmentation.
+
 endmenu		# General setup
 
 config TINY_SHMEM
@@ -391,6 +400,10 @@ config BASE_SMALL
 	default 0 if BASE_FULL
 	default 1 if !BASE_FULL
 
+config SLOB
+	default !SLAB
+	bool
+
 menu "Loadable module support"
 
 config MODULES
diff --git a/mm/Makefile b/mm/Makefile
index 74c85ddc9176..9aa03fa1dcc3 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -9,7 +9,7 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 
 obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   page_alloc.o page-writeback.o pdflush.o \
-			   readahead.o slab.o swap.o truncate.o vmscan.o \
+			   readahead.o swap.o truncate.o vmscan.o \
 			   prio_tree.o util.o $(mmu-y)
 
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
@@ -18,5 +18,7 @@ obj-$(CONFIG_NUMA) 	+= mempolicy.o
 obj-$(CONFIG_SPARSEMEM)	+= sparse.o
 obj-$(CONFIG_SHMEM) += shmem.o
 obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
+obj-$(CONFIG_SLOB) += slob.o
+obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/slob.c b/mm/slob.c
new file mode 100644
index 000000000000..1c240c4b71d9
--- /dev/null
+++ b/mm/slob.c
@@ -0,0 +1,385 @@
+/*
+ * SLOB Allocator: Simple List Of Blocks
+ *
+ * Matt Mackall <mpm@selenic.com> 12/30/03
+ *
+ * How SLOB works:
+ *
+ * The core of SLOB is a traditional K&R style heap allocator, with
+ * support for returning aligned objects. The granularity of this
+ * allocator is 8 bytes on x86, though it's perhaps possible to reduce
+ * this to 4 if it's deemed worth the effort. The slob heap is a
+ * singly-linked list of pages from __get_free_page, grown on demand
+ * and allocation from the heap is currently first-fit.
+ *
+ * Above this is an implementation of kmalloc/kfree. Blocks returned
+ * from kmalloc are 8-byte aligned and prepended with a 8-byte header.
+ * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
+ * __get_free_pages directly so that it can return page-aligned blocks
+ * and keeps a linked list of such pages and their orders. These
+ * objects are detected in kfree() by their page alignment.
+ *
+ * SLAB is emulated on top of SLOB by simply calling constructors and
+ * destructors for every SLAB allocation. Objects are returned with
+ * the 8-byte alignment unless the SLAB_MUST_HWCACHE_ALIGN flag is
+ * set, in which case the low-level allocator will fragment blocks to
+ * create the proper alignment. Again, objects of page-size or greater
+ * are allocated by calling __get_free_pages. As SLAB objects know
+ * their size, no separate size bookkeeping is necessary and there is
+ * essentially no allocation space overhead.
+ */
+
+#include <linux/config.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+
+struct slob_block {
+	int units;
+	struct slob_block *next;
+};
+typedef struct slob_block slob_t;
+
+#define SLOB_UNIT sizeof(slob_t)
+#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
+#define SLOB_ALIGN L1_CACHE_BYTES
+
+struct bigblock {
+	int order;
+	void *pages;
+	struct bigblock *next;
+};
+typedef struct bigblock bigblock_t;
+
+static slob_t arena = { .next = &arena, .units = 1 };
+static slob_t *slobfree = &arena;
+static bigblock_t *bigblocks;
+static DEFINE_SPINLOCK(slob_lock);
+static DEFINE_SPINLOCK(block_lock);
+
+static void slob_free(void *b, int size);
+
+static void *slob_alloc(size_t size, gfp_t gfp, int align)
+{
+	slob_t *prev, *cur, *aligned = 0;
+	int delta = 0, units = SLOB_UNITS(size);
+	unsigned long flags;
+
+	spin_lock_irqsave(&slob_lock, flags);
+	prev = slobfree;
+	for (cur = prev->next; ; prev = cur, cur = cur->next) {
+		if (align) {
+			aligned = (slob_t *)ALIGN((unsigned long)cur, align);
+			delta = aligned - cur;
+		}
+		if (cur->units >= units + delta) { /* room enough? */
+			if (delta) { /* need to fragment head to align? */
+				aligned->units = cur->units - delta;
+				aligned->next = cur->next;
+				cur->next = aligned;
+				cur->units = delta;
+				prev = cur;
+				cur = aligned;
+			}
+
+			if (cur->units == units) /* exact fit? */
+				prev->next = cur->next; /* unlink */
+			else { /* fragment */
+				prev->next = cur + units;
+				prev->next->units = cur->units - units;
+				prev->next->next = cur->next;
+				cur->units = units;
+			}
+
+			slobfree = prev;
+			spin_unlock_irqrestore(&slob_lock, flags);
+			return cur;
+		}
+		if (cur == slobfree) {
+			spin_unlock_irqrestore(&slob_lock, flags);
+
+			if (size == PAGE_SIZE) /* trying to shrink arena? */
+				return 0;
+
+			cur = (slob_t *)__get_free_page(gfp);
+			if (!cur)
+				return 0;
+
+			slob_free(cur, PAGE_SIZE);
+			spin_lock_irqsave(&slob_lock, flags);
+			cur = slobfree;
+		}
+	}
+}
+
+static void slob_free(void *block, int size)
+{
+	slob_t *cur, *b = (slob_t *)block;
+	unsigned long flags;
+
+	if (!block)
+		return;
+
+	if (size)
+		b->units = SLOB_UNITS(size);
+
+	/* Find reinsertion point */
+	spin_lock_irqsave(&slob_lock, flags);
+	for (cur = slobfree; !(b > cur && b < cur->next); cur = cur->next)
+		if (cur >= cur->next && (b > cur || b < cur->next))
+			break;
+
+	if (b + b->units == cur->next) {
+		b->units += cur->next->units;
+		b->next = cur->next->next;
+	} else
+		b->next = cur->next;
+
+	if (cur + cur->units == b) {
+		cur->units += b->units;
+		cur->next = b->next;
+	} else
+		cur->next = b;
+
+	slobfree = cur;
+
+	spin_unlock_irqrestore(&slob_lock, flags);
+}
+
+static int FASTCALL(find_order(int size));
+static int fastcall find_order(int size)
+{
+	int order = 0;
+	for ( ; size > 4096 ; size >>=1)
+		order++;
+	return order;
+}
+
+void *kmalloc(size_t size, gfp_t gfp)
+{
+	slob_t *m;
+	bigblock_t *bb;
+	unsigned long flags;
+
+	if (size < PAGE_SIZE - SLOB_UNIT) {
+		m = slob_alloc(size + SLOB_UNIT, gfp, 0);
+		return m ? (void *)(m + 1) : 0;
+	}
+
+	bb = slob_alloc(sizeof(bigblock_t), gfp, 0);
+	if (!bb)
+		return 0;
+
+	bb->order = find_order(size);
+	bb->pages = (void *)__get_free_pages(gfp, bb->order);
+
+	if (bb->pages) {
+		spin_lock_irqsave(&block_lock, flags);
+		bb->next = bigblocks;
+		bigblocks = bb;
+		spin_unlock_irqrestore(&block_lock, flags);
+		return bb->pages;
+	}
+
+	slob_free(bb, sizeof(bigblock_t));
+	return 0;
+}
+
+EXPORT_SYMBOL(kmalloc);
+
+void kfree(const void *block)
+{
+	bigblock_t *bb, **last = &bigblocks;
+	unsigned long flags;
+
+	if (!block)
+		return;
+
+	if (!((unsigned long)block & (PAGE_SIZE-1))) {
+		/* might be on the big block list */
+		spin_lock_irqsave(&block_lock, flags);
+		for (bb = bigblocks; bb; last = &bb->next, bb = bb->next) {
+			if (bb->pages == block) {
+				*last = bb->next;
+				spin_unlock_irqrestore(&block_lock, flags);
+				free_pages((unsigned long)block, bb->order);
+				slob_free(bb, sizeof(bigblock_t));
+				return;
+			}
+		}
+		spin_unlock_irqrestore(&block_lock, flags);
+	}
+
+	slob_free((slob_t *)block - 1, 0);
+	return;
+}
+
+EXPORT_SYMBOL(kfree);
+
+unsigned int ksize(const void *block)
+{
+	bigblock_t *bb;
+	unsigned long flags;
+
+	if (!block)
+		return 0;
+
+	if (!((unsigned long)block & (PAGE_SIZE-1))) {
+		spin_lock_irqsave(&block_lock, flags);
+		for (bb = bigblocks; bb; bb = bb->next)
+			if (bb->pages == block) {
+				spin_unlock_irqrestore(&slob_lock, flags);
+				return PAGE_SIZE << bb->order;
+			}
+		spin_unlock_irqrestore(&block_lock, flags);
+	}
+
+	return ((slob_t *)block - 1)->units * SLOB_UNIT;
+}
+
+struct kmem_cache {
+	unsigned int size, align;
+	const char *name;
+	void (*ctor)(void *, struct kmem_cache *, unsigned long);
+	void (*dtor)(void *, struct kmem_cache *, unsigned long);
+};
+
+struct kmem_cache *kmem_cache_create(const char *name, size_t size,
+	size_t align, unsigned long flags,
+	void (*ctor)(void*, struct kmem_cache *, unsigned long),
+	void (*dtor)(void*, struct kmem_cache *, unsigned long))
+{
+	struct kmem_cache *c;
+
+	c = slob_alloc(sizeof(struct kmem_cache), flags, 0);
+
+	if (c) {
+		c->name = name;
+		c->size = size;
+		c->ctor = ctor;
+		c->dtor = dtor;
+		/* ignore alignment unless it's forced */
+		c->align = (flags & SLAB_MUST_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
+		if (c->align < align)
+			c->align = align;
+	}
+
+	return c;
+}
+EXPORT_SYMBOL(kmem_cache_create);
+
+int kmem_cache_destroy(struct kmem_cache *c)
+{
+	slob_free(c, sizeof(struct kmem_cache));
+	return 0;
+}
+EXPORT_SYMBOL(kmem_cache_destroy);
+
+void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
+{
+	void *b;
+
+	if (c->size < PAGE_SIZE)
+		b = slob_alloc(c->size, flags, c->align);
+	else
+		b = (void *)__get_free_pages(flags, find_order(c->size));
+
+	if (c->ctor)
+		c->ctor(b, c, SLAB_CTOR_CONSTRUCTOR);
+
+	return b;
+}
+EXPORT_SYMBOL(kmem_cache_alloc);
+
+void kmem_cache_free(struct kmem_cache *c, void *b)
+{
+	if (c->dtor)
+		c->dtor(b, c, 0);
+
+	if (c->size < PAGE_SIZE)
+		slob_free(b, c->size);
+	else
+		free_pages((unsigned long)b, find_order(c->size));
+}
+EXPORT_SYMBOL(kmem_cache_free);
+
+unsigned int kmem_cache_size(struct kmem_cache *c)
+{
+	return c->size;
+}
+EXPORT_SYMBOL(kmem_cache_size);
+
+const char *kmem_cache_name(struct kmem_cache *c)
+{
+	return c->name;
+}
+EXPORT_SYMBOL(kmem_cache_name);
+
+static struct timer_list slob_timer = TIMER_INITIALIZER(
+	(void (*)(unsigned long))kmem_cache_init, 0, 0);
+
+void kmem_cache_init(void)
+{
+	void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1);
+
+	if (p)
+		free_page((unsigned long)p);
+
+	mod_timer(&slob_timer, jiffies + HZ);
+}
+
+atomic_t slab_reclaim_pages = ATOMIC_INIT(0);
+EXPORT_SYMBOL(slab_reclaim_pages);
+
+#ifdef CONFIG_SMP
+
+void *__alloc_percpu(size_t size, size_t align)
+{
+	int i;
+	struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
+
+	if (!pdata)
+		return NULL;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_possible(i))
+			continue;
+		pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
+		if (!pdata->ptrs[i])
+			goto unwind_oom;
+		memset(pdata->ptrs[i], 0, size);
+	}
+
+	/* Catch derefs w/o wrappers */
+	return (void *) (~(unsigned long) pdata);
+
+unwind_oom:
+	while (--i >= 0) {
+		if (!cpu_possible(i))
+			continue;
+		kfree(pdata->ptrs[i]);
+	}
+	kfree(pdata);
+	return NULL;
+}
+EXPORT_SYMBOL(__alloc_percpu);
+
+void
+free_percpu(const void *objp)
+{
+	int i;
+	struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_possible(i))
+			continue;
+		kfree(p->ptrs[i]);
+	}
+	kfree(p);
+}
+EXPORT_SYMBOL(free_percpu);
+
+#endif
-- 
cgit v1.2.3


From 5966514db662fb24c9bb43226a80106bcffd51f8 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:47 -0800
Subject: [PATCH] cpuset: mempolicy one more nodemask conversion

Finish converting mm/mempolicy.c from bitmaps to nodemasks.  The previous
conversion had left one routine using bitmaps, since it involved a
corresponding change to kernel/cpuset.c

Fix that interface by replacing with a simple macro that calls nodes_subset(),
or if !CONFIG_CPUSET, returns (1).

Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <christoph@lameter.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpuset.h |  5 +++--
 kernel/cpuset.c        | 10 ----------
 mm/mempolicy.c         |  5 ++---
 3 files changed, 5 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 6e2deef96b34..8b21786490ee 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -21,7 +21,8 @@ extern void cpuset_exit(struct task_struct *p);
 extern cpumask_t cpuset_cpus_allowed(const struct task_struct *p);
 void cpuset_init_current_mems_allowed(void);
 void cpuset_update_current_mems_allowed(void);
-void cpuset_restrict_to_mems_allowed(unsigned long *nodes);
+#define cpuset_nodes_subset_current_mems_allowed(nodes) \
+		nodes_subset((nodes), current->mems_allowed)
 int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
 extern int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask);
 extern int cpuset_excl_nodes_overlap(const struct task_struct *p);
@@ -42,7 +43,7 @@ static inline cpumask_t cpuset_cpus_allowed(struct task_struct *p)
 
 static inline void cpuset_init_current_mems_allowed(void) {}
 static inline void cpuset_update_current_mems_allowed(void) {}
-static inline void cpuset_restrict_to_mems_allowed(unsigned long *nodes) {}
+#define cpuset_nodes_subset_current_mems_allowed(nodes) (1)
 
 static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
 {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f63383e01ec7..6503c6da4c4f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1749,16 +1749,6 @@ done:
 		refresh_mems();
 }
 
-/**
- * cpuset_restrict_to_mems_allowed - limit nodes to current mems_allowed
- * @nodes: pointer to a node bitmap that is and-ed with mems_allowed
- */
-void cpuset_restrict_to_mems_allowed(unsigned long *nodes)
-{
-	bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed),
-							MAX_NUMNODES);
-}
-
 /**
  * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed
  * @zl: the zonelist to be checked
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 7051fe450e96..9dea2b8a7d48 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -387,10 +387,9 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
 	if (!nodes)
 		return 0;
 
-	/* Update current mems_allowed */
 	cpuset_update_current_mems_allowed();
-	/* Ignore nodes not set in current->mems_allowed */
-	cpuset_restrict_to_mems_allowed(nodes->bits);
+	if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
+		return -EINVAL;
 	return mpol_check_policy(mode, nodes);
 }
 
-- 
cgit v1.2.3


From 3e0d98b9f1eb757fc98efc84e74e54a08308aa73 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:49 -0800
Subject: [PATCH] cpuset: memory pressure meter

Provide a simple per-cpuset metric of memory pressure, tracking the -rate-
that the tasks in a cpuset call try_to_free_pages(), the synchronous
(direct) memory reclaim code.

This enables batch managers monitoring jobs running in dedicated cpusets to
efficiently detect what level of memory pressure that job is causing.

This is useful both on tightly managed systems running a wide mix of
submitted jobs, which may choose to terminate or reprioritize jobs that are
trying to use more memory than allowed on the nodes assigned them, and with
tightly coupled, long running, massively parallel scientific computing jobs
that will dramatically fail to meet required performance goals if they
start to use more memory than allowed to them.

This patch just provides a very economical way for the batch manager to
monitor a cpuset for signs of memory pressure.  It's up to the batch
manager or other user code to decide what to do about it and take action.

==> Unless this feature is enabled by writing "1" to the special file
    /dev/cpuset/memory_pressure_enabled, the hook in the rebalance
    code of __alloc_pages() for this metric reduces to simply noticing
    that the cpuset_memory_pressure_enabled flag is zero.  So only
    systems that enable this feature will compute the metric.

Why a per-cpuset, running average:

    Because this meter is per-cpuset, rather than per-task or mm, the
    system load imposed by a batch scheduler monitoring this metric is
    sharply reduced on large systems, because a scan of the tasklist can be
    avoided on each set of queries.

    Because this meter is a running average, instead of an accumulating
    counter, a batch scheduler can detect memory pressure with a single
    read, instead of having to read and accumulate results for a period of
    time.

    Because this meter is per-cpuset rather than per-task or mm, the
    batch scheduler can obtain the key information, memory pressure in a
    cpuset, with a single read, rather than having to query and accumulate
    results over all the (dynamically changing) set of tasks in the cpuset.

A per-cpuset simple digital filter (requires a spinlock and 3 words of data
per-cpuset) is kept, and updated by any task attached to that cpuset, if it
enters the synchronous (direct) page reclaim code.

A per-cpuset file provides an integer number representing the recent
(half-life of 10 seconds) rate of direct page reclaims caused by the tasks
in the cpuset, in units of reclaims attempted per second, times 1000.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpuset.h |  11 +++
 kernel/cpuset.c        | 193 ++++++++++++++++++++++++++++++++++++++++++++++++-
 mm/page_alloc.c        |   1 +
 3 files changed, 203 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 8b21786490ee..736d73801cb6 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -26,6 +26,15 @@ void cpuset_update_current_mems_allowed(void);
 int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
 extern int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask);
 extern int cpuset_excl_nodes_overlap(const struct task_struct *p);
+
+#define cpuset_memory_pressure_bump() 				\
+	do {							\
+		if (cpuset_memory_pressure_enabled)		\
+			__cpuset_memory_pressure_bump();	\
+	} while (0)
+extern int cpuset_memory_pressure_enabled;
+extern void __cpuset_memory_pressure_bump(void);
+
 extern struct file_operations proc_cpuset_operations;
 extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer);
 
@@ -60,6 +69,8 @@ static inline int cpuset_excl_nodes_overlap(const struct task_struct *p)
 	return 1;
 }
 
+static inline void cpuset_memory_pressure_bump(void) {}
+
 static inline char *cpuset_task_status_allowed(struct task_struct *task,
 							char *buffer)
 {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6503c6da4c4f..5a06fef669f8 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -56,6 +56,15 @@
 
 #define CPUSET_SUPER_MAGIC 		0x27e0eb
 
+/* See "Frequency meter" comments, below. */
+
+struct fmeter {
+	int cnt;		/* unprocessed events count */
+	int val;		/* most recent output value */
+	time_t time;		/* clock (secs) when val computed */
+	spinlock_t lock;	/* guards read or write of above */
+};
+
 struct cpuset {
 	unsigned long flags;		/* "unsigned long" so bitops work */
 	cpumask_t cpus_allowed;		/* CPUs allowed to tasks in cpuset */
@@ -80,7 +89,9 @@ struct cpuset {
 	 * Copy of global cpuset_mems_generation as of the most
 	 * recent time this cpuset changed its mems_allowed.
 	 */
-	 int mems_generation;
+	int mems_generation;
+
+	struct fmeter fmeter;		/* memory_pressure filter */
 };
 
 /* bits in struct cpuset flags field */
@@ -149,7 +160,7 @@ static struct cpuset top_cpuset = {
 };
 
 static struct vfsmount *cpuset_mount;
-static struct super_block *cpuset_sb = NULL;
+static struct super_block *cpuset_sb;
 
 /*
  * We have two global cpuset semaphores below.  They can nest.
@@ -806,6 +817,19 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 	return retval;
 }
 
+/*
+ * Call with manage_sem held.
+ */
+
+static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
+{
+	if (simple_strtoul(buf, NULL, 10) != 0)
+		cpuset_memory_pressure_enabled = 1;
+	else
+		cpuset_memory_pressure_enabled = 0;
+	return 0;
+}
+
 /*
  * update_flag - read a 0 or a 1 in a file and update associated flag
  * bit:	the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
@@ -847,6 +871,104 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
 	return 0;
 }
 
+/*
+ * Frequency meter - How fast is some event occuring?
+ *
+ * These routines manage a digitally filtered, constant time based,
+ * event frequency meter.  There are four routines:
+ *   fmeter_init() - initialize a frequency meter.
+ *   fmeter_markevent() - called each time the event happens.
+ *   fmeter_getrate() - returns the recent rate of such events.
+ *   fmeter_update() - internal routine used to update fmeter.
+ *
+ * A common data structure is passed to each of these routines,
+ * which is used to keep track of the state required to manage the
+ * frequency meter and its digital filter.
+ *
+ * The filter works on the number of events marked per unit time.
+ * The filter is single-pole low-pass recursive (IIR).  The time unit
+ * is 1 second.  Arithmetic is done using 32-bit integers scaled to
+ * simulate 3 decimal digits of precision (multiplied by 1000).
+ *
+ * With an FM_COEF of 933, and a time base of 1 second, the filter
+ * has a half-life of 10 seconds, meaning that if the events quit
+ * happening, then the rate returned from the fmeter_getrate()
+ * will be cut in half each 10 seconds, until it converges to zero.
+ *
+ * It is not worth doing a real infinitely recursive filter.  If more
+ * than FM_MAXTICKS ticks have elapsed since the last filter event,
+ * just compute FM_MAXTICKS ticks worth, by which point the level
+ * will be stable.
+ *
+ * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
+ * arithmetic overflow in the fmeter_update() routine.
+ *
+ * Given the simple 32 bit integer arithmetic used, this meter works
+ * best for reporting rates between one per millisecond (msec) and
+ * one per 32 (approx) seconds.  At constant rates faster than one
+ * per msec it maxes out at values just under 1,000,000.  At constant
+ * rates between one per msec, and one per second it will stabilize
+ * to a value N*1000, where N is the rate of events per second.
+ * At constant rates between one per second and one per 32 seconds,
+ * it will be choppy, moving up on the seconds that have an event,
+ * and then decaying until the next event.  At rates slower than
+ * about one in 32 seconds, it decays all the way back to zero between
+ * each event.
+ */
+
+#define FM_COEF 933		/* coefficient for half-life of 10 secs */
+#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
+#define FM_MAXCNT 1000000	/* limit cnt to avoid overflow */
+#define FM_SCALE 1000		/* faux fixed point scale */
+
+/* Initialize a frequency meter */
+static void fmeter_init(struct fmeter *fmp)
+{
+	fmp->cnt = 0;
+	fmp->val = 0;
+	fmp->time = 0;
+	spin_lock_init(&fmp->lock);
+}
+
+/* Internal meter update - process cnt events and update value */
+static void fmeter_update(struct fmeter *fmp)
+{
+	time_t now = get_seconds();
+	time_t ticks = now - fmp->time;
+
+	if (ticks == 0)
+		return;
+
+	ticks = min(FM_MAXTICKS, ticks);
+	while (ticks-- > 0)
+		fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
+	fmp->time = now;
+
+	fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
+	fmp->cnt = 0;
+}
+
+/* Process any previous ticks, then bump cnt by one (times scale). */
+static void fmeter_markevent(struct fmeter *fmp)
+{
+	spin_lock(&fmp->lock);
+	fmeter_update(fmp);
+	fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
+	spin_unlock(&fmp->lock);
+}
+
+/* Process any previous ticks, then return current value. */
+static int fmeter_getrate(struct fmeter *fmp)
+{
+	int val;
+
+	spin_lock(&fmp->lock);
+	fmeter_update(fmp);
+	val = fmp->val;
+	spin_unlock(&fmp->lock);
+	return val;
+}
+
 /*
  * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly
  * writing the path of the old cpuset in 'ppathbuf' if it needs to be
@@ -931,6 +1053,8 @@ typedef enum {
 	FILE_CPU_EXCLUSIVE,
 	FILE_MEM_EXCLUSIVE,
 	FILE_NOTIFY_ON_RELEASE,
+	FILE_MEMORY_PRESSURE_ENABLED,
+	FILE_MEMORY_PRESSURE,
 	FILE_TASKLIST,
 } cpuset_filetype_t;
 
@@ -984,6 +1108,12 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	case FILE_MEMORY_MIGRATE:
 		retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
 		break;
+	case FILE_MEMORY_PRESSURE_ENABLED:
+		retval = update_memory_pressure_enabled(cs, buffer);
+		break;
+	case FILE_MEMORY_PRESSURE:
+		retval = -EACCES;
+		break;
 	case FILE_TASKLIST:
 		retval = attach_task(cs, buffer, &pathbuf);
 		break;
@@ -1087,6 +1217,12 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
 	case FILE_MEMORY_MIGRATE:
 		*s++ = is_memory_migrate(cs) ? '1' : '0';
 		break;
+	case FILE_MEMORY_PRESSURE_ENABLED:
+		*s++ = cpuset_memory_pressure_enabled ? '1' : '0';
+		break;
+	case FILE_MEMORY_PRESSURE:
+		s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
+		break;
 	default:
 		retval = -EINVAL;
 		goto out;
@@ -1440,6 +1576,16 @@ static struct cftype cft_memory_migrate = {
 	.private = FILE_MEMORY_MIGRATE,
 };
 
+static struct cftype cft_memory_pressure_enabled = {
+	.name = "memory_pressure_enabled",
+	.private = FILE_MEMORY_PRESSURE_ENABLED,
+};
+
+static struct cftype cft_memory_pressure = {
+	.name = "memory_pressure",
+	.private = FILE_MEMORY_PRESSURE,
+};
+
 static int cpuset_populate_dir(struct dentry *cs_dentry)
 {
 	int err;
@@ -1456,6 +1602,8 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
 		return err;
 	if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0)
 		return err;
+	if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0)
+		return err;
 	if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
 		return err;
 	return 0;
@@ -1491,6 +1639,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 	INIT_LIST_HEAD(&cs->children);
 	atomic_inc(&cpuset_mems_generation);
 	cs->mems_generation = atomic_read(&cpuset_mems_generation);
+	fmeter_init(&cs->fmeter);
 
 	cs->parent = parent;
 
@@ -1580,6 +1729,7 @@ int __init cpuset_init(void)
 	top_cpuset.cpus_allowed = CPU_MASK_ALL;
 	top_cpuset.mems_allowed = NODE_MASK_ALL;
 
+	fmeter_init(&top_cpuset.fmeter);
 	atomic_inc(&cpuset_mems_generation);
 	top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation);
 
@@ -1601,6 +1751,9 @@ int __init cpuset_init(void)
 	top_cpuset.dentry = root;
 	root->d_inode->i_op = &cpuset_dir_inode_operations;
 	err = cpuset_populate_dir(root);
+	/* memory_pressure_enabled is in root cpuset only */
+	if (err == 0)
+		err = cpuset_add_file(root, &cft_memory_pressure_enabled);
 out:
 	return err;
 }
@@ -1890,6 +2043,42 @@ done:
 	return overlap;
 }
 
+/*
+ * Collection of memory_pressure is suppressed unless
+ * this flag is enabled by writing "1" to the special
+ * cpuset file 'memory_pressure_enabled' in the root cpuset.
+ */
+
+int cpuset_memory_pressure_enabled;
+
+/**
+ * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
+ *
+ * Keep a running average of the rate of synchronous (direct)
+ * page reclaim efforts initiated by tasks in each cpuset.
+ *
+ * This represents the rate at which some task in the cpuset
+ * ran low on memory on all nodes it was allowed to use, and
+ * had to enter the kernels page reclaim code in an effort to
+ * create more free memory by tossing clean pages or swapping
+ * or writing dirty pages.
+ *
+ * Display to user space in the per-cpuset read-only file
+ * "memory_pressure".  Value displayed is an integer
+ * representing the recent rate of entry into the synchronous
+ * (direct) page reclaim by any task attached to the cpuset.
+ **/
+
+void __cpuset_memory_pressure_bump(void)
+{
+	struct cpuset *cs;
+
+	task_lock(current);
+	cs = current->cpuset;
+	fmeter_markevent(&cs->fmeter);
+	task_unlock(current);
+}
+
 /*
  * proc_cpuset_show()
  *  - Print tasks cpuset path into seq_file.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ad3d0202cdef..e0e84924171b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -976,6 +976,7 @@ rebalance:
 	cond_resched();
 
 	/* We now go into synchronous reclaim */
+	cpuset_memory_pressure_bump();
 	p->flags |= PF_MEMALLOC;
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
-- 
cgit v1.2.3


From cf2a473c4089aa41c26f653200673f5a4cc25047 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:54 -0800
Subject: [PATCH] cpuset: combine refresh_mems and update_mems

The important code paths through alloc_pages_current() and alloc_page_vma(),
by which most kernel page allocations go, both called
cpuset_update_current_mems_allowed(), which in turn called refresh_mems().
-Both- of these latter two routines did a tasklock, got the tasks cpuset
pointer, and checked for out of date cpuset->mems_generation.

That was a silly duplication of code and waste of CPU cycles on an important
code path.

Consolidated those two routines into a single routine, called
cpuset_update_task_memory_state(), since it updates more than just
mems_allowed.

Changed all callers of either routine to call the new consolidated routine.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpuset.h |  4 +--
 kernel/cpuset.c        | 95 ++++++++++++++++++++++----------------------------
 mm/mempolicy.c         | 10 +++---
 3 files changed, 48 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 736d73801cb6..1feebf16ab08 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -20,7 +20,7 @@ extern void cpuset_fork(struct task_struct *p);
 extern void cpuset_exit(struct task_struct *p);
 extern cpumask_t cpuset_cpus_allowed(const struct task_struct *p);
 void cpuset_init_current_mems_allowed(void);
-void cpuset_update_current_mems_allowed(void);
+void cpuset_update_task_memory_state(void);
 #define cpuset_nodes_subset_current_mems_allowed(nodes) \
 		nodes_subset((nodes), current->mems_allowed)
 int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
@@ -51,7 +51,7 @@ static inline cpumask_t cpuset_cpus_allowed(struct task_struct *p)
 }
 
 static inline void cpuset_init_current_mems_allowed(void) {}
-static inline void cpuset_update_current_mems_allowed(void) {}
+static inline void cpuset_update_task_memory_state(void) {}
 #define cpuset_nodes_subset_current_mems_allowed(nodes) (1)
 
 static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d9349cc48b95..e9917d71628a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -584,13 +584,26 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
 	BUG_ON(!nodes_intersects(*pmask, node_online_map));
 }
 
-/*
- * Refresh current tasks mems_allowed and mems_generation from current
- * tasks cpuset.
+/**
+ * cpuset_update_task_memory_state - update task memory placement
  *
- * Call without callback_sem or task_lock() held.  May be called with
- * or without manage_sem held.  Will acquire task_lock() and might
- * acquire callback_sem during call.
+ * If the current tasks cpusets mems_allowed changed behind our
+ * backs, update current->mems_allowed, mems_generation and task NUMA
+ * mempolicy to the new value.
+ *
+ * Task mempolicy is updated by rebinding it relative to the
+ * current->cpuset if a task has its memory placement changed.
+ * Do not call this routine if in_interrupt().
+ *
+ * Call without callback_sem or task_lock() held.  May be called
+ * with or without manage_sem held.  Except in early boot or
+ * an exiting task, when tsk->cpuset is NULL, this routine will
+ * acquire task_lock().  We don't need to use task_lock to guard
+ * against another task changing a non-NULL cpuset pointer to NULL,
+ * as that is only done by a task on itself, and if the current task
+ * is here, it is not simultaneously in the exit code NULL'ing its
+ * cpuset pointer.  This routine also might acquire callback_sem and
+ * current->mm->mmap_sem during call.
  *
  * The task_lock() is required to dereference current->cpuset safely.
  * Without it, we could pick up the pointer value of current->cpuset
@@ -605,32 +618,36 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
  * task has been modifying its cpuset.
  */
 
-static void refresh_mems(void)
+void cpuset_update_task_memory_state()
 {
 	int my_cpusets_mem_gen;
+	struct task_struct *tsk = current;
+	struct cpuset *cs = tsk->cpuset;
 
-	task_lock(current);
-	my_cpusets_mem_gen = current->cpuset->mems_generation;
-	task_unlock(current);
+	if (unlikely(!cs))
+		return;
+
+	task_lock(tsk);
+	my_cpusets_mem_gen = cs->mems_generation;
+	task_unlock(tsk);
 
-	if (current->cpuset_mems_generation != my_cpusets_mem_gen) {
-		struct cpuset *cs;
-		nodemask_t oldmem = current->mems_allowed;
+	if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
+		nodemask_t oldmem = tsk->mems_allowed;
 		int migrate;
 
 		down(&callback_sem);
-		task_lock(current);
-		cs = current->cpuset;
+		task_lock(tsk);
+		cs = tsk->cpuset;	/* Maybe changed when task not locked */
 		migrate = is_memory_migrate(cs);
-		guarantee_online_mems(cs, &current->mems_allowed);
-		current->cpuset_mems_generation = cs->mems_generation;
-		task_unlock(current);
+		guarantee_online_mems(cs, &tsk->mems_allowed);
+		tsk->cpuset_mems_generation = cs->mems_generation;
+		task_unlock(tsk);
 		up(&callback_sem);
-		if (!nodes_equal(oldmem, current->mems_allowed)) {
-			numa_policy_rebind(&oldmem, &current->mems_allowed);
+		numa_policy_rebind(&oldmem, &tsk->mems_allowed);
+		if (!nodes_equal(oldmem, tsk->mems_allowed)) {
 			if (migrate) {
-				do_migrate_pages(current->mm, &oldmem,
-					&current->mems_allowed,
+				do_migrate_pages(tsk->mm, &oldmem,
+					&tsk->mems_allowed,
 					MPOL_MF_MOVE_ALL);
 			}
 		}
@@ -1630,7 +1647,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 		return -ENOMEM;
 
 	down(&manage_sem);
-	refresh_mems();
+	cpuset_update_task_memory_state();
 	cs->flags = 0;
 	if (notify_on_release(parent))
 		set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
@@ -1688,7 +1705,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	/* the vfs holds both inode->i_sem already */
 
 	down(&manage_sem);
-	refresh_mems();
+	cpuset_update_task_memory_state();
 	if (atomic_read(&cs->count) > 0) {
 		up(&manage_sem);
 		return -EBUSY;
@@ -1872,36 +1889,6 @@ void cpuset_init_current_mems_allowed(void)
 	current->mems_allowed = NODE_MASK_ALL;
 }
 
-/**
- * cpuset_update_current_mems_allowed - update mems parameters to new values
- *
- * If the current tasks cpusets mems_allowed changed behind our backs,
- * update current->mems_allowed and mems_generation to the new value.
- * Do not call this routine if in_interrupt().
- *
- * Call without callback_sem or task_lock() held.  May be called
- * with or without manage_sem held.  Unless exiting, it will acquire
- * task_lock().  Also might acquire callback_sem during call to
- * refresh_mems().
- */
-
-void cpuset_update_current_mems_allowed(void)
-{
-	struct cpuset *cs;
-	int need_to_refresh = 0;
-
-	task_lock(current);
-	cs = current->cpuset;
-	if (!cs)
-		goto done;
-	if (current->cpuset_mems_generation != cs->mems_generation)
-		need_to_refresh = 1;
-done:
-	task_unlock(current);
-	if (need_to_refresh)
-		refresh_mems();
-}
-
 /**
  * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed
  * @zl: the zonelist to be checked
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 9dea2b8a7d48..515bfeee027e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -387,7 +387,7 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
 	if (!nodes)
 		return 0;
 
-	cpuset_update_current_mems_allowed();
+	cpuset_update_task_memory_state();
 	if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
 		return -EINVAL;
 	return mpol_check_policy(mode, nodes);
@@ -461,7 +461,7 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	struct vm_area_struct *vma = NULL;
 	struct mempolicy *pol = current->mempolicy;
 
-	cpuset_update_current_mems_allowed();
+	cpuset_update_task_memory_state();
 	if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 		return -EINVAL;
 	if (flags & MPOL_F_ADDR) {
@@ -1089,7 +1089,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 {
 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 
-	cpuset_update_current_mems_allowed();
+	cpuset_update_task_memory_state();
 
 	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
 		unsigned nid;
@@ -1115,7 +1115,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
  *	interrupt context and apply the current process NUMA policy.
  *	Returns NULL when no page can be allocated.
  *
- *	Don't call cpuset_update_current_mems_allowed() unless
+ *	Don't call cpuset_update_task_memory_state() unless
  *	1) it's ok to take cpuset_sem (can WAIT), and
  *	2) allocating for current task (not interrupt).
  */
@@ -1124,7 +1124,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 	struct mempolicy *pol = current->mempolicy;
 
 	if ((gfp & __GFP_WAIT) && !in_interrupt())
-		cpuset_update_current_mems_allowed();
+		cpuset_update_task_memory_state();
 	if (!pol || in_interrupt())
 		pol = &default_policy;
 	if (pol->policy == MPOL_INTERLEAVE)
-- 
cgit v1.2.3


From 909d75a3b77bdd8baa9429bad3b69a654d2954ce Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:55 -0800
Subject: [PATCH] cpuset: implement cpuset_mems_allowed

Provide a cpuset_mems_allowed() method, which the sys_migrate_pages() code
needed, to obtain the mems_allowed vector of a cpuset, and replaced the
workaround in sys_migrate_pages() to call this new method.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpuset.h |  8 +++++++-
 kernel/cpuset.c        | 29 ++++++++++++++++++++++++++---
 mm/mempolicy.c         |  3 ---
 3 files changed, 33 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 1feebf16ab08..37d2dd7ca3e9 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -18,7 +18,8 @@ extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
 extern void cpuset_fork(struct task_struct *p);
 extern void cpuset_exit(struct task_struct *p);
-extern cpumask_t cpuset_cpus_allowed(const struct task_struct *p);
+extern cpumask_t cpuset_cpus_allowed(struct task_struct *p);
+extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 void cpuset_init_current_mems_allowed(void);
 void cpuset_update_task_memory_state(void);
 #define cpuset_nodes_subset_current_mems_allowed(nodes) \
@@ -50,6 +51,11 @@ static inline cpumask_t cpuset_cpus_allowed(struct task_struct *p)
 	return cpu_possible_map;
 }
 
+static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
+{
+	return node_possible_map;
+}
+
 static inline void cpuset_init_current_mems_allowed(void) {}
 static inline void cpuset_update_task_memory_state(void) {}
 #define cpuset_nodes_subset_current_mems_allowed(nodes) (1)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e9917d71628a..0d0dbbd6560a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1871,14 +1871,14 @@ void cpuset_exit(struct task_struct *tsk)
  * tasks cpuset.
  **/
 
-cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
+cpumask_t cpuset_cpus_allowed(struct task_struct *tsk)
 {
 	cpumask_t mask;
 
 	down(&callback_sem);
-	task_lock((struct task_struct *)tsk);
+	task_lock(tsk);
 	guarantee_online_cpus(tsk->cpuset, &mask);
-	task_unlock((struct task_struct *)tsk);
+	task_unlock(tsk);
 	up(&callback_sem);
 
 	return mask;
@@ -1889,6 +1889,29 @@ void cpuset_init_current_mems_allowed(void)
 	current->mems_allowed = NODE_MASK_ALL;
 }
 
+/**
+ * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
+ * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
+ *
+ * Description: Returns the nodemask_t mems_allowed of the cpuset
+ * attached to the specified @tsk.  Guaranteed to return some non-empty
+ * subset of node_online_map, even if this means going outside the
+ * tasks cpuset.
+ **/
+
+nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
+{
+	nodemask_t mask;
+
+	down(&callback_sem);
+	task_lock(tsk);
+	guarantee_online_mems(tsk->cpuset, &mask);
+	task_unlock(tsk);
+	up(&callback_sem);
+
+	return mask;
+}
+
 /**
  * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed
  * @zl: the zonelist to be checked
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 515bfeee027e..34d566ac147f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -772,9 +772,6 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 	return do_set_mempolicy(mode, &nodes);
 }
 
-/* Macro needed until Paul implements this function in kernel/cpusets.c */
-#define cpuset_mems_allowed(task) node_online_map
-
 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 		const unsigned long __user *old_nodes,
 		const unsigned long __user *new_nodes)
-- 
cgit v1.2.3


From 74cb21553f4bf244185b9bec4c26e4e3169ad55e Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:56 -0800
Subject: [PATCH] cpuset: numa_policy_rebind cleanup

Cleanup, reorganize and make more robust the mempolicy.c code to rebind
mempolicies relative to the containing cpuset after a tasks memory placement
changes.

The real motivator for this cleanup patch is to lay more groundwork for the
upcoming patch to correctly rebind NUMA mempolicies that are attached to vma's
after the containing cpuset memory placement changes.

NUMA mempolicies are constrained by the cpuset their task is a member of.
When either (1) a task is moved to a different cpuset, or (2) the 'mems'
mems_allowed of a cpuset is changed, then the NUMA mempolicies have embedded
node numbers (for MPOL_BIND, MPOL_INTERLEAVE and MPOL_PREFERRED) that need to
be recalculated, relative to their new cpuset placement.

The old code used an unreliable method of determining what was the old
mems_allowed constraining the mempolicy.  It just looked at the tasks
mems_allowed value.  This sort of worked with the present code, that just
rebinds the -task- mempolicy, and leaves any -vma- mempolicies broken,
referring to the old nodes.  But in an upcoming patch, the vma mempolicies
will be rebound as well.  Then the order in which the various task and vma
mempolicies are updated will no longer be deterministic, and one can no longer
count on the task->mems_allowed holding the old value for as long as needed.
It's not even clear if the current code was guaranteed to work reliably for
task mempolicies.

So I added a mems_allowed field to each mempolicy, stating exactly what
mems_allowed the policy is relative to, and updated synchronously and reliably
anytime that the mempolicy is rebound.

Also removed a useless wrapper routine, numa_policy_rebind(), and had its
caller, cpuset_update_task_memory_state(), call directly to the rewritten
policy_rebind() routine, and made that rebind routine extern instead of
static, and added a "mpol_" prefix to its name, making it
mpol_rebind_policy().

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempolicy.h | 12 ++++++++++--
 kernel/cpuset.c           |  2 +-
 mm/mempolicy.c            | 31 +++++++++++++++++++------------
 3 files changed, 30 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 05fddd5bee5d..74357cb9bc7c 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -68,6 +68,7 @@ struct mempolicy {
 		nodemask_t	 nodes;		/* interleave */
 		/* undefined for default */
 	} v;
+	nodemask_t cpuset_mems_allowed;	/* mempolicy relative to these nodes */
 };
 
 /*
@@ -146,7 +147,9 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
 
 extern void numa_default_policy(void);
 extern void numa_policy_init(void);
-extern void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new);
+extern void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *new);
+extern void mpol_rebind_task(struct task_struct *tsk,
+					const nodemask_t *new);
 extern struct mempolicy default_policy;
 extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 		unsigned long addr);
@@ -221,7 +224,12 @@ static inline void numa_default_policy(void)
 {
 }
 
-static inline void numa_policy_rebind(const nodemask_t *old,
+static inline void mpol_rebind_policy(struct mempolicy *pol,
+					const nodemask_t *new)
+{
+}
+
+static inline void mpol_rebind_task(struct task_struct *tsk,
 					const nodemask_t *new)
 {
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 0d0dbbd6560a..8f764de3a9e7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -643,7 +643,7 @@ void cpuset_update_task_memory_state()
 		tsk->cpuset_mems_generation = cs->mems_generation;
 		task_unlock(tsk);
 		up(&callback_sem);
-		numa_policy_rebind(&oldmem, &tsk->mems_allowed);
+		mpol_rebind_task(tsk, &tsk->mems_allowed);
 		if (!nodes_equal(oldmem, tsk->mems_allowed)) {
 			if (migrate) {
 				do_migrate_pages(tsk->mm, &oldmem,
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 34d566ac147f..c39bd86f4ea0 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -180,6 +180,7 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 		break;
 	}
 	policy->policy = mode;
+	policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 	return policy;
 }
 
@@ -1411,25 +1412,31 @@ void numa_default_policy(void)
 }
 
 /* Migrate a policy to a different set of nodes */
-static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
-							const nodemask_t *new)
+void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
 {
+	nodemask_t *mpolmask;
 	nodemask_t tmp;
 
 	if (!pol)
 		return;
+	mpolmask = &pol->cpuset_mems_allowed;
+	if (nodes_equal(*mpolmask, *newmask))
+		return;
 
 	switch (pol->policy) {
 	case MPOL_DEFAULT:
 		break;
 	case MPOL_INTERLEAVE:
-		nodes_remap(tmp, pol->v.nodes, *old, *new);
+		nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
 		pol->v.nodes = tmp;
-		current->il_next = node_remap(current->il_next, *old, *new);
+		*mpolmask = *newmask;
+		current->il_next = node_remap(current->il_next,
+						*mpolmask, *newmask);
 		break;
 	case MPOL_PREFERRED:
 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
-								*old, *new);
+						*mpolmask, *newmask);
+		*mpolmask = *newmask;
 		break;
 	case MPOL_BIND: {
 		nodemask_t nodes;
@@ -1439,7 +1446,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
 		nodes_clear(nodes);
 		for (z = pol->v.zonelist->zones; *z; z++)
 			node_set((*z)->zone_pgdat->node_id, nodes);
-		nodes_remap(tmp, nodes, *old, *new);
+		nodes_remap(tmp, nodes, *mpolmask, *newmask);
 		nodes = tmp;
 
 		zonelist = bind_zonelist(&nodes);
@@ -1454,6 +1461,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
 			kfree(pol->v.zonelist);
 			pol->v.zonelist = zonelist;
 		}
+		*mpolmask = *newmask;
 		break;
 	}
 	default:
@@ -1463,14 +1471,13 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
 }
 
 /*
- * Someone moved this task to different nodes.  Fixup mempolicies.
- *
- * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
- * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
+ * Wrapper for mpol_rebind_policy() that just requires task
+ * pointer, and updates task mempolicy.
  */
-void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
+
+void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 {
-	rebind_policy(current->mempolicy, old, new);
+	mpol_rebind_policy(tsk->mempolicy, new);
 }
 
 /*
-- 
cgit v1.2.3


From 202f72d5d1b5c2c084f63ef996c736d208b447b5 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:57 -0800
Subject: [PATCH] cpuset: number_of_cpusets optimization

Easy little optimization hack to avoid actually having to call
cpuset_zone_allowed() and check mems_allowed, in the main page allocation
routine, __alloc_pages().  This saves several CPU cycles per page allocation
on systems not using cpusets.

A counter is updated each time a cpuset is created or removed, and whenever
there is only one cpuset in the system, it must be the root cpuset, which
contains all CPUs and all Memory Nodes.  In that case, when the counter is
one, all allocations are allowed.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpuset.h | 10 +++++++++-
 kernel/cpuset.c        | 12 +++++++++++-
 2 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 37d2dd7ca3e9..34081c168af5 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -14,6 +14,8 @@
 
 #ifdef CONFIG_CPUSETS
 
+extern int number_of_cpusets;	/* How many cpusets are defined in system? */
+
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
 extern void cpuset_fork(struct task_struct *p);
@@ -25,7 +27,13 @@ void cpuset_update_task_memory_state(void);
 #define cpuset_nodes_subset_current_mems_allowed(nodes) \
 		nodes_subset((nodes), current->mems_allowed)
 int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
-extern int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask);
+
+extern int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask);
+static int inline cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+{
+	return number_of_cpusets <= 1 || __cpuset_zone_allowed(z, gfp_mask);
+}
+
 extern int cpuset_excl_nodes_overlap(const struct task_struct *p);
 
 #define cpuset_memory_pressure_bump() 				\
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8f764de3a9e7..6004719f26ee 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -56,6 +56,13 @@
 
 #define CPUSET_SUPER_MAGIC		0x27e0eb
 
+/*
+ * Tracks how many cpusets are currently defined in system.
+ * When there is only one cpuset (the root cpuset) we can
+ * short circuit some hooks.
+ */
+int number_of_cpusets;
+
 /* See "Frequency meter" comments, below. */
 
 struct fmeter {
@@ -1664,6 +1671,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 
 	down(&callback_sem);
 	list_add(&cs->sibling, &cs->parent->children);
+	number_of_cpusets++;
 	up(&callback_sem);
 
 	err = cpuset_create_dir(cs, name, mode);
@@ -1726,6 +1734,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	spin_unlock(&d->d_lock);
 	cpuset_d_remove_dir(d);
 	dput(d);
+	number_of_cpusets--;
 	up(&callback_sem);
 	if (list_empty(&parent->children))
 		check_for_release(parent, &pathbuf);
@@ -1769,6 +1778,7 @@ int __init cpuset_init(void)
 	root->d_inode->i_nlink++;
 	top_cpuset.dentry = root;
 	root->d_inode->i_op = &cpuset_dir_inode_operations;
+	number_of_cpusets = 1;
 	err = cpuset_populate_dir(root);
 	/* memory_pressure_enabled is in root cpuset only */
 	if (err == 0)
@@ -1982,7 +1992,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
  *	GFP_USER     - only nodes in current tasks mems allowed ok.
  **/
 
-int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 {
 	int node;			/* node that zone z is on */
 	const struct cpuset *cs;	/* current cpuset ancestors */
-- 
cgit v1.2.3


From 4225399a66b315d4d1fb1cb61b75dda201c832e3 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:59 -0800
Subject: [PATCH] cpuset: rebind vma mempolicies fix

Fix more of longstanding bug in cpuset/mempolicy interaction.

NUMA mempolicies (mm/mempolicy.c) are constrained by the current tasks cpuset
to just the Memory Nodes allowed by that cpuset.  The kernel maintains
internal state for each mempolicy, tracking what nodes are used for the
MPOL_INTERLEAVE, MPOL_BIND or MPOL_PREFERRED policies.

When a tasks cpuset memory placement changes, whether because the cpuset
changed, or because the task was attached to a different cpuset, then the
tasks mempolicies have to be rebound to the new cpuset placement, so as to
preserve the cpuset-relative numbering of the nodes in that policy.

An earlier fix handled such mempolicy rebinding for mempolicies attached to a
task.

This fix rebinds mempolicies attached to vma's (address ranges in a tasks
address space.) Due to the need to hold the task->mm->mmap_sem semaphore while
updating vma's, the rebinding of vma mempolicies has to be done when the
cpuset memory placement is changed, at which time mmap_sem can be safely
acquired.  The tasks mempolicy is rebound later, when the task next attempts
to allocate memory and notices that its task->cpuset_mems_generation is
out-of-date with its cpusets mems_generation.

Because walking the tasklist to find all tasks attached to a changing cpuset
requires holding tasklist_lock, a spinlock, one cannot update the vma's of the
affected tasks while doing the tasklist scan.  In general, one cannot acquire
a semaphore (which can sleep) while already holding a spinlock (such as
tasklist_lock).  So a list of mm references has to be built up during the
tasklist scan, then the tasklist lock dropped, then for each mm, its mmap_sem
acquired, and the vma's in that mm rebound.

Once the tasklist lock is dropped, affected tasks may fork new tasks, before
their mm's are rebound.  A kernel global 'cpuset_being_rebound' is set to
point to the cpuset being rebound (there can only be one; cpuset modifications
are done under a global 'manage_sem' semaphore), and the mpol_copy code that
is used to copy a tasks mempolicies during fork catches such forking tasks,
and ensures their children are also rebound.

When a task is moved to a different cpuset, it is easier, as there is only one
task involved.  It's mm->vma's are scanned, using the same
mpol_rebind_policy() as used above.

It may happen that both the mpol_copy hook and the update done via the
tasklist scan update the same mm twice.  This is ok, as the mempolicies of
each vma in an mm keep track of what mems_allowed they are relative to, and
safely no-op a second request to rebind to the same nodes.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempolicy.h | 18 ++++++++++
 kernel/cpuset.c           | 90 +++++++++++++++++++++++++++++++++++++++++++++++
 mm/mempolicy.c            | 29 +++++++++++++++
 3 files changed, 137 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 74357cb9bc7c..c7ac77e873b3 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -150,6 +150,16 @@ extern void numa_policy_init(void);
 extern void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *new);
 extern void mpol_rebind_task(struct task_struct *tsk,
 					const nodemask_t *new);
+extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
+#define set_cpuset_being_rebound(x) (cpuset_being_rebound = (x))
+
+#ifdef CONFIG_CPUSET
+#define current_cpuset_is_being_rebound() \
+				(cpuset_being_rebound == current->cpuset)
+#else
+#define current_cpuset_is_being_rebound() 0
+#endif
+
 extern struct mempolicy default_policy;
 extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 		unsigned long addr);
@@ -165,6 +175,8 @@ static inline void check_highest_zone(int k)
 int do_migrate_pages(struct mm_struct *mm,
 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags);
 
+extern void *cpuset_being_rebound;	/* Trigger mpol_copy vma rebind */
+
 #else
 
 struct mempolicy {};
@@ -234,6 +246,12 @@ static inline void mpol_rebind_task(struct task_struct *tsk,
 {
 }
 
+static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
+{
+}
+
+#define set_cpuset_being_rebound(x) do {} while (0)
+
 static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 		unsigned long addr)
 {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6004719f26ee..19f87565be17 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -812,12 +812,24 @@ static int update_cpumask(struct cpuset *cs, char *buf)
 }
 
 /*
+ * Handle user request to change the 'mems' memory placement
+ * of a cpuset.  Needs to validate the request, update the
+ * cpusets mems_allowed and mems_generation, and for each
+ * task in the cpuset, rebind any vma mempolicies.
+ *
  * Call with manage_sem held.  May take callback_sem during call.
+ * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
+ * lock each such tasks mm->mmap_sem, scan its vma's and rebind
+ * their mempolicies to the cpusets new mems_allowed.
  */
 
 static int update_nodemask(struct cpuset *cs, char *buf)
 {
 	struct cpuset trialcs;
+	struct task_struct *g, *p;
+	struct mm_struct **mmarray;
+	int i, n, ntasks;
+	int fudge;
 	int retval;
 
 	trialcs = *cs;
@@ -839,6 +851,76 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 	cs->mems_generation = atomic_read(&cpuset_mems_generation);
 	up(&callback_sem);
 
+	set_cpuset_being_rebound(cs);		/* causes mpol_copy() rebind */
+
+	fudge = 10;				/* spare mmarray[] slots */
+	fudge += cpus_weight(cs->cpus_allowed);	/* imagine one fork-bomb/cpu */
+	retval = -ENOMEM;
+
+	/*
+	 * Allocate mmarray[] to hold mm reference for each task
+	 * in cpuset cs.  Can't kmalloc GFP_KERNEL while holding
+	 * tasklist_lock.  We could use GFP_ATOMIC, but with a
+	 * few more lines of code, we can retry until we get a big
+	 * enough mmarray[] w/o using GFP_ATOMIC.
+	 */
+	while (1) {
+		ntasks = atomic_read(&cs->count);	/* guess */
+		ntasks += fudge;
+		mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
+		if (!mmarray)
+			goto done;
+		write_lock_irq(&tasklist_lock);		/* block fork */
+		if (atomic_read(&cs->count) <= ntasks)
+			break;				/* got enough */
+		write_unlock_irq(&tasklist_lock);	/* try again */
+		kfree(mmarray);
+	}
+
+	n = 0;
+
+	/* Load up mmarray[] with mm reference for each task in cpuset. */
+	do_each_thread(g, p) {
+		struct mm_struct *mm;
+
+		if (n >= ntasks) {
+			printk(KERN_WARNING
+				"Cpuset mempolicy rebind incomplete.\n");
+			continue;
+		}
+		if (p->cpuset != cs)
+			continue;
+		mm = get_task_mm(p);
+		if (!mm)
+			continue;
+		mmarray[n++] = mm;
+	} while_each_thread(g, p);
+	write_unlock_irq(&tasklist_lock);
+
+	/*
+	 * Now that we've dropped the tasklist spinlock, we can
+	 * rebind the vma mempolicies of each mm in mmarray[] to their
+	 * new cpuset, and release that mm.  The mpol_rebind_mm()
+	 * call takes mmap_sem, which we couldn't take while holding
+	 * tasklist_lock.  Forks can happen again now - the mpol_copy()
+	 * cpuset_being_rebound check will catch such forks, and rebind
+	 * their vma mempolicies too.  Because we still hold the global
+	 * cpuset manage_sem, we know that no other rebind effort will
+	 * be contending for the global variable cpuset_being_rebound.
+	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
+	 * is idempotent.
+	 */
+	for (i = 0; i < n; i++) {
+		struct mm_struct *mm = mmarray[i];
+
+		mpol_rebind_mm(mm, &cs->mems_allowed);
+		mmput(mm);
+	}
+
+	/* We're done rebinding vma's to this cpusets new mems_allowed. */
+	kfree(mmarray);
+	set_cpuset_being_rebound(NULL);
+	retval = 0;
 done:
 	return retval;
 }
@@ -1011,6 +1093,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 	struct cpuset *oldcs;
 	cpumask_t cpus;
 	nodemask_t from, to;
+	struct mm_struct *mm;
 
 	if (sscanf(pidbuf, "%d", &pid) != 1)
 		return -EIO;
@@ -1060,6 +1143,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 	to = cs->mems_allowed;
 
 	up(&callback_sem);
+
+	mm = get_task_mm(tsk);
+	if (mm) {
+		mpol_rebind_mm(mm, &to);
+		mmput(mm);
+	}
+
 	if (is_memory_migrate(cs))
 		do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
 	put_task_struct(tsk);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c39bd86f4ea0..1850d0aef4ac 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1131,6 +1131,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 }
 EXPORT_SYMBOL(alloc_pages_current);
 
+/*
+ * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
+ * rebinds the mempolicy its copying by calling mpol_rebind_policy()
+ * with the mems_allowed returned by cpuset_mems_allowed().  This
+ * keeps mempolicies cpuset relative after its cpuset moves.  See
+ * further kernel/cpuset.c update_nodemask().
+ */
+void *cpuset_being_rebound;
+
 /* Slow path of a mempolicy copy */
 struct mempolicy *__mpol_copy(struct mempolicy *old)
 {
@@ -1138,6 +1147,10 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
 
 	if (!new)
 		return ERR_PTR(-ENOMEM);
+	if (current_cpuset_is_being_rebound()) {
+		nodemask_t mems = cpuset_mems_allowed(current);
+		mpol_rebind_policy(old, &mems);
+	}
 	*new = *old;
 	atomic_set(&new->refcnt, 1);
 	if (new->policy == MPOL_BIND) {
@@ -1480,6 +1493,22 @@ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 	mpol_rebind_policy(tsk->mempolicy, new);
 }
 
+/*
+ * Rebind each vma in mm to new nodemask.
+ *
+ * Call holding a reference to mm.  Takes mm->mmap_sem during call.
+ */
+
+void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
+{
+	struct vm_area_struct *vma;
+
+	down_write(&mm->mmap_sem);
+	for (vma = mm->mmap; vma; vma = vma->vm_next)
+		mpol_rebind_policy(vma->vm_policy, new);
+	up_write(&mm->mmap_sem);
+}
+
 /*
  * Display pages allocated per node and memory policy via /proc.
  */
-- 
cgit v1.2.3


From c417f0242ebe578924a30d4e53d35b5059fed4e7 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:02:01 -0800
Subject: [PATCH] cpuset: remove test for null cpuset from alloc code path

Remove a couple of more lines of code from the cpuset hooks in the page
allocation code path.

There was a check for a NULL cpuset pointer in the routine
cpuset_update_task_memory_state() that was only needed during system boot,
after the memory subsystem was initialized, before the cpuset subsystem was
initialized, to catch a NULL task->cpuset pointer.

Add a cpuset_init_early() routine, just before the mem_init() call in
init/main.c, that sets up just enough of the init tasks cpuset structure to
render cpuset_update_task_memory_state() calls harmless.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpuset.h |  2 ++
 init/main.c            |  1 +
 kernel/cpuset.c        | 22 ++++++++++++++++------
 3 files changed, 19 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 34081c168af5..c472f972bd6d 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -16,6 +16,7 @@
 
 extern int number_of_cpusets;	/* How many cpusets are defined in system? */
 
+extern int cpuset_init_early(void);
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
 extern void cpuset_fork(struct task_struct *p);
@@ -49,6 +50,7 @@ extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer);
 
 #else /* !CONFIG_CPUSETS */
 
+static inline int cpuset_init_early(void) { return 0; }
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
 static inline void cpuset_fork(struct task_struct *p) {}
diff --git a/init/main.c b/init/main.c
index 2ed3638deec7..afe5eb84ad52 100644
--- a/init/main.c
+++ b/init/main.c
@@ -512,6 +512,7 @@ asmlinkage void __init start_kernel(void)
 	}
 #endif
 	vfs_caches_init_early();
+	cpuset_init_early();
 	mem_init();
 	kmem_cache_init();
 	setup_per_cpu_pageset();
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index cf8203a5fa71..fc949e4a625c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -603,9 +603,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
  * Do not call this routine if in_interrupt().
  *
  * Call without callback_sem or task_lock() held.  May be called
- * with or without manage_sem held.  Except in early boot or
- * an exiting task, when tsk->cpuset is NULL, this routine will
- * acquire task_lock().  We don't need to use task_lock to guard
+ * with or without manage_sem held.  Doesn't need task_lock to guard
  * against another task changing a non-NULL cpuset pointer to NULL,
  * as that is only done by a task on itself, and if the current task
  * is here, it is not simultaneously in the exit code NULL'ing its
@@ -631,9 +629,6 @@ void cpuset_update_task_memory_state()
 	struct task_struct *tsk = current;
 	struct cpuset *cs = tsk->cpuset;
 
-	if (unlikely(!cs))
-		return;
-
 	task_lock(tsk);
 	my_cpusets_mem_gen = cs->mems_generation;
 	task_unlock(tsk);
@@ -1836,6 +1831,21 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	return 0;
 }
 
+/*
+ * cpuset_init_early - just enough so that the calls to
+ * cpuset_update_task_memory_state() in early init code
+ * are harmless.
+ */
+
+int __init cpuset_init_early(void)
+{
+	struct task_struct *tsk = current;
+
+	tsk->cpuset = &top_cpuset;
+	tsk->cpuset->mems_generation = atomic_read(&cpuset_mems_generation);
+	return 0;
+}
+
 /**
  * cpuset_init - initialize cpusets at system boot
  *
-- 
cgit v1.2.3


From de25968cc87cc5b76d09de8b4cbddc8f24fcf5f7 Mon Sep 17 00:00:00 2001
From: Tim Schmielau <tim@physik3.uni-rostock.de>
Date: Sun, 8 Jan 2006 01:02:05 -0800
Subject: [PATCH] fix more missing includes

Include fixes for 2.6.14-git11.  Should allow to remove sched.h from
module.h on i386, x86_64, arm, ia64, ppc, ppc64, and s390.  Probably more
to come since I haven't yet checked the other archs.

Signed-off-by: Tim Schmielau <tim@physik3.uni-rostock.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm/common/scoop.c                   | 1 +
 arch/arm/mach-realview/localtimer.c       | 1 +
 arch/mips/sgi-ip27/ip27-berr.c            | 1 +
 drivers/char/agp/sworks-agp.c             | 1 +
 drivers/infiniband/hw/mthca/mthca_dev.h   | 1 +
 drivers/infiniband/ulp/srp/ib_srp.c       | 1 +
 drivers/macintosh/windfarm_smu_controls.c | 1 +
 drivers/macintosh/windfarm_smu_sensors.c  | 1 +
 drivers/mtd/onenand/generic.c             | 1 +
 drivers/mtd/rfd_ftl.c                     | 1 +
 drivers/pci/hotplug/pciehp.h              | 1 +
 drivers/pci/hotplug/pciehp_hpc.c          | 3 +++
 drivers/rapidio/rio-scan.c                | 2 ++
 drivers/rapidio/rio-sysfs.c               | 1 +
 drivers/rapidio/rio.c                     | 1 +
 drivers/usb/host/ohci-au1xxx.c            | 1 +
 drivers/usb/host/ohci-lh7a404.c           | 1 +
 drivers/usb/host/ohci-ppc-soc.c           | 1 +
 include/linux/rio_drv.h                   | 1 +
 19 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/arch/arm/common/scoop.c b/arch/arm/common/scoop.c
index b6de43e73699..a2dfe0b0f1ec 100644
--- a/arch/arm/common/scoop.c
+++ b/arch/arm/common/scoop.c
@@ -13,6 +13,7 @@
 
 #include <linux/device.h>
 #include <linux/string.h>
+#include <linux/slab.h>
 #include <linux/platform_device.h>
 #include <asm/io.h>
 #include <asm/hardware/scoop.h>
diff --git a/arch/arm/mach-realview/localtimer.c b/arch/arm/mach-realview/localtimer.c
index c9d7c596b200..caf6b8bb6c95 100644
--- a/arch/arm/mach-realview/localtimer.c
+++ b/arch/arm/mach-realview/localtimer.c
@@ -13,6 +13,7 @@
 #include <linux/delay.h>
 #include <linux/device.h>
 #include <linux/smp.h>
+#include <linux/jiffies.h>
 
 #include <asm/mach/time.h>
 #include <asm/hardware/arm_twd.h>
diff --git a/arch/mips/sgi-ip27/ip27-berr.c b/arch/mips/sgi-ip27/ip27-berr.c
index 07631a97670b..ce907eda221b 100644
--- a/arch/mips/sgi-ip27/ip27-berr.c
+++ b/arch/mips/sgi-ip27/ip27-berr.c
@@ -11,6 +11,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/signal.h>	/* for SIGBUS */
+#include <linux/sched.h>	/* schow_regs(), force_sig() */
 
 #include <asm/module.h>
 #include <asm/sn/addrs.h>
diff --git a/drivers/char/agp/sworks-agp.c b/drivers/char/agp/sworks-agp.c
index 3f8f7fa6b0ff..268f78d926d3 100644
--- a/drivers/char/agp/sworks-agp.c
+++ b/drivers/char/agp/sworks-agp.c
@@ -7,6 +7,7 @@
 #include <linux/init.h>
 #include <linux/string.h>
 #include <linux/slab.h>
+#include <linux/jiffies.h>
 #include <linux/agp_backend.h>
 #include "agp.h"
 
diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h
index 497ff794ef6a..795b379260bf 100644
--- a/drivers/infiniband/hw/mthca/mthca_dev.h
+++ b/drivers/infiniband/hw/mthca/mthca_dev.h
@@ -43,6 +43,7 @@
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
+#include <linux/timer.h>
 #include <asm/semaphore.h>
 
 #include "mthca_provider.h"
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index ee9fe226ae99..dd488d3cffa9 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -39,6 +39,7 @@
 #include <linux/string.h>
 #include <linux/parser.h>
 #include <linux/random.h>
+#include <linux/jiffies.h>
 
 #include <asm/atomic.h>
 
diff --git a/drivers/macintosh/windfarm_smu_controls.c b/drivers/macintosh/windfarm_smu_controls.c
index 2c3158c81ff2..4d811600bdab 100644
--- a/drivers/macintosh/windfarm_smu_controls.c
+++ b/drivers/macintosh/windfarm_smu_controls.c
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/wait.h>
+#include <linux/completion.h>
 #include <asm/prom.h>
 #include <asm/machdep.h>
 #include <asm/io.h>
diff --git a/drivers/macintosh/windfarm_smu_sensors.c b/drivers/macintosh/windfarm_smu_sensors.c
index b558cc209d49..1a00d9c75a23 100644
--- a/drivers/macintosh/windfarm_smu_sensors.c
+++ b/drivers/macintosh/windfarm_smu_sensors.c
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/wait.h>
+#include <linux/completion.h>
 #include <asm/prom.h>
 #include <asm/machdep.h>
 #include <asm/io.h>
diff --git a/drivers/mtd/onenand/generic.c b/drivers/mtd/onenand/generic.c
index 45c077d0f063..af06a80f44de 100644
--- a/drivers/mtd/onenand/generic.c
+++ b/drivers/mtd/onenand/generic.c
@@ -14,6 +14,7 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/platform_device.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/onenand.h>
diff --git a/drivers/mtd/rfd_ftl.c b/drivers/mtd/rfd_ftl.c
index 20ce212638fc..a3e00a4635a5 100644
--- a/drivers/mtd/rfd_ftl.c
+++ b/drivers/mtd/rfd_ftl.c
@@ -18,6 +18,7 @@
 #include <linux/mtd/blktrans.h>
 #include <linux/mtd/mtd.h>
 #include <linux/vmalloc.h>
+#include <linux/slab.h>
 #include <linux/jiffies.h>
 
 #include <asm/types.h>
diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h
index 6a61b9f286e1..0aac6a61337d 100644
--- a/drivers/pci/hotplug/pciehp.h
+++ b/drivers/pci/hotplug/pciehp.h
@@ -32,6 +32,7 @@
 #include <linux/types.h>
 #include <linux/pci.h>
 #include <linux/delay.h>
+#include <linux/sched.h>		/* signal_pending() */
 #include <linux/pcieport_if.h>
 #include "pci_hotplug.h"
 
diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index 0b8b26beb163..ac1e495c314e 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -30,6 +30,9 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/types.h>
+#include <linux/signal.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
 #include <linux/pci.h>
 #include <linux/interrupt.h>
 
diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c
index 4f7ed4bd3be9..94e30fe4b8f3 100644
--- a/drivers/rapidio/rio-scan.c
+++ b/drivers/rapidio/rio-scan.c
@@ -24,6 +24,8 @@
 #include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/timer.h>
+#include <linux/jiffies.h>
+#include <linux/slab.h>
 
 #include "rio.h"
 
diff --git a/drivers/rapidio/rio-sysfs.c b/drivers/rapidio/rio-sysfs.c
index 30a11436e241..bef9316e95df 100644
--- a/drivers/rapidio/rio-sysfs.c
+++ b/drivers/rapidio/rio-sysfs.c
@@ -15,6 +15,7 @@
 #include <linux/rio.h>
 #include <linux/rio_drv.h>
 #include <linux/stat.h>
+#include <linux/sched.h>	/* for capable() */
 
 #include "rio.h"
 
diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c
index 3ca1011ceaac..5e382470faa2 100644
--- a/drivers/rapidio/rio.c
+++ b/drivers/rapidio/rio.c
@@ -23,6 +23,7 @@
 #include <linux/rio_regs.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
+#include <linux/slab.h>
 
 #include "rio.h"
 
diff --git a/drivers/usb/host/ohci-au1xxx.c b/drivers/usb/host/ohci-au1xxx.c
index d9cf3b327d96..77cd6ac07e3c 100644
--- a/drivers/usb/host/ohci-au1xxx.c
+++ b/drivers/usb/host/ohci-au1xxx.c
@@ -19,6 +19,7 @@
  */
 
 #include <linux/platform_device.h>
+#include <linux/signal.h>
 
 #include <asm/mach-au1x00/au1000.h>
 
diff --git a/drivers/usb/host/ohci-lh7a404.c b/drivers/usb/host/ohci-lh7a404.c
index 3959ccc88332..0020ed7a39d0 100644
--- a/drivers/usb/host/ohci-lh7a404.c
+++ b/drivers/usb/host/ohci-lh7a404.c
@@ -17,6 +17,7 @@
  */
 
 #include <linux/platform_device.h>
+#include <linux/signal.h>
 
 #include <asm/hardware.h>
 
diff --git a/drivers/usb/host/ohci-ppc-soc.c b/drivers/usb/host/ohci-ppc-soc.c
index 2ec6a78bd65e..b2a8dfa48870 100644
--- a/drivers/usb/host/ohci-ppc-soc.c
+++ b/drivers/usb/host/ohci-ppc-soc.c
@@ -15,6 +15,7 @@
  */
 
 #include <linux/platform_device.h>
+#include <linux/signal.h>
 
 /* configure so an HC device and id are always provided */
 /* always called with process context; sleeping is OK */
diff --git a/include/linux/rio_drv.h b/include/linux/rio_drv.h
index 3bd7cce19e26..157d7e3236b5 100644
--- a/include/linux/rio_drv.h
+++ b/include/linux/rio_drv.h
@@ -21,6 +21,7 @@
 #include <linux/list.h>
 #include <linux/errno.h>
 #include <linux/device.h>
+#include <linux/string.h>
 #include <linux/rio.h>
 
 extern int __rio_local_read_config_32(struct rio_mport *port, u32 offset,
-- 
cgit v1.2.3


From 705b6c7b34f2621f95f606d0e683daa10cdb8eb9 Mon Sep 17 00:00:00 2001
From: Paul Fulghum <paulkf@microgate.com>
Date: Sun, 8 Jan 2006 01:02:06 -0800
Subject: [PATCH] new driver synclink_gt

New character device driver for the SyncLink GT and SyncLink AC families of
synchronous and asynchronous serial adapters

Signed-off-by: Paul Fulghum <paulkf@microgate.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/Kconfig       |    8 +
 drivers/char/Makefile      |    1 +
 drivers/char/synclink_gt.c | 4501 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/synclink.h   |    9 +-
 4 files changed, 4518 insertions(+), 1 deletion(-)
 create mode 100644 drivers/char/synclink_gt.c

(limited to 'include/linux')

diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index b1b34edcd70c..dd7e6901c575 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -220,6 +220,14 @@ config SYNCLINKMP
 	  The module will be called synclinkmp.  If you want to do that, say M
 	  here.
 
+config SYNCLINK_GT
+	tristate "SyncLink GT/AC support"
+	depends on SERIAL_NONSTANDARD
+	help
+	  Support for SyncLink GT and SyncLink AC families of
+	  synchronous and asynchronous serial adapters
+	  manufactured by Microgate Systems, Ltd. (www.microgate.com)
+
 config N_HDLC
 	tristate "HDLC line discipline support"
 	depends on SERIAL_NONSTANDARD
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index 4aeae687e88a..d973d14d8f7f 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_RISCOM8)		+= riscom8.o
 obj-$(CONFIG_ISI)		+= isicom.o
 obj-$(CONFIG_SYNCLINK)		+= synclink.o
 obj-$(CONFIG_SYNCLINKMP)	+= synclinkmp.o
+obj-$(CONFIG_SYNCLINK_GT)	+= synclink_gt.o
 obj-$(CONFIG_N_HDLC)		+= n_hdlc.o
 obj-$(CONFIG_AMIGA_BUILTIN_SERIAL) += amiserial.o
 obj-$(CONFIG_SX)		+= sx.o generic_serial.o
diff --git a/drivers/char/synclink_gt.c b/drivers/char/synclink_gt.c
new file mode 100644
index 000000000000..2b9cde94e2f7
--- /dev/null
+++ b/drivers/char/synclink_gt.c
@@ -0,0 +1,4501 @@
+/*
+ * $Id: synclink_gt.c,v 4.20 2005/11/08 19:51:55 paulkf Exp $
+ *
+ * Device driver for Microgate SyncLink GT serial adapters.
+ *
+ * written by Paul Fulghum for Microgate Corporation
+ * paulkf@microgate.com
+ *
+ * Microgate and SyncLink are trademarks of Microgate Corporation
+ *
+ * This code is released under the GNU General Public License (GPL)
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * DEBUG OUTPUT DEFINITIONS
+ *
+ * uncomment lines below to enable specific types of debug output
+ *
+ * DBGINFO   information - most verbose output
+ * DBGERR    serious errors
+ * DBGBH     bottom half service routine debugging
+ * DBGISR    interrupt service routine debugging
+ * DBGDATA   output receive and transmit data
+ * DBGTBUF   output transmit DMA buffers and registers
+ * DBGRBUF   output receive DMA buffers and registers
+ */
+
+#define DBGINFO(fmt) if (debug_level >= DEBUG_LEVEL_INFO) printk fmt
+#define DBGERR(fmt) if (debug_level >= DEBUG_LEVEL_ERROR) printk fmt
+#define DBGBH(fmt) if (debug_level >= DEBUG_LEVEL_BH) printk fmt
+#define DBGISR(fmt) if (debug_level >= DEBUG_LEVEL_ISR) printk fmt
+#define DBGDATA(info, buf, size, label) if (debug_level >= DEBUG_LEVEL_DATA) trace_block((info), (buf), (size), (label))
+//#define DBGTBUF(info) dump_tbufs(info)
+//#define DBGRBUF(info) dump_rbufs(info)
+
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+#include <linux/serial.h>
+#include <linux/major.h>
+#include <linux/string.h>
+#include <linux/fcntl.h>
+#include <linux/ptrace.h>
+#include <linux/ioport.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/ioctl.h>
+#include <linux/termios.h>
+#include <linux/bitops.h>
+#include <linux/workqueue.h>
+#include <linux/hdlc.h>
+
+#include <asm/serial.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/dma.h>
+#include <asm/types.h>
+#include <asm/uaccess.h>
+
+#include "linux/synclink.h"
+
+#ifdef CONFIG_HDLC_MODULE
+#define CONFIG_HDLC 1
+#endif
+
+/*
+ * module identification
+ */
+static char *driver_name     = "SyncLink GT";
+static char *driver_version  = "$Revision: 4.20 $";
+static char *tty_driver_name = "synclink_gt";
+static char *tty_dev_prefix  = "ttySLG";
+MODULE_LICENSE("GPL");
+#define MGSL_MAGIC 0x5401
+#define MAX_DEVICES 12
+
+static struct pci_device_id pci_table[] = {
+	{PCI_VENDOR_ID_MICROGATE, SYNCLINK_GT_DEVICE_ID, PCI_ANY_ID, PCI_ANY_ID,},
+	{PCI_VENDOR_ID_MICROGATE, SYNCLINK_GT4_DEVICE_ID, PCI_ANY_ID, PCI_ANY_ID,},
+	{PCI_VENDOR_ID_MICROGATE, SYNCLINK_AC_DEVICE_ID, PCI_ANY_ID, PCI_ANY_ID,},
+	{0,}, /* terminate list */
+};
+MODULE_DEVICE_TABLE(pci, pci_table);
+
+static int  init_one(struct pci_dev *dev,const struct pci_device_id *ent);
+static void remove_one(struct pci_dev *dev);
+static struct pci_driver pci_driver = {
+	.name		= "synclink_gt",
+	.id_table	= pci_table,
+	.probe		= init_one,
+	.remove		= __devexit_p(remove_one),
+};
+
+static int pci_registered;
+
+/*
+ * module configuration and status
+ */
+static struct slgt_info *slgt_device_list;
+static int slgt_device_count;
+
+static int ttymajor;
+static int debug_level;
+static int maxframe[MAX_DEVICES];
+static int dosyncppp[MAX_DEVICES];
+
+module_param(ttymajor, int, 0);
+module_param(debug_level, int, 0);
+module_param_array(maxframe, int, NULL, 0);
+module_param_array(dosyncppp, int, NULL, 0);
+
+MODULE_PARM_DESC(ttymajor, "TTY major device number override: 0=auto assigned");
+MODULE_PARM_DESC(debug_level, "Debug syslog output: 0=disabled, 1 to 5=increasing detail");
+MODULE_PARM_DESC(maxframe, "Maximum frame size used by device (4096 to 65535)");
+MODULE_PARM_DESC(dosyncppp, "Enable synchronous net device, 0=disable 1=enable");
+
+/*
+ * tty support and callbacks
+ */
+#define RELEVANT_IFLAG(iflag) (iflag & (IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK))
+
+static struct tty_driver *serial_driver;
+
+static int  open(struct tty_struct *tty, struct file * filp);
+static void close(struct tty_struct *tty, struct file * filp);
+static void hangup(struct tty_struct *tty);
+static void set_termios(struct tty_struct *tty, struct termios *old_termios);
+
+static int  write(struct tty_struct *tty, const unsigned char *buf, int count);
+static void put_char(struct tty_struct *tty, unsigned char ch);
+static void send_xchar(struct tty_struct *tty, char ch);
+static void wait_until_sent(struct tty_struct *tty, int timeout);
+static int  write_room(struct tty_struct *tty);
+static void flush_chars(struct tty_struct *tty);
+static void flush_buffer(struct tty_struct *tty);
+static void tx_hold(struct tty_struct *tty);
+static void tx_release(struct tty_struct *tty);
+
+static int  ioctl(struct tty_struct *tty, struct file *file, unsigned int cmd, unsigned long arg);
+static int  read_proc(char *page, char **start, off_t off, int count,int *eof, void *data);
+static int  chars_in_buffer(struct tty_struct *tty);
+static void throttle(struct tty_struct * tty);
+static void unthrottle(struct tty_struct * tty);
+static void set_break(struct tty_struct *tty, int break_state);
+
+/*
+ * generic HDLC support and callbacks
+ */
+#ifdef CONFIG_HDLC
+#define dev_to_port(D) (dev_to_hdlc(D)->priv)
+static void hdlcdev_tx_done(struct slgt_info *info);
+static void hdlcdev_rx(struct slgt_info *info, char *buf, int size);
+static int  hdlcdev_init(struct slgt_info *info);
+static void hdlcdev_exit(struct slgt_info *info);
+#endif
+
+
+/*
+ * device specific structures, macros and functions
+ */
+
+#define SLGT_MAX_PORTS 4
+#define SLGT_REG_SIZE  256
+
+/*
+ * DMA buffer descriptor and access macros
+ */
+struct slgt_desc
+{
+	unsigned short count;
+	unsigned short status;
+	unsigned int pbuf;  /* physical address of data buffer */
+	unsigned int next;  /* physical address of next descriptor */
+
+	/* driver book keeping */
+	char *buf;          /* virtual  address of data buffer */
+    	unsigned int pdesc; /* physical address of this descriptor */
+	dma_addr_t buf_dma_addr;
+};
+
+#define set_desc_buffer(a,b) (a).pbuf = cpu_to_le32((unsigned int)(b))
+#define set_desc_next(a,b) (a).next   = cpu_to_le32((unsigned int)(b))
+#define set_desc_count(a,b)(a).count  = cpu_to_le16((unsigned short)(b))
+#define set_desc_eof(a,b)  (a).status = cpu_to_le16((b) ? (le16_to_cpu((a).status) | BIT0) : (le16_to_cpu((a).status) & ~BIT0))
+#define desc_count(a)      (le16_to_cpu((a).count))
+#define desc_status(a)     (le16_to_cpu((a).status))
+#define desc_complete(a)   (le16_to_cpu((a).status) & BIT15)
+#define desc_eof(a)        (le16_to_cpu((a).status) & BIT2)
+#define desc_crc_error(a)  (le16_to_cpu((a).status) & BIT1)
+#define desc_abort(a)      (le16_to_cpu((a).status) & BIT0)
+#define desc_residue(a)    ((le16_to_cpu((a).status) & 0x38) >> 3)
+
+struct _input_signal_events {
+	int ri_up;
+	int ri_down;
+	int dsr_up;
+	int dsr_down;
+	int dcd_up;
+	int dcd_down;
+	int cts_up;
+	int cts_down;
+};
+
+/*
+ * device instance data structure
+ */
+struct slgt_info {
+	void *if_ptr;		/* General purpose pointer (used by SPPP) */
+
+	struct slgt_info *next_device;	/* device list link */
+
+	int magic;
+	int flags;
+
+	char device_name[25];
+	struct pci_dev *pdev;
+
+	int port_count;  /* count of ports on adapter */
+	int adapter_num; /* adapter instance number */
+	int port_num;    /* port instance number */
+
+	/* array of pointers to port contexts on this adapter */
+	struct slgt_info *port_array[SLGT_MAX_PORTS];
+
+	int			count;		/* count of opens */
+	int			line;		/* tty line instance number */
+	unsigned short		close_delay;
+	unsigned short		closing_wait;	/* time to wait before closing */
+
+	struct mgsl_icount	icount;
+
+	struct tty_struct 	*tty;
+	int			timeout;
+	int			x_char;		/* xon/xoff character */
+	int			blocked_open;	/* # of blocked opens */
+	unsigned int		read_status_mask;
+	unsigned int 		ignore_status_mask;
+
+	wait_queue_head_t	open_wait;
+	wait_queue_head_t	close_wait;
+
+	wait_queue_head_t	status_event_wait_q;
+	wait_queue_head_t	event_wait_q;
+	struct timer_list	tx_timer;
+	struct timer_list	rx_timer;
+
+	spinlock_t lock;	/* spinlock for synchronizing with ISR */
+
+	struct work_struct task;
+	u32 pending_bh;
+	int bh_requested;
+	int bh_running;
+
+	int isr_overflow;
+	int irq_requested;	/* nonzero if IRQ requested */
+	int irq_occurred;	/* for diagnostics use */
+
+	/* device configuration */
+
+	unsigned int bus_type;
+	unsigned int irq_level;
+	unsigned long irq_flags;
+
+	unsigned char __iomem * reg_addr;  /* memory mapped registers address */
+	u32 phys_reg_addr;
+	u32 reg_offset;
+	int reg_addr_requested;
+
+	MGSL_PARAMS params;       /* communications parameters */
+	u32 idle_mode;
+	u32 max_frame_size;       /* as set by device config */
+
+	unsigned int raw_rx_size;
+	unsigned int if_mode;
+
+	/* device status */
+
+	int rx_enabled;
+	int rx_restart;
+
+	int tx_enabled;
+	int tx_active;
+
+	unsigned char signals;    /* serial signal states */
+	unsigned int init_error;  /* initialization error */
+
+	unsigned char *tx_buf;
+	int tx_count;
+
+	char flag_buf[MAX_ASYNC_BUFFER_SIZE];
+	char char_buf[MAX_ASYNC_BUFFER_SIZE];
+	BOOLEAN drop_rts_on_tx_done;
+	struct	_input_signal_events	input_signal_events;
+
+	int dcd_chkcount;	/* check counts to prevent */
+	int cts_chkcount;	/* too many IRQs if a signal */
+	int dsr_chkcount;	/* is floating */
+	int ri_chkcount;
+
+	char *bufs;		/* virtual address of DMA buffer lists */
+	dma_addr_t bufs_dma_addr; /* physical address of buffer descriptors */
+
+	unsigned int rbuf_count;
+	struct slgt_desc *rbufs;
+	unsigned int rbuf_current;
+	unsigned int rbuf_index;
+
+	unsigned int tbuf_count;
+	struct slgt_desc *tbufs;
+	unsigned int tbuf_current;
+	unsigned int tbuf_start;
+
+	unsigned char *tmp_rbuf;
+	unsigned int tmp_rbuf_count;
+
+	/* SPPP/Cisco HDLC device parts */
+
+	int netcount;
+	int dosyncppp;
+	spinlock_t netlock;
+#ifdef CONFIG_HDLC
+	struct net_device *netdev;
+#endif
+
+};
+
+static MGSL_PARAMS default_params = {
+	.mode            = MGSL_MODE_HDLC,
+	.loopback        = 0,
+	.flags           = HDLC_FLAG_UNDERRUN_ABORT15,
+	.encoding        = HDLC_ENCODING_NRZI_SPACE,
+	.clock_speed     = 0,
+	.addr_filter     = 0xff,
+	.crc_type        = HDLC_CRC_16_CCITT,
+	.preamble_length = HDLC_PREAMBLE_LENGTH_8BITS,
+	.preamble        = HDLC_PREAMBLE_PATTERN_NONE,
+	.data_rate       = 9600,
+	.data_bits       = 8,
+	.stop_bits       = 1,
+	.parity          = ASYNC_PARITY_NONE
+};
+
+
+#define BH_RECEIVE  1
+#define BH_TRANSMIT 2
+#define BH_STATUS   4
+#define IO_PIN_SHUTDOWN_LIMIT 100
+
+#define DMABUFSIZE 256
+#define DESC_LIST_SIZE 4096
+
+#define MASK_PARITY  BIT1
+#define MASK_FRAMING BIT2
+#define MASK_BREAK   BIT3
+#define MASK_OVERRUN BIT4
+
+#define GSR   0x00 /* global status */
+#define TDR   0x80 /* tx data */
+#define RDR   0x80 /* rx data */
+#define TCR   0x82 /* tx control */
+#define TIR   0x84 /* tx idle */
+#define TPR   0x85 /* tx preamble */
+#define RCR   0x86 /* rx control */
+#define VCR   0x88 /* V.24 control */
+#define CCR   0x89 /* clock control */
+#define BDR   0x8a /* baud divisor */
+#define SCR   0x8c /* serial control */
+#define SSR   0x8e /* serial status */
+#define RDCSR 0x90 /* rx DMA control/status */
+#define TDCSR 0x94 /* tx DMA control/status */
+#define RDDAR 0x98 /* rx DMA descriptor address */
+#define TDDAR 0x9c /* tx DMA descriptor address */
+
+#define RXIDLE      BIT14
+#define RXBREAK     BIT14
+#define IRQ_TXDATA  BIT13
+#define IRQ_TXIDLE  BIT12
+#define IRQ_TXUNDER BIT11 /* HDLC */
+#define IRQ_RXDATA  BIT10
+#define IRQ_RXIDLE  BIT9  /* HDLC */
+#define IRQ_RXBREAK BIT9  /* async */
+#define IRQ_RXOVER  BIT8
+#define IRQ_DSR     BIT7
+#define IRQ_CTS     BIT6
+#define IRQ_DCD     BIT5
+#define IRQ_RI      BIT4
+#define IRQ_ALL     0x3ff0
+#define IRQ_MASTER  BIT0
+
+#define slgt_irq_on(info, mask) \
+	wr_reg16((info), SCR, (unsigned short)(rd_reg16((info), SCR) | (mask)))
+#define slgt_irq_off(info, mask) \
+	wr_reg16((info), SCR, (unsigned short)(rd_reg16((info), SCR) & ~(mask)))
+
+static __u8  rd_reg8(struct slgt_info *info, unsigned int addr);
+static void  wr_reg8(struct slgt_info *info, unsigned int addr, __u8 value);
+static __u16 rd_reg16(struct slgt_info *info, unsigned int addr);
+static void  wr_reg16(struct slgt_info *info, unsigned int addr, __u16 value);
+static __u32 rd_reg32(struct slgt_info *info, unsigned int addr);
+static void  wr_reg32(struct slgt_info *info, unsigned int addr, __u32 value);
+
+static void  msc_set_vcr(struct slgt_info *info);
+
+static int  startup(struct slgt_info *info);
+static int  block_til_ready(struct tty_struct *tty, struct file * filp,struct slgt_info *info);
+static void shutdown(struct slgt_info *info);
+static void program_hw(struct slgt_info *info);
+static void change_params(struct slgt_info *info);
+
+static int  register_test(struct slgt_info *info);
+static int  irq_test(struct slgt_info *info);
+static int  loopback_test(struct slgt_info *info);
+static int  adapter_test(struct slgt_info *info);
+
+static void reset_adapter(struct slgt_info *info);
+static void reset_port(struct slgt_info *info);
+static void async_mode(struct slgt_info *info);
+static void hdlc_mode(struct slgt_info *info);
+
+static void rx_stop(struct slgt_info *info);
+static void rx_start(struct slgt_info *info);
+static void reset_rbufs(struct slgt_info *info);
+static void free_rbufs(struct slgt_info *info, unsigned int first, unsigned int last);
+static void rdma_reset(struct slgt_info *info);
+static int  rx_get_frame(struct slgt_info *info);
+static int  rx_get_buf(struct slgt_info *info);
+
+static void tx_start(struct slgt_info *info);
+static void tx_stop(struct slgt_info *info);
+static void tx_set_idle(struct slgt_info *info);
+static unsigned int free_tbuf_count(struct slgt_info *info);
+static void reset_tbufs(struct slgt_info *info);
+static void tdma_reset(struct slgt_info *info);
+static void tx_load(struct slgt_info *info, const char *buf, unsigned int count);
+
+static void get_signals(struct slgt_info *info);
+static void set_signals(struct slgt_info *info);
+static void enable_loopback(struct slgt_info *info);
+static void set_rate(struct slgt_info *info, u32 data_rate);
+
+static int  bh_action(struct slgt_info *info);
+static void bh_handler(void* context);
+static void bh_transmit(struct slgt_info *info);
+static void isr_serial(struct slgt_info *info);
+static void isr_rdma(struct slgt_info *info);
+static void isr_txeom(struct slgt_info *info, unsigned short status);
+static void isr_tdma(struct slgt_info *info);
+static irqreturn_t slgt_interrupt(int irq, void *dev_id, struct pt_regs * regs);
+
+static int  alloc_dma_bufs(struct slgt_info *info);
+static void free_dma_bufs(struct slgt_info *info);
+static int  alloc_desc(struct slgt_info *info);
+static void free_desc(struct slgt_info *info);
+static int  alloc_bufs(struct slgt_info *info, struct slgt_desc *bufs, int count);
+static void free_bufs(struct slgt_info *info, struct slgt_desc *bufs, int count);
+
+static int  alloc_tmp_rbuf(struct slgt_info *info);
+static void free_tmp_rbuf(struct slgt_info *info);
+
+static void tx_timeout(unsigned long context);
+static void rx_timeout(unsigned long context);
+
+/*
+ * ioctl handlers
+ */
+static int  get_stats(struct slgt_info *info, struct mgsl_icount __user *user_icount);
+static int  get_params(struct slgt_info *info, MGSL_PARAMS __user *params);
+static int  set_params(struct slgt_info *info, MGSL_PARAMS __user *params);
+static int  get_txidle(struct slgt_info *info, int __user *idle_mode);
+static int  set_txidle(struct slgt_info *info, int idle_mode);
+static int  tx_enable(struct slgt_info *info, int enable);
+static int  tx_abort(struct slgt_info *info);
+static int  rx_enable(struct slgt_info *info, int enable);
+static int  modem_input_wait(struct slgt_info *info,int arg);
+static int  wait_mgsl_event(struct slgt_info *info, int __user *mask_ptr);
+static int  tiocmget(struct tty_struct *tty, struct file *file);
+static int  tiocmset(struct tty_struct *tty, struct file *file,
+		     unsigned int set, unsigned int clear);
+static void set_break(struct tty_struct *tty, int break_state);
+static int  get_interface(struct slgt_info *info, int __user *if_mode);
+static int  set_interface(struct slgt_info *info, int if_mode);
+
+/*
+ * driver functions
+ */
+static void add_device(struct slgt_info *info);
+static void device_init(int adapter_num, struct pci_dev *pdev);
+static int  claim_resources(struct slgt_info *info);
+static void release_resources(struct slgt_info *info);
+
+/*
+ * DEBUG OUTPUT CODE
+ */
+#ifndef DBGINFO
+#define DBGINFO(fmt)
+#endif
+#ifndef DBGERR
+#define DBGERR(fmt)
+#endif
+#ifndef DBGBH
+#define DBGBH(fmt)
+#endif
+#ifndef DBGISR
+#define DBGISR(fmt)
+#endif
+
+#ifdef DBGDATA
+static void trace_block(struct slgt_info *info, const char *data, int count, const char *label)
+{
+	int i;
+	int linecount;
+	printk("%s %s data:\n",info->device_name, label);
+	while(count) {
+		linecount = (count > 16) ? 16 : count;
+		for(i=0; i < linecount; i++)
+			printk("%02X ",(unsigned char)data[i]);
+		for(;i<17;i++)
+			printk("   ");
+		for(i=0;i<linecount;i++) {
+			if (data[i]>=040 && data[i]<=0176)
+				printk("%c",data[i]);
+			else
+				printk(".");
+		}
+		printk("\n");
+		data  += linecount;
+		count -= linecount;
+	}
+}
+#else
+#define DBGDATA(info, buf, size, label)
+#endif
+
+#ifdef DBGTBUF
+static void dump_tbufs(struct slgt_info *info)
+{
+	int i;
+	printk("tbuf_current=%d\n", info->tbuf_current);
+	for (i=0 ; i < info->tbuf_count ; i++) {
+		printk("%d: count=%04X status=%04X\n",
+			i, le16_to_cpu(info->tbufs[i].count), le16_to_cpu(info->tbufs[i].status));
+	}
+}
+#else
+#define DBGTBUF(info)
+#endif
+
+#ifdef DBGRBUF
+static void dump_rbufs(struct slgt_info *info)
+{
+	int i;
+	printk("rbuf_current=%d\n", info->rbuf_current);
+	for (i=0 ; i < info->rbuf_count ; i++) {
+		printk("%d: count=%04X status=%04X\n",
+			i, le16_to_cpu(info->rbufs[i].count), le16_to_cpu(info->rbufs[i].status));
+	}
+}
+#else
+#define DBGRBUF(info)
+#endif
+
+static inline int sanity_check(struct slgt_info *info, char *devname, const char *name)
+{
+#ifdef SANITY_CHECK
+	if (!info) {
+		printk("null struct slgt_info for (%s) in %s\n", devname, name);
+		return 1;
+	}
+	if (info->magic != MGSL_MAGIC) {
+		printk("bad magic number struct slgt_info (%s) in %s\n", devname, name);
+		return 1;
+	}
+#else
+	if (!info)
+		return 1;
+#endif
+	return 0;
+}
+
+/**
+ * line discipline callback wrappers
+ *
+ * The wrappers maintain line discipline references
+ * while calling into the line discipline.
+ *
+ * ldisc_receive_buf  - pass receive data to line discipline
+ */
+static void ldisc_receive_buf(struct tty_struct *tty,
+			      const __u8 *data, char *flags, int count)
+{
+	struct tty_ldisc *ld;
+	if (!tty)
+		return;
+	ld = tty_ldisc_ref(tty);
+	if (ld) {
+		if (ld->receive_buf)
+			ld->receive_buf(tty, data, flags, count);
+		tty_ldisc_deref(ld);
+	}
+}
+
+/* tty callbacks */
+
+static int open(struct tty_struct *tty, struct file *filp)
+{
+	struct slgt_info *info;
+	int retval, line;
+	unsigned long flags;
+
+	line = tty->index;
+	if ((line < 0) || (line >= slgt_device_count)) {
+		DBGERR(("%s: open with invalid line #%d.\n", driver_name, line));
+		return -ENODEV;
+	}
+
+	info = slgt_device_list;
+	while(info && info->line != line)
+		info = info->next_device;
+	if (sanity_check(info, tty->name, "open"))
+		return -ENODEV;
+	if (info->init_error) {
+		DBGERR(("%s init error=%d\n", info->device_name, info->init_error));
+		return -ENODEV;
+	}
+
+	tty->driver_data = info;
+	info->tty = tty;
+
+	DBGINFO(("%s open, old ref count = %d\n", info->device_name, info->count));
+
+	/* If port is closing, signal caller to try again */
+	if (tty_hung_up_p(filp) || info->flags & ASYNC_CLOSING){
+		if (info->flags & ASYNC_CLOSING)
+			interruptible_sleep_on(&info->close_wait);
+		retval = ((info->flags & ASYNC_HUP_NOTIFY) ?
+			-EAGAIN : -ERESTARTSYS);
+		goto cleanup;
+	}
+
+	info->tty->low_latency = (info->flags & ASYNC_LOW_LATENCY) ? 1 : 0;
+
+	spin_lock_irqsave(&info->netlock, flags);
+	if (info->netcount) {
+		retval = -EBUSY;
+		spin_unlock_irqrestore(&info->netlock, flags);
+		goto cleanup;
+	}
+	info->count++;
+	spin_unlock_irqrestore(&info->netlock, flags);
+
+	if (info->count == 1) {
+		/* 1st open on this device, init hardware */
+		retval = startup(info);
+		if (retval < 0)
+			goto cleanup;
+	}
+
+	retval = block_til_ready(tty, filp, info);
+	if (retval) {
+		DBGINFO(("%s block_til_ready rc=%d\n", info->device_name, retval));
+		goto cleanup;
+	}
+
+	retval = 0;
+
+cleanup:
+	if (retval) {
+		if (tty->count == 1)
+			info->tty = NULL; /* tty layer will release tty struct */
+		if(info->count)
+			info->count--;
+	}
+
+	DBGINFO(("%s open rc=%d\n", info->device_name, retval));
+	return retval;
+}
+
+static void close(struct tty_struct *tty, struct file *filp)
+{
+	struct slgt_info *info = tty->driver_data;
+
+	if (sanity_check(info, tty->name, "close"))
+		return;
+	DBGINFO(("%s close entry, count=%d\n", info->device_name, info->count));
+
+	if (!info->count)
+		return;
+
+	if (tty_hung_up_p(filp))
+		goto cleanup;
+
+	if ((tty->count == 1) && (info->count != 1)) {
+		/*
+		 * tty->count is 1 and the tty structure will be freed.
+		 * info->count should be one in this case.
+		 * if it's not, correct it so that the port is shutdown.
+		 */
+		DBGERR(("%s close: bad refcount; tty->count=1, "
+		       "info->count=%d\n", info->device_name, info->count));
+		info->count = 1;
+	}
+
+	info->count--;
+
+	/* if at least one open remaining, leave hardware active */
+	if (info->count)
+		goto cleanup;
+
+	info->flags |= ASYNC_CLOSING;
+
+	/* set tty->closing to notify line discipline to
+	 * only process XON/XOFF characters. Only the N_TTY
+	 * discipline appears to use this (ppp does not).
+	 */
+	tty->closing = 1;
+
+	/* wait for transmit data to clear all layers */
+
+	if (info->closing_wait != ASYNC_CLOSING_WAIT_NONE) {
+		DBGINFO(("%s call tty_wait_until_sent\n", info->device_name));
+		tty_wait_until_sent(tty, info->closing_wait);
+	}
+
+ 	if (info->flags & ASYNC_INITIALIZED)
+ 		wait_until_sent(tty, info->timeout);
+	if (tty->driver->flush_buffer)
+		tty->driver->flush_buffer(tty);
+	tty_ldisc_flush(tty);
+
+	shutdown(info);
+
+	tty->closing = 0;
+	info->tty = NULL;
+
+	if (info->blocked_open) {
+		if (info->close_delay) {
+			msleep_interruptible(jiffies_to_msecs(info->close_delay));
+		}
+		wake_up_interruptible(&info->open_wait);
+	}
+
+	info->flags &= ~(ASYNC_NORMAL_ACTIVE|ASYNC_CLOSING);
+
+	wake_up_interruptible(&info->close_wait);
+
+cleanup:
+	DBGINFO(("%s close exit, count=%d\n", tty->driver->name, info->count));
+}
+
+static void hangup(struct tty_struct *tty)
+{
+	struct slgt_info *info = tty->driver_data;
+
+	if (sanity_check(info, tty->name, "hangup"))
+		return;
+	DBGINFO(("%s hangup\n", info->device_name));
+
+	flush_buffer(tty);
+	shutdown(info);
+
+	info->count = 0;
+	info->flags &= ~ASYNC_NORMAL_ACTIVE;
+	info->tty = NULL;
+
+	wake_up_interruptible(&info->open_wait);
+}
+
+static void set_termios(struct tty_struct *tty, struct termios *old_termios)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned long flags;
+
+	DBGINFO(("%s set_termios\n", tty->driver->name));
+
+	/* just return if nothing has changed */
+	if ((tty->termios->c_cflag == old_termios->c_cflag)
+	    && (RELEVANT_IFLAG(tty->termios->c_iflag)
+		== RELEVANT_IFLAG(old_termios->c_iflag)))
+		return;
+
+	change_params(info);
+
+	/* Handle transition to B0 status */
+	if (old_termios->c_cflag & CBAUD &&
+	    !(tty->termios->c_cflag & CBAUD)) {
+		info->signals &= ~(SerialSignal_RTS + SerialSignal_DTR);
+		spin_lock_irqsave(&info->lock,flags);
+		set_signals(info);
+		spin_unlock_irqrestore(&info->lock,flags);
+	}
+
+	/* Handle transition away from B0 status */
+	if (!(old_termios->c_cflag & CBAUD) &&
+	    tty->termios->c_cflag & CBAUD) {
+		info->signals |= SerialSignal_DTR;
+ 		if (!(tty->termios->c_cflag & CRTSCTS) ||
+ 		    !test_bit(TTY_THROTTLED, &tty->flags)) {
+			info->signals |= SerialSignal_RTS;
+ 		}
+		spin_lock_irqsave(&info->lock,flags);
+	 	set_signals(info);
+		spin_unlock_irqrestore(&info->lock,flags);
+	}
+
+	/* Handle turning off CRTSCTS */
+	if (old_termios->c_cflag & CRTSCTS &&
+	    !(tty->termios->c_cflag & CRTSCTS)) {
+		tty->hw_stopped = 0;
+		tx_release(tty);
+	}
+}
+
+static int write(struct tty_struct *tty,
+		 const unsigned char *buf, int count)
+{
+	int ret = 0;
+	struct slgt_info *info = tty->driver_data;
+	unsigned long flags;
+
+	if (sanity_check(info, tty->name, "write"))
+		goto cleanup;
+	DBGINFO(("%s write count=%d\n", info->device_name, count));
+
+	if (!tty || !info->tx_buf)
+		goto cleanup;
+
+	if (count > info->max_frame_size) {
+		ret = -EIO;
+		goto cleanup;
+	}
+
+	if (!count)
+		goto cleanup;
+
+	if (info->params.mode == MGSL_MODE_RAW) {
+		unsigned int bufs_needed = (count/DMABUFSIZE);
+		unsigned int bufs_free = free_tbuf_count(info);
+		if (count % DMABUFSIZE)
+			++bufs_needed;
+		if (bufs_needed > bufs_free)
+			goto cleanup;
+	} else {
+		if (info->tx_active)
+			goto cleanup;
+		if (info->tx_count) {
+			/* send accumulated data from send_char() calls */
+			/* as frame and wait before accepting more data. */
+			tx_load(info, info->tx_buf, info->tx_count);
+			goto start;
+		}
+	}
+
+	ret = info->tx_count = count;
+	tx_load(info, buf, count);
+	goto start;
+
+start:
+ 	if (info->tx_count && !tty->stopped && !tty->hw_stopped) {
+		spin_lock_irqsave(&info->lock,flags);
+		if (!info->tx_active)
+		 	tx_start(info);
+		spin_unlock_irqrestore(&info->lock,flags);
+ 	}
+
+cleanup:
+	DBGINFO(("%s write rc=%d\n", info->device_name, ret));
+	return ret;
+}
+
+static void put_char(struct tty_struct *tty, unsigned char ch)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned long flags;
+
+	if (sanity_check(info, tty->name, "put_char"))
+		return;
+	DBGINFO(("%s put_char(%d)\n", info->device_name, ch));
+	if (!tty || !info->tx_buf)
+		return;
+	spin_lock_irqsave(&info->lock,flags);
+	if (!info->tx_active && (info->tx_count < info->max_frame_size))
+		info->tx_buf[info->tx_count++] = ch;
+	spin_unlock_irqrestore(&info->lock,flags);
+}
+
+static void send_xchar(struct tty_struct *tty, char ch)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned long flags;
+
+	if (sanity_check(info, tty->name, "send_xchar"))
+		return;
+	DBGINFO(("%s send_xchar(%d)\n", info->device_name, ch));
+	info->x_char = ch;
+	if (ch) {
+		spin_lock_irqsave(&info->lock,flags);
+		if (!info->tx_enabled)
+		 	tx_start(info);
+		spin_unlock_irqrestore(&info->lock,flags);
+	}
+}
+
+static void wait_until_sent(struct tty_struct *tty, int timeout)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned long orig_jiffies, char_time;
+
+	if (!info )
+		return;
+	if (sanity_check(info, tty->name, "wait_until_sent"))
+		return;
+	DBGINFO(("%s wait_until_sent entry\n", info->device_name));
+	if (!(info->flags & ASYNC_INITIALIZED))
+		goto exit;
+
+	orig_jiffies = jiffies;
+
+	/* Set check interval to 1/5 of estimated time to
+	 * send a character, and make it at least 1. The check
+	 * interval should also be less than the timeout.
+	 * Note: use tight timings here to satisfy the NIST-PCTS.
+	 */
+
+	if (info->params.data_rate) {
+	       	char_time = info->timeout/(32 * 5);
+		if (!char_time)
+			char_time++;
+	} else
+		char_time = 1;
+
+	if (timeout)
+		char_time = min_t(unsigned long, char_time, timeout);
+
+	while (info->tx_active) {
+		msleep_interruptible(jiffies_to_msecs(char_time));
+		if (signal_pending(current))
+			break;
+		if (timeout && time_after(jiffies, orig_jiffies + timeout))
+			break;
+	}
+
+exit:
+	DBGINFO(("%s wait_until_sent exit\n", info->device_name));
+}
+
+static int write_room(struct tty_struct *tty)
+{
+	struct slgt_info *info = tty->driver_data;
+	int ret;
+
+	if (sanity_check(info, tty->name, "write_room"))
+		return 0;
+	ret = (info->tx_active) ? 0 : HDLC_MAX_FRAME_SIZE;
+	DBGINFO(("%s write_room=%d\n", info->device_name, ret));
+	return ret;
+}
+
+static void flush_chars(struct tty_struct *tty)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned long flags;
+
+	if (sanity_check(info, tty->name, "flush_chars"))
+		return;
+	DBGINFO(("%s flush_chars entry tx_count=%d\n", info->device_name, info->tx_count));
+
+	if (info->tx_count <= 0 || tty->stopped ||
+	    tty->hw_stopped || !info->tx_buf)
+		return;
+
+	DBGINFO(("%s flush_chars start transmit\n", info->device_name));
+
+	spin_lock_irqsave(&info->lock,flags);
+	if (!info->tx_active && info->tx_count) {
+		tx_load(info, info->tx_buf,info->tx_count);
+	 	tx_start(info);
+	}
+	spin_unlock_irqrestore(&info->lock,flags);
+}
+
+static void flush_buffer(struct tty_struct *tty)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned long flags;
+
+	if (sanity_check(info, tty->name, "flush_buffer"))
+		return;
+	DBGINFO(("%s flush_buffer\n", info->device_name));
+
+	spin_lock_irqsave(&info->lock,flags);
+	if (!info->tx_active)
+		info->tx_count = 0;
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	wake_up_interruptible(&tty->write_wait);
+	tty_wakeup(tty);
+}
+
+/*
+ * throttle (stop) transmitter
+ */
+static void tx_hold(struct tty_struct *tty)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned long flags;
+
+	if (sanity_check(info, tty->name, "tx_hold"))
+		return;
+	DBGINFO(("%s tx_hold\n", info->device_name));
+	spin_lock_irqsave(&info->lock,flags);
+	if (info->tx_enabled && info->params.mode == MGSL_MODE_ASYNC)
+	 	tx_stop(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+}
+
+/*
+ * release (start) transmitter
+ */
+static void tx_release(struct tty_struct *tty)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned long flags;
+
+	if (sanity_check(info, tty->name, "tx_release"))
+		return;
+	DBGINFO(("%s tx_release\n", info->device_name));
+	spin_lock_irqsave(&info->lock,flags);
+	if (!info->tx_active && info->tx_count) {
+		tx_load(info, info->tx_buf, info->tx_count);
+	 	tx_start(info);
+	}
+	spin_unlock_irqrestore(&info->lock,flags);
+}
+
+/*
+ * Service an IOCTL request
+ *
+ * Arguments
+ *
+ * 	tty	pointer to tty instance data
+ * 	file	pointer to associated file object for device
+ * 	cmd	IOCTL command code
+ * 	arg	command argument/context
+ *
+ * Return 0 if success, otherwise error code
+ */
+static int ioctl(struct tty_struct *tty, struct file *file,
+		 unsigned int cmd, unsigned long arg)
+{
+	struct slgt_info *info = tty->driver_data;
+	struct mgsl_icount cnow;	/* kernel counter temps */
+	struct serial_icounter_struct __user *p_cuser;	/* user space */
+	unsigned long flags;
+	void __user *argp = (void __user *)arg;
+
+	if (sanity_check(info, tty->name, "ioctl"))
+		return -ENODEV;
+	DBGINFO(("%s ioctl() cmd=%08X\n", info->device_name, cmd));
+
+	if ((cmd != TIOCGSERIAL) && (cmd != TIOCSSERIAL) &&
+	    (cmd != TIOCMIWAIT) && (cmd != TIOCGICOUNT)) {
+		if (tty->flags & (1 << TTY_IO_ERROR))
+		    return -EIO;
+	}
+
+	switch (cmd) {
+	case MGSL_IOCGPARAMS:
+		return get_params(info, argp);
+	case MGSL_IOCSPARAMS:
+		return set_params(info, argp);
+	case MGSL_IOCGTXIDLE:
+		return get_txidle(info, argp);
+	case MGSL_IOCSTXIDLE:
+		return set_txidle(info, (int)arg);
+	case MGSL_IOCTXENABLE:
+		return tx_enable(info, (int)arg);
+	case MGSL_IOCRXENABLE:
+		return rx_enable(info, (int)arg);
+	case MGSL_IOCTXABORT:
+		return tx_abort(info);
+	case MGSL_IOCGSTATS:
+		return get_stats(info, argp);
+	case MGSL_IOCWAITEVENT:
+		return wait_mgsl_event(info, argp);
+	case TIOCMIWAIT:
+		return modem_input_wait(info,(int)arg);
+	case MGSL_IOCGIF:
+		return get_interface(info, argp);
+	case MGSL_IOCSIF:
+		return set_interface(info,(int)arg);
+	case TIOCGICOUNT:
+		spin_lock_irqsave(&info->lock,flags);
+		cnow = info->icount;
+		spin_unlock_irqrestore(&info->lock,flags);
+		p_cuser = argp;
+		if (put_user(cnow.cts, &p_cuser->cts) ||
+		    put_user(cnow.dsr, &p_cuser->dsr) ||
+		    put_user(cnow.rng, &p_cuser->rng) ||
+		    put_user(cnow.dcd, &p_cuser->dcd) ||
+		    put_user(cnow.rx, &p_cuser->rx) ||
+		    put_user(cnow.tx, &p_cuser->tx) ||
+		    put_user(cnow.frame, &p_cuser->frame) ||
+		    put_user(cnow.overrun, &p_cuser->overrun) ||
+		    put_user(cnow.parity, &p_cuser->parity) ||
+		    put_user(cnow.brk, &p_cuser->brk) ||
+		    put_user(cnow.buf_overrun, &p_cuser->buf_overrun))
+			return -EFAULT;
+		return 0;
+	default:
+		return -ENOIOCTLCMD;
+	}
+	return 0;
+}
+
+/*
+ * proc fs support
+ */
+static inline int line_info(char *buf, struct slgt_info *info)
+{
+	char stat_buf[30];
+	int ret;
+	unsigned long flags;
+
+	ret = sprintf(buf, "%s: IO=%08X IRQ=%d MaxFrameSize=%u\n",
+		      info->device_name, info->phys_reg_addr,
+		      info->irq_level, info->max_frame_size);
+
+	/* output current serial signal states */
+	spin_lock_irqsave(&info->lock,flags);
+	get_signals(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	stat_buf[0] = 0;
+	stat_buf[1] = 0;
+	if (info->signals & SerialSignal_RTS)
+		strcat(stat_buf, "|RTS");
+	if (info->signals & SerialSignal_CTS)
+		strcat(stat_buf, "|CTS");
+	if (info->signals & SerialSignal_DTR)
+		strcat(stat_buf, "|DTR");
+	if (info->signals & SerialSignal_DSR)
+		strcat(stat_buf, "|DSR");
+	if (info->signals & SerialSignal_DCD)
+		strcat(stat_buf, "|CD");
+	if (info->signals & SerialSignal_RI)
+		strcat(stat_buf, "|RI");
+
+	if (info->params.mode != MGSL_MODE_ASYNC) {
+		ret += sprintf(buf+ret, "\tHDLC txok:%d rxok:%d",
+			       info->icount.txok, info->icount.rxok);
+		if (info->icount.txunder)
+			ret += sprintf(buf+ret, " txunder:%d", info->icount.txunder);
+		if (info->icount.txabort)
+			ret += sprintf(buf+ret, " txabort:%d", info->icount.txabort);
+		if (info->icount.rxshort)
+			ret += sprintf(buf+ret, " rxshort:%d", info->icount.rxshort);
+		if (info->icount.rxlong)
+			ret += sprintf(buf+ret, " rxlong:%d", info->icount.rxlong);
+		if (info->icount.rxover)
+			ret += sprintf(buf+ret, " rxover:%d", info->icount.rxover);
+		if (info->icount.rxcrc)
+			ret += sprintf(buf+ret, " rxcrc:%d", info->icount.rxcrc);
+	} else {
+		ret += sprintf(buf+ret, "\tASYNC tx:%d rx:%d",
+			       info->icount.tx, info->icount.rx);
+		if (info->icount.frame)
+			ret += sprintf(buf+ret, " fe:%d", info->icount.frame);
+		if (info->icount.parity)
+			ret += sprintf(buf+ret, " pe:%d", info->icount.parity);
+		if (info->icount.brk)
+			ret += sprintf(buf+ret, " brk:%d", info->icount.brk);
+		if (info->icount.overrun)
+			ret += sprintf(buf+ret, " oe:%d", info->icount.overrun);
+	}
+
+	/* Append serial signal status to end */
+	ret += sprintf(buf+ret, " %s\n", stat_buf+1);
+
+	ret += sprintf(buf+ret, "\ttxactive=%d bh_req=%d bh_run=%d pending_bh=%x\n",
+		       info->tx_active,info->bh_requested,info->bh_running,
+		       info->pending_bh);
+
+	return ret;
+}
+
+/* Called to print information about devices
+ */
+static int read_proc(char *page, char **start, off_t off, int count,
+		     int *eof, void *data)
+{
+	int len = 0, l;
+	off_t	begin = 0;
+	struct slgt_info *info;
+
+	len += sprintf(page, "synclink_gt driver:%s\n", driver_version);
+
+	info = slgt_device_list;
+	while( info ) {
+		l = line_info(page + len, info);
+		len += l;
+		if (len+begin > off+count)
+			goto done;
+		if (len+begin < off) {
+			begin += len;
+			len = 0;
+		}
+		info = info->next_device;
+	}
+
+	*eof = 1;
+done:
+	if (off >= len+begin)
+		return 0;
+	*start = page + (off-begin);
+	return ((count < begin+len-off) ? count : begin+len-off);
+}
+
+/*
+ * return count of bytes in transmit buffer
+ */
+static int chars_in_buffer(struct tty_struct *tty)
+{
+	struct slgt_info *info = tty->driver_data;
+	if (sanity_check(info, tty->name, "chars_in_buffer"))
+		return 0;
+	DBGINFO(("%s chars_in_buffer()=%d\n", info->device_name, info->tx_count));
+	return info->tx_count;
+}
+
+/*
+ * signal remote device to throttle send data (our receive data)
+ */
+static void throttle(struct tty_struct * tty)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned long flags;
+
+	if (sanity_check(info, tty->name, "throttle"))
+		return;
+	DBGINFO(("%s throttle\n", info->device_name));
+	if (I_IXOFF(tty))
+		send_xchar(tty, STOP_CHAR(tty));
+ 	if (tty->termios->c_cflag & CRTSCTS) {
+		spin_lock_irqsave(&info->lock,flags);
+		info->signals &= ~SerialSignal_RTS;
+	 	set_signals(info);
+		spin_unlock_irqrestore(&info->lock,flags);
+	}
+}
+
+/*
+ * signal remote device to stop throttling send data (our receive data)
+ */
+static void unthrottle(struct tty_struct * tty)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned long flags;
+
+	if (sanity_check(info, tty->name, "unthrottle"))
+		return;
+	DBGINFO(("%s unthrottle\n", info->device_name));
+	if (I_IXOFF(tty)) {
+		if (info->x_char)
+			info->x_char = 0;
+		else
+			send_xchar(tty, START_CHAR(tty));
+	}
+ 	if (tty->termios->c_cflag & CRTSCTS) {
+		spin_lock_irqsave(&info->lock,flags);
+		info->signals |= SerialSignal_RTS;
+	 	set_signals(info);
+		spin_unlock_irqrestore(&info->lock,flags);
+	}
+}
+
+/*
+ * set or clear transmit break condition
+ * break_state	-1=set break condition, 0=clear
+ */
+static void set_break(struct tty_struct *tty, int break_state)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned short value;
+	unsigned long flags;
+
+	if (sanity_check(info, tty->name, "set_break"))
+		return;
+	DBGINFO(("%s set_break(%d)\n", info->device_name, break_state));
+
+	spin_lock_irqsave(&info->lock,flags);
+	value = rd_reg16(info, TCR);
+ 	if (break_state == -1)
+		value |= BIT6;
+	else
+		value &= ~BIT6;
+	wr_reg16(info, TCR, value);
+	spin_unlock_irqrestore(&info->lock,flags);
+}
+
+#ifdef CONFIG_HDLC
+
+/**
+ * called by generic HDLC layer when protocol selected (PPP, frame relay, etc.)
+ * set encoding and frame check sequence (FCS) options
+ *
+ * dev       pointer to network device structure
+ * encoding  serial encoding setting
+ * parity    FCS setting
+ *
+ * returns 0 if success, otherwise error code
+ */
+static int hdlcdev_attach(struct net_device *dev, unsigned short encoding,
+			  unsigned short parity)
+{
+	struct slgt_info *info = dev_to_port(dev);
+	unsigned char  new_encoding;
+	unsigned short new_crctype;
+
+	/* return error if TTY interface open */
+	if (info->count)
+		return -EBUSY;
+
+	DBGINFO(("%s hdlcdev_attach\n", info->device_name));
+
+	switch (encoding)
+	{
+	case ENCODING_NRZ:        new_encoding = HDLC_ENCODING_NRZ; break;
+	case ENCODING_NRZI:       new_encoding = HDLC_ENCODING_NRZI_SPACE; break;
+	case ENCODING_FM_MARK:    new_encoding = HDLC_ENCODING_BIPHASE_MARK; break;
+	case ENCODING_FM_SPACE:   new_encoding = HDLC_ENCODING_BIPHASE_SPACE; break;
+	case ENCODING_MANCHESTER: new_encoding = HDLC_ENCODING_BIPHASE_LEVEL; break;
+	default: return -EINVAL;
+	}
+
+	switch (parity)
+	{
+	case PARITY_NONE:            new_crctype = HDLC_CRC_NONE; break;
+	case PARITY_CRC16_PR1_CCITT: new_crctype = HDLC_CRC_16_CCITT; break;
+	case PARITY_CRC32_PR1_CCITT: new_crctype = HDLC_CRC_32_CCITT; break;
+	default: return -EINVAL;
+	}
+
+	info->params.encoding = new_encoding;
+	info->params.crc_type = new_crctype;;
+
+	/* if network interface up, reprogram hardware */
+	if (info->netcount)
+		program_hw(info);
+
+	return 0;
+}
+
+/**
+ * called by generic HDLC layer to send frame
+ *
+ * skb  socket buffer containing HDLC frame
+ * dev  pointer to network device structure
+ *
+ * returns 0 if success, otherwise error code
+ */
+static int hdlcdev_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct slgt_info *info = dev_to_port(dev);
+	struct net_device_stats *stats = hdlc_stats(dev);
+	unsigned long flags;
+
+	DBGINFO(("%s hdlc_xmit\n", dev->name));
+
+	/* stop sending until this frame completes */
+	netif_stop_queue(dev);
+
+	/* copy data to device buffers */
+	info->tx_count = skb->len;
+	tx_load(info, skb->data, skb->len);
+
+	/* update network statistics */
+	stats->tx_packets++;
+	stats->tx_bytes += skb->len;
+
+	/* done with socket buffer, so free it */
+	dev_kfree_skb(skb);
+
+	/* save start time for transmit timeout detection */
+	dev->trans_start = jiffies;
+
+	/* start hardware transmitter if necessary */
+	spin_lock_irqsave(&info->lock,flags);
+	if (!info->tx_active)
+	 	tx_start(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	return 0;
+}
+
+/**
+ * called by network layer when interface enabled
+ * claim resources and initialize hardware
+ *
+ * dev  pointer to network device structure
+ *
+ * returns 0 if success, otherwise error code
+ */
+static int hdlcdev_open(struct net_device *dev)
+{
+	struct slgt_info *info = dev_to_port(dev);
+	int rc;
+	unsigned long flags;
+
+	DBGINFO(("%s hdlcdev_open\n", dev->name));
+
+	/* generic HDLC layer open processing */
+	if ((rc = hdlc_open(dev)))
+		return rc;
+
+	/* arbitrate between network and tty opens */
+	spin_lock_irqsave(&info->netlock, flags);
+	if (info->count != 0 || info->netcount != 0) {
+		DBGINFO(("%s hdlc_open busy\n", dev->name));
+		spin_unlock_irqrestore(&info->netlock, flags);
+		return -EBUSY;
+	}
+	info->netcount=1;
+	spin_unlock_irqrestore(&info->netlock, flags);
+
+	/* claim resources and init adapter */
+	if ((rc = startup(info)) != 0) {
+		spin_lock_irqsave(&info->netlock, flags);
+		info->netcount=0;
+		spin_unlock_irqrestore(&info->netlock, flags);
+		return rc;
+	}
+
+	/* assert DTR and RTS, apply hardware settings */
+	info->signals |= SerialSignal_RTS + SerialSignal_DTR;
+	program_hw(info);
+
+	/* enable network layer transmit */
+	dev->trans_start = jiffies;
+	netif_start_queue(dev);
+
+	/* inform generic HDLC layer of current DCD status */
+	spin_lock_irqsave(&info->lock, flags);
+	get_signals(info);
+	spin_unlock_irqrestore(&info->lock, flags);
+	hdlc_set_carrier(info->signals & SerialSignal_DCD, dev);
+
+	return 0;
+}
+
+/**
+ * called by network layer when interface is disabled
+ * shutdown hardware and release resources
+ *
+ * dev  pointer to network device structure
+ *
+ * returns 0 if success, otherwise error code
+ */
+static int hdlcdev_close(struct net_device *dev)
+{
+	struct slgt_info *info = dev_to_port(dev);
+	unsigned long flags;
+
+	DBGINFO(("%s hdlcdev_close\n", dev->name));
+
+	netif_stop_queue(dev);
+
+	/* shutdown adapter and release resources */
+	shutdown(info);
+
+	hdlc_close(dev);
+
+	spin_lock_irqsave(&info->netlock, flags);
+	info->netcount=0;
+	spin_unlock_irqrestore(&info->netlock, flags);
+
+	return 0;
+}
+
+/**
+ * called by network layer to process IOCTL call to network device
+ *
+ * dev  pointer to network device structure
+ * ifr  pointer to network interface request structure
+ * cmd  IOCTL command code
+ *
+ * returns 0 if success, otherwise error code
+ */
+static int hdlcdev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	const size_t size = sizeof(sync_serial_settings);
+	sync_serial_settings new_line;
+	sync_serial_settings __user *line = ifr->ifr_settings.ifs_ifsu.sync;
+	struct slgt_info *info = dev_to_port(dev);
+	unsigned int flags;
+
+	DBGINFO(("%s hdlcdev_ioctl\n", dev->name));
+
+	/* return error if TTY interface open */
+	if (info->count)
+		return -EBUSY;
+
+	if (cmd != SIOCWANDEV)
+		return hdlc_ioctl(dev, ifr, cmd);
+
+	switch(ifr->ifr_settings.type) {
+	case IF_GET_IFACE: /* return current sync_serial_settings */
+
+		ifr->ifr_settings.type = IF_IFACE_SYNC_SERIAL;
+		if (ifr->ifr_settings.size < size) {
+			ifr->ifr_settings.size = size; /* data size wanted */
+			return -ENOBUFS;
+		}
+
+		flags = info->params.flags & (HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_RXC_DPLL |
+					      HDLC_FLAG_RXC_BRG    | HDLC_FLAG_RXC_TXCPIN |
+					      HDLC_FLAG_TXC_TXCPIN | HDLC_FLAG_TXC_DPLL |
+					      HDLC_FLAG_TXC_BRG    | HDLC_FLAG_TXC_RXCPIN);
+
+		switch (flags){
+		case (HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_TXC_TXCPIN): new_line.clock_type = CLOCK_EXT; break;
+		case (HDLC_FLAG_RXC_BRG    | HDLC_FLAG_TXC_BRG):    new_line.clock_type = CLOCK_INT; break;
+		case (HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_TXC_BRG):    new_line.clock_type = CLOCK_TXINT; break;
+		case (HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_TXC_RXCPIN): new_line.clock_type = CLOCK_TXFROMRX; break;
+		default: new_line.clock_type = CLOCK_DEFAULT;
+		}
+
+		new_line.clock_rate = info->params.clock_speed;
+		new_line.loopback   = info->params.loopback ? 1:0;
+
+		if (copy_to_user(line, &new_line, size))
+			return -EFAULT;
+		return 0;
+
+	case IF_IFACE_SYNC_SERIAL: /* set sync_serial_settings */
+
+		if(!capable(CAP_NET_ADMIN))
+			return -EPERM;
+		if (copy_from_user(&new_line, line, size))
+			return -EFAULT;
+
+		switch (new_line.clock_type)
+		{
+		case CLOCK_EXT:      flags = HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_TXC_TXCPIN; break;
+		case CLOCK_TXFROMRX: flags = HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_TXC_RXCPIN; break;
+		case CLOCK_INT:      flags = HDLC_FLAG_RXC_BRG    | HDLC_FLAG_TXC_BRG;    break;
+		case CLOCK_TXINT:    flags = HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_TXC_BRG;    break;
+		case CLOCK_DEFAULT:  flags = info->params.flags &
+					     (HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_RXC_DPLL |
+					      HDLC_FLAG_RXC_BRG    | HDLC_FLAG_RXC_TXCPIN |
+					      HDLC_FLAG_TXC_TXCPIN | HDLC_FLAG_TXC_DPLL |
+					      HDLC_FLAG_TXC_BRG    | HDLC_FLAG_TXC_RXCPIN); break;
+		default: return -EINVAL;
+		}
+
+		if (new_line.loopback != 0 && new_line.loopback != 1)
+			return -EINVAL;
+
+		info->params.flags &= ~(HDLC_FLAG_RXC_RXCPIN | HDLC_FLAG_RXC_DPLL |
+					HDLC_FLAG_RXC_BRG    | HDLC_FLAG_RXC_TXCPIN |
+					HDLC_FLAG_TXC_TXCPIN | HDLC_FLAG_TXC_DPLL |
+					HDLC_FLAG_TXC_BRG    | HDLC_FLAG_TXC_RXCPIN);
+		info->params.flags |= flags;
+
+		info->params.loopback = new_line.loopback;
+
+		if (flags & (HDLC_FLAG_RXC_BRG | HDLC_FLAG_TXC_BRG))
+			info->params.clock_speed = new_line.clock_rate;
+		else
+			info->params.clock_speed = 0;
+
+		/* if network interface up, reprogram hardware */
+		if (info->netcount)
+			program_hw(info);
+		return 0;
+
+	default:
+		return hdlc_ioctl(dev, ifr, cmd);
+	}
+}
+
+/**
+ * called by network layer when transmit timeout is detected
+ *
+ * dev  pointer to network device structure
+ */
+static void hdlcdev_tx_timeout(struct net_device *dev)
+{
+	struct slgt_info *info = dev_to_port(dev);
+	struct net_device_stats *stats = hdlc_stats(dev);
+	unsigned long flags;
+
+	DBGINFO(("%s hdlcdev_tx_timeout\n", dev->name));
+
+	stats->tx_errors++;
+	stats->tx_aborted_errors++;
+
+	spin_lock_irqsave(&info->lock,flags);
+	tx_stop(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	netif_wake_queue(dev);
+}
+
+/**
+ * called by device driver when transmit completes
+ * reenable network layer transmit if stopped
+ *
+ * info  pointer to device instance information
+ */
+static void hdlcdev_tx_done(struct slgt_info *info)
+{
+	if (netif_queue_stopped(info->netdev))
+		netif_wake_queue(info->netdev);
+}
+
+/**
+ * called by device driver when frame received
+ * pass frame to network layer
+ *
+ * info  pointer to device instance information
+ * buf   pointer to buffer contianing frame data
+ * size  count of data bytes in buf
+ */
+static void hdlcdev_rx(struct slgt_info *info, char *buf, int size)
+{
+	struct sk_buff *skb = dev_alloc_skb(size);
+	struct net_device *dev = info->netdev;
+	struct net_device_stats *stats = hdlc_stats(dev);
+
+	DBGINFO(("%s hdlcdev_rx\n", dev->name));
+
+	if (skb == NULL) {
+		DBGERR(("%s: can't alloc skb, drop packet\n", dev->name));
+		stats->rx_dropped++;
+		return;
+	}
+
+	memcpy(skb_put(skb, size),buf,size);
+
+	skb->protocol = hdlc_type_trans(skb, info->netdev);
+
+	stats->rx_packets++;
+	stats->rx_bytes += size;
+
+	netif_rx(skb);
+
+	info->netdev->last_rx = jiffies;
+}
+
+/**
+ * called by device driver when adding device instance
+ * do generic HDLC initialization
+ *
+ * info  pointer to device instance information
+ *
+ * returns 0 if success, otherwise error code
+ */
+static int hdlcdev_init(struct slgt_info *info)
+{
+	int rc;
+	struct net_device *dev;
+	hdlc_device *hdlc;
+
+	/* allocate and initialize network and HDLC layer objects */
+
+	if (!(dev = alloc_hdlcdev(info))) {
+		printk(KERN_ERR "%s hdlc device alloc failure\n", info->device_name);
+		return -ENOMEM;
+	}
+
+	/* for network layer reporting purposes only */
+	dev->mem_start = info->phys_reg_addr;
+	dev->mem_end   = info->phys_reg_addr + SLGT_REG_SIZE - 1;
+	dev->irq       = info->irq_level;
+
+	/* network layer callbacks and settings */
+	dev->do_ioctl       = hdlcdev_ioctl;
+	dev->open           = hdlcdev_open;
+	dev->stop           = hdlcdev_close;
+	dev->tx_timeout     = hdlcdev_tx_timeout;
+	dev->watchdog_timeo = 10*HZ;
+	dev->tx_queue_len   = 50;
+
+	/* generic HDLC layer callbacks and settings */
+	hdlc         = dev_to_hdlc(dev);
+	hdlc->attach = hdlcdev_attach;
+	hdlc->xmit   = hdlcdev_xmit;
+
+	/* register objects with HDLC layer */
+	if ((rc = register_hdlc_device(dev))) {
+		printk(KERN_WARNING "%s:unable to register hdlc device\n",__FILE__);
+		free_netdev(dev);
+		return rc;
+	}
+
+	info->netdev = dev;
+	return 0;
+}
+
+/**
+ * called by device driver when removing device instance
+ * do generic HDLC cleanup
+ *
+ * info  pointer to device instance information
+ */
+static void hdlcdev_exit(struct slgt_info *info)
+{
+	unregister_hdlc_device(info->netdev);
+	free_netdev(info->netdev);
+	info->netdev = NULL;
+}
+
+#endif /* ifdef CONFIG_HDLC */
+
+/*
+ * get async data from rx DMA buffers
+ */
+static void rx_async(struct slgt_info *info)
+{
+ 	struct tty_struct *tty = info->tty;
+ 	struct mgsl_icount *icount = &info->icount;
+	unsigned int start, end;
+	unsigned char *p;
+	unsigned char status;
+	struct slgt_desc *bufs = info->rbufs;
+	int i, count;
+
+	start = end = info->rbuf_current;
+
+	while(desc_complete(bufs[end])) {
+		count = desc_count(bufs[end]) - info->rbuf_index;
+		p     = bufs[end].buf + info->rbuf_index;
+
+		DBGISR(("%s rx_async count=%d\n", info->device_name, count));
+		DBGDATA(info, p, count, "rx");
+
+		for(i=0 ; i < count; i+=2, p+=2) {
+			if (tty) {
+				if (tty->flip.count >= TTY_FLIPBUF_SIZE)
+					tty_flip_buffer_push(tty);
+				if (tty->flip.count >= TTY_FLIPBUF_SIZE)
+					break;
+				*tty->flip.char_buf_ptr = *p;
+				*tty->flip.flag_buf_ptr = 0;
+			}
+			icount->rx++;
+
+			if ((status = *(p+1) & (BIT9 + BIT8))) {
+				if (status & BIT9)
+					icount->parity++;
+				else if (status & BIT8)
+					icount->frame++;
+				/* discard char if tty control flags say so */
+				if (status & info->ignore_status_mask)
+					continue;
+				if (tty) {
+					if (status & BIT9)
+						*tty->flip.flag_buf_ptr = TTY_PARITY;
+					else if (status & BIT8)
+						*tty->flip.flag_buf_ptr = TTY_FRAME;
+				}
+			}
+			if (tty) {
+				tty->flip.flag_buf_ptr++;
+				tty->flip.char_buf_ptr++;
+				tty->flip.count++;
+			}
+		}
+
+		if (i < count) {
+			/* receive buffer not completed */
+			info->rbuf_index += i;
+			info->rx_timer.expires = jiffies + 1;
+			add_timer(&info->rx_timer);
+			break;
+		}
+
+		info->rbuf_index = 0;
+		free_rbufs(info, end, end);
+
+		if (++end == info->rbuf_count)
+			end = 0;
+
+		/* if entire list searched then no frame available */
+		if (end == start)
+			break;
+	}
+
+	if (tty && tty->flip.count)
+		tty_flip_buffer_push(tty);
+}
+
+/*
+ * return next bottom half action to perform
+ */
+static int bh_action(struct slgt_info *info)
+{
+	unsigned long flags;
+	int rc;
+
+	spin_lock_irqsave(&info->lock,flags);
+
+	if (info->pending_bh & BH_RECEIVE) {
+		info->pending_bh &= ~BH_RECEIVE;
+		rc = BH_RECEIVE;
+	} else if (info->pending_bh & BH_TRANSMIT) {
+		info->pending_bh &= ~BH_TRANSMIT;
+		rc = BH_TRANSMIT;
+	} else if (info->pending_bh & BH_STATUS) {
+		info->pending_bh &= ~BH_STATUS;
+		rc = BH_STATUS;
+	} else {
+		/* Mark BH routine as complete */
+		info->bh_running   = 0;
+		info->bh_requested = 0;
+		rc = 0;
+	}
+
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	return rc;
+}
+
+/*
+ * perform bottom half processing
+ */
+static void bh_handler(void* context)
+{
+	struct slgt_info *info = context;
+	int action;
+
+	if (!info)
+		return;
+	info->bh_running = 1;
+
+	while((action = bh_action(info))) {
+		switch (action) {
+		case BH_RECEIVE:
+			DBGBH(("%s bh receive\n", info->device_name));
+			switch(info->params.mode) {
+			case MGSL_MODE_ASYNC:
+				rx_async(info);
+				break;
+			case MGSL_MODE_HDLC:
+				while(rx_get_frame(info));
+				break;
+			case MGSL_MODE_RAW:
+				while(rx_get_buf(info));
+				break;
+			}
+			/* restart receiver if rx DMA buffers exhausted */
+			if (info->rx_restart)
+				rx_start(info);
+			break;
+		case BH_TRANSMIT:
+			bh_transmit(info);
+			break;
+		case BH_STATUS:
+			DBGBH(("%s bh status\n", info->device_name));
+			info->ri_chkcount = 0;
+			info->dsr_chkcount = 0;
+			info->dcd_chkcount = 0;
+			info->cts_chkcount = 0;
+			break;
+		default:
+			DBGBH(("%s unknown action\n", info->device_name));
+			break;
+		}
+	}
+	DBGBH(("%s bh_handler exit\n", info->device_name));
+}
+
+static void bh_transmit(struct slgt_info *info)
+{
+	struct tty_struct *tty = info->tty;
+
+	DBGBH(("%s bh_transmit\n", info->device_name));
+	if (tty) {
+		tty_wakeup(tty);
+		wake_up_interruptible(&tty->write_wait);
+	}
+}
+
+static void dsr_change(struct slgt_info *info)
+{
+	get_signals(info);
+	DBGISR(("dsr_change %s signals=%04X\n", info->device_name, info->signals));
+	if ((info->dsr_chkcount)++ == IO_PIN_SHUTDOWN_LIMIT) {
+		slgt_irq_off(info, IRQ_DSR);
+		return;
+	}
+	info->icount.dsr++;
+	if (info->signals & SerialSignal_DSR)
+		info->input_signal_events.dsr_up++;
+	else
+		info->input_signal_events.dsr_down++;
+	wake_up_interruptible(&info->status_event_wait_q);
+	wake_up_interruptible(&info->event_wait_q);
+	info->pending_bh |= BH_STATUS;
+}
+
+static void cts_change(struct slgt_info *info)
+{
+	get_signals(info);
+	DBGISR(("cts_change %s signals=%04X\n", info->device_name, info->signals));
+	if ((info->cts_chkcount)++ == IO_PIN_SHUTDOWN_LIMIT) {
+		slgt_irq_off(info, IRQ_CTS);
+		return;
+	}
+	info->icount.cts++;
+	if (info->signals & SerialSignal_CTS)
+		info->input_signal_events.cts_up++;
+	else
+		info->input_signal_events.cts_down++;
+	wake_up_interruptible(&info->status_event_wait_q);
+	wake_up_interruptible(&info->event_wait_q);
+	info->pending_bh |= BH_STATUS;
+
+	if (info->flags & ASYNC_CTS_FLOW) {
+		if (info->tty) {
+			if (info->tty->hw_stopped) {
+				if (info->signals & SerialSignal_CTS) {
+		 			info->tty->hw_stopped = 0;
+					info->pending_bh |= BH_TRANSMIT;
+					return;
+				}
+			} else {
+				if (!(info->signals & SerialSignal_CTS))
+		 			info->tty->hw_stopped = 1;
+			}
+		}
+	}
+}
+
+static void dcd_change(struct slgt_info *info)
+{
+	get_signals(info);
+	DBGISR(("dcd_change %s signals=%04X\n", info->device_name, info->signals));
+	if ((info->dcd_chkcount)++ == IO_PIN_SHUTDOWN_LIMIT) {
+		slgt_irq_off(info, IRQ_DCD);
+		return;
+	}
+	info->icount.dcd++;
+	if (info->signals & SerialSignal_DCD) {
+		info->input_signal_events.dcd_up++;
+	} else {
+		info->input_signal_events.dcd_down++;
+	}
+#ifdef CONFIG_HDLC
+	if (info->netcount)
+		hdlc_set_carrier(info->signals & SerialSignal_DCD, info->netdev);
+#endif
+	wake_up_interruptible(&info->status_event_wait_q);
+	wake_up_interruptible(&info->event_wait_q);
+	info->pending_bh |= BH_STATUS;
+
+	if (info->flags & ASYNC_CHECK_CD) {
+		if (info->signals & SerialSignal_DCD)
+			wake_up_interruptible(&info->open_wait);
+		else {
+			if (info->tty)
+				tty_hangup(info->tty);
+		}
+	}
+}
+
+static void ri_change(struct slgt_info *info)
+{
+	get_signals(info);
+	DBGISR(("ri_change %s signals=%04X\n", info->device_name, info->signals));
+	if ((info->ri_chkcount)++ == IO_PIN_SHUTDOWN_LIMIT) {
+		slgt_irq_off(info, IRQ_RI);
+		return;
+	}
+	info->icount.dcd++;
+	if (info->signals & SerialSignal_RI) {
+		info->input_signal_events.ri_up++;
+	} else {
+		info->input_signal_events.ri_down++;
+	}
+	wake_up_interruptible(&info->status_event_wait_q);
+	wake_up_interruptible(&info->event_wait_q);
+	info->pending_bh |= BH_STATUS;
+}
+
+static void isr_serial(struct slgt_info *info)
+{
+	unsigned short status = rd_reg16(info, SSR);
+
+	DBGISR(("%s isr_serial status=%04X\n", info->device_name, status));
+
+	wr_reg16(info, SSR, status); /* clear pending */
+
+	info->irq_occurred = 1;
+
+	if (info->params.mode == MGSL_MODE_ASYNC) {
+		if (status & IRQ_TXIDLE) {
+			if (info->tx_count)
+				isr_txeom(info, status);
+		}
+		if ((status & IRQ_RXBREAK) && (status & RXBREAK)) {
+			info->icount.brk++;
+			/* process break detection if tty control allows */
+			if (info->tty) {
+				if (!(status & info->ignore_status_mask)) {
+					if (info->read_status_mask & MASK_BREAK) {
+						*info->tty->flip.flag_buf_ptr = TTY_BREAK;
+						if (info->flags & ASYNC_SAK)
+							do_SAK(info->tty);
+					}
+				}
+			}
+		}
+	} else {
+		if (status & (IRQ_TXIDLE + IRQ_TXUNDER))
+			isr_txeom(info, status);
+
+		if (status & IRQ_RXIDLE) {
+			if (status & RXIDLE)
+				info->icount.rxidle++;
+			else
+				info->icount.exithunt++;
+			wake_up_interruptible(&info->event_wait_q);
+		}
+
+		if (status & IRQ_RXOVER)
+			rx_start(info);
+	}
+
+	if (status & IRQ_DSR)
+		dsr_change(info);
+	if (status & IRQ_CTS)
+		cts_change(info);
+	if (status & IRQ_DCD)
+		dcd_change(info);
+	if (status & IRQ_RI)
+		ri_change(info);
+}
+
+static void isr_rdma(struct slgt_info *info)
+{
+	unsigned int status = rd_reg32(info, RDCSR);
+
+	DBGISR(("%s isr_rdma status=%08x\n", info->device_name, status));
+
+	/* RDCSR (rx DMA control/status)
+	 *
+	 * 31..07  reserved
+	 * 06      save status byte to DMA buffer
+	 * 05      error
+	 * 04      eol (end of list)
+	 * 03      eob (end of buffer)
+	 * 02      IRQ enable
+	 * 01      reset
+	 * 00      enable
+	 */
+	wr_reg32(info, RDCSR, status);	/* clear pending */
+
+	if (status & (BIT5 + BIT4)) {
+		DBGISR(("%s isr_rdma rx_restart=1\n", info->device_name));
+		info->rx_restart = 1;
+	}
+	info->pending_bh |= BH_RECEIVE;
+}
+
+static void isr_tdma(struct slgt_info *info)
+{
+	unsigned int status = rd_reg32(info, TDCSR);
+
+	DBGISR(("%s isr_tdma status=%08x\n", info->device_name, status));
+
+	/* TDCSR (tx DMA control/status)
+	 *
+	 * 31..06  reserved
+	 * 05      error
+	 * 04      eol (end of list)
+	 * 03      eob (end of buffer)
+	 * 02      IRQ enable
+	 * 01      reset
+	 * 00      enable
+	 */
+	wr_reg32(info, TDCSR, status);	/* clear pending */
+
+	if (status & (BIT5 + BIT4 + BIT3)) {
+		// another transmit buffer has completed
+		// run bottom half to get more send data from user
+		info->pending_bh |= BH_TRANSMIT;
+	}
+}
+
+static void isr_txeom(struct slgt_info *info, unsigned short status)
+{
+	DBGISR(("%s txeom status=%04x\n", info->device_name, status));
+
+	slgt_irq_off(info, IRQ_TXDATA + IRQ_TXIDLE + IRQ_TXUNDER);
+	tdma_reset(info);
+	reset_tbufs(info);
+	if (status & IRQ_TXUNDER) {
+		unsigned short val = rd_reg16(info, TCR);
+		wr_reg16(info, TCR, (unsigned short)(val | BIT2)); /* set reset bit */
+		wr_reg16(info, TCR, val); /* clear reset bit */
+	}
+
+	if (info->tx_active) {
+		if (info->params.mode != MGSL_MODE_ASYNC) {
+			if (status & IRQ_TXUNDER)
+				info->icount.txunder++;
+			else if (status & IRQ_TXIDLE)
+				info->icount.txok++;
+		}
+
+		info->tx_active = 0;
+		info->tx_count = 0;
+
+		del_timer(&info->tx_timer);
+
+		if (info->params.mode != MGSL_MODE_ASYNC && info->drop_rts_on_tx_done) {
+			info->signals &= ~SerialSignal_RTS;
+			info->drop_rts_on_tx_done = 0;
+			set_signals(info);
+		}
+
+#ifdef CONFIG_HDLC
+		if (info->netcount)
+			hdlcdev_tx_done(info);
+		else
+#endif
+		{
+			if (info->tty && (info->tty->stopped || info->tty->hw_stopped)) {
+				tx_stop(info);
+				return;
+			}
+			info->pending_bh |= BH_TRANSMIT;
+		}
+	}
+}
+
+/* interrupt service routine
+ *
+ * 	irq	interrupt number
+ * 	dev_id	device ID supplied during interrupt registration
+ * 	regs	interrupted processor context
+ */
+static irqreturn_t slgt_interrupt(int irq, void *dev_id, struct pt_regs * regs)
+{
+	struct slgt_info *info;
+	unsigned int gsr;
+	unsigned int i;
+
+	DBGISR(("slgt_interrupt irq=%d entry\n", irq));
+
+	info = dev_id;
+	if (!info)
+		return IRQ_NONE;
+
+	spin_lock(&info->lock);
+
+	while((gsr = rd_reg32(info, GSR) & 0xffffff00)) {
+		DBGISR(("%s gsr=%08x\n", info->device_name, gsr));
+		info->irq_occurred = 1;
+		for(i=0; i < info->port_count ; i++) {
+			if (info->port_array[i] == NULL)
+				continue;
+			if (gsr & (BIT8 << i))
+				isr_serial(info->port_array[i]);
+			if (gsr & (BIT16 << (i*2)))
+				isr_rdma(info->port_array[i]);
+			if (gsr & (BIT17 << (i*2)))
+				isr_tdma(info->port_array[i]);
+		}
+	}
+
+	for(i=0; i < info->port_count ; i++) {
+		struct slgt_info *port = info->port_array[i];
+
+		if (port && (port->count || port->netcount) &&
+		    port->pending_bh && !port->bh_running &&
+		    !port->bh_requested) {
+			DBGISR(("%s bh queued\n", port->device_name));
+			schedule_work(&port->task);
+			port->bh_requested = 1;
+		}
+	}
+
+	spin_unlock(&info->lock);
+
+	DBGISR(("slgt_interrupt irq=%d exit\n", irq));
+	return IRQ_HANDLED;
+}
+
+static int startup(struct slgt_info *info)
+{
+	DBGINFO(("%s startup\n", info->device_name));
+
+	if (info->flags & ASYNC_INITIALIZED)
+		return 0;
+
+	if (!info->tx_buf) {
+		info->tx_buf = kmalloc(info->max_frame_size, GFP_KERNEL);
+		if (!info->tx_buf) {
+			DBGERR(("%s can't allocate tx buffer\n", info->device_name));
+			return -ENOMEM;
+		}
+	}
+
+	info->pending_bh = 0;
+
+	memset(&info->icount, 0, sizeof(info->icount));
+
+	/* program hardware for current parameters */
+	change_params(info);
+
+	if (info->tty)
+		clear_bit(TTY_IO_ERROR, &info->tty->flags);
+
+	info->flags |= ASYNC_INITIALIZED;
+
+	return 0;
+}
+
+/*
+ *  called by close() and hangup() to shutdown hardware
+ */
+static void shutdown(struct slgt_info *info)
+{
+	unsigned long flags;
+
+	if (!(info->flags & ASYNC_INITIALIZED))
+		return;
+
+	DBGINFO(("%s shutdown\n", info->device_name));
+
+	/* clear status wait queue because status changes */
+	/* can't happen after shutting down the hardware */
+	wake_up_interruptible(&info->status_event_wait_q);
+	wake_up_interruptible(&info->event_wait_q);
+
+	del_timer_sync(&info->tx_timer);
+	del_timer_sync(&info->rx_timer);
+
+	kfree(info->tx_buf);
+	info->tx_buf = NULL;
+
+	spin_lock_irqsave(&info->lock,flags);
+
+	tx_stop(info);
+	rx_stop(info);
+
+	slgt_irq_off(info, IRQ_ALL | IRQ_MASTER);
+
+ 	if (!info->tty || info->tty->termios->c_cflag & HUPCL) {
+ 		info->signals &= ~(SerialSignal_DTR + SerialSignal_RTS);
+		set_signals(info);
+	}
+
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	if (info->tty)
+		set_bit(TTY_IO_ERROR, &info->tty->flags);
+
+	info->flags &= ~ASYNC_INITIALIZED;
+}
+
+static void program_hw(struct slgt_info *info)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&info->lock,flags);
+
+	rx_stop(info);
+	tx_stop(info);
+
+	if (info->params.mode == MGSL_MODE_HDLC ||
+	    info->params.mode == MGSL_MODE_RAW ||
+	    info->netcount)
+		hdlc_mode(info);
+	else
+		async_mode(info);
+
+	set_signals(info);
+
+	info->dcd_chkcount = 0;
+	info->cts_chkcount = 0;
+	info->ri_chkcount = 0;
+	info->dsr_chkcount = 0;
+
+	slgt_irq_on(info, IRQ_DCD | IRQ_CTS | IRQ_DSR);
+	get_signals(info);
+
+	if (info->netcount ||
+	    (info->tty && info->tty->termios->c_cflag & CREAD))
+		rx_start(info);
+
+	spin_unlock_irqrestore(&info->lock,flags);
+}
+
+/*
+ * reconfigure adapter based on new parameters
+ */
+static void change_params(struct slgt_info *info)
+{
+	unsigned cflag;
+	int bits_per_char;
+
+	if (!info->tty || !info->tty->termios)
+		return;
+	DBGINFO(("%s change_params\n", info->device_name));
+
+	cflag = info->tty->termios->c_cflag;
+
+	/* if B0 rate (hangup) specified then negate DTR and RTS */
+	/* otherwise assert DTR and RTS */
+ 	if (cflag & CBAUD)
+		info->signals |= SerialSignal_RTS + SerialSignal_DTR;
+	else
+		info->signals &= ~(SerialSignal_RTS + SerialSignal_DTR);
+
+	/* byte size and parity */
+
+	switch (cflag & CSIZE) {
+	case CS5: info->params.data_bits = 5; break;
+	case CS6: info->params.data_bits = 6; break;
+	case CS7: info->params.data_bits = 7; break;
+	case CS8: info->params.data_bits = 8; break;
+	default:  info->params.data_bits = 7; break;
+	}
+
+	info->params.stop_bits = (cflag & CSTOPB) ? 2 : 1;
+
+	if (cflag & PARENB)
+		info->params.parity = (cflag & PARODD) ? ASYNC_PARITY_ODD : ASYNC_PARITY_EVEN;
+	else
+		info->params.parity = ASYNC_PARITY_NONE;
+
+	/* calculate number of jiffies to transmit a full
+	 * FIFO (32 bytes) at specified data rate
+	 */
+	bits_per_char = info->params.data_bits +
+			info->params.stop_bits + 1;
+
+	info->params.data_rate = tty_get_baud_rate(info->tty);
+
+	if (info->params.data_rate) {
+		info->timeout = (32*HZ*bits_per_char) /
+				info->params.data_rate;
+	}
+	info->timeout += HZ/50;		/* Add .02 seconds of slop */
+
+	if (cflag & CRTSCTS)
+		info->flags |= ASYNC_CTS_FLOW;
+	else
+		info->flags &= ~ASYNC_CTS_FLOW;
+
+	if (cflag & CLOCAL)
+		info->flags &= ~ASYNC_CHECK_CD;
+	else
+		info->flags |= ASYNC_CHECK_CD;
+
+	/* process tty input control flags */
+
+	info->read_status_mask = IRQ_RXOVER;
+	if (I_INPCK(info->tty))
+		info->read_status_mask |= MASK_PARITY | MASK_FRAMING;
+ 	if (I_BRKINT(info->tty) || I_PARMRK(info->tty))
+ 		info->read_status_mask |= MASK_BREAK;
+	if (I_IGNPAR(info->tty))
+		info->ignore_status_mask |= MASK_PARITY | MASK_FRAMING;
+	if (I_IGNBRK(info->tty)) {
+		info->ignore_status_mask |= MASK_BREAK;
+		/* If ignoring parity and break indicators, ignore
+		 * overruns too.  (For real raw support).
+		 */
+		if (I_IGNPAR(info->tty))
+			info->ignore_status_mask |= MASK_OVERRUN;
+	}
+
+	program_hw(info);
+}
+
+static int get_stats(struct slgt_info *info, struct mgsl_icount __user *user_icount)
+{
+	DBGINFO(("%s get_stats\n",  info->device_name));
+	if (!user_icount) {
+		memset(&info->icount, 0, sizeof(info->icount));
+	} else {
+		if (copy_to_user(user_icount, &info->icount, sizeof(struct mgsl_icount)))
+			return -EFAULT;
+	}
+	return 0;
+}
+
+static int get_params(struct slgt_info *info, MGSL_PARAMS __user *user_params)
+{
+	DBGINFO(("%s get_params\n", info->device_name));
+	if (copy_to_user(user_params, &info->params, sizeof(MGSL_PARAMS)))
+		return -EFAULT;
+	return 0;
+}
+
+static int set_params(struct slgt_info *info, MGSL_PARAMS __user *new_params)
+{
+ 	unsigned long flags;
+	MGSL_PARAMS tmp_params;
+
+	DBGINFO(("%s set_params\n", info->device_name));
+	if (copy_from_user(&tmp_params, new_params, sizeof(MGSL_PARAMS)))
+		return -EFAULT;
+
+	spin_lock_irqsave(&info->lock, flags);
+	memcpy(&info->params, &tmp_params, sizeof(MGSL_PARAMS));
+	spin_unlock_irqrestore(&info->lock, flags);
+
+ 	change_params(info);
+
+	return 0;
+}
+
+static int get_txidle(struct slgt_info *info, int __user *idle_mode)
+{
+	DBGINFO(("%s get_txidle=%d\n", info->device_name, info->idle_mode));
+	if (put_user(info->idle_mode, idle_mode))
+		return -EFAULT;
+	return 0;
+}
+
+static int set_txidle(struct slgt_info *info, int idle_mode)
+{
+ 	unsigned long flags;
+	DBGINFO(("%s set_txidle(%d)\n", info->device_name, idle_mode));
+	spin_lock_irqsave(&info->lock,flags);
+	info->idle_mode = idle_mode;
+	tx_set_idle(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+	return 0;
+}
+
+static int tx_enable(struct slgt_info *info, int enable)
+{
+ 	unsigned long flags;
+	DBGINFO(("%s tx_enable(%d)\n", info->device_name, enable));
+	spin_lock_irqsave(&info->lock,flags);
+	if (enable) {
+		if (!info->tx_enabled)
+			tx_start(info);
+	} else {
+		if (info->tx_enabled)
+			tx_stop(info);
+	}
+	spin_unlock_irqrestore(&info->lock,flags);
+	return 0;
+}
+
+/*
+ * abort transmit HDLC frame
+ */
+static int tx_abort(struct slgt_info *info)
+{
+ 	unsigned long flags;
+	DBGINFO(("%s tx_abort\n", info->device_name));
+	spin_lock_irqsave(&info->lock,flags);
+	tdma_reset(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+	return 0;
+}
+
+static int rx_enable(struct slgt_info *info, int enable)
+{
+ 	unsigned long flags;
+	DBGINFO(("%s rx_enable(%d)\n", info->device_name, enable));
+	spin_lock_irqsave(&info->lock,flags);
+	if (enable) {
+		if (!info->rx_enabled)
+			rx_start(info);
+	} else {
+		if (info->rx_enabled)
+			rx_stop(info);
+	}
+	spin_unlock_irqrestore(&info->lock,flags);
+	return 0;
+}
+
+/*
+ *  wait for specified event to occur
+ */
+static int wait_mgsl_event(struct slgt_info *info, int __user *mask_ptr)
+{
+ 	unsigned long flags;
+	int s;
+	int rc=0;
+	struct mgsl_icount cprev, cnow;
+	int events;
+	int mask;
+	struct	_input_signal_events oldsigs, newsigs;
+	DECLARE_WAITQUEUE(wait, current);
+
+	if (get_user(mask, mask_ptr))
+		return -EFAULT;
+
+	DBGINFO(("%s wait_mgsl_event(%d)\n", info->device_name, mask));
+
+	spin_lock_irqsave(&info->lock,flags);
+
+	/* return immediately if state matches requested events */
+	get_signals(info);
+	s = info->signals;
+
+	events = mask &
+		( ((s & SerialSignal_DSR) ? MgslEvent_DsrActive:MgslEvent_DsrInactive) +
+ 		  ((s & SerialSignal_DCD) ? MgslEvent_DcdActive:MgslEvent_DcdInactive) +
+		  ((s & SerialSignal_CTS) ? MgslEvent_CtsActive:MgslEvent_CtsInactive) +
+		  ((s & SerialSignal_RI)  ? MgslEvent_RiActive :MgslEvent_RiInactive) );
+	if (events) {
+		spin_unlock_irqrestore(&info->lock,flags);
+		goto exit;
+	}
+
+	/* save current irq counts */
+	cprev = info->icount;
+	oldsigs = info->input_signal_events;
+
+	/* enable hunt and idle irqs if needed */
+	if (mask & (MgslEvent_ExitHuntMode+MgslEvent_IdleReceived)) {
+		unsigned short val = rd_reg16(info, SCR);
+		if (!(val & IRQ_RXIDLE))
+			wr_reg16(info, SCR, (unsigned short)(val | IRQ_RXIDLE));
+	}
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	add_wait_queue(&info->event_wait_q, &wait);
+
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	for(;;) {
+		schedule();
+		if (signal_pending(current)) {
+			rc = -ERESTARTSYS;
+			break;
+		}
+
+		/* get current irq counts */
+		spin_lock_irqsave(&info->lock,flags);
+		cnow = info->icount;
+		newsigs = info->input_signal_events;
+		set_current_state(TASK_INTERRUPTIBLE);
+		spin_unlock_irqrestore(&info->lock,flags);
+
+		/* if no change, wait aborted for some reason */
+		if (newsigs.dsr_up   == oldsigs.dsr_up   &&
+		    newsigs.dsr_down == oldsigs.dsr_down &&
+		    newsigs.dcd_up   == oldsigs.dcd_up   &&
+		    newsigs.dcd_down == oldsigs.dcd_down &&
+		    newsigs.cts_up   == oldsigs.cts_up   &&
+		    newsigs.cts_down == oldsigs.cts_down &&
+		    newsigs.ri_up    == oldsigs.ri_up    &&
+		    newsigs.ri_down  == oldsigs.ri_down  &&
+		    cnow.exithunt    == cprev.exithunt   &&
+		    cnow.rxidle      == cprev.rxidle) {
+			rc = -EIO;
+			break;
+		}
+
+		events = mask &
+			( (newsigs.dsr_up   != oldsigs.dsr_up   ? MgslEvent_DsrActive:0)   +
+			  (newsigs.dsr_down != oldsigs.dsr_down ? MgslEvent_DsrInactive:0) +
+			  (newsigs.dcd_up   != oldsigs.dcd_up   ? MgslEvent_DcdActive:0)   +
+			  (newsigs.dcd_down != oldsigs.dcd_down ? MgslEvent_DcdInactive:0) +
+			  (newsigs.cts_up   != oldsigs.cts_up   ? MgslEvent_CtsActive:0)   +
+			  (newsigs.cts_down != oldsigs.cts_down ? MgslEvent_CtsInactive:0) +
+			  (newsigs.ri_up    != oldsigs.ri_up    ? MgslEvent_RiActive:0)    +
+			  (newsigs.ri_down  != oldsigs.ri_down  ? MgslEvent_RiInactive:0)  +
+			  (cnow.exithunt    != cprev.exithunt   ? MgslEvent_ExitHuntMode:0) +
+			  (cnow.rxidle      != cprev.rxidle     ? MgslEvent_IdleReceived:0) );
+		if (events)
+			break;
+
+		cprev = cnow;
+		oldsigs = newsigs;
+	}
+
+	remove_wait_queue(&info->event_wait_q, &wait);
+	set_current_state(TASK_RUNNING);
+
+
+	if (mask & (MgslEvent_ExitHuntMode + MgslEvent_IdleReceived)) {
+		spin_lock_irqsave(&info->lock,flags);
+		if (!waitqueue_active(&info->event_wait_q)) {
+			/* disable enable exit hunt mode/idle rcvd IRQs */
+			wr_reg16(info, SCR,
+				(unsigned short)(rd_reg16(info, SCR) & ~IRQ_RXIDLE));
+		}
+		spin_unlock_irqrestore(&info->lock,flags);
+	}
+exit:
+	if (rc == 0)
+		rc = put_user(events, mask_ptr);
+	return rc;
+}
+
+static int get_interface(struct slgt_info *info, int __user *if_mode)
+{
+	DBGINFO(("%s get_interface=%x\n", info->device_name, info->if_mode));
+	if (put_user(info->if_mode, if_mode))
+		return -EFAULT;
+	return 0;
+}
+
+static int set_interface(struct slgt_info *info, int if_mode)
+{
+ 	unsigned long flags;
+	unsigned char val;
+
+	DBGINFO(("%s set_interface=%x)\n", info->device_name, if_mode));
+	spin_lock_irqsave(&info->lock,flags);
+	info->if_mode = if_mode;
+
+	msc_set_vcr(info);
+
+	/* TCR (tx control) 07  1=RTS driver control */
+	val = rd_reg16(info, TCR);
+	if (info->if_mode & MGSL_INTERFACE_RTS_EN)
+		val |= BIT7;
+	else
+		val &= ~BIT7;
+	wr_reg16(info, TCR, val);
+
+	spin_unlock_irqrestore(&info->lock,flags);
+	return 0;
+}
+
+static int modem_input_wait(struct slgt_info *info,int arg)
+{
+ 	unsigned long flags;
+	int rc;
+	struct mgsl_icount cprev, cnow;
+	DECLARE_WAITQUEUE(wait, current);
+
+	/* save current irq counts */
+	spin_lock_irqsave(&info->lock,flags);
+	cprev = info->icount;
+	add_wait_queue(&info->status_event_wait_q, &wait);
+	set_current_state(TASK_INTERRUPTIBLE);
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	for(;;) {
+		schedule();
+		if (signal_pending(current)) {
+			rc = -ERESTARTSYS;
+			break;
+		}
+
+		/* get new irq counts */
+		spin_lock_irqsave(&info->lock,flags);
+		cnow = info->icount;
+		set_current_state(TASK_INTERRUPTIBLE);
+		spin_unlock_irqrestore(&info->lock,flags);
+
+		/* if no change, wait aborted for some reason */
+		if (cnow.rng == cprev.rng && cnow.dsr == cprev.dsr &&
+		    cnow.dcd == cprev.dcd && cnow.cts == cprev.cts) {
+			rc = -EIO;
+			break;
+		}
+
+		/* check for change in caller specified modem input */
+		if ((arg & TIOCM_RNG && cnow.rng != cprev.rng) ||
+		    (arg & TIOCM_DSR && cnow.dsr != cprev.dsr) ||
+		    (arg & TIOCM_CD  && cnow.dcd != cprev.dcd) ||
+		    (arg & TIOCM_CTS && cnow.cts != cprev.cts)) {
+			rc = 0;
+			break;
+		}
+
+		cprev = cnow;
+	}
+	remove_wait_queue(&info->status_event_wait_q, &wait);
+	set_current_state(TASK_RUNNING);
+	return rc;
+}
+
+/*
+ *  return state of serial control and status signals
+ */
+static int tiocmget(struct tty_struct *tty, struct file *file)
+{
+	struct slgt_info *info = tty->driver_data;
+	unsigned int result;
+ 	unsigned long flags;
+
+	spin_lock_irqsave(&info->lock,flags);
+ 	get_signals(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	result = ((info->signals & SerialSignal_RTS) ? TIOCM_RTS:0) +
+		((info->signals & SerialSignal_DTR) ? TIOCM_DTR:0) +
+		((info->signals & SerialSignal_DCD) ? TIOCM_CAR:0) +
+		((info->signals & SerialSignal_RI)  ? TIOCM_RNG:0) +
+		((info->signals & SerialSignal_DSR) ? TIOCM_DSR:0) +
+		((info->signals & SerialSignal_CTS) ? TIOCM_CTS:0);
+
+	DBGINFO(("%s tiocmget value=%08X\n", info->device_name, result));
+	return result;
+}
+
+/*
+ * set modem control signals (DTR/RTS)
+ *
+ * 	cmd	signal command: TIOCMBIS = set bit TIOCMBIC = clear bit
+ *		TIOCMSET = set/clear signal values
+ * 	value	bit mask for command
+ */
+static int tiocmset(struct tty_struct *tty, struct file *file,
+		    unsigned int set, unsigned int clear)
+{
+	struct slgt_info *info = tty->driver_data;
+ 	unsigned long flags;
+
+	DBGINFO(("%s tiocmset(%x,%x)\n", info->device_name, set, clear));
+
+	if (set & TIOCM_RTS)
+		info->signals |= SerialSignal_RTS;
+	if (set & TIOCM_DTR)
+		info->signals |= SerialSignal_DTR;
+	if (clear & TIOCM_RTS)
+		info->signals &= ~SerialSignal_RTS;
+	if (clear & TIOCM_DTR)
+		info->signals &= ~SerialSignal_DTR;
+
+	spin_lock_irqsave(&info->lock,flags);
+ 	set_signals(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+	return 0;
+}
+
+/*
+ *  block current process until the device is ready to open
+ */
+static int block_til_ready(struct tty_struct *tty, struct file *filp,
+			   struct slgt_info *info)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	int		retval;
+	int		do_clocal = 0, extra_count = 0;
+	unsigned long	flags;
+
+	DBGINFO(("%s block_til_ready\n", tty->driver->name));
+
+	if (filp->f_flags & O_NONBLOCK || tty->flags & (1 << TTY_IO_ERROR)){
+		/* nonblock mode is set or port is not enabled */
+		info->flags |= ASYNC_NORMAL_ACTIVE;
+		return 0;
+	}
+
+	if (tty->termios->c_cflag & CLOCAL)
+		do_clocal = 1;
+
+	/* Wait for carrier detect and the line to become
+	 * free (i.e., not in use by the callout).  While we are in
+	 * this loop, info->count is dropped by one, so that
+	 * close() knows when to free things.  We restore it upon
+	 * exit, either normal or abnormal.
+	 */
+
+	retval = 0;
+	add_wait_queue(&info->open_wait, &wait);
+
+	spin_lock_irqsave(&info->lock, flags);
+	if (!tty_hung_up_p(filp)) {
+		extra_count = 1;
+		info->count--;
+	}
+	spin_unlock_irqrestore(&info->lock, flags);
+	info->blocked_open++;
+
+	while (1) {
+		if ((tty->termios->c_cflag & CBAUD)) {
+			spin_lock_irqsave(&info->lock,flags);
+			info->signals |= SerialSignal_RTS + SerialSignal_DTR;
+		 	set_signals(info);
+			spin_unlock_irqrestore(&info->lock,flags);
+		}
+
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		if (tty_hung_up_p(filp) || !(info->flags & ASYNC_INITIALIZED)){
+			retval = (info->flags & ASYNC_HUP_NOTIFY) ?
+					-EAGAIN : -ERESTARTSYS;
+			break;
+		}
+
+		spin_lock_irqsave(&info->lock,flags);
+	 	get_signals(info);
+		spin_unlock_irqrestore(&info->lock,flags);
+
+ 		if (!(info->flags & ASYNC_CLOSING) &&
+ 		    (do_clocal || (info->signals & SerialSignal_DCD)) ) {
+ 			break;
+		}
+
+		if (signal_pending(current)) {
+			retval = -ERESTARTSYS;
+			break;
+		}
+
+		DBGINFO(("%s block_til_ready wait\n", tty->driver->name));
+		schedule();
+	}
+
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(&info->open_wait, &wait);
+
+	if (extra_count)
+		info->count++;
+	info->blocked_open--;
+
+	if (!retval)
+		info->flags |= ASYNC_NORMAL_ACTIVE;
+
+	DBGINFO(("%s block_til_ready ready, rc=%d\n", tty->driver->name, retval));
+	return retval;
+}
+
+static int alloc_tmp_rbuf(struct slgt_info *info)
+{
+	info->tmp_rbuf = kmalloc(info->max_frame_size, GFP_KERNEL);
+	if (info->tmp_rbuf == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void free_tmp_rbuf(struct slgt_info *info)
+{
+	kfree(info->tmp_rbuf);
+	info->tmp_rbuf = NULL;
+}
+
+/*
+ * allocate DMA descriptor lists.
+ */
+static int alloc_desc(struct slgt_info *info)
+{
+	unsigned int i;
+	unsigned int pbufs;
+
+	/* allocate memory to hold descriptor lists */
+	info->bufs = pci_alloc_consistent(info->pdev, DESC_LIST_SIZE, &info->bufs_dma_addr);
+	if (info->bufs == NULL)
+		return -ENOMEM;
+
+	memset(info->bufs, 0, DESC_LIST_SIZE);
+
+	info->rbufs = (struct slgt_desc*)info->bufs;
+	info->tbufs = ((struct slgt_desc*)info->bufs) + info->rbuf_count;
+
+	pbufs = (unsigned int)info->bufs_dma_addr;
+
+	/*
+	 * Build circular lists of descriptors
+	 */
+
+	for (i=0; i < info->rbuf_count; i++) {
+		/* physical address of this descriptor */
+		info->rbufs[i].pdesc = pbufs + (i * sizeof(struct slgt_desc));
+
+		/* physical address of next descriptor */
+		if (i == info->rbuf_count - 1)
+			info->rbufs[i].next = cpu_to_le32(pbufs);
+		else
+			info->rbufs[i].next = cpu_to_le32(pbufs + ((i+1) * sizeof(struct slgt_desc)));
+		set_desc_count(info->rbufs[i], DMABUFSIZE);
+	}
+
+	for (i=0; i < info->tbuf_count; i++) {
+		/* physical address of this descriptor */
+		info->tbufs[i].pdesc = pbufs + ((info->rbuf_count + i) * sizeof(struct slgt_desc));
+
+		/* physical address of next descriptor */
+		if (i == info->tbuf_count - 1)
+			info->tbufs[i].next = cpu_to_le32(pbufs + info->rbuf_count * sizeof(struct slgt_desc));
+		else
+			info->tbufs[i].next = cpu_to_le32(pbufs + ((info->rbuf_count + i + 1) * sizeof(struct slgt_desc)));
+	}
+
+	return 0;
+}
+
+static void free_desc(struct slgt_info *info)
+{
+	if (info->bufs != NULL) {
+		pci_free_consistent(info->pdev, DESC_LIST_SIZE, info->bufs, info->bufs_dma_addr);
+		info->bufs  = NULL;
+		info->rbufs = NULL;
+		info->tbufs = NULL;
+	}
+}
+
+static int alloc_bufs(struct slgt_info *info, struct slgt_desc *bufs, int count)
+{
+	int i;
+	for (i=0; i < count; i++) {
+		if ((bufs[i].buf = pci_alloc_consistent(info->pdev, DMABUFSIZE, &bufs[i].buf_dma_addr)) == NULL)
+			return -ENOMEM;
+		bufs[i].pbuf  = cpu_to_le32((unsigned int)bufs[i].buf_dma_addr);
+	}
+	return 0;
+}
+
+static void free_bufs(struct slgt_info *info, struct slgt_desc *bufs, int count)
+{
+	int i;
+	for (i=0; i < count; i++) {
+		if (bufs[i].buf == NULL)
+			continue;
+		pci_free_consistent(info->pdev, DMABUFSIZE, bufs[i].buf, bufs[i].buf_dma_addr);
+		bufs[i].buf = NULL;
+	}
+}
+
+static int alloc_dma_bufs(struct slgt_info *info)
+{
+	info->rbuf_count = 32;
+	info->tbuf_count = 32;
+
+	if (alloc_desc(info) < 0 ||
+	    alloc_bufs(info, info->rbufs, info->rbuf_count) < 0 ||
+	    alloc_bufs(info, info->tbufs, info->tbuf_count) < 0 ||
+	    alloc_tmp_rbuf(info) < 0) {
+		DBGERR(("%s DMA buffer alloc fail\n", info->device_name));
+		return -ENOMEM;
+	}
+	reset_rbufs(info);
+	return 0;
+}
+
+static void free_dma_bufs(struct slgt_info *info)
+{
+	if (info->bufs) {
+		free_bufs(info, info->rbufs, info->rbuf_count);
+		free_bufs(info, info->tbufs, info->tbuf_count);
+		free_desc(info);
+	}
+	free_tmp_rbuf(info);
+}
+
+static int claim_resources(struct slgt_info *info)
+{
+	if (request_mem_region(info->phys_reg_addr, SLGT_REG_SIZE, "synclink_gt") == NULL) {
+		DBGERR(("%s reg addr conflict, addr=%08X\n",
+			info->device_name, info->phys_reg_addr));
+		info->init_error = DiagStatus_AddressConflict;
+		goto errout;
+	}
+	else
+		info->reg_addr_requested = 1;
+
+	info->reg_addr = ioremap(info->phys_reg_addr, PAGE_SIZE);
+	if (!info->reg_addr) {
+		DBGERR(("%s cant map device registers, addr=%08X\n",
+			info->device_name, info->phys_reg_addr));
+		info->init_error = DiagStatus_CantAssignPciResources;
+		goto errout;
+	}
+	info->reg_addr += info->reg_offset;
+	return 0;
+
+errout:
+	release_resources(info);
+	return -ENODEV;
+}
+
+static void release_resources(struct slgt_info *info)
+{
+	if (info->irq_requested) {
+		free_irq(info->irq_level, info);
+		info->irq_requested = 0;
+	}
+
+	if (info->reg_addr_requested) {
+		release_mem_region(info->phys_reg_addr, SLGT_REG_SIZE);
+		info->reg_addr_requested = 0;
+	}
+
+	if (info->reg_addr) {
+		iounmap(info->reg_addr - info->reg_offset);
+		info->reg_addr = NULL;
+	}
+}
+
+/* Add the specified device instance data structure to the
+ * global linked list of devices and increment the device count.
+ */
+static void add_device(struct slgt_info *info)
+{
+	char *devstr;
+
+	info->next_device = NULL;
+	info->line = slgt_device_count;
+	sprintf(info->device_name, "%s%d", tty_dev_prefix, info->line);
+
+	if (info->line < MAX_DEVICES) {
+		if (maxframe[info->line])
+			info->max_frame_size = maxframe[info->line];
+		info->dosyncppp = dosyncppp[info->line];
+	}
+
+	slgt_device_count++;
+
+	if (!slgt_device_list)
+		slgt_device_list = info;
+	else {
+		struct slgt_info *current_dev = slgt_device_list;
+		while(current_dev->next_device)
+			current_dev = current_dev->next_device;
+		current_dev->next_device = info;
+	}
+
+	if (info->max_frame_size < 4096)
+		info->max_frame_size = 4096;
+	else if (info->max_frame_size > 65535)
+		info->max_frame_size = 65535;
+
+	switch(info->pdev->device) {
+	case SYNCLINK_GT_DEVICE_ID:
+		devstr = "GT";
+		break;
+	case SYNCLINK_GT4_DEVICE_ID:
+		devstr = "GT4";
+		break;
+	case SYNCLINK_AC_DEVICE_ID:
+		devstr = "AC";
+		info->params.mode = MGSL_MODE_ASYNC;
+		break;
+	default:
+		devstr = "(unknown model)";
+	}
+	printk("SyncLink %s %s IO=%08x IRQ=%d MaxFrameSize=%u\n",
+		devstr, info->device_name, info->phys_reg_addr,
+		info->irq_level, info->max_frame_size);
+
+#ifdef CONFIG_HDLC
+	hdlcdev_init(info);
+#endif
+}
+
+/*
+ *  allocate device instance structure, return NULL on failure
+ */
+static struct slgt_info *alloc_dev(int adapter_num, int port_num, struct pci_dev *pdev)
+{
+	struct slgt_info *info;
+
+	info = kmalloc(sizeof(struct slgt_info), GFP_KERNEL);
+
+	if (!info) {
+		DBGERR(("%s device alloc failed adapter=%d port=%d\n",
+			driver_name, adapter_num, port_num));
+	} else {
+		memset(info, 0, sizeof(struct slgt_info));
+		info->magic = MGSL_MAGIC;
+		INIT_WORK(&info->task, bh_handler, info);
+		info->max_frame_size = 4096;
+		info->raw_rx_size = DMABUFSIZE;
+		info->close_delay = 5*HZ/10;
+		info->closing_wait = 30*HZ;
+		init_waitqueue_head(&info->open_wait);
+		init_waitqueue_head(&info->close_wait);
+		init_waitqueue_head(&info->status_event_wait_q);
+		init_waitqueue_head(&info->event_wait_q);
+		spin_lock_init(&info->netlock);
+		memcpy(&info->params,&default_params,sizeof(MGSL_PARAMS));
+		info->idle_mode = HDLC_TXIDLE_FLAGS;
+		info->adapter_num = adapter_num;
+		info->port_num = port_num;
+
+		init_timer(&info->tx_timer);
+		info->tx_timer.data = (unsigned long)info;
+		info->tx_timer.function = tx_timeout;
+
+		init_timer(&info->rx_timer);
+		info->rx_timer.data = (unsigned long)info;
+		info->rx_timer.function = rx_timeout;
+
+		/* Copy configuration info to device instance data */
+		info->pdev = pdev;
+		info->irq_level = pdev->irq;
+		info->phys_reg_addr = pci_resource_start(pdev,0);
+
+		/* veremap works on page boundaries
+		 * map full page starting at the page boundary
+		 */
+		info->reg_offset    = info->phys_reg_addr & (PAGE_SIZE-1);
+		info->phys_reg_addr &= ~(PAGE_SIZE-1);
+
+		info->bus_type = MGSL_BUS_TYPE_PCI;
+		info->irq_flags = SA_SHIRQ;
+
+		info->init_error = -1; /* assume error, set to 0 on successful init */
+	}
+
+	return info;
+}
+
+static void device_init(int adapter_num, struct pci_dev *pdev)
+{
+	struct slgt_info *port_array[SLGT_MAX_PORTS];
+	int i;
+	int port_count = 1;
+
+	if (pdev->device == SYNCLINK_GT4_DEVICE_ID)
+		port_count = 4;
+
+	/* allocate device instances for all ports */
+	for (i=0; i < port_count; ++i) {
+		port_array[i] = alloc_dev(adapter_num, i, pdev);
+		if (port_array[i] == NULL) {
+			for (--i; i >= 0; --i)
+				kfree(port_array[i]);
+			return;
+		}
+	}
+
+	/* give copy of port_array to all ports and add to device list  */
+	for (i=0; i < port_count; ++i) {
+		memcpy(port_array[i]->port_array, port_array, sizeof(port_array));
+		add_device(port_array[i]);
+		port_array[i]->port_count = port_count;
+		spin_lock_init(&port_array[i]->lock);
+	}
+
+	/* Allocate and claim adapter resources */
+	if (!claim_resources(port_array[0])) {
+
+		alloc_dma_bufs(port_array[0]);
+
+		/* copy resource information from first port to others */
+		for (i = 1; i < port_count; ++i) {
+			port_array[i]->lock      = port_array[0]->lock;
+			port_array[i]->irq_level = port_array[0]->irq_level;
+			port_array[i]->reg_addr  = port_array[0]->reg_addr;
+			alloc_dma_bufs(port_array[i]);
+		}
+
+		if (request_irq(port_array[0]->irq_level,
+					slgt_interrupt,
+					port_array[0]->irq_flags,
+					port_array[0]->device_name,
+					port_array[0]) < 0) {
+			DBGERR(("%s request_irq failed IRQ=%d\n",
+				port_array[0]->device_name,
+				port_array[0]->irq_level));
+		} else {
+			port_array[0]->irq_requested = 1;
+			adapter_test(port_array[0]);
+			for (i=1 ; i < port_count ; i++)
+				port_array[i]->init_error = port_array[0]->init_error;
+		}
+	}
+}
+
+static int __devinit init_one(struct pci_dev *dev,
+			      const struct pci_device_id *ent)
+{
+	if (pci_enable_device(dev)) {
+		printk("error enabling pci device %p\n", dev);
+		return -EIO;
+	}
+	pci_set_master(dev);
+	device_init(slgt_device_count, dev);
+	return 0;
+}
+
+static void __devexit remove_one(struct pci_dev *dev)
+{
+}
+
+static struct tty_operations ops = {
+	.open = open,
+	.close = close,
+	.write = write,
+	.put_char = put_char,
+	.flush_chars = flush_chars,
+	.write_room = write_room,
+	.chars_in_buffer = chars_in_buffer,
+	.flush_buffer = flush_buffer,
+	.ioctl = ioctl,
+	.throttle = throttle,
+	.unthrottle = unthrottle,
+	.send_xchar = send_xchar,
+	.break_ctl = set_break,
+	.wait_until_sent = wait_until_sent,
+ 	.read_proc = read_proc,
+	.set_termios = set_termios,
+	.stop = tx_hold,
+	.start = tx_release,
+	.hangup = hangup,
+	.tiocmget = tiocmget,
+	.tiocmset = tiocmset,
+};
+
+static void slgt_cleanup(void)
+{
+	int rc;
+	struct slgt_info *info;
+	struct slgt_info *tmp;
+
+	printk("unload %s %s\n", driver_name, driver_version);
+
+	if (serial_driver) {
+		if ((rc = tty_unregister_driver(serial_driver)))
+			DBGERR(("tty_unregister_driver error=%d\n", rc));
+		put_tty_driver(serial_driver);
+	}
+
+	/* reset devices */
+	info = slgt_device_list;
+	while(info) {
+		reset_port(info);
+		info = info->next_device;
+	}
+
+	/* release devices */
+	info = slgt_device_list;
+	while(info) {
+#ifdef CONFIG_HDLC
+		hdlcdev_exit(info);
+#endif
+		free_dma_bufs(info);
+		free_tmp_rbuf(info);
+		if (info->port_num == 0)
+			release_resources(info);
+		tmp = info;
+		info = info->next_device;
+		kfree(tmp);
+	}
+
+	if (pci_registered)
+		pci_unregister_driver(&pci_driver);
+}
+
+/*
+ *  Driver initialization entry point.
+ */
+static int __init slgt_init(void)
+{
+	int rc;
+
+ 	printk("%s %s\n", driver_name, driver_version);
+
+	slgt_device_count = 0;
+	if ((rc = pci_register_driver(&pci_driver)) < 0) {
+		printk("%s pci_register_driver error=%d\n", driver_name, rc);
+		return rc;
+	}
+	pci_registered = 1;
+
+	if (!slgt_device_list) {
+		printk("%s no devices found\n",driver_name);
+		return -ENODEV;
+	}
+
+	serial_driver = alloc_tty_driver(MAX_DEVICES);
+	if (!serial_driver) {
+		rc = -ENOMEM;
+		goto error;
+	}
+
+	/* Initialize the tty_driver structure */
+
+	serial_driver->owner = THIS_MODULE;
+	serial_driver->driver_name = tty_driver_name;
+	serial_driver->name = tty_dev_prefix;
+	serial_driver->major = ttymajor;
+	serial_driver->minor_start = 64;
+	serial_driver->type = TTY_DRIVER_TYPE_SERIAL;
+	serial_driver->subtype = SERIAL_TYPE_NORMAL;
+	serial_driver->init_termios = tty_std_termios;
+	serial_driver->init_termios.c_cflag =
+		B9600 | CS8 | CREAD | HUPCL | CLOCAL;
+	serial_driver->flags = TTY_DRIVER_REAL_RAW;
+	tty_set_operations(serial_driver, &ops);
+	if ((rc = tty_register_driver(serial_driver)) < 0) {
+		DBGERR(("%s can't register serial driver\n", driver_name));
+		put_tty_driver(serial_driver);
+		serial_driver = NULL;
+		goto error;
+	}
+
+ 	printk("%s %s, tty major#%d\n",
+		driver_name, driver_version,
+		serial_driver->major);
+
+	return 0;
+
+error:
+	slgt_cleanup();
+	return rc;
+}
+
+static void __exit slgt_exit(void)
+{
+	slgt_cleanup();
+}
+
+module_init(slgt_init);
+module_exit(slgt_exit);
+
+/*
+ * register access routines
+ */
+
+#define CALC_REGADDR() \
+	unsigned long reg_addr = ((unsigned long)info->reg_addr) + addr; \
+	if (addr >= 0x80) \
+		reg_addr += (info->port_num) * 32;
+
+static __u8 rd_reg8(struct slgt_info *info, unsigned int addr)
+{
+	CALC_REGADDR();
+	return readb((void __iomem *)reg_addr);
+}
+
+static void wr_reg8(struct slgt_info *info, unsigned int addr, __u8 value)
+{
+	CALC_REGADDR();
+	writeb(value, (void __iomem *)reg_addr);
+}
+
+static __u16 rd_reg16(struct slgt_info *info, unsigned int addr)
+{
+	CALC_REGADDR();
+	return readw((void __iomem *)reg_addr);
+}
+
+static void wr_reg16(struct slgt_info *info, unsigned int addr, __u16 value)
+{
+	CALC_REGADDR();
+	writew(value, (void __iomem *)reg_addr);
+}
+
+static __u32 rd_reg32(struct slgt_info *info, unsigned int addr)
+{
+	CALC_REGADDR();
+	return readl((void __iomem *)reg_addr);
+}
+
+static void wr_reg32(struct slgt_info *info, unsigned int addr, __u32 value)
+{
+	CALC_REGADDR();
+	writel(value, (void __iomem *)reg_addr);
+}
+
+static void rdma_reset(struct slgt_info *info)
+{
+	unsigned int i;
+
+	/* set reset bit */
+	wr_reg32(info, RDCSR, BIT1);
+
+	/* wait for enable bit cleared */
+	for(i=0 ; i < 1000 ; i++)
+		if (!(rd_reg32(info, RDCSR) & BIT0))
+			break;
+}
+
+static void tdma_reset(struct slgt_info *info)
+{
+	unsigned int i;
+
+	/* set reset bit */
+	wr_reg32(info, TDCSR, BIT1);
+
+	/* wait for enable bit cleared */
+	for(i=0 ; i < 1000 ; i++)
+		if (!(rd_reg32(info, TDCSR) & BIT0))
+			break;
+}
+
+/*
+ * enable internal loopback
+ * TxCLK and RxCLK are generated from BRG
+ * and TxD is looped back to RxD internally.
+ */
+static void enable_loopback(struct slgt_info *info)
+{
+	/* SCR (serial control) BIT2=looopback enable */
+	wr_reg16(info, SCR, (unsigned short)(rd_reg16(info, SCR) | BIT2));
+
+	if (info->params.mode != MGSL_MODE_ASYNC) {
+		/* CCR (clock control)
+		 * 07..05  tx clock source (010 = BRG)
+		 * 04..02  rx clock source (010 = BRG)
+		 * 01      auxclk enable   (0 = disable)
+		 * 00      BRG enable      (1 = enable)
+		 *
+		 * 0100 1001
+		 */
+		wr_reg8(info, CCR, 0x49);
+
+		/* set speed if available, otherwise use default */
+		if (info->params.clock_speed)
+			set_rate(info, info->params.clock_speed);
+		else
+			set_rate(info, 3686400);
+	}
+}
+
+/*
+ *  set baud rate generator to specified rate
+ */
+static void set_rate(struct slgt_info *info, u32 rate)
+{
+	unsigned int div;
+	static unsigned int osc = 14745600;
+
+	/* div = osc/rate - 1
+	 *
+	 * Round div up if osc/rate is not integer to
+	 * force to next slowest rate.
+	 */
+
+	if (rate) {
+		div = osc/rate;
+		if (!(osc % rate) && div)
+			div--;
+		wr_reg16(info, BDR, (unsigned short)div);
+	}
+}
+
+static void rx_stop(struct slgt_info *info)
+{
+	unsigned short val;
+
+	/* disable and reset receiver */
+	val = rd_reg16(info, RCR) & ~BIT1;          /* clear enable bit */
+	wr_reg16(info, RCR, (unsigned short)(val | BIT2)); /* set reset bit */
+	wr_reg16(info, RCR, val);                  /* clear reset bit */
+
+	slgt_irq_off(info, IRQ_RXOVER + IRQ_RXDATA + IRQ_RXIDLE);
+
+	/* clear pending rx interrupts */
+	wr_reg16(info, SSR, IRQ_RXIDLE + IRQ_RXOVER);
+
+	rdma_reset(info);
+
+	info->rx_enabled = 0;
+	info->rx_restart = 0;
+}
+
+static void rx_start(struct slgt_info *info)
+{
+	unsigned short val;
+
+	slgt_irq_off(info, IRQ_RXOVER + IRQ_RXDATA);
+
+	/* clear pending rx overrun IRQ */
+	wr_reg16(info, SSR, IRQ_RXOVER);
+
+	/* reset and disable receiver */
+	val = rd_reg16(info, RCR) & ~BIT1; /* clear enable bit */
+	wr_reg16(info, RCR, (unsigned short)(val | BIT2)); /* set reset bit */
+	wr_reg16(info, RCR, val);                  /* clear reset bit */
+
+	rdma_reset(info);
+	reset_rbufs(info);
+
+	/* set 1st descriptor address */
+	wr_reg32(info, RDDAR, info->rbufs[0].pdesc);
+
+	if (info->params.mode != MGSL_MODE_ASYNC) {
+		/* enable rx DMA and DMA interrupt */
+		wr_reg32(info, RDCSR, (BIT2 + BIT0));
+	} else {
+		/* enable saving of rx status, rx DMA and DMA interrupt */
+		wr_reg32(info, RDCSR, (BIT6 + BIT2 + BIT0));
+	}
+
+	slgt_irq_on(info, IRQ_RXOVER);
+
+	/* enable receiver */
+	wr_reg16(info, RCR, (unsigned short)(rd_reg16(info, RCR) | BIT1));
+
+	info->rx_restart = 0;
+	info->rx_enabled = 1;
+}
+
+static void tx_start(struct slgt_info *info)
+{
+	if (!info->tx_enabled) {
+		wr_reg16(info, TCR,
+			(unsigned short)(rd_reg16(info, TCR) | BIT1));
+		info->tx_enabled = TRUE;
+	}
+
+	if (info->tx_count) {
+		info->drop_rts_on_tx_done = 0;
+
+		if (info->params.mode != MGSL_MODE_ASYNC) {
+			if (info->params.flags & HDLC_FLAG_AUTO_RTS) {
+				get_signals(info);
+				if (!(info->signals & SerialSignal_RTS)) {
+					info->signals |= SerialSignal_RTS;
+					set_signals(info);
+					info->drop_rts_on_tx_done = 1;
+				}
+			}
+
+			slgt_irq_off(info, IRQ_TXDATA);
+			slgt_irq_on(info, IRQ_TXUNDER + IRQ_TXIDLE);
+			/* clear tx idle and underrun status bits */
+			wr_reg16(info, SSR, (unsigned short)(IRQ_TXIDLE + IRQ_TXUNDER));
+
+			if (!(rd_reg32(info, TDCSR) & BIT0)) {
+				/* tx DMA stopped, restart tx DMA */
+				tdma_reset(info);
+				/* set 1st descriptor address */
+				wr_reg32(info, TDDAR, info->tbufs[info->tbuf_start].pdesc);
+				if (info->params.mode == MGSL_MODE_RAW)
+					wr_reg32(info, TDCSR, BIT2 + BIT0); /* IRQ + DMA enable */
+				else
+					wr_reg32(info, TDCSR, BIT0); /* DMA enable */
+			}
+
+			if (info->params.mode != MGSL_MODE_RAW) {
+				info->tx_timer.expires = jiffies + msecs_to_jiffies(5000);
+				add_timer(&info->tx_timer);
+			}
+		} else {
+			tdma_reset(info);
+			/* set 1st descriptor address */
+			wr_reg32(info, TDDAR, info->tbufs[info->tbuf_start].pdesc);
+
+			slgt_irq_off(info, IRQ_TXDATA);
+			slgt_irq_on(info, IRQ_TXIDLE);
+			/* clear tx idle status bit */
+			wr_reg16(info, SSR, IRQ_TXIDLE);
+
+			/* enable tx DMA */
+			wr_reg32(info, TDCSR, BIT0);
+		}
+
+		info->tx_active = 1;
+	}
+}
+
+static void tx_stop(struct slgt_info *info)
+{
+	unsigned short val;
+
+	del_timer(&info->tx_timer);
+
+	tdma_reset(info);
+
+	/* reset and disable transmitter */
+	val = rd_reg16(info, TCR) & ~BIT1;          /* clear enable bit */
+	wr_reg16(info, TCR, (unsigned short)(val | BIT2)); /* set reset bit */
+	wr_reg16(info, TCR, val);                  /* clear reset */
+
+	slgt_irq_off(info, IRQ_TXDATA + IRQ_TXIDLE + IRQ_TXUNDER);
+
+	/* clear tx idle and underrun status bit */
+	wr_reg16(info, SSR, (unsigned short)(IRQ_TXIDLE + IRQ_TXUNDER));
+
+	reset_tbufs(info);
+
+	info->tx_enabled = 0;
+	info->tx_active  = 0;
+}
+
+static void reset_port(struct slgt_info *info)
+{
+	if (!info->reg_addr)
+		return;
+
+	tx_stop(info);
+	rx_stop(info);
+
+	info->signals &= ~(SerialSignal_DTR + SerialSignal_RTS);
+	set_signals(info);
+
+	slgt_irq_off(info, IRQ_ALL | IRQ_MASTER);
+}
+
+static void reset_adapter(struct slgt_info *info)
+{
+	int i;
+	for (i=0; i < info->port_count; ++i) {
+		if (info->port_array[i])
+			reset_port(info->port_array[i]);
+	}
+}
+
+static void async_mode(struct slgt_info *info)
+{
+  	unsigned short val;
+
+	slgt_irq_off(info, IRQ_ALL | IRQ_MASTER);
+	tx_stop(info);
+	rx_stop(info);
+
+	/* TCR (tx control)
+	 *
+	 * 15..13  mode, 010=async
+	 * 12..10  encoding, 000=NRZ
+	 * 09      parity enable
+	 * 08      1=odd parity, 0=even parity
+	 * 07      1=RTS driver control
+	 * 06      1=break enable
+	 * 05..04  character length
+	 *         00=5 bits
+	 *         01=6 bits
+	 *         10=7 bits
+	 *         11=8 bits
+	 * 03      0=1 stop bit, 1=2 stop bits
+	 * 02      reset
+	 * 01      enable
+	 * 00      auto-CTS enable
+	 */
+	val = 0x4000;
+
+	if (info->if_mode & MGSL_INTERFACE_RTS_EN)
+		val |= BIT7;
+
+	if (info->params.parity != ASYNC_PARITY_NONE) {
+		val |= BIT9;
+		if (info->params.parity == ASYNC_PARITY_ODD)
+			val |= BIT8;
+	}
+
+	switch (info->params.data_bits)
+	{
+	case 6: val |= BIT4; break;
+	case 7: val |= BIT5; break;
+	case 8: val |= BIT5 + BIT4; break;
+	}
+
+	if (info->params.stop_bits != 1)
+		val |= BIT3;
+
+	if (info->params.flags & HDLC_FLAG_AUTO_CTS)
+		val |= BIT0;
+
+	wr_reg16(info, TCR, val);
+
+	/* RCR (rx control)
+	 *
+	 * 15..13  mode, 010=async
+	 * 12..10  encoding, 000=NRZ
+	 * 09      parity enable
+	 * 08      1=odd parity, 0=even parity
+	 * 07..06  reserved, must be 0
+	 * 05..04  character length
+	 *         00=5 bits
+	 *         01=6 bits
+	 *         10=7 bits
+	 *         11=8 bits
+	 * 03      reserved, must be zero
+	 * 02      reset
+	 * 01      enable
+	 * 00      auto-DCD enable
+	 */
+	val = 0x4000;
+
+	if (info->params.parity != ASYNC_PARITY_NONE) {
+		val |= BIT9;
+		if (info->params.parity == ASYNC_PARITY_ODD)
+			val |= BIT8;
+	}
+
+	switch (info->params.data_bits)
+	{
+	case 6: val |= BIT4; break;
+	case 7: val |= BIT5; break;
+	case 8: val |= BIT5 + BIT4; break;
+	}
+
+	if (info->params.flags & HDLC_FLAG_AUTO_DCD)
+		val |= BIT0;
+
+	wr_reg16(info, RCR, val);
+
+	/* CCR (clock control)
+	 *
+	 * 07..05  011 = tx clock source is BRG/16
+	 * 04..02  010 = rx clock source is BRG
+	 * 01      0 = auxclk disabled
+	 * 00      1 = BRG enabled
+	 *
+	 * 0110 1001
+	 */
+	wr_reg8(info, CCR, 0x69);
+
+	msc_set_vcr(info);
+
+	tx_set_idle(info);
+
+	/* SCR (serial control)
+	 *
+	 * 15  1=tx req on FIFO half empty
+	 * 14  1=rx req on FIFO half full
+	 * 13  tx data  IRQ enable
+	 * 12  tx idle  IRQ enable
+	 * 11  rx break on IRQ enable
+	 * 10  rx data  IRQ enable
+	 * 09  rx break off IRQ enable
+	 * 08  overrun  IRQ enable
+	 * 07  DSR      IRQ enable
+	 * 06  CTS      IRQ enable
+	 * 05  DCD      IRQ enable
+	 * 04  RI       IRQ enable
+	 * 03  reserved, must be zero
+	 * 02  1=txd->rxd internal loopback enable
+	 * 01  reserved, must be zero
+	 * 00  1=master IRQ enable
+	 */
+	val = BIT15 + BIT14 + BIT0;
+	wr_reg16(info, SCR, val);
+
+	slgt_irq_on(info, IRQ_RXBREAK | IRQ_RXOVER);
+
+	set_rate(info, info->params.data_rate * 16);
+
+	if (info->params.loopback)
+		enable_loopback(info);
+}
+
+static void hdlc_mode(struct slgt_info *info)
+{
+	unsigned short val;
+
+	slgt_irq_off(info, IRQ_ALL | IRQ_MASTER);
+	tx_stop(info);
+	rx_stop(info);
+
+	/* TCR (tx control)
+	 *
+	 * 15..13  mode, 000=HDLC 001=raw sync
+	 * 12..10  encoding
+	 * 09      CRC enable
+	 * 08      CRC32
+	 * 07      1=RTS driver control
+	 * 06      preamble enable
+	 * 05..04  preamble length
+	 * 03      share open/close flag
+	 * 02      reset
+	 * 01      enable
+	 * 00      auto-CTS enable
+	 */
+	val = 0;
+
+	if (info->params.mode == MGSL_MODE_RAW)
+		val |= BIT13;
+	if (info->if_mode & MGSL_INTERFACE_RTS_EN)
+		val |= BIT7;
+
+	switch(info->params.encoding)
+	{
+	case HDLC_ENCODING_NRZB:          val |= BIT10; break;
+	case HDLC_ENCODING_NRZI_MARK:     val |= BIT11; break;
+	case HDLC_ENCODING_NRZI:          val |= BIT11 + BIT10; break;
+	case HDLC_ENCODING_BIPHASE_MARK:  val |= BIT12; break;
+	case HDLC_ENCODING_BIPHASE_SPACE: val |= BIT12 + BIT10; break;
+	case HDLC_ENCODING_BIPHASE_LEVEL: val |= BIT12 + BIT11; break;
+	case HDLC_ENCODING_DIFF_BIPHASE_LEVEL: val |= BIT12 + BIT11 + BIT10; break;
+	}
+
+	switch (info->params.crc_type)
+	{
+	case HDLC_CRC_16_CCITT: val |= BIT9; break;
+	case HDLC_CRC_32_CCITT: val |= BIT9 + BIT8; break;
+	}
+
+	if (info->params.preamble != HDLC_PREAMBLE_PATTERN_NONE)
+		val |= BIT6;
+
+	switch (info->params.preamble_length)
+	{
+	case HDLC_PREAMBLE_LENGTH_16BITS: val |= BIT5; break;
+	case HDLC_PREAMBLE_LENGTH_32BITS: val |= BIT4; break;
+	case HDLC_PREAMBLE_LENGTH_64BITS: val |= BIT5 + BIT4; break;
+	}
+
+	if (info->params.flags & HDLC_FLAG_AUTO_CTS)
+		val |= BIT0;
+
+	wr_reg16(info, TCR, val);
+
+	/* TPR (transmit preamble) */
+
+	switch (info->params.preamble)
+	{
+	case HDLC_PREAMBLE_PATTERN_FLAGS: val = 0x7e; break;
+	case HDLC_PREAMBLE_PATTERN_ONES:  val = 0xff; break;
+	case HDLC_PREAMBLE_PATTERN_ZEROS: val = 0x00; break;
+	case HDLC_PREAMBLE_PATTERN_10:    val = 0x55; break;
+	case HDLC_PREAMBLE_PATTERN_01:    val = 0xaa; break;
+	default:                          val = 0x7e; break;
+	}
+	wr_reg8(info, TPR, (unsigned char)val);
+
+	/* RCR (rx control)
+	 *
+	 * 15..13  mode, 000=HDLC 001=raw sync
+	 * 12..10  encoding
+	 * 09      CRC enable
+	 * 08      CRC32
+	 * 07..03  reserved, must be 0
+	 * 02      reset
+	 * 01      enable
+	 * 00      auto-DCD enable
+	 */
+	val = 0;
+
+	if (info->params.mode == MGSL_MODE_RAW)
+		val |= BIT13;
+
+	switch(info->params.encoding)
+	{
+	case HDLC_ENCODING_NRZB:          val |= BIT10; break;
+	case HDLC_ENCODING_NRZI_MARK:     val |= BIT11; break;
+	case HDLC_ENCODING_NRZI:          val |= BIT11 + BIT10; break;
+	case HDLC_ENCODING_BIPHASE_MARK:  val |= BIT12; break;
+	case HDLC_ENCODING_BIPHASE_SPACE: val |= BIT12 + BIT10; break;
+	case HDLC_ENCODING_BIPHASE_LEVEL: val |= BIT12 + BIT11; break;
+	case HDLC_ENCODING_DIFF_BIPHASE_LEVEL: val |= BIT12 + BIT11 + BIT10; break;
+	}
+
+	switch (info->params.crc_type)
+	{
+	case HDLC_CRC_16_CCITT: val |= BIT9; break;
+	case HDLC_CRC_32_CCITT: val |= BIT9 + BIT8; break;
+	}
+
+	if (info->params.flags & HDLC_FLAG_AUTO_DCD)
+		val |= BIT0;
+
+	wr_reg16(info, RCR, val);
+
+	/* CCR (clock control)
+	 *
+	 * 07..05  tx clock source
+	 * 04..02  rx clock source
+	 * 01      auxclk enable
+	 * 00      BRG enable
+	 */
+	val = 0;
+
+	if (info->params.flags & HDLC_FLAG_TXC_BRG)
+	{
+		// when RxC source is DPLL, BRG generates 16X DPLL
+		// reference clock, so take TxC from BRG/16 to get
+		// transmit clock at actual data rate
+		if (info->params.flags & HDLC_FLAG_RXC_DPLL)
+			val |= BIT6 + BIT5;	/* 011, txclk = BRG/16 */
+		else
+			val |= BIT6;	/* 010, txclk = BRG */
+	}
+	else if (info->params.flags & HDLC_FLAG_TXC_DPLL)
+		val |= BIT7;	/* 100, txclk = DPLL Input */
+	else if (info->params.flags & HDLC_FLAG_TXC_RXCPIN)
+		val |= BIT5;	/* 001, txclk = RXC Input */
+
+	if (info->params.flags & HDLC_FLAG_RXC_BRG)
+		val |= BIT3;	/* 010, rxclk = BRG */
+	else if (info->params.flags & HDLC_FLAG_RXC_DPLL)
+		val |= BIT4;	/* 100, rxclk = DPLL */
+	else if (info->params.flags & HDLC_FLAG_RXC_TXCPIN)
+		val |= BIT2;	/* 001, rxclk = TXC Input */
+
+	if (info->params.clock_speed)
+		val |= BIT1 + BIT0;
+
+	wr_reg8(info, CCR, (unsigned char)val);
+
+	if (info->params.flags & (HDLC_FLAG_TXC_DPLL + HDLC_FLAG_RXC_DPLL))
+	{
+		// program DPLL mode
+		switch(info->params.encoding)
+		{
+		case HDLC_ENCODING_BIPHASE_MARK:
+		case HDLC_ENCODING_BIPHASE_SPACE:
+			val = BIT7; break;
+		case HDLC_ENCODING_BIPHASE_LEVEL:
+		case HDLC_ENCODING_DIFF_BIPHASE_LEVEL:
+			val = BIT7 + BIT6; break;
+		default: val = BIT6;	// NRZ encodings
+		}
+		wr_reg16(info, RCR, (unsigned short)(rd_reg16(info, RCR) | val));
+
+		// DPLL requires a 16X reference clock from BRG
+		set_rate(info, info->params.clock_speed * 16);
+	}
+	else
+		set_rate(info, info->params.clock_speed);
+
+	tx_set_idle(info);
+
+	msc_set_vcr(info);
+
+	/* SCR (serial control)
+	 *
+	 * 15  1=tx req on FIFO half empty
+	 * 14  1=rx req on FIFO half full
+	 * 13  tx data  IRQ enable
+	 * 12  tx idle  IRQ enable
+	 * 11  underrun IRQ enable
+	 * 10  rx data  IRQ enable
+	 * 09  rx idle  IRQ enable
+	 * 08  overrun  IRQ enable
+	 * 07  DSR      IRQ enable
+	 * 06  CTS      IRQ enable
+	 * 05  DCD      IRQ enable
+	 * 04  RI       IRQ enable
+	 * 03  reserved, must be zero
+	 * 02  1=txd->rxd internal loopback enable
+	 * 01  reserved, must be zero
+	 * 00  1=master IRQ enable
+	 */
+	wr_reg16(info, SCR, BIT15 + BIT14 + BIT0);
+
+	if (info->params.loopback)
+		enable_loopback(info);
+}
+
+/*
+ *  set transmit idle mode
+ */
+static void tx_set_idle(struct slgt_info *info)
+{
+	unsigned char val = 0xff;
+
+	switch(info->idle_mode)
+	{
+	case HDLC_TXIDLE_FLAGS:          val = 0x7e; break;
+	case HDLC_TXIDLE_ALT_ZEROS_ONES: val = 0xaa; break;
+	case HDLC_TXIDLE_ZEROS:          val = 0x00; break;
+	case HDLC_TXIDLE_ONES:           val = 0xff; break;
+	case HDLC_TXIDLE_ALT_MARK_SPACE: val = 0xaa; break;
+	case HDLC_TXIDLE_SPACE:          val = 0x00; break;
+	case HDLC_TXIDLE_MARK:           val = 0xff; break;
+	}
+
+	wr_reg8(info, TIR, val);
+}
+
+/*
+ * get state of V24 status (input) signals
+ */
+static void get_signals(struct slgt_info *info)
+{
+	unsigned short status = rd_reg16(info, SSR);
+
+	/* clear all serial signals except DTR and RTS */
+	info->signals &= SerialSignal_DTR + SerialSignal_RTS;
+
+	if (status & BIT3)
+		info->signals |= SerialSignal_DSR;
+	if (status & BIT2)
+		info->signals |= SerialSignal_CTS;
+	if (status & BIT1)
+		info->signals |= SerialSignal_DCD;
+	if (status & BIT0)
+		info->signals |= SerialSignal_RI;
+}
+
+/*
+ * set V.24 Control Register based on current configuration
+ */
+static void msc_set_vcr(struct slgt_info *info)
+{
+	unsigned char val = 0;
+
+	/* VCR (V.24 control)
+	 *
+	 * 07..04  serial IF select
+	 * 03      DTR
+	 * 02      RTS
+	 * 01      LL
+	 * 00      RL
+	 */
+
+	switch(info->if_mode & MGSL_INTERFACE_MASK)
+	{
+	case MGSL_INTERFACE_RS232:
+		val |= BIT5; /* 0010 */
+		break;
+	case MGSL_INTERFACE_V35:
+		val |= BIT7 + BIT6 + BIT5; /* 1110 */
+		break;
+	case MGSL_INTERFACE_RS422:
+		val |= BIT6; /* 0100 */
+		break;
+	}
+
+	if (info->signals & SerialSignal_DTR)
+		val |= BIT3;
+	if (info->signals & SerialSignal_RTS)
+		val |= BIT2;
+	if (info->if_mode & MGSL_INTERFACE_LL)
+		val |= BIT1;
+	if (info->if_mode & MGSL_INTERFACE_RL)
+		val |= BIT0;
+	wr_reg8(info, VCR, val);
+}
+
+/*
+ * set state of V24 control (output) signals
+ */
+static void set_signals(struct slgt_info *info)
+{
+	unsigned char val = rd_reg8(info, VCR);
+	if (info->signals & SerialSignal_DTR)
+		val |= BIT3;
+	else
+		val &= ~BIT3;
+	if (info->signals & SerialSignal_RTS)
+		val |= BIT2;
+	else
+		val &= ~BIT2;
+	wr_reg8(info, VCR, val);
+}
+
+/*
+ * free range of receive DMA buffers (i to last)
+ */
+static void free_rbufs(struct slgt_info *info, unsigned int i, unsigned int last)
+{
+	int done = 0;
+
+	while(!done) {
+		/* reset current buffer for reuse */
+		info->rbufs[i].status = 0;
+		if (info->params.mode == MGSL_MODE_RAW)
+			set_desc_count(info->rbufs[i], info->raw_rx_size);
+		else
+			set_desc_count(info->rbufs[i], DMABUFSIZE);
+
+		if (i == last)
+			done = 1;
+		if (++i == info->rbuf_count)
+			i = 0;
+	}
+	info->rbuf_current = i;
+}
+
+/*
+ * mark all receive DMA buffers as free
+ */
+static void reset_rbufs(struct slgt_info *info)
+{
+	free_rbufs(info, 0, info->rbuf_count - 1);
+}
+
+/*
+ * pass receive HDLC frame to upper layer
+ *
+ * return 1 if frame available, otherwise 0
+ */
+static int rx_get_frame(struct slgt_info *info)
+{
+	unsigned int start, end;
+	unsigned short status;
+	unsigned int framesize = 0;
+	int rc = 0;
+	unsigned long flags;
+	struct tty_struct *tty = info->tty;
+	unsigned char addr_field = 0xff;
+
+check_again:
+
+	framesize = 0;
+	addr_field = 0xff;
+	start = end = info->rbuf_current;
+
+	for (;;) {
+		if (!desc_complete(info->rbufs[end]))
+			goto cleanup;
+
+		if (framesize == 0 && info->params.addr_filter != 0xff)
+			addr_field = info->rbufs[end].buf[0];
+
+		framesize += desc_count(info->rbufs[end]);
+
+		if (desc_eof(info->rbufs[end]))
+			break;
+
+		if (++end == info->rbuf_count)
+			end = 0;
+
+		if (end == info->rbuf_current) {
+			if (info->rx_enabled){
+				spin_lock_irqsave(&info->lock,flags);
+				rx_start(info);
+				spin_unlock_irqrestore(&info->lock,flags);
+			}
+			goto cleanup;
+		}
+	}
+
+	/* status
+	 *
+	 * 15      buffer complete
+	 * 14..06  reserved
+	 * 05..04  residue
+	 * 02      eof (end of frame)
+	 * 01      CRC error
+	 * 00      abort
+	 */
+	status = desc_status(info->rbufs[end]);
+
+	/* ignore CRC bit if not using CRC (bit is undefined) */
+	if (info->params.crc_type == HDLC_CRC_NONE)
+		status &= ~BIT1;
+
+	if (framesize == 0 ||
+		 (addr_field != 0xff && addr_field != info->params.addr_filter)) {
+		free_rbufs(info, start, end);
+		goto check_again;
+	}
+
+	if (framesize < 2 || status & (BIT1+BIT0)) {
+		if (framesize < 2 || (status & BIT0))
+			info->icount.rxshort++;
+		else
+			info->icount.rxcrc++;
+		framesize = 0;
+
+#ifdef CONFIG_HDLC
+		{
+			struct net_device_stats *stats = hdlc_stats(info->netdev);
+			stats->rx_errors++;
+			stats->rx_frame_errors++;
+		}
+#endif
+	} else {
+		/* adjust frame size for CRC, if any */
+		if (info->params.crc_type == HDLC_CRC_16_CCITT)
+			framesize -= 2;
+		else if (info->params.crc_type == HDLC_CRC_32_CCITT)
+			framesize -= 4;
+	}
+
+	DBGBH(("%s rx frame status=%04X size=%d\n",
+		info->device_name, status, framesize));
+	DBGDATA(info, info->rbufs[start].buf, min_t(int, framesize, DMABUFSIZE), "rx");
+
+	if (framesize) {
+		if (framesize > info->max_frame_size)
+			info->icount.rxlong++;
+		else {
+			/* copy dma buffer(s) to contiguous temp buffer */
+			int copy_count = framesize;
+			int i = start;
+			unsigned char *p = info->tmp_rbuf;
+			info->tmp_rbuf_count = framesize;
+
+			info->icount.rxok++;
+
+			while(copy_count) {
+				int partial_count = min(copy_count, DMABUFSIZE);
+				memcpy(p, info->rbufs[i].buf, partial_count);
+				p += partial_count;
+				copy_count -= partial_count;
+				if (++i == info->rbuf_count)
+					i = 0;
+			}
+
+#ifdef CONFIG_HDLC
+			if (info->netcount)
+				hdlcdev_rx(info,info->tmp_rbuf, framesize);
+			else
+#endif
+				ldisc_receive_buf(tty, info->tmp_rbuf, info->flag_buf, framesize);
+		}
+	}
+	free_rbufs(info, start, end);
+	rc = 1;
+
+cleanup:
+	return rc;
+}
+
+/*
+ * pass receive buffer (RAW synchronous mode) to tty layer
+ * return 1 if buffer available, otherwise 0
+ */
+static int rx_get_buf(struct slgt_info *info)
+{
+	unsigned int i = info->rbuf_current;
+
+	if (!desc_complete(info->rbufs[i]))
+		return 0;
+	DBGDATA(info, info->rbufs[i].buf, desc_count(info->rbufs[i]), "rx");
+	DBGINFO(("rx_get_buf size=%d\n", desc_count(info->rbufs[i])));
+	ldisc_receive_buf(info->tty, info->rbufs[i].buf,
+			  info->flag_buf, desc_count(info->rbufs[i]));
+	free_rbufs(info, i, i);
+	return 1;
+}
+
+static void reset_tbufs(struct slgt_info *info)
+{
+	unsigned int i;
+	info->tbuf_current = 0;
+	for (i=0 ; i < info->tbuf_count ; i++) {
+		info->tbufs[i].status = 0;
+		info->tbufs[i].count  = 0;
+	}
+}
+
+/*
+ * return number of free transmit DMA buffers
+ */
+static unsigned int free_tbuf_count(struct slgt_info *info)
+{
+	unsigned int count = 0;
+	unsigned int i = info->tbuf_current;
+
+	do
+	{
+		if (desc_count(info->tbufs[i]))
+			break; /* buffer in use */
+		++count;
+		if (++i == info->tbuf_count)
+			i=0;
+	} while (i != info->tbuf_current);
+
+	/* last buffer with zero count may be in use, assume it is */
+	if (count)
+		--count;
+
+	return count;
+}
+
+/*
+ * load transmit DMA buffer(s) with data
+ */
+static void tx_load(struct slgt_info *info, const char *buf, unsigned int size)
+{
+	unsigned short count;
+	unsigned int i;
+	struct slgt_desc *d;
+
+	if (size == 0)
+		return;
+
+	DBGDATA(info, buf, size, "tx");
+
+	info->tbuf_start = i = info->tbuf_current;
+
+	while (size) {
+		d = &info->tbufs[i];
+		if (++i == info->tbuf_count)
+			i = 0;
+
+		count = (unsigned short)((size > DMABUFSIZE) ? DMABUFSIZE : size);
+		memcpy(d->buf, buf, count);
+
+		size -= count;
+		buf  += count;
+
+		if (!size && info->params.mode != MGSL_MODE_RAW)
+			set_desc_eof(*d, 1); /* HDLC: set EOF of last desc */
+		else
+			set_desc_eof(*d, 0);
+
+		set_desc_count(*d, count);
+	}
+
+	info->tbuf_current = i;
+}
+
+static int register_test(struct slgt_info *info)
+{
+	static unsigned short patterns[] =
+		{0x0000, 0xffff, 0xaaaa, 0x5555, 0x6969, 0x9696};
+	static unsigned int count = sizeof(patterns)/sizeof(patterns[0]);
+	unsigned int i;
+	int rc = 0;
+
+	for (i=0 ; i < count ; i++) {
+		wr_reg16(info, TIR, patterns[i]);
+		wr_reg16(info, BDR, patterns[(i+1)%count]);
+		if ((rd_reg16(info, TIR) != patterns[i]) ||
+		    (rd_reg16(info, BDR) != patterns[(i+1)%count])) {
+			rc = -ENODEV;
+			break;
+		}
+	}
+
+	info->init_error = rc ? 0 : DiagStatus_AddressFailure;
+	return rc;
+}
+
+static int irq_test(struct slgt_info *info)
+{
+	unsigned long timeout;
+	unsigned long flags;
+	struct tty_struct *oldtty = info->tty;
+	u32 speed = info->params.data_rate;
+
+	info->params.data_rate = 921600;
+	info->tty = NULL;
+
+	spin_lock_irqsave(&info->lock, flags);
+	async_mode(info);
+	slgt_irq_on(info, IRQ_TXIDLE);
+
+	/* enable transmitter */
+	wr_reg16(info, TCR,
+		(unsigned short)(rd_reg16(info, TCR) | BIT1));
+
+	/* write one byte and wait for tx idle */
+	wr_reg16(info, TDR, 0);
+
+	/* assume failure */
+	info->init_error = DiagStatus_IrqFailure;
+	info->irq_occurred = FALSE;
+
+	spin_unlock_irqrestore(&info->lock, flags);
+
+	timeout=100;
+	while(timeout-- && !info->irq_occurred)
+		msleep_interruptible(10);
+
+	spin_lock_irqsave(&info->lock,flags);
+	reset_port(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	info->params.data_rate = speed;
+	info->tty = oldtty;
+
+	info->init_error = info->irq_occurred ? 0 : DiagStatus_IrqFailure;
+	return info->irq_occurred ? 0 : -ENODEV;
+}
+
+static int loopback_test_rx(struct slgt_info *info)
+{
+	unsigned char *src, *dest;
+	int count;
+
+	if (desc_complete(info->rbufs[0])) {
+		count = desc_count(info->rbufs[0]);
+		src   = info->rbufs[0].buf;
+		dest  = info->tmp_rbuf;
+
+		for( ; count ; count-=2, src+=2) {
+			/* src=data byte (src+1)=status byte */
+			if (!(*(src+1) & (BIT9 + BIT8))) {
+				*dest = *src;
+				dest++;
+				info->tmp_rbuf_count++;
+			}
+		}
+		DBGDATA(info, info->tmp_rbuf, info->tmp_rbuf_count, "rx");
+		return 1;
+	}
+	return 0;
+}
+
+static int loopback_test(struct slgt_info *info)
+{
+#define TESTFRAMESIZE 20
+
+	unsigned long timeout;
+	u16 count = TESTFRAMESIZE;
+	unsigned char buf[TESTFRAMESIZE];
+	int rc = -ENODEV;
+	unsigned long flags;
+
+	struct tty_struct *oldtty = info->tty;
+	MGSL_PARAMS params;
+
+	memcpy(&params, &info->params, sizeof(params));
+
+	info->params.mode = MGSL_MODE_ASYNC;
+	info->params.data_rate = 921600;
+	info->params.loopback = 1;
+	info->tty = NULL;
+
+	/* build and send transmit frame */
+	for (count = 0; count < TESTFRAMESIZE; ++count)
+		buf[count] = (unsigned char)count;
+
+	info->tmp_rbuf_count = 0;
+	memset(info->tmp_rbuf, 0, TESTFRAMESIZE);
+
+	/* program hardware for HDLC and enabled receiver */
+	spin_lock_irqsave(&info->lock,flags);
+	async_mode(info);
+	rx_start(info);
+	info->tx_count = count;
+	tx_load(info, buf, count);
+	tx_start(info);
+	spin_unlock_irqrestore(&info->lock, flags);
+
+	/* wait for receive complete */
+	for (timeout = 100; timeout; --timeout) {
+		msleep_interruptible(10);
+		if (loopback_test_rx(info)) {
+			rc = 0;
+			break;
+		}
+	}
+
+	/* verify received frame length and contents */
+	if (!rc && (info->tmp_rbuf_count != count ||
+		  memcmp(buf, info->tmp_rbuf, count))) {
+		rc = -ENODEV;
+	}
+
+	spin_lock_irqsave(&info->lock,flags);
+	reset_adapter(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	memcpy(&info->params, &params, sizeof(info->params));
+	info->tty = oldtty;
+
+	info->init_error = rc ? DiagStatus_DmaFailure : 0;
+	return rc;
+}
+
+static int adapter_test(struct slgt_info *info)
+{
+	DBGINFO(("testing %s\n", info->device_name));
+	if ((info->init_error = register_test(info)) < 0) {
+		printk("register test failure %s addr=%08X\n",
+			info->device_name, info->phys_reg_addr);
+	} else if ((info->init_error = irq_test(info)) < 0) {
+		printk("IRQ test failure %s IRQ=%d\n",
+			info->device_name, info->irq_level);
+	} else if ((info->init_error = loopback_test(info)) < 0) {
+		printk("loopback test failure %s\n", info->device_name);
+	}
+	return info->init_error;
+}
+
+/*
+ * transmit timeout handler
+ */
+static void tx_timeout(unsigned long context)
+{
+	struct slgt_info *info = (struct slgt_info*)context;
+	unsigned long flags;
+
+	DBGINFO(("%s tx_timeout\n", info->device_name));
+	if(info->tx_active && info->params.mode == MGSL_MODE_HDLC) {
+		info->icount.txtimeout++;
+	}
+	spin_lock_irqsave(&info->lock,flags);
+	info->tx_active = 0;
+	info->tx_count = 0;
+	spin_unlock_irqrestore(&info->lock,flags);
+
+#ifdef CONFIG_HDLC
+	if (info->netcount)
+		hdlcdev_tx_done(info);
+	else
+#endif
+		bh_transmit(info);
+}
+
+/*
+ * receive buffer polling timer
+ */
+static void rx_timeout(unsigned long context)
+{
+	struct slgt_info *info = (struct slgt_info*)context;
+	unsigned long flags;
+
+	DBGINFO(("%s rx_timeout\n", info->device_name));
+	spin_lock_irqsave(&info->lock, flags);
+	info->pending_bh |= BH_RECEIVE;
+	spin_unlock_irqrestore(&info->lock, flags);
+	bh_handler(info);
+}
+
diff --git a/include/linux/synclink.h b/include/linux/synclink.h
index 763bd290f28d..1b7cd8d1a71b 100644
--- a/include/linux/synclink.h
+++ b/include/linux/synclink.h
@@ -1,7 +1,7 @@
 /*
  * SyncLink Multiprotocol Serial Adapter Driver
  *
- * $Id: synclink.h,v 3.6 2002/02/20 21:58:20 paulkf Exp $
+ * $Id: synclink.h,v 3.10 2005/11/08 19:50:54 paulkf Exp $
  *
  * Copyright (C) 1998-2000 by Microgate Corporation
  *
@@ -128,10 +128,14 @@
 #define MGSL_BUS_TYPE_EISA	2
 #define MGSL_BUS_TYPE_PCI	5
 
+#define MGSL_INTERFACE_MASK     0xf
 #define MGSL_INTERFACE_DISABLE  0
 #define MGSL_INTERFACE_RS232    1
 #define MGSL_INTERFACE_V35      2
 #define MGSL_INTERFACE_RS422    3
+#define MGSL_INTERFACE_RTS_EN   0x10
+#define MGSL_INTERFACE_LL       0x20
+#define MGSL_INTERFACE_RL       0x40
 
 typedef struct _MGSL_PARAMS
 {
@@ -163,6 +167,9 @@ typedef struct _MGSL_PARAMS
 #define SYNCLINK_DEVICE_ID 0x0010
 #define MGSCC_DEVICE_ID 0x0020
 #define SYNCLINK_SCA_DEVICE_ID 0x0030
+#define SYNCLINK_GT_DEVICE_ID 0x0070
+#define SYNCLINK_GT4_DEVICE_ID 0x0080
+#define SYNCLINK_AC_DEVICE_ID  0x0090
 #define MGSL_MAX_SERIAL_NUMBER 30
 
 /*
-- 
cgit v1.2.3


From 9ded96f24c3a5fcbef954e88c443385a1af37eb9 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+lkml@arm.linux.org.uk>
Date: Sun, 8 Jan 2006 01:02:07 -0800
Subject: [PATCH] IRQ type flags

Some ARM platforms have the ability to program the interrupt controller to
detect various interrupt edges and/or levels.  For some platforms, this is
critical to setup correctly, particularly those which the setting is dependent
on the device.

Currently, ARM drivers do (eg) the following:

	err = request_irq(irq, ...);

	set_irq_type(irq, IRQT_RISING);

However, if the interrupt has previously been programmed to be level sensitive
(for whatever reason) then this will cause an interrupt storm.

Hence, if we combine set_irq_type() with request_irq(), we can then safely set
the type prior to unmasking the interrupt.  The unfortunate problem is that in
order to support this, these flags need to be visible outside of the ARM
architecture - drivers such as smc91x need these flags and they're
cross-architecture.

Finally, the SA_TRIGGER_* flag passed to request_irq() should reflect the
property that the device would like.  The IRQ controller code should do its
best to select the most appropriate supported mode.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm/kernel/irq.c              | 14 ++++++++++++--
 arch/arm/mach-omap1/serial.c       |  3 +--
 arch/arm/mach-pxa/corgi.c          |  7 +++----
 arch/arm/mach-pxa/poodle.c         |  7 +++----
 arch/arm/mach-pxa/spitz.c          |  7 +++----
 arch/arm/mach-s3c2410/usb-simtec.c |  6 +++---
 drivers/i2c/chips/tps65010.c       | 11 ++++++-----
 drivers/input/keyboard/corgikbd.c  |  6 ++----
 drivers/input/keyboard/spitzkbd.c  | 27 ++++++++++++++-------------
 drivers/mfd/ucb1x00-core.c         |  5 ++---
 drivers/net/smc91x.c               |  5 +----
 drivers/net/smc91x.h               | 18 +++++++++---------
 include/asm-arm/irq.h              | 12 ++++++++----
 include/linux/signal.h             | 13 +++++++++++++
 14 files changed, 80 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c
index 869c466e6258..b5645c4462cf 100644
--- a/arch/arm/kernel/irq.c
+++ b/arch/arm/kernel/irq.c
@@ -684,8 +684,12 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 	spin_lock_irqsave(&irq_controller_lock, flags);
 	p = &desc->action;
 	if ((old = *p) != NULL) {
-		/* Can't share interrupts unless both agree to */
-		if (!(old->flags & new->flags & SA_SHIRQ)) {
+		/*
+		 * Can't share interrupts unless both agree to and are
+		 * the same type.
+		 */
+		if (!(old->flags & new->flags & SA_SHIRQ) ||
+		    (~old->flags & new->flags) & SA_TRIGGER_MASK) {
 			spin_unlock_irqrestore(&irq_controller_lock, flags);
 			return -EBUSY;
 		}
@@ -705,6 +709,12 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 		desc->running = 0;
 		desc->pending = 0;
 		desc->disable_depth = 1;
+
+		if (new->flags & SA_TRIGGER_MASK) {
+			unsigned int type = new->flags & SA_TRIGGER_MASK;
+			desc->chip->set_type(irq, type);
+		}
+
 		if (!desc->noautoenable) {
 			desc->disable_depth = 0;
 			desc->chip->unmask(irq);
diff --git a/arch/arm/mach-omap1/serial.c b/arch/arm/mach-omap1/serial.c
index fcfb81d13cfe..7a68f098a025 100644
--- a/arch/arm/mach-omap1/serial.c
+++ b/arch/arm/mach-omap1/serial.c
@@ -252,9 +252,8 @@ static void __init omap_serial_set_port_wakeup(int gpio_nr)
 		return;
 	}
 	omap_set_gpio_direction(gpio_nr, 1);
-	set_irq_type(OMAP_GPIO_IRQ(gpio_nr), IRQT_RISING);
 	ret = request_irq(OMAP_GPIO_IRQ(gpio_nr), &omap_serial_wake_interrupt,
-			  0, "serial wakeup", NULL);
+			  SA_TRIGGER_RISING, "serial wakeup", NULL);
 	if (ret) {
 		omap_free_gpio(gpio_nr);
 		printk(KERN_ERR "No interrupt for UART wake GPIO: %i\n",
diff --git a/arch/arm/mach-pxa/corgi.c b/arch/arm/mach-pxa/corgi.c
index 100fb31b5156..5a7b873f29b3 100644
--- a/arch/arm/mach-pxa/corgi.c
+++ b/arch/arm/mach-pxa/corgi.c
@@ -213,15 +213,14 @@ static int corgi_mci_init(struct device *dev, irqreturn_t (*corgi_detect_int)(in
 
 	corgi_mci_platform_data.detect_delay = msecs_to_jiffies(250);
 
-	err = request_irq(CORGI_IRQ_GPIO_nSD_DETECT, corgi_detect_int, SA_INTERRUPT,
-			     "MMC card detect", data);
+	err = request_irq(CORGI_IRQ_GPIO_nSD_DETECT, corgi_detect_int,
+			  SA_INTERRUPT | SA_TRIGGER_RISING | SA_TRIGGER_FALLING,
+			  "MMC card detect", data);
 	if (err) {
 		printk(KERN_ERR "corgi_mci_init: MMC/SD: can't request MMC card detect IRQ\n");
 		return -1;
 	}
 
-	set_irq_type(CORGI_IRQ_GPIO_nSD_DETECT, IRQT_BOTHEDGE);
-
 	return 0;
 }
 
diff --git a/arch/arm/mach-pxa/poodle.c b/arch/arm/mach-pxa/poodle.c
index eef3de26ad37..663c95005985 100644
--- a/arch/arm/mach-pxa/poodle.c
+++ b/arch/arm/mach-pxa/poodle.c
@@ -146,15 +146,14 @@ static int poodle_mci_init(struct device *dev, irqreturn_t (*poodle_detect_int)(
 
 	poodle_mci_platform_data.detect_delay = msecs_to_jiffies(250);
 
-	err = request_irq(POODLE_IRQ_GPIO_nSD_DETECT, poodle_detect_int, SA_INTERRUPT,
-			     "MMC card detect", data);
+	err = request_irq(POODLE_IRQ_GPIO_nSD_DETECT, poodle_detect_int,
+			  SA_INTERRUPT | SA_TRIGGER_RISING | SA_TRIGGER_FALLING,
+			  "MMC card detect", data);
 	if (err) {
 		printk(KERN_ERR "poodle_mci_init: MMC/SD: can't request MMC card detect IRQ\n");
 		return -1;
 	}
 
-	set_irq_type(POODLE_IRQ_GPIO_nSD_DETECT, IRQT_BOTHEDGE);
-
 	return 0;
 }
 
diff --git a/arch/arm/mach-pxa/spitz.c b/arch/arm/mach-pxa/spitz.c
index f2007db0cda5..a9eacc06555f 100644
--- a/arch/arm/mach-pxa/spitz.c
+++ b/arch/arm/mach-pxa/spitz.c
@@ -296,15 +296,14 @@ static int spitz_mci_init(struct device *dev, irqreturn_t (*spitz_detect_int)(in
 
 	spitz_mci_platform_data.detect_delay = msecs_to_jiffies(250);
 
-	err = request_irq(SPITZ_IRQ_GPIO_nSD_DETECT, spitz_detect_int, SA_INTERRUPT,
-			     "MMC card detect", data);
+	err = request_irq(SPITZ_IRQ_GPIO_nSD_DETECT, spitz_detect_int,
+			  SA_INTERRUPT | SA_TRIGGER_RISING | SA_TRIGGER_FALLING,
+			  "MMC card detect", data);
 	if (err) {
 		printk(KERN_ERR "spitz_mci_init: MMC/SD: can't request MMC card detect IRQ\n");
 		return -1;
 	}
 
-	set_irq_type(SPITZ_IRQ_GPIO_nSD_DETECT, IRQT_BOTHEDGE);
-
 	return 0;
 }
 
diff --git a/arch/arm/mach-s3c2410/usb-simtec.c b/arch/arm/mach-s3c2410/usb-simtec.c
index 5098b50158a3..495f8c6ffcb6 100644
--- a/arch/arm/mach-s3c2410/usb-simtec.c
+++ b/arch/arm/mach-s3c2410/usb-simtec.c
@@ -84,13 +84,13 @@ static void usb_simtec_enableoc(struct s3c2410_hcd_info *info, int on)
 	int ret;
 
 	if (on) {
-		ret = request_irq(IRQ_USBOC, usb_simtec_ocirq, SA_INTERRUPT,
+		ret = request_irq(IRQ_USBOC, usb_simtec_ocirq,
+				  SA_INTERRUPT | SA_TRIGGER_RISING |
+				   SA_TRIGGER_FALLING,
 				  "USB Over-current", info);
 		if (ret != 0) {
 			printk(KERN_ERR "failed to request usb oc irq\n");
 		}
-
-		set_irq_type(IRQ_USBOC, IRQT_BOTHEDGE);
 	} else {
 		free_irq(IRQ_USBOC, info);
 	}
diff --git a/drivers/i2c/chips/tps65010.c b/drivers/i2c/chips/tps65010.c
index e70b3db69edd..1af3dfbb8086 100644
--- a/drivers/i2c/chips/tps65010.c
+++ b/drivers/i2c/chips/tps65010.c
@@ -494,6 +494,7 @@ tps65010_probe(struct i2c_adapter *bus, int address, int kind)
 {
 	struct tps65010		*tps;
 	int			status;
+	unsigned long		irqflags;
 
 	if (the_tps) {
 		dev_dbg(&bus->dev, "only one %s for now\n", DRIVER_NAME);
@@ -520,13 +521,14 @@ tps65010_probe(struct i2c_adapter *bus, int address, int kind)
 	}
 
 #ifdef	CONFIG_ARM
+	irqflags = SA_SAMPLE_RANDOM | SA_TRIGGER_LOW;
 	if (machine_is_omap_h2()) {
 		tps->model = TPS65010;
 		omap_cfg_reg(W4_GPIO58);
 		tps->irq = OMAP_GPIO_IRQ(58);
 		omap_request_gpio(58);
 		omap_set_gpio_direction(58, 1);
-		set_irq_type(tps->irq, IRQT_FALLING);
+		irqflags |= SA_TRIGGER_FALLING;
 	}
 	if (machine_is_omap_osk()) {
 		tps->model = TPS65010;
@@ -534,7 +536,7 @@ tps65010_probe(struct i2c_adapter *bus, int address, int kind)
 		tps->irq = OMAP_GPIO_IRQ(OMAP_MPUIO(1));
 		omap_request_gpio(OMAP_MPUIO(1));
 		omap_set_gpio_direction(OMAP_MPUIO(1), 1);
-		set_irq_type(tps->irq, IRQT_FALLING);
+		irqflags |= SA_TRIGGER_FALLING;
 	}
 	if (machine_is_omap_h3()) {
 		tps->model = TPS65013;
@@ -542,13 +544,12 @@ tps65010_probe(struct i2c_adapter *bus, int address, int kind)
 		// FIXME set up this board's IRQ ...
 	}
 #else
-#define set_irq_type(num,trigger)	do{}while(0)
+	irqflags = SA_SAMPLE_RANDOM;
 #endif
 
 	if (tps->irq > 0) {
-		set_irq_type(tps->irq, IRQT_LOW);
 		status = request_irq(tps->irq, tps65010_irq,
-			SA_SAMPLE_RANDOM, DRIVER_NAME, tps);
+			irqflags, DRIVER_NAME, tps);
 		if (status < 0) {
 			dev_dbg(&tps->client.dev, "can't get IRQ %d, err %d\n",
 					tps->irq, status);
diff --git a/drivers/input/keyboard/corgikbd.c b/drivers/input/keyboard/corgikbd.c
index 64672d491222..e301ee4ca264 100644
--- a/drivers/input/keyboard/corgikbd.c
+++ b/drivers/input/keyboard/corgikbd.c
@@ -19,7 +19,6 @@
 #include <linux/jiffies.h>
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <asm/irq.h>
 
 #include <asm/arch/corgi.h>
 #include <asm/arch/hardware.h>
@@ -343,10 +342,9 @@ static int __init corgikbd_probe(struct platform_device *pdev)
 	for (i = 0; i < CORGI_KEY_SENSE_NUM; i++) {
 		pxa_gpio_mode(CORGI_GPIO_KEY_SENSE(i) | GPIO_IN);
 		if (request_irq(CORGI_IRQ_GPIO_KEY_SENSE(i), corgikbd_interrupt,
-						SA_INTERRUPT, "corgikbd", corgikbd))
+				SA_INTERRUPT | SA_TRIGGER_RISING,
+				"corgikbd", corgikbd))
 			printk(KERN_WARNING "corgikbd: Can't get IRQ: %d!\n", i);
-		else
-			set_irq_type(CORGI_IRQ_GPIO_KEY_SENSE(i),IRQT_RISING);
 	}
 
 	/* Set Strobe lines as outputs - set high */
diff --git a/drivers/input/keyboard/spitzkbd.c b/drivers/input/keyboard/spitzkbd.c
index 6a15fe3bc527..83999d583122 100644
--- a/drivers/input/keyboard/spitzkbd.c
+++ b/drivers/input/keyboard/spitzkbd.c
@@ -19,7 +19,6 @@
 #include <linux/jiffies.h>
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <asm/irq.h>
 
 #include <asm/arch/spitz.h>
 #include <asm/arch/hardware.h>
@@ -407,10 +406,9 @@ static int __init spitzkbd_probe(struct platform_device *dev)
 	for (i = 0; i < SPITZ_KEY_SENSE_NUM; i++) {
 		pxa_gpio_mode(spitz_senses[i] | GPIO_IN);
 		if (request_irq(IRQ_GPIO(spitz_senses[i]), spitzkbd_interrupt,
-						SA_INTERRUPT, "Spitzkbd Sense", spitzkbd))
+				SA_INTERRUPT|SA_TRIGGER_RISING,
+				"Spitzkbd Sense", spitzkbd))
 			printk(KERN_WARNING "spitzkbd: Can't get Sense IRQ: %d!\n", i);
-		else
-			set_irq_type(IRQ_GPIO(spitz_senses[i]),IRQT_RISING);
 	}
 
 	/* Set Strobe lines as outputs - set high */
@@ -422,15 +420,18 @@ static int __init spitzkbd_probe(struct platform_device *dev)
 	pxa_gpio_mode(SPITZ_GPIO_SWA | GPIO_IN);
 	pxa_gpio_mode(SPITZ_GPIO_SWB | GPIO_IN);
 
-	request_irq(SPITZ_IRQ_GPIO_SYNC, spitzkbd_interrupt, SA_INTERRUPT, "Spitzkbd Sync", spitzkbd);
-	request_irq(SPITZ_IRQ_GPIO_ON_KEY, spitzkbd_interrupt, SA_INTERRUPT, "Spitzkbd PwrOn", spitzkbd);
-	request_irq(SPITZ_IRQ_GPIO_SWA, spitzkbd_hinge_isr, SA_INTERRUPT, "Spitzkbd SWA", spitzkbd);
-	request_irq(SPITZ_IRQ_GPIO_SWB, spitzkbd_hinge_isr, SA_INTERRUPT, "Spitzkbd SWB", spitzkbd);
-
-	set_irq_type(SPITZ_IRQ_GPIO_SYNC, IRQT_BOTHEDGE);
-	set_irq_type(SPITZ_IRQ_GPIO_ON_KEY, IRQT_BOTHEDGE);
-	set_irq_type(SPITZ_IRQ_GPIO_SWA, IRQT_BOTHEDGE);
-	set_irq_type(SPITZ_IRQ_GPIO_SWB, IRQT_BOTHEDGE);
+	request_irq(SPITZ_IRQ_GPIO_SYNC, spitzkbd_interrupt,
+		    SA_INTERRUPT | SA_TRIGGER_RISING | SA_TRIGGER_FALLING,
+		    "Spitzkbd Sync", spitzkbd);
+	request_irq(SPITZ_IRQ_GPIO_ON_KEY, spitzkbd_interrupt,
+		    SA_INTERRUPT | SA_TRIGGER_RISING | SA_TRIGGER_FALLING,
+		    "Spitzkbd PwrOn", spitzkbd);
+	request_irq(SPITZ_IRQ_GPIO_SWA, spitzkbd_hinge_isr,
+		    SA_INTERRUPT | SA_TRIGGER_RISING | SA_TRIGGER_FALLING,
+		    "Spitzkbd SWA", spitzkbd);
+	request_irq(SPITZ_IRQ_GPIO_SWB, spitzkbd_hinge_isr,
+		    SA_INTERRUPT | SA_TRIGGER_RISING | SA_TRIGGER_FALLING,
+		    "Spitzkbd SWB", spitzkbd);
 
 	printk(KERN_INFO "input: Spitz Keyboard Registered\n");
 
diff --git a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c
index e335d54c4659..b42e0fbab59b 100644
--- a/drivers/mfd/ucb1x00-core.c
+++ b/drivers/mfd/ucb1x00-core.c
@@ -27,7 +27,6 @@
 
 #include <asm/dma.h>
 #include <asm/hardware.h>
-#include <asm/irq.h>
 
 #include "ucb1x00.h"
 
@@ -507,14 +506,14 @@ static int ucb1x00_probe(struct mcp *mcp)
 		goto err_free;
 	}
 
-	ret = request_irq(ucb->irq, ucb1x00_irq, 0, "UCB1x00", ucb);
+	ret = request_irq(ucb->irq, ucb1x00_irq, SA_TRIGGER_RISING,
+			  "UCB1x00", ucb);
 	if (ret) {
 		printk(KERN_ERR "ucb1x00: unable to grab irq%d: %d\n",
 			ucb->irq, ret);
 		goto err_free;
 	}
 
-	set_irq_type(ucb->irq, IRQT_RISING);
 	mcp_set_drvdata(mcp, ucb);
 
 	ret = class_device_register(&ucb->cdev);
diff --git a/drivers/net/smc91x.c b/drivers/net/smc91x.c
index 28bf2e69eb5e..7ec08127c9d6 100644
--- a/drivers/net/smc91x.c
+++ b/drivers/net/smc91x.c
@@ -88,7 +88,6 @@ static const char version[] =
 #include <linux/skbuff.h>
 
 #include <asm/io.h>
-#include <asm/irq.h>
 
 #include "smc91x.h"
 
@@ -2007,12 +2006,10 @@ static int __init smc_probe(struct net_device *dev, void __iomem *ioaddr)
 	}
 
 	/* Grab the IRQ */
-      	retval = request_irq(dev->irq, &smc_interrupt, 0, dev->name, dev);
+      	retval = request_irq(dev->irq, &smc_interrupt, SMC_IRQ_FLAGS, dev->name, dev);
       	if (retval)
       		goto err_out;
 
-	set_irq_type(dev->irq, SMC_IRQ_TRIGGER_TYPE);
-
 #ifdef SMC_USE_PXA_DMA
 	{
 		int dma = pxa_request_dma(dev->name, DMA_PRIO_LOW,
diff --git a/drivers/net/smc91x.h b/drivers/net/smc91x.h
index 5c2824be4ee6..e0efd1964e72 100644
--- a/drivers/net/smc91x.h
+++ b/drivers/net/smc91x.h
@@ -90,7 +90,7 @@
 			__l--;						\
 		}							\
 	} while (0)
-#define set_irq_type(irq, type)
+#define SMC_IRQ_FLAGS		(0)
 
 #elif defined(CONFIG_SA1100_PLEB)
 /* We can only do 16-bit reads and writes in the static memory space. */
@@ -109,7 +109,7 @@
 #define SMC_outw(v, a, r)	writew(v, (a) + (r))
 #define SMC_outsw(a, r, p, l)	writesw((a) + (r), p, l)
 
-#define set_irq_type(irq, type) do {} while (0)
+#define SMC_IRQ_FLAGS		(0)
 
 #elif defined(CONFIG_SA1100_ASSABET)
 
@@ -185,11 +185,11 @@ SMC_outw(u16 val, void __iomem *ioaddr, int reg)
 #include <asm/mach-types.h>
 #include <asm/arch/cpu.h>
 
-#define	SMC_IRQ_TRIGGER_TYPE (( \
+#define	SMC_IRQ_FLAGS (( \
 		   machine_is_omap_h2() \
 		|| machine_is_omap_h3() \
 		|| (machine_is_omap_innovator() && !cpu_is_omap1510()) \
-	) ? IRQT_FALLING : IRQT_RISING)
+	) ? SA_TRIGGER_FALLING : SA_TRIGGER_RISING)
 
 
 #elif	defined(CONFIG_SH_SH4202_MICRODEV)
@@ -209,7 +209,7 @@ SMC_outw(u16 val, void __iomem *ioaddr, int reg)
 #define SMC_insw(a, r, p, l)	insw((a) + (r) - 0xa0000000, p, l)
 #define SMC_outsw(a, r, p, l)	outsw((a) + (r) - 0xa0000000, p, l)
 
-#define set_irq_type(irq, type)	do {} while(0)
+#define SMC_IRQ_FLAGS		(0)
 
 #elif	defined(CONFIG_ISA)
 
@@ -237,7 +237,7 @@ SMC_outw(u16 val, void __iomem *ioaddr, int reg)
 #define SMC_insw(a, r, p, l)	insw(((u32)a) + (r), p, l)
 #define SMC_outsw(a, r, p, l)	outsw(((u32)a) + (r), p, l)
 
-#define set_irq_type(irq, type)	do {} while(0)
+#define SMC_IRQ_FLAGS		(0)
 
 #define RPC_LSA_DEFAULT		RPC_LED_TX_RX
 #define RPC_LSB_DEFAULT		RPC_LED_100_10
@@ -319,7 +319,7 @@ static inline void SMC_outsw (unsigned long a, int r, unsigned char* p, int l)
 			au_writew(*_p++ , _a); \
 	} while(0)
 
-#define set_irq_type(irq, type) do {} while (0)
+#define SMC_IRQ_FLAGS		(0)
 
 #else
 
@@ -342,8 +342,8 @@ static inline void SMC_outsw (unsigned long a, int r, unsigned char* p, int l)
 
 #endif
 
-#ifndef	SMC_IRQ_TRIGGER_TYPE
-#define	SMC_IRQ_TRIGGER_TYPE	IRQT_RISING
+#ifndef	SMC_IRQ_FLAGS
+#define	SMC_IRQ_FLAGS		SA_TRIGGER_RISING
 #endif
 
 #ifdef SMC_USE_PXA_DMA
diff --git a/include/asm-arm/irq.h b/include/asm-arm/irq.h
index 59975ee43cf1..7772432d3fd7 100644
--- a/include/asm-arm/irq.h
+++ b/include/asm-arm/irq.h
@@ -25,10 +25,14 @@ extern void disable_irq_nosync(unsigned int);
 extern void disable_irq(unsigned int);
 extern void enable_irq(unsigned int);
 
-#define __IRQT_FALEDGE	(1 << 0)
-#define __IRQT_RISEDGE	(1 << 1)
-#define __IRQT_LOWLVL	(1 << 2)
-#define __IRQT_HIGHLVL	(1 << 3)
+/*
+ * These correspond with the SA_TRIGGER_* defines, and therefore the
+ * IRQRESOURCE_IRQ_* defines.
+ */
+#define __IRQT_RISEDGE	(1 << 0)
+#define __IRQT_FALEDGE	(1 << 1)
+#define __IRQT_HIGHLVL	(1 << 2)
+#define __IRQT_LOWLVL	(1 << 3)
 
 #define IRQT_NOEDGE	(0)
 #define IRQT_RISING	(__IRQT_RISEDGE)
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 5dd5f02c5c5f..ea9eff16c4b7 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -18,6 +18,19 @@
 #define SA_PROBE		SA_ONESHOT
 #define SA_SAMPLE_RANDOM	SA_RESTART
 #define SA_SHIRQ		0x04000000
+/*
+ * As above, these correspond to the IORESOURCE_IRQ_* defines in
+ * linux/ioport.h to select the interrupt line behaviour.  When
+ * requesting an interrupt without specifying a SA_TRIGGER, the
+ * setting should be assumed to be "as already configured", which
+ * may be as per machine or firmware initialisation.
+ */
+#define SA_TRIGGER_LOW		0x00000008
+#define SA_TRIGGER_HIGH		0x00000004
+#define SA_TRIGGER_FALLING	0x00000002
+#define SA_TRIGGER_RISING	0x00000001
+#define SA_TRIGGER_MASK	(SA_TRIGGER_HIGH|SA_TRIGGER_LOW|\
+				 SA_TRIGGER_RISING|SA_TRIGGER_FALLING)
 
 /*
  * Real Time signals may be queued.
-- 
cgit v1.2.3


From e5174baaea7585760f02eef23b225847d209a8db Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Sun, 8 Jan 2006 01:02:11 -0800
Subject: [PATCH] fat: support ->direct_IO()

This patch add to support of ->direct_IO() for mostly read.

The user of this seems to want to use for streaming read.  So, current direct
I/O has limitation, it can only overwrite.  (For write operation, mainly we
need to handle the hole etc..)

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fat/cache.c           | 14 +++++++--
 fs/fat/dir.c             |  6 ++--
 fs/fat/inode.c           | 82 ++++++++++++++++++++++++++++++++++++++++++------
 include/linux/msdos_fs.h |  3 +-
 4 files changed, 89 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 77c24fcf712a..1acc941245fb 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -295,7 +295,8 @@ static int fat_bmap_cluster(struct inode *inode, int cluster)
 	return dclus;
 }
 
-int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys)
+int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
+	     unsigned long *mapped_blocks)
 {
 	struct super_block *sb = inode->i_sb;
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
@@ -303,9 +304,12 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys)
 	int cluster, offset;
 
 	*phys = 0;
+	*mapped_blocks = 0;
 	if ((sbi->fat_bits != 32) && (inode->i_ino == MSDOS_ROOT_INO)) {
-		if (sector < (sbi->dir_entries >> sbi->dir_per_block_bits))
+		if (sector < (sbi->dir_entries >> sbi->dir_per_block_bits)) {
 			*phys = sector + sbi->dir_start;
+			*mapped_blocks = 1;
+		}
 		return 0;
 	}
 	last_block = (MSDOS_I(inode)->mmu_private + (sb->s_blocksize - 1))
@@ -318,7 +322,11 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys)
 	cluster = fat_bmap_cluster(inode, cluster);
 	if (cluster < 0)
 		return cluster;
-	else if (cluster)
+	else if (cluster) {
 		*phys = fat_clus_to_blknr(sbi, cluster) + offset;
+		*mapped_blocks = sbi->sec_per_clus - offset;
+		if (*mapped_blocks > last_block - sector)
+			*mapped_blocks = last_block - sector;
+	}
 	return 0;
 }
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 4ce77475aed3..eef1b81aa294 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -68,8 +68,8 @@ static int fat__get_entry(struct inode *dir, loff_t *pos,
 {
 	struct super_block *sb = dir->i_sb;
 	sector_t phys, iblock;
-	int offset;
-	int err;
+	unsigned long mapped_blocks;
+	int err, offset;
 
 next:
 	if (*bh)
@@ -77,7 +77,7 @@ next:
 
 	*bh = NULL;
 	iblock = *pos >> sb->s_blocksize_bits;
-	err = fat_bmap(dir, iblock, &phys);
+	err = fat_bmap(dir, iblock, &phys, &mapped_blocks);
 	if (err || !phys)
 		return -1;	/* beyond EOF or error */
 
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 932c8d6d1f54..e7f4aa7fc686 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -23,6 +23,7 @@
 #include <linux/mount.h>
 #include <linux/vfs.h>
 #include <linux/parser.h>
+#include <linux/uio.h>
 #include <asm/unaligned.h>
 
 #ifndef CONFIG_FAT_DEFAULT_IOCHARSET
@@ -49,43 +50,77 @@ static int fat_add_cluster(struct inode *inode)
 	return err;
 }
 
-static int fat_get_block(struct inode *inode, sector_t iblock,
-			 struct buffer_head *bh_result, int create)
+static int __fat_get_blocks(struct inode *inode, sector_t iblock,
+			    unsigned long *max_blocks,
+			    struct buffer_head *bh_result, int create)
 {
 	struct super_block *sb = inode->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 	sector_t phys;
-	int err;
+	unsigned long mapped_blocks;
+	int err, offset;
 
-	err = fat_bmap(inode, iblock, &phys);
+	err = fat_bmap(inode, iblock, &phys, &mapped_blocks);
 	if (err)
 		return err;
 	if (phys) {
 		map_bh(bh_result, sb, phys);
+		*max_blocks = min(mapped_blocks, *max_blocks);
 		return 0;
 	}
 	if (!create)
 		return 0;
+
 	if (iblock != MSDOS_I(inode)->mmu_private >> sb->s_blocksize_bits) {
 		fat_fs_panic(sb, "corrupted file size (i_pos %lld, %lld)",
 			     MSDOS_I(inode)->i_pos, MSDOS_I(inode)->mmu_private);
 		return -EIO;
 	}
-	if (!((unsigned long)iblock & (MSDOS_SB(sb)->sec_per_clus - 1))) {
+
+	offset = (unsigned long)iblock & (sbi->sec_per_clus - 1);
+	if (!offset) {
+		/* TODO: multiple cluster allocation would be desirable. */
 		err = fat_add_cluster(inode);
 		if (err)
 			return err;
 	}
-	MSDOS_I(inode)->mmu_private += sb->s_blocksize;
-	err = fat_bmap(inode, iblock, &phys);
+	/* available blocks on this cluster */
+	mapped_blocks = sbi->sec_per_clus - offset;
+
+	*max_blocks = min(mapped_blocks, *max_blocks);
+	MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits;
+
+	err = fat_bmap(inode, iblock, &phys, &mapped_blocks);
 	if (err)
 		return err;
-	if (!phys)
-		BUG();
+	BUG_ON(!phys);
+	BUG_ON(*max_blocks != mapped_blocks);
 	set_buffer_new(bh_result);
 	map_bh(bh_result, sb, phys);
 	return 0;
 }
 
+static int fat_get_blocks(struct inode *inode, sector_t iblock,
+			  unsigned long max_blocks,
+			  struct buffer_head *bh_result, int create)
+{
+	struct super_block *sb = inode->i_sb;
+	int err;
+
+	err = __fat_get_blocks(inode, iblock, &max_blocks, bh_result, create);
+	if (err)
+		return err;
+	bh_result->b_size = max_blocks << sb->s_blocksize_bits;
+	return 0;
+}
+
+static int fat_get_block(struct inode *inode, sector_t iblock,
+			 struct buffer_head *bh_result, int create)
+{
+	unsigned long max_blocks = 1;
+	return __fat_get_blocks(inode, iblock, &max_blocks, bh_result, create);
+}
+
 static int fat_writepage(struct page *page, struct writeback_control *wbc)
 {
 	return block_write_full_page(page, fat_get_block, wbc);
@@ -128,6 +163,34 @@ static int fat_commit_write(struct file *file, struct page *page,
 	return err;
 }
 
+static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
+			     const struct iovec *iov,
+			     loff_t offset, unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+
+	if (rw == WRITE) {
+		/*
+		 * FIXME: blockdev_direct_IO() doesn't use ->prepare_write(),
+		 * so we need to update the ->mmu_private to block boundary.
+		 *
+		 * But we must fill the remaining area or hole by nul for
+		 * updating ->mmu_private.
+		 */
+		loff_t size = offset + iov_length(iov, nr_segs);
+		if (MSDOS_I(inode)->mmu_private < size)
+			return -EINVAL;
+	}
+
+	/*
+	 * FAT need to use the DIO_LOCKING for avoiding the race
+	 * condition of fat_get_block() and ->truncate().
+	 */
+	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+				  offset, nr_segs, fat_get_blocks, NULL);
+}
+
 static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping, block, fat_get_block);
@@ -141,6 +204,7 @@ static struct address_space_operations fat_aops = {
 	.sync_page	= block_sync_page,
 	.prepare_write	= fat_prepare_write,
 	.commit_write	= fat_commit_write,
+	.direct_IO	= fat_direct_IO,
 	.bmap		= _fat_bmap
 };
 
diff --git a/include/linux/msdos_fs.h b/include/linux/msdos_fs.h
index 941da5c016a0..e933e2a355ad 100644
--- a/include/linux/msdos_fs.h
+++ b/include/linux/msdos_fs.h
@@ -329,7 +329,8 @@ static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len)
 extern void fat_cache_inval_inode(struct inode *inode);
 extern int fat_get_cluster(struct inode *inode, int cluster,
 			   int *fclus, int *dclus);
-extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys);
+extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
+		    unsigned long *mapped_blocks);
 
 /* fat/dir.c */
 extern struct file_operations fat_dir_operations;
-- 
cgit v1.2.3


From 268fc16e343b4f8e249468747db2e658da46a814 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Sun, 8 Jan 2006 01:02:12 -0800
Subject: [PATCH] export/change sync_page_range/_nolock()

This exports/changes the sync_page_range/_nolock().  The fatfs needs
sync_page_range/_nolock() for expanding truncate, and changes "size_t count"
to "loff_t count".

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/writeback.h | 4 +++-
 mm/filemap.c              | 8 ++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index b096159086e8..beaef5c7a0ea 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -103,7 +103,9 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping);
 int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
 int sync_page_range(struct inode *inode, struct address_space *mapping,
-			loff_t pos, size_t count);
+			loff_t pos, loff_t count);
+int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
+			   loff_t pos, loff_t count);
 
 /* pdflush.c */
 extern int nr_pdflush_threads;	/* Global so it can be exported to sysctl
diff --git a/mm/filemap.c b/mm/filemap.c
index 4ef24a397684..8fdf36508023 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -280,7 +280,7 @@ static int wait_on_page_writeback_range(struct address_space *mapping,
  * it is otherwise livelockable.
  */
 int sync_page_range(struct inode *inode, struct address_space *mapping,
-			loff_t pos, size_t count)
+			loff_t pos, loff_t count)
 {
 	pgoff_t start = pos >> PAGE_CACHE_SHIFT;
 	pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
@@ -305,9 +305,8 @@ EXPORT_SYMBOL(sync_page_range);
  * as it forces O_SYNC writers to different parts of the same file
  * to be serialised right until io completion.
  */
-static int sync_page_range_nolock(struct inode *inode,
-				  struct address_space *mapping,
-				  loff_t pos, size_t count)
+int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
+			   loff_t pos, loff_t count)
 {
 	pgoff_t start = pos >> PAGE_CACHE_SHIFT;
 	pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
@@ -322,6 +321,7 @@ static int sync_page_range_nolock(struct inode *inode,
 		ret = wait_on_page_writeback_range(mapping, start, end);
 	return ret;
 }
+EXPORT_SYMBOL(sync_page_range_nolock);
 
 /**
  * filemap_fdatawait - walk the list of under-writeback pages of the given
-- 
cgit v1.2.3


From 05eb0b51fb46430050d5873458612f53e0234f2e Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Sun, 8 Jan 2006 01:02:13 -0800
Subject: [PATCH] fat: support a truncate() for expanding size
 (generic_cont_expand)

This patch changes generic_cont_expand(), in order to share the code
with fatfs.

  - Use vmtruncate() if ->prepare_write() returns a error.

Even if ->prepare_write() returns an error, it may already have added some
blocks.  So, this truncates blocks outside of ->i_size by vmtruncate().

  - Add generic_cont_expand_simple().

The generic_cont_expand_simple() assumes that ->prepare_write() can handle
the block boundary.  With this, we don't need to care the extra byte.

And for expanding a file size by truncate(), fatfs uses the
added generic_cont_expand_simple().

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/buffer.c                 | 60 ++++++++++++++++++++++++++++++++++-----------
 fs/fat/file.c               | 31 ++++++++++++++++++++---
 include/linux/buffer_head.h |  3 ++-
 3 files changed, 76 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 5287be18633b..55023231e460 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2160,11 +2160,12 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
  * truncates.  Uses prepare/commit_write to allow the filesystem to
  * deal with the hole.  
  */
-int generic_cont_expand(struct inode *inode, loff_t size)
+static int __generic_cont_expand(struct inode *inode, loff_t size,
+				 pgoff_t index, unsigned int offset)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
-	unsigned long index, offset, limit;
+	unsigned long limit;
 	int err;
 
 	err = -EFBIG;
@@ -2176,24 +2177,24 @@ int generic_cont_expand(struct inode *inode, loff_t size)
 	if (size > inode->i_sb->s_maxbytes)
 		goto out;
 
-	offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
-
-	/* ugh.  in prepare/commit_write, if from==to==start of block, we 
-	** skip the prepare.  make sure we never send an offset for the start
-	** of a block
-	*/
-	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
-		offset++;
-	}
-	index = size >> PAGE_CACHE_SHIFT;
 	err = -ENOMEM;
 	page = grab_cache_page(mapping, index);
 	if (!page)
 		goto out;
 	err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
-	if (!err) {
-		err = mapping->a_ops->commit_write(NULL, page, offset, offset);
+	if (err) {
+		/*
+		 * ->prepare_write() may have instantiated a few blocks
+		 * outside i_size.  Trim these off again.
+		 */
+		unlock_page(page);
+		page_cache_release(page);
+		vmtruncate(inode, inode->i_size);
+		goto out;
 	}
+
+	err = mapping->a_ops->commit_write(NULL, page, offset, offset);
+
 	unlock_page(page);
 	page_cache_release(page);
 	if (err > 0)
@@ -2202,6 +2203,36 @@ out:
 	return err;
 }
 
+int generic_cont_expand(struct inode *inode, loff_t size)
+{
+	pgoff_t index;
+	unsigned int offset;
+
+	offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
+
+	/* ugh.  in prepare/commit_write, if from==to==start of block, we
+	** skip the prepare.  make sure we never send an offset for the start
+	** of a block
+	*/
+	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
+		/* caller must handle this extra byte. */
+		offset++;
+	}
+	index = size >> PAGE_CACHE_SHIFT;
+
+	return __generic_cont_expand(inode, size, index, offset);
+}
+
+int generic_cont_expand_simple(struct inode *inode, loff_t size)
+{
+	loff_t pos = size - 1;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
+
+	/* prepare/commit_write can handle even if from==to==start of block. */
+	return __generic_cont_expand(inode, size, index, offset);
+}
+
 /*
  * For moronic filesystems that do not allow holes in file.
  * We may have to extend the file.
@@ -3145,6 +3176,7 @@ EXPORT_SYMBOL(fsync_bdev);
 EXPORT_SYMBOL(generic_block_bmap);
 EXPORT_SYMBOL(generic_commit_write);
 EXPORT_SYMBOL(generic_cont_expand);
+EXPORT_SYMBOL(generic_cont_expand_simple);
 EXPORT_SYMBOL(init_buffer);
 EXPORT_SYMBOL(invalidate_bdev);
 EXPORT_SYMBOL(ll_rw_block);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 15229fe569c3..9b07c328a6fc 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -11,6 +11,7 @@
 #include <linux/msdos_fs.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include <linux/writeback.h>
 
 int fat_generic_ioctl(struct inode *inode, struct file *filp,
 		      unsigned int cmd, unsigned long arg)
@@ -124,6 +125,24 @@ struct file_operations fat_file_operations = {
 	.sendfile	= generic_file_sendfile,
 };
 
+static int fat_cont_expand(struct inode *inode, loff_t size)
+{
+	struct address_space *mapping = inode->i_mapping;
+	loff_t start = inode->i_size, count = size - inode->i_size;
+	int err;
+
+	err = generic_cont_expand_simple(inode, size);
+	if (err)
+		goto out;
+
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+	if (IS_SYNC(inode))
+		err = sync_page_range_nolock(inode, mapping, start, count);
+out:
+	return err;
+}
+
 int fat_notify_change(struct dentry *dentry, struct iattr *attr)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
@@ -132,11 +151,17 @@ int fat_notify_change(struct dentry *dentry, struct iattr *attr)
 
 	lock_kernel();
 
-	/* FAT cannot truncate to a longer file */
+	/*
+	 * Expand the file. Since inode_setattr() updates ->i_size
+	 * before calling the ->truncate(), but FAT needs to fill the
+	 * hole before it.
+	 */
 	if (attr->ia_valid & ATTR_SIZE) {
 		if (attr->ia_size > inode->i_size) {
-			error = -EPERM;
-			goto out;
+			error = fat_cont_expand(inode, attr->ia_size);
+			if (error || attr->ia_valid == ATTR_SIZE)
+				goto out;
+			attr->ia_valid &= ~ATTR_SIZE;
 		}
 	}
 
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 1db061bb6b08..9f159baf153f 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -197,7 +197,8 @@ int block_read_full_page(struct page*, get_block_t*);
 int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*);
 int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*,
 				loff_t *);
-int generic_cont_expand(struct inode *inode, loff_t size) ;
+int generic_cont_expand(struct inode *inode, loff_t size);
+int generic_cont_expand_simple(struct inode *inode, loff_t size);
 int block_commit_write(struct page *page, unsigned from, unsigned to);
 int block_sync_page(struct page *);
 sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
-- 
cgit v1.2.3


From 2a10e0b28b196051ae71829e5b989cba00513289 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sun, 8 Jan 2006 01:02:15 -0800
Subject: [PATCH] move rtc_interrupt() prototype to rtc.h

This patch moves the rtc_interrupt() prototype to rtc.h and removes the
prototypes from C files.

It also renames static rtc_interrupt() functions in
arch/arm/mach-integrator/time.c and arch/sh64/kernel/time.c to avoid compile
problems.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Paul Gortmaker <p_gortmaker@yahoo.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm/mach-integrator/time.c | 5 +++--
 arch/i386/kernel/time_hpet.c    | 2 --
 arch/sh64/kernel/time.c         | 7 ++++---
 arch/x86_64/kernel/time.c       | 2 --
 include/linux/rtc.h             | 3 +++
 5 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-integrator/time.c b/arch/arm/mach-integrator/time.c
index 9f46aaef8968..3c22c16b38bf 100644
--- a/arch/arm/mach-integrator/time.c
+++ b/arch/arm/mach-integrator/time.c
@@ -96,7 +96,8 @@ static struct rtc_ops rtc_ops = {
 	.set_alarm	= rtc_set_alarm,
 };
 
-static irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+static irqreturn_t arm_rtc_interrupt(int irq, void *dev_id,
+				     struct pt_regs *regs)
 {
 	writel(0, rtc_base + RTC_EOI);
 	return IRQ_HANDLED;
@@ -124,7 +125,7 @@ static int rtc_probe(struct amba_device *dev, void *id)
 
 	xtime.tv_sec = __raw_readl(rtc_base + RTC_DR);
 
-	ret = request_irq(dev->irq[0], rtc_interrupt, SA_INTERRUPT,
+	ret = request_irq(dev->irq[0], arm_rtc_interrupt, SA_INTERRUPT,
 			  "rtc-pl030", dev);
 	if (ret)
 		goto map_out;
diff --git a/arch/i386/kernel/time_hpet.c b/arch/i386/kernel/time_hpet.c
index 9caeaa315cd7..a529f0cdce17 100644
--- a/arch/i386/kernel/time_hpet.c
+++ b/arch/i386/kernel/time_hpet.c
@@ -259,8 +259,6 @@ __setup("hpet=", hpet_setup);
 #include <linux/mc146818rtc.h>
 #include <linux/rtc.h>
 
-extern irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs);
-
 #define DEFAULT_RTC_INT_FREQ 	64
 #define RTC_NUM_INTS 		1
 
diff --git a/arch/sh64/kernel/time.c b/arch/sh64/kernel/time.c
index 870fe5327e09..1195af37ee5a 100644
--- a/arch/sh64/kernel/time.c
+++ b/arch/sh64/kernel/time.c
@@ -417,7 +417,7 @@ static __init unsigned int get_cpu_hz(void)
 	/*
 	** Regardless the toolchain, force the compiler to use the
 	** arbitrary register r3 as a clock tick counter.
-	** NOTE: r3 must be in accordance with rtc_interrupt()
+	** NOTE: r3 must be in accordance with sh64_rtc_interrupt()
 	*/
 	register unsigned long long  __rtc_irq_flag __asm__ ("r3");
 
@@ -482,7 +482,8 @@ static __init unsigned int get_cpu_hz(void)
 #endif
 }
 
-static irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+static irqreturn_t sh64_rtc_interrupt(int irq, void *dev_id,
+				      struct pt_regs *regs)
 {
 	ctrl_outb(0, RCR1);	/* Disable Carry Interrupts */
 	regs->regs[3] = 1;	/* Using r3 */
@@ -491,7 +492,7 @@ static irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 }
 
 static struct irqaction irq0  = { timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL};
-static struct irqaction irq1  = { rtc_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "rtc", NULL, NULL};
+static struct irqaction irq1  = { sh64_rtc_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "rtc", NULL, NULL};
 
 void __init time_init(void)
 {
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 74102796e5c0..43c9fa0f8d5f 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -1075,8 +1075,6 @@ device_initcall(time_init_device);
  */
 #include <linux/rtc.h>
 
-extern irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs);
-
 #define DEFAULT_RTC_INT_FREQ 	64
 #define RTC_NUM_INTS 		1
 
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index e1aaf1fac8e0..0b2ba67ff13c 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -11,6 +11,8 @@
 #ifndef _LINUX_RTC_H_
 #define _LINUX_RTC_H_
 
+#include <linux/interrupt.h>
+
 /*
  * The struct used to pass data via the following ioctl. Similar to the
  * struct tm in <time.h>, but it needs to be here so that the kernel 
@@ -102,6 +104,7 @@ int rtc_register(rtc_task_t *task);
 int rtc_unregister(rtc_task_t *task);
 int rtc_control(rtc_task_t *t, unsigned int cmd, unsigned long arg);
 void rtc_get_rtc_time(struct rtc_time *rtc_tm);
+irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs);
 
 #endif /* __KERNEL__ */
 
-- 
cgit v1.2.3


From 095975da26dba21698582e91e96be10f7417333f Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sun, 8 Jan 2006 01:02:19 -0800
Subject: [PATCH] rcu file: use atomic primitives

Use atomic_inc_not_zero for rcu files instead of special case rcuref.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/RCU/rcuref.txt |  87 ++++++++---------
 fs/aio.c                     |   3 +-
 fs/file_table.c              |   8 +-
 include/linux/fs.h           |   3 +-
 include/linux/radix-tree.h   |   1 +
 include/linux/rcuref.h       | 220 -------------------------------------------
 kernel/rcupdate.c            |  14 ---
 kernel/rcutorture.c          |   1 -
 security/selinux/hooks.c     |   2 +-
 9 files changed, 48 insertions(+), 291 deletions(-)
 delete mode 100644 include/linux/rcuref.h

(limited to 'include/linux')

diff --git a/Documentation/RCU/rcuref.txt b/Documentation/RCU/rcuref.txt
index a23fee66064d..3f60db41b2f0 100644
--- a/Documentation/RCU/rcuref.txt
+++ b/Documentation/RCU/rcuref.txt
@@ -1,74 +1,67 @@
-Refcounter framework for elements of lists/arrays protected by
-RCU.
+Refcounter design for elements of lists/arrays protected by RCU.
 
 Refcounting on elements of  lists which are protected by traditional
 reader/writer spinlocks or semaphores are straight forward as in:
 
-1.					2.
-add()					search_and_reference()
-{					{
-	alloc_object				read_lock(&list_lock);
-	...					search_for_element
-	atomic_set(&el->rc, 1);			atomic_inc(&el->rc);
-	write_lock(&list_lock);			...
-	add_element				read_unlock(&list_lock);
-	...					...
-	write_unlock(&list_lock);	}
+1.				2.
+add()				search_and_reference()
+{				{
+    alloc_object		    read_lock(&list_lock);
+    ...				    search_for_element
+    atomic_set(&el->rc, 1);	    atomic_inc(&el->rc);
+    write_lock(&list_lock);	     ...
+    add_element			    read_unlock(&list_lock);
+    ...				    ...
+    write_unlock(&list_lock);	}
 }
 
 3.					4.
 release_referenced()			delete()
 {					{
-	...				write_lock(&list_lock);
-	atomic_dec(&el->rc, relfunc)	...
-	...				delete_element
-}					write_unlock(&list_lock);
- 					...
-					if (atomic_dec_and_test(&el->rc))
-						kfree(el);
-					...
+    ...					    write_lock(&list_lock);
+    atomic_dec(&el->rc, relfunc)	    ...
+    ...					    delete_element
+}					    write_unlock(&list_lock);
+ 					    ...
+					    if (atomic_dec_and_test(&el->rc))
+					        kfree(el);
+					    ...
 					}
 
 If this list/array is made lock free using rcu as in changing the
 write_lock in add() and delete() to spin_lock and changing read_lock
-in search_and_reference to rcu_read_lock(), the rcuref_get in
+in search_and_reference to rcu_read_lock(), the atomic_get in
 search_and_reference could potentially hold reference to an element which
-has already been deleted from the list/array.  rcuref_lf_get_rcu takes
+has already been deleted from the list/array.  atomic_inc_not_zero takes
 care of this scenario. search_and_reference should look as;
 
 1.					2.
 add()					search_and_reference()
 {					{
- 	alloc_object				rcu_read_lock();
-	...					search_for_element
-	atomic_set(&el->rc, 1);			if (rcuref_inc_lf(&el->rc)) {
-	write_lock(&list_lock);				rcu_read_unlock();
-							return FAIL;
-	add_element				}
-	...					...
-	write_unlock(&list_lock);		rcu_read_unlock();
+    alloc_object			    rcu_read_lock();
+    ...					    search_for_element
+    atomic_set(&el->rc, 1);		    if (atomic_inc_not_zero(&el->rc)) {
+    write_lock(&list_lock);		        rcu_read_unlock();
+					        return FAIL;
+    add_element				    }
+    ...					    ...
+    write_unlock(&list_lock);		    rcu_read_unlock();
 }					}
 3.					4.
 release_referenced()			delete()
 {					{
-	...				write_lock(&list_lock);
-	rcuref_dec(&el->rc, relfunc)	...
-	...				delete_element
-}					write_unlock(&list_lock);
- 					...
-					if (rcuref_dec_and_test(&el->rc))
-						call_rcu(&el->head, el_free);
-					...
+    ...					    write_lock(&list_lock);
+    atomic_dec(&el->rc, relfunc)	    ...
+    ...					    delete_element
+}					    write_unlock(&list_lock);
+ 					    ...
+					    if (atomic_dec_and_test(&el->rc))
+					        call_rcu(&el->head, el_free);
+					    ...
 					}
 
 Sometimes, reference to the element need to be obtained in the
-update (write) stream.  In such cases, rcuref_inc_lf might be an overkill
-since the spinlock serialising list updates are held. rcuref_inc
+update (write) stream.  In such cases, atomic_inc_not_zero might be an
+overkill since the spinlock serialising list updates are held. atomic_inc
 is to be used in such cases.
-For arches which do not have cmpxchg rcuref_inc_lf
-api uses a hashed spinlock implementation and the same hashed spinlock
-is acquired in all rcuref_xxx primitives to preserve atomicity.
-Note: Use rcuref_inc api only if you need to use rcuref_inc_lf on the
-refcounter atleast at one place.  Mixing rcuref_inc and atomic_xxx api
-might lead to races. rcuref_inc_lf() must be used in lockfree
-RCU critical sections only.
+
diff --git a/fs/aio.c b/fs/aio.c
index 5a28b69ad223..aec2b1916d1b 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -29,7 +29,6 @@
 #include <linux/highmem.h>
 #include <linux/workqueue.h>
 #include <linux/security.h>
-#include <linux/rcuref.h>
 
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -514,7 +513,7 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
 	/* Must be done under the lock to serialise against cancellation.
 	 * Call this aio_fput as it duplicates fput via the fput_work.
 	 */
-	if (unlikely(rcuref_dec_and_test(&req->ki_filp->f_count))) {
+	if (unlikely(atomic_dec_and_test(&req->ki_filp->f_count))) {
 		get_ioctx(ctx);
 		spin_lock(&fput_lock);
 		list_add(&req->ki_list, &fput_head);
diff --git a/fs/file_table.c b/fs/file_table.c
index c3a5e2fd663b..6142250104a6 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -117,7 +117,7 @@ EXPORT_SYMBOL(get_empty_filp);
 
 void fastcall fput(struct file *file)
 {
-	if (rcuref_dec_and_test(&file->f_count))
+	if (atomic_dec_and_test(&file->f_count))
 		__fput(file);
 }
 
@@ -166,7 +166,7 @@ struct file fastcall *fget(unsigned int fd)
 	rcu_read_lock();
 	file = fcheck_files(files, fd);
 	if (file) {
-		if (!rcuref_inc_lf(&file->f_count)) {
+		if (!atomic_inc_not_zero(&file->f_count)) {
 			/* File object ref couldn't be taken */
 			rcu_read_unlock();
 			return NULL;
@@ -198,7 +198,7 @@ struct file fastcall *fget_light(unsigned int fd, int *fput_needed)
 		rcu_read_lock();
 		file = fcheck_files(files, fd);
 		if (file) {
-			if (rcuref_inc_lf(&file->f_count))
+			if (atomic_inc_not_zero(&file->f_count))
 				*fput_needed = 1;
 			else
 				/* Didn't get the reference, someone's freed */
@@ -213,7 +213,7 @@ struct file fastcall *fget_light(unsigned int fd, int *fput_needed)
 
 void put_filp(struct file *file)
 {
-	if (rcuref_dec_and_test(&file->f_count)) {
+	if (atomic_dec_and_test(&file->f_count)) {
 		security_file_free(file);
 		file_kill(file);
 		file_free(file);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2c9c48d65630..ef29500b5df8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -9,7 +9,6 @@
 #include <linux/config.h>
 #include <linux/limits.h>
 #include <linux/ioctl.h>
-#include <linux/rcuref.h>
 
 /*
  * It's silly to have NR_OPEN bigger than NR_FILE, but you can change
@@ -653,7 +652,7 @@ extern spinlock_t files_lock;
 #define file_list_lock() spin_lock(&files_lock);
 #define file_list_unlock() spin_unlock(&files_lock);
 
-#define get_file(x)	rcuref_inc(&(x)->f_count)
+#define get_file(x)	atomic_inc(&(x)->f_count)
 #define file_count(x)	atomic_read(&(x)->f_count)
 
 #define	MAX_NON_LFS	((1UL<<31) - 1)
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 36e5d269612f..c57ff2fcb30a 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -19,6 +19,7 @@
 #ifndef _LINUX_RADIX_TREE_H
 #define _LINUX_RADIX_TREE_H
 
+#include <linux/sched.h>
 #include <linux/preempt.h>
 #include <linux/types.h>
 
diff --git a/include/linux/rcuref.h b/include/linux/rcuref.h
deleted file mode 100644
index e1adbba14b67..000000000000
--- a/include/linux/rcuref.h
+++ /dev/null
@@ -1,220 +0,0 @@
-/*
- * rcuref.h
- *
- * Reference counting for elements of lists/arrays protected by
- * RCU.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright (C) IBM Corporation, 2005
- *
- * Author: Dipankar Sarma <dipankar@in.ibm.com>
- *	   Ravikiran Thirumalai <kiran_th@gmail.com>
- *
- * See Documentation/RCU/rcuref.txt for detailed user guide.
- *
- */
-
-#ifndef _RCUREF_H_
-#define _RCUREF_H_
-
-#ifdef __KERNEL__
-
-#include <linux/types.h>
-#include <linux/interrupt.h>
-#include <linux/spinlock.h>
-#include <asm/atomic.h>
-
-/*
- * These APIs work on traditional atomic_t counters used in the
- * kernel for reference counting. Under special circumstances
- * where a lock-free get() operation races with a put() operation
- * these APIs can be used. See Documentation/RCU/rcuref.txt.
- */
-
-#ifdef __HAVE_ARCH_CMPXCHG
-
-/**
- * rcuref_inc - increment refcount for object.
- * @rcuref: reference counter in the object in question.
- *
- * This should be used only for objects where we use RCU and
- * use the rcuref_inc_lf() api to acquire a reference
- * in a lock-free reader-side critical section.
- */
-static inline void rcuref_inc(atomic_t *rcuref)
-{
-	atomic_inc(rcuref);
-}
-
-/**
- * rcuref_dec - decrement refcount for object.
- * @rcuref: reference counter in the object in question.
- *
- * This should be used only for objects where we use RCU and
- * use the rcuref_inc_lf() api to acquire a reference
- * in a lock-free reader-side critical section.
- */
-static inline void rcuref_dec(atomic_t *rcuref)
-{
-	atomic_dec(rcuref);
-}
-
-/**
- * rcuref_dec_and_test - decrement refcount for object and test
- * @rcuref: reference counter in the object.
- * @release: pointer to the function that will clean up the object
- *	     when the last reference to the object is released.
- *	     This pointer is required.
- *
- * Decrement the refcount, and if 0, return 1. Else return 0.
- *
- * This should be used only for objects where we use RCU and
- * use the rcuref_inc_lf() api to acquire a reference
- * in a lock-free reader-side critical section.
- */
-static inline int rcuref_dec_and_test(atomic_t *rcuref)
-{
-	return atomic_dec_and_test(rcuref);
-}
-
-/*
- * cmpxchg is needed on UP too, if deletions to the list/array can happen
- * in interrupt context.
- */
-
-/**
- * rcuref_inc_lf - Take reference to an object in a read-side
- * critical section protected by RCU.
- * @rcuref: reference counter in the object in question.
- *
- * Try and increment the refcount by 1.  The increment might fail if
- * the reference counter has been through a 1 to 0 transition and
- * is no longer part of the lock-free list.
- * Returns non-zero on successful increment and zero otherwise.
- */
-static inline int rcuref_inc_lf(atomic_t *rcuref)
-{
-	int c, old;
-	c = atomic_read(rcuref);
-	while (c && (old = cmpxchg(&rcuref->counter, c, c + 1)) != c)
-		c = old;
-	return c;
-}
-
-#else				/* !__HAVE_ARCH_CMPXCHG */
-
-extern spinlock_t __rcuref_hash[];
-
-/*
- * Use a hash table of locks to protect the reference count
- * since cmpxchg is not available in this arch.
- */
-#ifdef	CONFIG_SMP
-#define RCUREF_HASH_SIZE	4
-#define RCUREF_HASH(k) \
-	(&__rcuref_hash[(((unsigned long)k)>>8) & (RCUREF_HASH_SIZE-1)])
-#else
-#define	RCUREF_HASH_SIZE	1
-#define RCUREF_HASH(k) 	&__rcuref_hash[0]
-#endif				/* CONFIG_SMP */
-
-/**
- * rcuref_inc - increment refcount for object.
- * @rcuref: reference counter in the object in question.
- *
- * This should be used only for objects where we use RCU and
- * use the rcuref_inc_lf() api to acquire a reference in a lock-free
- * reader-side critical section.
- */
-static inline void rcuref_inc(atomic_t *rcuref)
-{
-	unsigned long flags;
-	spin_lock_irqsave(RCUREF_HASH(rcuref), flags);
-	rcuref->counter += 1;
-	spin_unlock_irqrestore(RCUREF_HASH(rcuref), flags);
-}
-
-/**
- * rcuref_dec - decrement refcount for object.
- * @rcuref: reference counter in the object in question.
- *
- * This should be used only for objects where we use RCU and
- * use the rcuref_inc_lf() api to acquire a reference in a lock-free
- * reader-side critical section.
- */
-static inline void rcuref_dec(atomic_t *rcuref)
-{
-	unsigned long flags;
-	spin_lock_irqsave(RCUREF_HASH(rcuref), flags);
-	rcuref->counter -= 1;
-	spin_unlock_irqrestore(RCUREF_HASH(rcuref), flags);
-}
-
-/**
- * rcuref_dec_and_test - decrement refcount for object and test
- * @rcuref: reference counter in the object.
- * @release: pointer to the function that will clean up the object
- *	     when the last reference to the object is released.
- *	     This pointer is required.
- *
- * Decrement the refcount, and if 0, return 1. Else return 0.
- *
- * This should be used only for objects where we use RCU and
- * use the rcuref_inc_lf() api to acquire a reference in a lock-free
- * reader-side critical section.
- */
-static inline int rcuref_dec_and_test(atomic_t *rcuref)
-{
-	unsigned long flags;
-	spin_lock_irqsave(RCUREF_HASH(rcuref), flags);
-	rcuref->counter--;
-	if (!rcuref->counter) {
-		spin_unlock_irqrestore(RCUREF_HASH(rcuref), flags);
-		return 1;
-	} else {
-		spin_unlock_irqrestore(RCUREF_HASH(rcuref), flags);
-		return 0;
-	}
-}
-
-/**
- * rcuref_inc_lf - Take reference to an object of a lock-free collection
- * by traversing a lock-free list/array.
- * @rcuref: reference counter in the object in question.
- *
- * Try and increment the refcount by 1.  The increment might fail if
- * the reference counter has been through a 1 to 0 transition and
- * object is no longer part of the lock-free list.
- * Returns non-zero on successful increment and zero otherwise.
- */
-static inline int rcuref_inc_lf(atomic_t *rcuref)
-{
-	int ret;
-	unsigned long flags;
-	spin_lock_irqsave(RCUREF_HASH(rcuref), flags);
-	if (rcuref->counter)
-		ret = rcuref->counter++;
-	else
-		ret = 0;
-	spin_unlock_irqrestore(RCUREF_HASH(rcuref), flags);
-	return ret;
-}
-
-
-#endif /* !__HAVE_ARCH_CMPXCHG */
-
-#endif /* __KERNEL__ */
-#endif /* _RCUREF_H_ */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 0a669bd2f6d1..30b0bba03859 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -46,7 +46,6 @@
 #include <linux/percpu.h>
 #include <linux/notifier.h>
 #include <linux/rcupdate.h>
-#include <linux/rcuref.h>
 #include <linux/cpu.h>
 
 /* Definition for rcupdate control block. */
@@ -74,19 +73,6 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
 static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
 static int maxbatch = 10000;
 
-#ifndef __HAVE_ARCH_CMPXCHG
-/*
- * We use an array of spinlocks for the rcurefs -- similar to ones in sparc
- * 32 bit atomic_t implementations, and a hash function similar to that
- * for our refcounting needs.
- * Can't help multiprocessors which donot have cmpxchg :(
- */
-
-spinlock_t __rcuref_hash[RCUREF_HASH_SIZE] = {
-	[0 ... (RCUREF_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED
-};
-#endif
-
 /**
  * call_rcu - Queue an RCU callback for invocation after a grace period.
  * @head: structure to be used for queueing the RCU updates.
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 36efe088ad81..75174c81529a 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -39,7 +39,6 @@
 #include <linux/moduleparam.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
-#include <linux/rcuref.h>
 #include <linux/cpu.h>
 #include <linux/random.h>
 #include <linux/delay.h>
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 3d496eae1b47..6647204e4636 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1663,7 +1663,7 @@ static inline void flush_unauthorized_files(struct files_struct * files)
 						continue;
 					}
 					if (devnull) {
-						rcuref_inc(&devnull->f_count);
+						get_file(devnull);
 					} else {
 						devnull = dentry_open(dget(selinux_null), mntget(selinuxfs_mount), O_RDWR);
 						if (!devnull) {
-- 
cgit v1.2.3


From b3f3d6141f8636f627bf19fd44eaf59a52637ac8 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Sun, 8 Jan 2006 01:02:20 -0800
Subject: [PATCH] ELF: symbol table type additions

Needed for the Novell kernel debugger and perhaps some per-cpu data on x86_64
in the future.

Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/elf.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/elf.h b/include/linux/elf.h
index ff955dbf510d..d3bfacb24496 100644
--- a/include/linux/elf.h
+++ b/include/linux/elf.h
@@ -151,6 +151,8 @@ typedef __s64	Elf64_Sxword;
 #define STT_FUNC    2
 #define STT_SECTION 3
 #define STT_FILE    4
+#define STT_COMMON  5
+#define STT_TLS     6
 
 #define ELF_ST_BIND(x)		((x) >> 4)
 #define ELF_ST_TYPE(x)		(((unsigned int) x) & 0xf)
-- 
cgit v1.2.3


From 907f2c77d1653ce235e8e1fd6ce5c46005814e78 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:24 -0800
Subject: [PATCH] relayfs: export relayfs_create_file() with fileops param

This patch adds a mandatory fileops param to relayfs_create_file() and exports
that function so that clients can use it to create files defined by their own
set of file operations, in relayfs.  The purpose is to allow relayfs
applications to create their own set of 'control' files alongside their relay
files in relayfs rather than having to create them in /proc or debugfs for
instance.  relayfs_create_file() is also used by relay_open_buf() to create
the relay files for a channel.  In this case, a pointer to
relayfs_file_operations is passed in, along with a pointer to the buffer
associated with the file.

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/relayfs/inode.c         | 41 ++++++++++++++++++++++++++---------------
 fs/relayfs/relay.c         |  3 ++-
 fs/relayfs/relay.h         |  4 ----
 include/linux/relayfs_fs.h |  7 ++++++-
 4 files changed, 34 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c
index 379e07cd2b34..a5e6d4f2efb9 100644
--- a/fs/relayfs/inode.c
+++ b/fs/relayfs/inode.c
@@ -33,7 +33,9 @@ static struct backing_dev_info		relayfs_backing_dev_info = {
 	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
 };
 
-static struct inode *relayfs_get_inode(struct super_block *sb, int mode,
+static struct inode *relayfs_get_inode(struct super_block *sb,
+				       int mode,
+ 				       struct file_operations *fops,
 				       void *data)
 {
 	struct inode *inode;
@@ -51,8 +53,8 @@ static struct inode *relayfs_get_inode(struct super_block *sb, int mode,
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	switch (mode & S_IFMT) {
 	case S_IFREG:
-		inode->i_fop = &relayfs_file_operations;
-		RELAYFS_I(inode)->buf = data;
+		inode->i_fop = fops;
+		RELAYFS_I(inode)->data = data;
 		break;
 	case S_IFDIR:
 		inode->i_op = &simple_dir_inode_operations;
@@ -73,6 +75,7 @@ static struct inode *relayfs_get_inode(struct super_block *sb, int mode,
  *	@name: the name of the file to create
  *	@parent: parent directory
  *	@mode: mode
+ *	@fops: file operations to use for the file
  *	@data: user-associated data for this file
  *
  *	Returns the new dentry, NULL on failure
@@ -82,6 +85,7 @@ static struct inode *relayfs_get_inode(struct super_block *sb, int mode,
 static struct dentry *relayfs_create_entry(const char *name,
 					   struct dentry *parent,
 					   int mode,
+					   struct file_operations *fops,
 					   void *data)
 {
 	struct dentry *d;
@@ -117,7 +121,7 @@ static struct dentry *relayfs_create_entry(const char *name,
 		goto release_mount;
 	}
 
-	inode = relayfs_get_inode(parent->d_inode->i_sb, mode, data);
+	inode = relayfs_get_inode(parent->d_inode->i_sb, mode, fops, data);
 	if (!inode) {
 		d = NULL;
 		goto release_mount;
@@ -145,20 +149,26 @@ exit:
  *	@name: the name of the file to create
  *	@parent: parent directory
  *	@mode: mode, if not specied the default perms are used
+ *	@fops: file operations to use for the file
  *	@data: user-associated data for this file
  *
  *	Returns file dentry if successful, NULL otherwise.
  *
  *	The file will be created user r on behalf of current user.
  */
-struct dentry *relayfs_create_file(const char *name, struct dentry *parent,
-				   int mode, void *data)
+struct dentry *relayfs_create_file(const char *name,
+				   struct dentry *parent,
+				   int mode,
+				   struct file_operations *fops,
+				   void *data)
 {
+	BUG_ON(!fops);
+
 	if (!mode)
 		mode = S_IRUSR;
 	mode = (mode & S_IALLUGO) | S_IFREG;
 
-	return relayfs_create_entry(name, parent, mode, data);
+	return relayfs_create_entry(name, parent, mode, fops, data);
 }
 
 /**
@@ -173,7 +183,7 @@ struct dentry *relayfs_create_file(const char *name, struct dentry *parent,
 struct dentry *relayfs_create_dir(const char *name, struct dentry *parent)
 {
 	int mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
-	return relayfs_create_entry(name, parent, mode, NULL);
+	return relayfs_create_entry(name, parent, mode, NULL, NULL);
 }
 
 /**
@@ -234,7 +244,7 @@ int relayfs_remove_dir(struct dentry *dentry)
  */
 static int relayfs_open(struct inode *inode, struct file *filp)
 {
-	struct rchan_buf *buf = RELAYFS_I(inode)->buf;
+	struct rchan_buf *buf = RELAYFS_I(inode)->data;
 	kref_get(&buf->kref);
 
 	return 0;
@@ -250,7 +260,7 @@ static int relayfs_open(struct inode *inode, struct file *filp)
 static int relayfs_mmap(struct file *filp, struct vm_area_struct *vma)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
-	return relay_mmap_buf(RELAYFS_I(inode)->buf, vma);
+	return relay_mmap_buf(RELAYFS_I(inode)->data, vma);
 }
 
 /**
@@ -264,7 +274,7 @@ static unsigned int relayfs_poll(struct file *filp, poll_table *wait)
 {
 	unsigned int mask = 0;
 	struct inode *inode = filp->f_dentry->d_inode;
-	struct rchan_buf *buf = RELAYFS_I(inode)->buf;
+	struct rchan_buf *buf = RELAYFS_I(inode)->data;
 
 	if (buf->finalized)
 		return POLLERR;
@@ -288,7 +298,7 @@ static unsigned int relayfs_poll(struct file *filp, poll_table *wait)
  */
 static int relayfs_release(struct inode *inode, struct file *filp)
 {
-	struct rchan_buf *buf = RELAYFS_I(inode)->buf;
+	struct rchan_buf *buf = RELAYFS_I(inode)->data;
 	kref_put(&buf->kref, relay_remove_buf);
 
 	return 0;
@@ -450,7 +460,7 @@ static ssize_t relayfs_read(struct file *filp,
 			    loff_t *ppos)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
-	struct rchan_buf *buf = RELAYFS_I(inode)->buf;
+	struct rchan_buf *buf = RELAYFS_I(inode)->data;
 	size_t read_start, avail;
 	ssize_t ret = 0;
 	void *from;
@@ -485,7 +495,7 @@ static struct inode *relayfs_alloc_inode(struct super_block *sb)
 	struct relayfs_inode_info *p = kmem_cache_alloc(relayfs_inode_cachep, SLAB_KERNEL);
 	if (!p)
 		return NULL;
-	p->buf = NULL;
+	p->data = NULL;
 
 	return &p->vfs_inode;
 }
@@ -531,7 +541,7 @@ static int relayfs_fill_super(struct super_block * sb, void * data, int silent)
 	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
 	sb->s_magic = RELAYFS_MAGIC;
 	sb->s_op = &relayfs_ops;
-	inode = relayfs_get_inode(sb, mode, NULL);
+	inode = relayfs_get_inode(sb, mode, NULL, NULL);
 
 	if (!inode)
 		return -ENOMEM;
@@ -589,6 +599,7 @@ module_exit(exit_relayfs_fs)
 EXPORT_SYMBOL_GPL(relayfs_file_operations);
 EXPORT_SYMBOL_GPL(relayfs_create_dir);
 EXPORT_SYMBOL_GPL(relayfs_remove_dir);
+EXPORT_SYMBOL_GPL(relayfs_create_file);
 
 MODULE_AUTHOR("Tom Zanussi <zanussi@us.ibm.com> and Karim Yaghmour <karim@opersys.com>");
 MODULE_DESCRIPTION("Relay Filesystem");
diff --git a/fs/relayfs/relay.c b/fs/relayfs/relay.c
index 7fbda177ad8f..a9cd5585c45c 100644
--- a/fs/relayfs/relay.c
+++ b/fs/relayfs/relay.c
@@ -176,7 +176,8 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan,
  		return NULL;
 
 	/* Create file in fs */
-	dentry = relayfs_create_file(filename, parent, S_IRUSR, buf);
+	dentry = relayfs_create_file(filename, parent, S_IRUSR,
+				     &relayfs_file_operations, buf);
  	if (!dentry) {
  		relay_destroy_buf(buf);
 		return NULL;
diff --git a/fs/relayfs/relay.h b/fs/relayfs/relay.h
index c325bb243549..0993d3e5753b 100644
--- a/fs/relayfs/relay.h
+++ b/fs/relayfs/relay.h
@@ -1,10 +1,6 @@
 #ifndef _RELAY_H
 #define _RELAY_H
 
-struct dentry *relayfs_create_file(const char *name,
-				   struct dentry *parent,
-				   int mode,
-				   void *data);
 extern int relayfs_remove(struct dentry *dentry);
 extern int relay_buf_empty(struct rchan_buf *buf);
 extern void relay_destroy_channel(struct kref *kref);
diff --git a/include/linux/relayfs_fs.h b/include/linux/relayfs_fs.h
index fb7e80737325..a122df2d9880 100644
--- a/include/linux/relayfs_fs.h
+++ b/include/linux/relayfs_fs.h
@@ -70,7 +70,7 @@ struct rchan
 struct relayfs_inode_info
 {
 	struct inode vfs_inode;
-	struct rchan_buf *buf;
+	void *data;
 };
 
 static inline struct relayfs_inode_info *RELAYFS_I(struct inode *inode)
@@ -148,6 +148,11 @@ extern size_t relay_switch_subbuf(struct rchan_buf *buf,
 extern struct dentry *relayfs_create_dir(const char *name,
 					 struct dentry *parent);
 extern int relayfs_remove_dir(struct dentry *dentry);
+extern struct dentry *relayfs_create_file(const char *name,
+					  struct dentry *parent,
+					  int mode,
+					  struct file_operations *fops,
+					  void *data);
 
 /**
  *	relay_write - write data into the channel
-- 
cgit v1.2.3


From 7431733791feb0b19453d8047b0723c744667040 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:25 -0800
Subject: [PATCH] relayfs: add relayfs_remove_file()

This patch adds and exports relayfs_remove_file(), for API symmetry (with
relayfs_create_file()).

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/relayfs/inode.c         | 12 ++++++++++++
 include/linux/relayfs_fs.h |  1 +
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c
index a5e6d4f2efb9..b2f50655736b 100644
--- a/fs/relayfs/inode.c
+++ b/fs/relayfs/inode.c
@@ -224,6 +224,17 @@ int relayfs_remove(struct dentry *dentry)
 	return error;
 }
 
+/**
+ *	relayfs_remove_file - remove a file from relay filesystem
+ *	@dentry: directory dentry
+ *
+ *	Returns 0 if successful, negative otherwise.
+ */
+int relayfs_remove_file(struct dentry *dentry)
+{
+	return relayfs_remove(dentry);
+}
+
 /**
  *	relayfs_remove_dir - remove a directory in the relay filesystem
  *	@dentry: directory dentry
@@ -600,6 +611,7 @@ EXPORT_SYMBOL_GPL(relayfs_file_operations);
 EXPORT_SYMBOL_GPL(relayfs_create_dir);
 EXPORT_SYMBOL_GPL(relayfs_remove_dir);
 EXPORT_SYMBOL_GPL(relayfs_create_file);
+EXPORT_SYMBOL_GPL(relayfs_remove_file);
 
 MODULE_AUTHOR("Tom Zanussi <zanussi@us.ibm.com> and Karim Yaghmour <karim@opersys.com>");
 MODULE_DESCRIPTION("Relay Filesystem");
diff --git a/include/linux/relayfs_fs.h b/include/linux/relayfs_fs.h
index a122df2d9880..921540b1cdf8 100644
--- a/include/linux/relayfs_fs.h
+++ b/include/linux/relayfs_fs.h
@@ -153,6 +153,7 @@ extern struct dentry *relayfs_create_file(const char *name,
 					  int mode,
 					  struct file_operations *fops,
 					  void *data);
+extern int relayfs_remove_file(struct dentry *dentry);
 
 /**
  *	relay_write - write data into the channel
-- 
cgit v1.2.3


From aaea25d7a68a7f72e167dc1698b66a15edc71883 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:26 -0800
Subject: [PATCH] relayfs: remove unused alloc/destroy_inode()

Since we're no longer using relayfs_inode_info, remove relayfs_alloc_inode()
and relayfs_destroy_inode() along with the relayfs inode cache.

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/relayfs/inode.c         | 46 +---------------------------------------------
 include/linux/relayfs_fs.h | 14 --------------
 2 files changed, 1 insertion(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c
index 7f6d2c8e91c2..b4c3e0466e98 100644
--- a/fs/relayfs/inode.c
+++ b/fs/relayfs/inode.c
@@ -26,7 +26,6 @@
 
 static struct vfsmount *		relayfs_mount;
 static int				relayfs_mount_count;
-static kmem_cache_t *			relayfs_inode_cachep;
 
 static struct backing_dev_info		relayfs_backing_dev_info = {
 	.ra_pages	= 0,	/* No readahead */
@@ -499,34 +498,6 @@ out:
 	return ret;
 }
 
-/**
- *	relayfs alloc_inode() implementation
- */
-static struct inode *relayfs_alloc_inode(struct super_block *sb)
-{
-	struct relayfs_inode_info *p = kmem_cache_alloc(relayfs_inode_cachep, SLAB_KERNEL);
-	if (!p)
-		return NULL;
-	p->data = NULL;
-
-	return &p->vfs_inode;
-}
-
-/**
- *	relayfs destroy_inode() implementation
- */
-static void relayfs_destroy_inode(struct inode *inode)
-{
-	kmem_cache_free(relayfs_inode_cachep, RELAYFS_I(inode));
-}
-
-static void init_once(void *p, kmem_cache_t *cachep, unsigned long flags)
-{
-	struct relayfs_inode_info *i = p;
-	if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(&i->vfs_inode);
-}
-
 struct file_operations relayfs_file_operations = {
 	.open		= relayfs_open,
 	.poll		= relayfs_poll,
@@ -539,8 +510,6 @@ struct file_operations relayfs_file_operations = {
 static struct super_operations relayfs_ops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
-	.alloc_inode	= relayfs_alloc_inode,
-	.destroy_inode	= relayfs_destroy_inode,
 };
 
 static int relayfs_fill_super(struct super_block * sb, void * data, int silent)
@@ -584,25 +553,12 @@ static struct file_system_type relayfs_fs_type = {
 
 static int __init init_relayfs_fs(void)
 {
-	int err;
-
-	relayfs_inode_cachep = kmem_cache_create("relayfs_inode_cache",
-				sizeof(struct relayfs_inode_info), 0,
-				0, init_once, NULL);
-	if (!relayfs_inode_cachep)
-		return -ENOMEM;
-
-	err = register_filesystem(&relayfs_fs_type);
-	if (err)
-		kmem_cache_destroy(relayfs_inode_cachep);
-
-	return err;
+	return register_filesystem(&relayfs_fs_type);
 }
 
 static void __exit exit_relayfs_fs(void)
 {
 	unregister_filesystem(&relayfs_fs_type);
-	kmem_cache_destroy(relayfs_inode_cachep);
 }
 
 module_init(init_relayfs_fs)
diff --git a/include/linux/relayfs_fs.h b/include/linux/relayfs_fs.h
index 921540b1cdf8..8200ecbe6e0f 100644
--- a/include/linux/relayfs_fs.h
+++ b/include/linux/relayfs_fs.h
@@ -64,20 +64,6 @@ struct rchan
 	struct rchan_buf *buf[NR_CPUS]; /* per-cpu channel buffers */
 };
 
-/*
- * Relayfs inode
- */
-struct relayfs_inode_info
-{
-	struct inode vfs_inode;
-	void *data;
-};
-
-static inline struct relayfs_inode_info *RELAYFS_I(struct inode *inode)
-{
-	return container_of(inode, struct relayfs_inode_info, vfs_inode);
-}
-
 /*
  * Relay channel client callbacks
  */
-- 
cgit v1.2.3


From 08c541a7ade230883c48225f4ea406a0117e7c2f Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:28 -0800
Subject: [PATCH] relayfs: add support for relay files in other filesystems

This patch adds a couple of callback functions that allow a client to hook
into relay_open()/close() and supply the files that will be used to represent
the channel buffers; the default implementation if no callbacks are defined is
to create the files in relayfs.  This is to support the creation and use of
relay files in other filesystems such as debugfs, as implied by the fact that
relayfs_file_operations are exported.

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/relayfs/buffers.c       |  2 +-
 fs/relayfs/relay.c         | 30 ++++++++++++++++++++++++++++--
 include/linux/relayfs_fs.h | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 63 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/relayfs/buffers.c b/fs/relayfs/buffers.c
index 667b529944c5..10187812771e 100644
--- a/fs/relayfs/buffers.c
+++ b/fs/relayfs/buffers.c
@@ -185,6 +185,6 @@ void relay_destroy_buf(struct rchan_buf *buf)
 void relay_remove_buf(struct kref *kref)
 {
 	struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
-	relayfs_remove(buf->dentry);
+	buf->chan->cb->remove_buf_file(buf->dentry);
 	relay_destroy_buf(buf);
 }
diff --git a/fs/relayfs/relay.c b/fs/relayfs/relay.c
index a9cd5585c45c..b9bb56903272 100644
--- a/fs/relayfs/relay.c
+++ b/fs/relayfs/relay.c
@@ -80,11 +80,33 @@ static void buf_unmapped_default_callback(struct rchan_buf *buf,
 {
 }
 
+/*
+ * create_buf_file_create() default callback.  Creates file to represent buf.
+ */
+static struct dentry *create_buf_file_default_callback(const char *filename,
+						       struct dentry *parent,
+						       int mode,
+						       struct rchan_buf *buf)
+{
+	return relayfs_create_file(filename, parent, mode,
+				   &relayfs_file_operations, buf);
+}
+
+/*
+ * remove_buf_file() default callback.  Removes file representing relay buffer.
+ */
+static int remove_buf_file_default_callback(struct dentry *dentry)
+{
+	return relayfs_remove(dentry);
+}
+
 /* relay channel default callbacks */
 static struct rchan_callbacks default_channel_callbacks = {
 	.subbuf_start = subbuf_start_default_callback,
 	.buf_mapped = buf_mapped_default_callback,
 	.buf_unmapped = buf_unmapped_default_callback,
+	.create_buf_file = create_buf_file_default_callback,
+	.remove_buf_file = remove_buf_file_default_callback,
 };
 
 /**
@@ -176,8 +198,8 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan,
  		return NULL;
 
 	/* Create file in fs */
-	dentry = relayfs_create_file(filename, parent, S_IRUSR,
-				     &relayfs_file_operations, buf);
+ 	dentry = chan->cb->create_buf_file(filename, parent, S_IRUSR,
+ 					   buf);
  	if (!dentry) {
  		relay_destroy_buf(buf);
 		return NULL;
@@ -220,6 +242,10 @@ static inline void setup_callbacks(struct rchan *chan,
 		cb->buf_mapped = buf_mapped_default_callback;
 	if (!cb->buf_unmapped)
 		cb->buf_unmapped = buf_unmapped_default_callback;
+	if (!cb->create_buf_file)
+		cb->create_buf_file = create_buf_file_default_callback;
+	if (!cb->remove_buf_file)
+		cb->remove_buf_file = remove_buf_file_default_callback;
 	chan->cb = cb;
 }
 
diff --git a/include/linux/relayfs_fs.h b/include/linux/relayfs_fs.h
index 8200ecbe6e0f..8c2177105857 100644
--- a/include/linux/relayfs_fs.h
+++ b/include/linux/relayfs_fs.h
@@ -110,6 +110,40 @@ struct rchan_callbacks
 	 */
         void (*buf_unmapped)(struct rchan_buf *buf,
 			     struct file *filp);
+	/*
+	 * create_buf_file - create file to represent a relayfs channel buffer
+	 * @filename: the name of the file to create
+	 * @parent: the parent of the file to create
+	 * @mode: the mode of the file to create
+	 * @buf: the channel buffer
+	 *
+	 * Called during relay_open(), once for each per-cpu buffer,
+	 * to allow the client to create a file to be used to
+	 * represent the corresponding channel buffer.  If the file is
+	 * created outside of relayfs, the parent must also exist in
+	 * that filesystem.
+	 *
+	 * The callback should return the dentry of the file created
+	 * to represent the relay buffer.
+	 *
+	 * See Documentation/filesystems/relayfs.txt for more info.
+	 */
+	struct dentry *(*create_buf_file)(const char *filename,
+					  struct dentry *parent,
+					  int mode,
+					  struct rchan_buf *buf);
+
+	/*
+	 * remove_buf_file - remove file representing a relayfs channel buffer
+	 * @dentry: the dentry of the file to remove
+	 *
+	 * Called during relay_close(), once for each per-cpu buffer,
+	 * to allow the client to remove a file used to represent a
+	 * channel buffer.
+	 *
+	 * The callback should return 0 if successful, negative if not.
+	 */
+	int (*remove_buf_file)(struct dentry *dentry);
 };
 
 /*
-- 
cgit v1.2.3


From e6c08367b8fc6dce6dfd1106f53f6ef28215b313 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:29 -0800
Subject: [PATCH] relayfs: add support for global relay buffers

This patch adds the optional is_global outparam to the create_buf_file()
callback.  This can be used by clients to create a single global relayfs
buffer instead of the default per-cpu buffers.  This was suggested as being
useful for certain debugging applications where it's more convenient to be
able to get all the data from a single channel without having to go to the
bother of dealing with per-cpu files.

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/relayfs/relay.c         | 35 +++++++++++++++++++++++++----------
 include/linux/relayfs_fs.h |  8 +++++++-
 2 files changed, 32 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/relayfs/relay.c b/fs/relayfs/relay.c
index b9bb56903272..2935a6ab8ffa 100644
--- a/fs/relayfs/relay.c
+++ b/fs/relayfs/relay.c
@@ -86,7 +86,8 @@ static void buf_unmapped_default_callback(struct rchan_buf *buf,
 static struct dentry *create_buf_file_default_callback(const char *filename,
 						       struct dentry *parent,
 						       int mode,
-						       struct rchan_buf *buf)
+						       struct rchan_buf *buf,
+						       int *is_global)
 {
 	return relayfs_create_file(filename, parent, mode,
 				   &relayfs_file_operations, buf);
@@ -170,14 +171,16 @@ static inline void __relay_reset(struct rchan_buf *buf, unsigned int init)
 void relay_reset(struct rchan *chan)
 {
 	unsigned int i;
+	struct rchan_buf *prev = NULL;
 
 	if (!chan)
 		return;
 
 	for (i = 0; i < NR_CPUS; i++) {
-		if (!chan->buf[i])
-			continue;
+		if (!chan->buf[i] || chan->buf[i] == prev)
+			break;
 		__relay_reset(chan->buf[i], 0);
+		prev = chan->buf[i];
 	}
 }
 
@@ -188,18 +191,22 @@ void relay_reset(struct rchan *chan)
  */
 static struct rchan_buf *relay_open_buf(struct rchan *chan,
 					const char *filename,
-					struct dentry *parent)
+					struct dentry *parent,
+					int *is_global)
 {
 	struct rchan_buf *buf;
 	struct dentry *dentry;
 
+	if (*is_global)
+		return chan->buf[0];
+
  	buf = relay_create_buf(chan);
  	if (!buf)
  		return NULL;
 
 	/* Create file in fs */
  	dentry = chan->cb->create_buf_file(filename, parent, S_IRUSR,
- 					   buf);
+ 					   buf, is_global);
  	if (!dentry) {
  		relay_destroy_buf(buf);
 		return NULL;
@@ -273,6 +280,7 @@ struct rchan *relay_open(const char *base_filename,
 	unsigned int i;
 	struct rchan *chan;
 	char *tmpname;
+	int is_global = 0;
 
 	if (!base_filename)
 		return NULL;
@@ -297,7 +305,8 @@ struct rchan *relay_open(const char *base_filename,
 
 	for_each_online_cpu(i) {
 		sprintf(tmpname, "%s%d", base_filename, i);
-		chan->buf[i] = relay_open_buf(chan, tmpname, parent);
+		chan->buf[i] = relay_open_buf(chan, tmpname, parent,
+					      &is_global);
 		chan->buf[i]->cpu = i;
 		if (!chan->buf[i])
 			goto free_bufs;
@@ -311,6 +320,8 @@ free_bufs:
 		if (!chan->buf[i])
 			break;
 		relay_close_buf(chan->buf[i]);
+		if (is_global)
+			break;
 	}
 	kfree(tmpname);
 
@@ -420,14 +431,16 @@ void relay_destroy_channel(struct kref *kref)
 void relay_close(struct rchan *chan)
 {
 	unsigned int i;
+	struct rchan_buf *prev = NULL;
 
 	if (!chan)
 		return;
 
 	for (i = 0; i < NR_CPUS; i++) {
-		if (!chan->buf[i])
-			continue;
+		if (!chan->buf[i] || chan->buf[i] == prev)
+			break;
 		relay_close_buf(chan->buf[i]);
+		prev = chan->buf[i];
 	}
 
 	if (chan->last_toobig)
@@ -447,14 +460,16 @@ void relay_close(struct rchan *chan)
 void relay_flush(struct rchan *chan)
 {
 	unsigned int i;
+	struct rchan_buf *prev = NULL;
 
 	if (!chan)
 		return;
 
 	for (i = 0; i < NR_CPUS; i++) {
-		if (!chan->buf[i])
-			continue;
+		if (!chan->buf[i] || chan->buf[i] == prev)
+			break;
 		relay_switch_subbuf(chan->buf[i], 0);
+		prev = chan->buf[i];
 	}
 }
 
diff --git a/include/linux/relayfs_fs.h b/include/linux/relayfs_fs.h
index 8c2177105857..30f45511b40d 100644
--- a/include/linux/relayfs_fs.h
+++ b/include/linux/relayfs_fs.h
@@ -116,6 +116,7 @@ struct rchan_callbacks
 	 * @parent: the parent of the file to create
 	 * @mode: the mode of the file to create
 	 * @buf: the channel buffer
+	 * @is_global: outparam - set non-zero if the buffer should be global
 	 *
 	 * Called during relay_open(), once for each per-cpu buffer,
 	 * to allow the client to create a file to be used to
@@ -126,12 +127,17 @@ struct rchan_callbacks
 	 * The callback should return the dentry of the file created
 	 * to represent the relay buffer.
 	 *
+	 * Setting the is_global outparam to a non-zero value will
+	 * cause relay_open() to create a single global buffer rather
+	 * than the default set of per-cpu buffers.
+	 *
 	 * See Documentation/filesystems/relayfs.txt for more info.
 	 */
 	struct dentry *(*create_buf_file)(const char *filename,
 					  struct dentry *parent,
 					  int mode,
-					  struct rchan_buf *buf);
+					  struct rchan_buf *buf,
+					  int *is_global);
 
 	/*
 	 * remove_buf_file - remove file representing a relayfs channel buffer
-- 
cgit v1.2.3


From 761da5c88aca34586e5b7295bd8b9be2438906f2 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Sun, 8 Jan 2006 01:02:31 -0800
Subject: [PATCH] relayfs: cleanup, change relayfs_file_* to relay_file_*

This patch renames relayfs_file_operations to relay_file_operations, and the
file operations themselves from relayfs_XXX to relay_file_XXX, to make it more
clear that they refer to relay files.

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/relayfs/inode.c         | 89 ++++++++++++++++++++++++----------------------
 fs/relayfs/relay.c         |  2 +-
 include/linux/relayfs_fs.h |  5 ++-
 3 files changed, 50 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c
index b4c3e0466e98..7b7f2cb5f0e1 100644
--- a/fs/relayfs/inode.c
+++ b/fs/relayfs/inode.c
@@ -247,13 +247,13 @@ int relayfs_remove_dir(struct dentry *dentry)
 }
 
 /**
- *	relayfs_open - open file op for relayfs files
+ *	relay_file_open - open file op for relay files
  *	@inode: the inode
  *	@filp: the file
  *
  *	Increments the channel buffer refcount.
  */
-static int relayfs_open(struct inode *inode, struct file *filp)
+static int relay_file_open(struct inode *inode, struct file *filp)
 {
 	struct rchan_buf *buf = inode->u.generic_ip;
 	kref_get(&buf->kref);
@@ -263,26 +263,26 @@ static int relayfs_open(struct inode *inode, struct file *filp)
 }
 
 /**
- *	relayfs_mmap - mmap file op for relayfs files
+ *	relay_file_mmap - mmap file op for relay files
  *	@filp: the file
  *	@vma: the vma describing what to map
  *
  *	Calls upon relay_mmap_buf to map the file into user space.
  */
-static int relayfs_mmap(struct file *filp, struct vm_area_struct *vma)
+static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma)
 {
 	struct rchan_buf *buf = filp->private_data;
 	return relay_mmap_buf(buf, vma);
 }
 
 /**
- *	relayfs_poll - poll file op for relayfs files
+ *	relay_file_poll - poll file op for relay files
  *	@filp: the file
  *	@wait: poll table
  *
  *	Poll implemention.
  */
-static unsigned int relayfs_poll(struct file *filp, poll_table *wait)
+static unsigned int relay_file_poll(struct file *filp, poll_table *wait)
 {
 	unsigned int mask = 0;
 	struct rchan_buf *buf = filp->private_data;
@@ -300,14 +300,14 @@ static unsigned int relayfs_poll(struct file *filp, poll_table *wait)
 }
 
 /**
- *	relayfs_release - release file op for relayfs files
+ *	relay_file_release - release file op for relay files
  *	@inode: the inode
  *	@filp: the file
  *
  *	Decrements the channel refcount, as the filesystem is
  *	no longer using it.
  */
-static int relayfs_release(struct inode *inode, struct file *filp)
+static int relay_file_release(struct inode *inode, struct file *filp)
 {
 	struct rchan_buf *buf = filp->private_data;
 	kref_put(&buf->kref, relay_remove_buf);
@@ -316,11 +316,11 @@ static int relayfs_release(struct inode *inode, struct file *filp)
 }
 
 /**
- *	relayfs_read_consume - update the consumed count for the buffer
+ *	relay_file_read_consume - update the consumed count for the buffer
  */
-static void relayfs_read_consume(struct rchan_buf *buf,
-				 size_t read_pos,
-				 size_t bytes_consumed)
+static void relay_file_read_consume(struct rchan_buf *buf,
+				    size_t read_pos,
+				    size_t bytes_consumed)
 {
 	size_t subbuf_size = buf->chan->subbuf_size;
 	size_t n_subbufs = buf->chan->n_subbufs;
@@ -343,9 +343,9 @@ static void relayfs_read_consume(struct rchan_buf *buf,
 }
 
 /**
- *	relayfs_read_avail - boolean, are there unconsumed bytes available?
+ *	relay_file_read_avail - boolean, are there unconsumed bytes available?
  */
-static int relayfs_read_avail(struct rchan_buf *buf, size_t read_pos)
+static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
 {
 	size_t bytes_produced, bytes_consumed, write_offset;
 	size_t subbuf_size = buf->chan->subbuf_size;
@@ -376,16 +376,16 @@ static int relayfs_read_avail(struct rchan_buf *buf, size_t read_pos)
 	if (bytes_produced == bytes_consumed)
 		return 0;
 
-	relayfs_read_consume(buf, read_pos, 0);
+	relay_file_read_consume(buf, read_pos, 0);
 
 	return 1;
 }
 
 /**
- *	relayfs_read_subbuf_avail - return bytes available in sub-buffer
+ *	relay_file_read_subbuf_avail - return bytes available in sub-buffer
  */
-static size_t relayfs_read_subbuf_avail(size_t read_pos,
-					struct rchan_buf *buf)
+static size_t relay_file_read_subbuf_avail(size_t read_pos,
+					   struct rchan_buf *buf)
 {
 	size_t padding, avail = 0;
 	size_t read_subbuf, read_offset, write_subbuf, write_offset;
@@ -407,14 +407,14 @@ static size_t relayfs_read_subbuf_avail(size_t read_pos,
 }
 
 /**
- *	relayfs_read_start_pos - find the first available byte to read
+ *	relay_file_read_start_pos - find the first available byte to read
  *
  *	If the read_pos is in the middle of padding, return the
  *	position of the first actually available byte, otherwise
  *	return the original value.
  */
-static size_t relayfs_read_start_pos(size_t read_pos,
-				     struct rchan_buf *buf)
+static size_t relay_file_read_start_pos(size_t read_pos,
+					struct rchan_buf *buf)
 {
 	size_t read_subbuf, padding, padding_start, padding_end;
 	size_t subbuf_size = buf->chan->subbuf_size;
@@ -433,11 +433,11 @@ static size_t relayfs_read_start_pos(size_t read_pos,
 }
 
 /**
- *	relayfs_read_end_pos - return the new read position
+ *	relay_file_read_end_pos - return the new read position
  */
-static size_t relayfs_read_end_pos(struct rchan_buf *buf,
-				   size_t read_pos,
-				   size_t count)
+static size_t relay_file_read_end_pos(struct rchan_buf *buf,
+				      size_t read_pos,
+				      size_t count)
 {
 	size_t read_subbuf, padding, end_pos;
 	size_t subbuf_size = buf->chan->subbuf_size;
@@ -456,7 +456,7 @@ static size_t relayfs_read_end_pos(struct rchan_buf *buf,
 }
 
 /**
- *	relayfs_read - read file op for relayfs files
+ *	relay_file_read - read file op for relay files
  *	@filp: the file
  *	@buffer: the userspace buffer
  *	@count: number of bytes to read
@@ -465,10 +465,10 @@ static size_t relayfs_read_end_pos(struct rchan_buf *buf,
  *	Reads count bytes or the number of bytes available in the
  *	current sub-buffer being read, whichever is smaller.
  */
-static ssize_t relayfs_read(struct file *filp,
-			    char __user *buffer,
-			    size_t count,
-			    loff_t *ppos)
+static ssize_t relay_file_read(struct file *filp,
+			       char __user *buffer,
+			       size_t count,
+			       loff_t *ppos)
 {
 	struct rchan_buf *buf = filp->private_data;
 	struct inode *inode = filp->f_dentry->d_inode;
@@ -477,11 +477,11 @@ static ssize_t relayfs_read(struct file *filp,
 	void *from;
 
 	down(&inode->i_sem);
-	if(!relayfs_read_avail(buf, *ppos))
+	if(!relay_file_read_avail(buf, *ppos))
 		goto out;
 
-	read_start = relayfs_read_start_pos(*ppos, buf);
-	avail = relayfs_read_subbuf_avail(read_start, buf);
+	read_start = relay_file_read_start_pos(*ppos, buf);
+	avail = relay_file_read_subbuf_avail(read_start, buf);
 	if (!avail)
 		goto out;
 
@@ -491,20 +491,20 @@ static ssize_t relayfs_read(struct file *filp,
 		ret = -EFAULT;
 		goto out;
 	}
-	relayfs_read_consume(buf, read_start, count);
-	*ppos = relayfs_read_end_pos(buf, read_start, count);
+	relay_file_read_consume(buf, read_start, count);
+	*ppos = relay_file_read_end_pos(buf, read_start, count);
 out:
 	up(&inode->i_sem);
 	return ret;
 }
 
-struct file_operations relayfs_file_operations = {
-	.open		= relayfs_open,
-	.poll		= relayfs_poll,
-	.mmap		= relayfs_mmap,
-	.read		= relayfs_read,
+struct file_operations relay_file_operations = {
+	.open		= relay_file_open,
+	.poll		= relay_file_poll,
+	.mmap		= relay_file_mmap,
+	.read		= relay_file_read,
 	.llseek		= no_llseek,
-	.release	= relayfs_release,
+	.release	= relay_file_release,
 };
 
 static struct super_operations relayfs_ops = {
@@ -558,13 +558,18 @@ static int __init init_relayfs_fs(void)
 
 static void __exit exit_relayfs_fs(void)
 {
+
+
+
+
+
 	unregister_filesystem(&relayfs_fs_type);
 }
 
 module_init(init_relayfs_fs)
 module_exit(exit_relayfs_fs)
 
-EXPORT_SYMBOL_GPL(relayfs_file_operations);
+EXPORT_SYMBOL_GPL(relay_file_operations);
 EXPORT_SYMBOL_GPL(relayfs_create_dir);
 EXPORT_SYMBOL_GPL(relayfs_remove_dir);
 EXPORT_SYMBOL_GPL(relayfs_create_file);
diff --git a/fs/relayfs/relay.c b/fs/relayfs/relay.c
index 2935a6ab8ffa..abf3ceaace49 100644
--- a/fs/relayfs/relay.c
+++ b/fs/relayfs/relay.c
@@ -90,7 +90,7 @@ static struct dentry *create_buf_file_default_callback(const char *filename,
 						       int *is_global)
 {
 	return relayfs_create_file(filename, parent, mode,
-				   &relayfs_file_operations, buf);
+				   &relay_file_operations, buf);
 }
 
 /*
diff --git a/include/linux/relayfs_fs.h b/include/linux/relayfs_fs.h
index 30f45511b40d..7342e66247fb 100644
--- a/include/linux/relayfs_fs.h
+++ b/include/linux/relayfs_fs.h
@@ -279,10 +279,9 @@ static inline void subbuf_start_reserve(struct rchan_buf *buf,
 }
 
 /*
- * exported relayfs file operations, fs/relayfs/inode.c
+ * exported relay file operations, fs/relayfs/inode.c
  */
-
-extern struct file_operations relayfs_file_operations;
+extern struct file_operations relay_file_operations;
 
 #endif /* _LINUX_RELAYFS_FS_H */
 
-- 
cgit v1.2.3


From 6b9c7ed84837753a436415097063232422e29a35 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 8 Jan 2006 01:02:33 -0800
Subject: [PATCH] use ptrace_get_task_struct in various places

The ptrace_get_task_struct() helper that I added as part of the ptrace
consolidation is useful in variety of places that currently opencode it.
Switch them to the common helpers.

Add a ptrace_traceme() helper that needs to be explicitly called, and simplify
the ptrace_get_task_struct() interface.  We don't need the request argument
now, and we return the task_struct directly, using ERR_PTR() for error
returns.  It's a bit more code in the callers, but we have two sane routines
that do one thing well now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/ptrace.c     | 24 +++----------
 arch/ia64/ia32/sys_ia32.c      | 16 +++------
 arch/ia64/kernel/ptrace.c      |  9 +----
 arch/m32r/kernel/ptrace.c      | 22 +++---------
 arch/mips/kernel/ptrace32.c    | 26 ++++----------
 arch/powerpc/kernel/ptrace32.c | 28 ++++-----------
 arch/s390/kernel/ptrace.c      | 29 ++++------------
 arch/sparc/kernel/ptrace.c     | 35 ++++---------------
 arch/sparc64/kernel/ptrace.c   | 34 +++----------------
 arch/x86_64/ia32/ptrace32.c    | 44 ++++++------------------
 include/linux/ptrace.h         |  2 ++
 kernel/ptrace.c                | 77 +++++++++++++++++++++++++-----------------
 12 files changed, 105 insertions(+), 241 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c
index bbd37536d14e..9969d212e94d 100644
--- a/arch/alpha/kernel/ptrace.c
+++ b/arch/alpha/kernel/ptrace.c
@@ -265,30 +265,16 @@ do_sys_ptrace(long request, long pid, long addr, long data,
 	lock_kernel();
 	DBG(DBG_MEM, ("request=%ld pid=%ld addr=0x%lx data=0x%lx\n",
 		      request, pid, addr, data));
-	ret = -EPERM;
 	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out_notsk;
-		ret = security_ptrace(current->parent, current);
-		if (ret)
-			goto out_notsk;
-		/* set the ptrace bit in the process ptrace flags. */
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
+		ret = ptrace_traceme();
 		goto out_notsk;
 	}
-	if (pid == 1)		/* you may not mess with init */
-		goto out_notsk;
 
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child)) {
+		ret = PTR_ERR(child);
 		goto out_notsk;
+	}
 
 	if (request == PTRACE_ATTACH) {
 		ret = ptrace_attach(child);
diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c
index dc282710421a..9f8e8d558873 100644
--- a/arch/ia64/ia32/sys_ia32.c
+++ b/arch/ia64/ia32/sys_ia32.c
@@ -1761,21 +1761,15 @@ sys32_ptrace (int request, pid_t pid, unsigned int addr, unsigned int data)
 
 	lock_kernel();
 	if (request == PTRACE_TRACEME) {
-		ret = sys_ptrace(request, pid, addr, data);
+		ret = ptrace_traceme();
 		goto out;
 	}
 
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child)) {
+		ret = PTR_ERR(child);
 		goto out;
-	ret = -EPERM;
-	if (pid == 1)		/* no messing around with init! */
-		goto out_tsk;
+	}
 
 	if (request == PTRACE_ATTACH) {
 		ret = sys_ptrace(request, pid, addr, data);
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 4b19d0410632..8d88eeea02d1 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -1422,14 +1422,7 @@ sys_ptrace (long request, pid_t pid, unsigned long addr, unsigned long data)
 	lock_kernel();
 	ret = -EPERM;
 	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-		ret = security_ptrace(current->parent, current);
-		if (ret)
-			goto out;
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
+		ret = ptrace_traceme();
 		goto out;
 	}
 
diff --git a/arch/m32r/kernel/ptrace.c b/arch/m32r/kernel/ptrace.c
index 078d2a0e71c2..9b75caaf5cec 100644
--- a/arch/m32r/kernel/ptrace.c
+++ b/arch/m32r/kernel/ptrace.c
@@ -762,28 +762,16 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
 	int ret;
 
 	lock_kernel();
-	ret = -EPERM;
 	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
+		ret = ptrace_traceme();
 		goto out;
 	}
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
-		goto out;
 
-	ret = -EPERM;
-	if (pid == 1)		/* you may not mess with init */
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child)) {
+		ret = PTR_ERR(child);
 		goto out;
+	}
 
 	if (request == PTRACE_ATTACH) {
 		ret = ptrace_attach(child);
diff --git a/arch/mips/kernel/ptrace32.c b/arch/mips/kernel/ptrace32.c
index 9a9b04972132..7e55457a491f 100644
--- a/arch/mips/kernel/ptrace32.c
+++ b/arch/mips/kernel/ptrace32.c
@@ -57,30 +57,16 @@ asmlinkage int sys32_ptrace(int request, int pid, int addr, int data)
 	       (unsigned long) data);
 #endif
 	lock_kernel();
-	ret = -EPERM;
 	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-		if ((ret = security_ptrace(current->parent, current)))
-			goto out;
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
+		ret = ptrace_traceme();
 		goto out;
 	}
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
-		goto out;
 
-	ret = -EPERM;
-	if (pid == 1)		/* you may not mess with init */
-		goto out_tsk;
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child)) {
+		ret = PTR_ERR(child);
+		goto out;
+	}
 
 	if (request == PTRACE_ATTACH) {
 		ret = ptrace_attach(child);
diff --git a/arch/powerpc/kernel/ptrace32.c b/arch/powerpc/kernel/ptrace32.c
index 61762640b877..826ee3d056de 100644
--- a/arch/powerpc/kernel/ptrace32.c
+++ b/arch/powerpc/kernel/ptrace32.c
@@ -45,33 +45,19 @@ long compat_sys_ptrace(int request, int pid, unsigned long addr,
 		       unsigned long data)
 {
 	struct task_struct *child;
-	int ret = -EPERM;
+	int ret;
 
 	lock_kernel();
 	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-		ret = security_ptrace(current->parent, current);
-		if (ret)
-			goto out;
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
+		ret = ptrace_traceme();
 		goto out;
 	}
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
-		goto out;
 
-	ret = -EPERM;
-	if (pid == 1)		/* you may not mess with init */
-		goto out_tsk;
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child)) {
+		ret = PTR_ERR(child);
+		goto out;
+	}
 
 	if (request == PTRACE_ATTACH) {
 		ret = ptrace_attach(child);
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 8ecda6d66de4..cc02232aa96e 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -712,35 +712,18 @@ sys_ptrace(long request, long pid, long addr, long data)
 	int ret;
 
 	lock_kernel();
-
 	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		ret = -EPERM;
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-		ret = security_ptrace(current->parent, current);
-		if (ret)
-			goto out;
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
-		goto out;
+		 ret = ptrace_traceme();
+		 goto out;
 	}
 
-	ret = -EPERM;
-	if (pid == 1)		/* you may not mess with init */
-		goto out;
-
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child)) {
+		ret = PTR_ERR(child);
 		goto out;
+	}
 
 	ret = do_ptrace(child, request, addr, data);
-
 	put_task_struct(child);
 out:
 	unlock_kernel();
diff --git a/arch/sparc/kernel/ptrace.c b/arch/sparc/kernel/ptrace.c
index 475c4c13462c..fc470c0e9dc6 100644
--- a/arch/sparc/kernel/ptrace.c
+++ b/arch/sparc/kernel/ptrace.c
@@ -286,40 +286,17 @@ asmlinkage void do_ptrace(struct pt_regs *regs)
 			       s, (int) request, (int) pid, addr, data, addr2);
 	}
 #endif
-	if (request == PTRACE_TRACEME) {
-		int my_ret;
-
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED) {
-			pt_error_return(regs, EPERM);
-			goto out;
-		}
-		my_ret = security_ptrace(current->parent, current);
-		if (my_ret) {
-			pt_error_return(regs, -my_ret);
-			goto out;
-		}
 
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
+	if (request == PTRACE_TRACEME) {
+		ret = ptrace_traceme();
 		pt_succ_return(regs, 0);
 		goto out;
 	}
-#ifndef ALLOW_INIT_TRACING
-	if (pid == 1) {
-		/* Can't dork with init. */
-		pt_error_return(regs, EPERM);
-		goto out;
-	}
-#endif
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
 
-	if (!child) {
-		pt_error_return(regs, ESRCH);
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child)) {
+		ret = PTR_ERR(child);
+		pt_error_return(regs, -ret);
 		goto out;
 	}
 
diff --git a/arch/sparc64/kernel/ptrace.c b/arch/sparc64/kernel/ptrace.c
index 774ecbb8a031..84d3df2264cb 100644
--- a/arch/sparc64/kernel/ptrace.c
+++ b/arch/sparc64/kernel/ptrace.c
@@ -198,39 +198,15 @@ asmlinkage void do_ptrace(struct pt_regs *regs)
 	}
 #endif
 	if (request == PTRACE_TRACEME) {
-		int ret;
-
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED) {
-			pt_error_return(regs, EPERM);
-			goto out;
-		}
-		ret = security_ptrace(current->parent, current);
-		if (ret) {
-			pt_error_return(regs, -ret);
-			goto out;
-		}
-
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
+		ret = ptrace_traceme();
 		pt_succ_return(regs, 0);
 		goto out;
 	}
-#ifndef ALLOW_INIT_TRACING
-	if (pid == 1) {
-		/* Can't dork with init. */
-		pt_error_return(regs, EPERM);
-		goto out;
-	}
-#endif
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
 
-	if (!child) {
-		pt_error_return(regs, ESRCH);
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child)) {
+		ret = PTR_ERR(child);
+		pt_error_return(regs, -ret);
 		goto out;
 	}
 
diff --git a/arch/x86_64/ia32/ptrace32.c b/arch/x86_64/ia32/ptrace32.c
index 2a925e2af390..5f4cdfa56901 100644
--- a/arch/x86_64/ia32/ptrace32.c
+++ b/arch/x86_64/ia32/ptrace32.c
@@ -196,36 +196,6 @@ static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
 
 #undef R32
 
-static struct task_struct *find_target(int request, int pid, int *err)
-{ 
-	struct task_struct *child;
-
-	*err = -EPERM; 
-	if (pid == 1)
-		return NULL; 
-
-	*err = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (child) { 
-		*err = -EPERM;
-		if (child->pid == 1) 
-			goto out;
-		*err = ptrace_check_attach(child, request == PTRACE_KILL); 
-		if (*err < 0) 
-			goto out;
-		return child; 
-	} 
- out:
-	if (child)
-	put_task_struct(child);
-	return NULL; 
-	
-} 
-
 asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
 {
 	struct task_struct *child;
@@ -254,9 +224,16 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
 		break;
 	} 
 
-	child = find_target(request, pid, &ret);
-	if (!child)
-		return ret;
+	if (request == PTRACE_TRACEME)
+		return ptrace_traceme();
+
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child))
+		return PTR_ERR(child);
+
+	ret = ptrace_check_attach(child, request == PTRACE_KILL);
+	if (ret < 0)
+		goto out;
 
 	childregs = (struct pt_regs *)(child->thread.rsp0 - sizeof(struct pt_regs)); 
 
@@ -373,6 +350,7 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
 		break;
 	}
 
+ out:
 	put_task_struct(child);
 	return ret;
 }
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index b2b3dba1298d..864791996b5f 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -80,6 +80,8 @@
 
 
 extern long arch_ptrace(struct task_struct *child, long request, long addr, long data);
+extern struct task_struct *ptrace_get_task_struct(pid_t pid);
+extern int ptrace_traceme(void);
 extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len);
 extern int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len);
 extern int ptrace_attach(struct task_struct *tsk);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 656476eedb1b..cceaf09ac413 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -408,54 +408,62 @@ int ptrace_request(struct task_struct *child, long request,
 	return ret;
 }
 
-#ifndef __ARCH_SYS_PTRACE
-static int ptrace_get_task_struct(long request, long pid,
-		struct task_struct **childp)
+/**
+ * ptrace_traceme  --  helper for PTRACE_TRACEME
+ *
+ * Performs checks and sets PT_PTRACED.
+ * Should be used by all ptrace implementations for PTRACE_TRACEME.
+ */
+int ptrace_traceme(void)
 {
-	struct task_struct *child;
 	int ret;
 
 	/*
-	 * Callers use child == NULL as an indication to exit early even
-	 * when the return value is 0, so make sure it is non-NULL here.
+	 * Are we already being traced?
+	 */
+	if (current->ptrace & PT_PTRACED)
+		return -EPERM;
+	ret = security_ptrace(current->parent, current);
+	if (ret)
+		return -EPERM;
+	/*
+	 * Set the ptrace bit in the process ptrace flags.
 	 */
-	*childp = NULL;
+	current->ptrace |= PT_PTRACED;
+	return 0;
+}
 
-	if (request == PTRACE_TRACEME) {
-		/*
-		 * Are we already being traced?
-		 */
-		if (current->ptrace & PT_PTRACED)
-			return -EPERM;
-		ret = security_ptrace(current->parent, current);
-		if (ret)
-			return -EPERM;
-		/*
-		 * Set the ptrace bit in the process ptrace flags.
-		 */
-		current->ptrace |= PT_PTRACED;
-		return 0;
-	}
+/**
+ * ptrace_get_task_struct  --  grab a task struct reference for ptrace
+ * @pid:       process id to grab a task_struct reference of
+ *
+ * This function is a helper for ptrace implementations.  It checks
+ * permissions and then grabs a task struct for use of the actual
+ * ptrace implementation.
+ *
+ * Returns the task_struct for @pid or an ERR_PTR() on failure.
+ */
+struct task_struct *ptrace_get_task_struct(pid_t pid)
+{
+	struct task_struct *child;
 
 	/*
-	 * You may not mess with init
+	 * Tracing init is not allowed.
 	 */
 	if (pid == 1)
-		return -EPERM;
+		return ERR_PTR(-EPERM);
 
-	ret = -ESRCH;
 	read_lock(&tasklist_lock);
 	child = find_task_by_pid(pid);
 	if (child)
 		get_task_struct(child);
 	read_unlock(&tasklist_lock);
 	if (!child)
-		return -ESRCH;
-
-	*childp = child;
-	return 0;
+		return ERR_PTR(-ESRCH);
+	return child;
 }
 
+#ifndef __ARCH_SYS_PTRACE
 asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
 {
 	struct task_struct *child;
@@ -465,9 +473,16 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
 	 * This lock_kernel fixes a subtle race with suid exec
 	 */
 	lock_kernel();
-	ret = ptrace_get_task_struct(request, pid, &child);
-	if (!child)
+	if (request == PTRACE_TRACEME) {
+		ret = ptrace_traceme();
 		goto out;
+	}
+
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child)) {
+		ret = PTR_ERR(child);
+		goto out;
+	}
 
 	if (request == PTRACE_ATTACH) {
 		ret = ptrace_attach(child);
-- 
cgit v1.2.3


From 788540141f4549637e89aadca6e25cf25eb53383 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 8 Jan 2006 01:02:37 -0800
Subject: [PATCH] Permit multiple inclusion of linux/pagevec.h

Make it possible to include linux/pagevec.h multiple times without
incurring errors due to duplicate definitions.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/pagevec.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index def32c5715be..8eb7fa76c1d0 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -5,6 +5,9 @@
  * pages.  A pagevec is a multipage container which is used for that.
  */
 
+#ifndef _LINUX_PAGEVEC_H
+#define _LINUX_PAGEVEC_H
+
 /* 14 pointers + two long's align the pagevec structure to a power of two */
 #define PAGEVEC_SIZE	14
 
@@ -83,3 +86,5 @@ static inline void pagevec_lru_add(struct pagevec *pvec)
 	if (pagevec_count(pvec))
 		__pagevec_lru_add(pvec);
 }
+
+#endif /* _LINUX_PAGEVEC_H */
-- 
cgit v1.2.3


From 4a30131e7dbb17e5fec6958bfac9da9aff1fa29b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Sun, 8 Jan 2006 01:02:39 -0800
Subject: [PATCH] Fix some problems with truncate and mtime semantics.

SUS requires that when truncating a file to the size that it currently
is:
  truncate and ftruncate should NOT modify ctime or mtime
  O_TRUNC SHOULD modify ctime and mtime.

Currently mtime and ctime are always modified on most local
filesystems (side effect of ->truncate) or never modified (on NFS).

With this patch:
  ATTR_CTIME|ATTR_MTIME are sent with ATTR_SIZE precisely when
    an update of these times is required whether size changes or not
    (via a new argument to do_truncate).  This allows NFS to do
    the right thing for O_TRUNC.
  inode_setattr nolonger forces ATTR_MTIME|ATTR_CTIME when the ATTR_SIZE
    sets the size to it's current value.  This allows local filesystems
    to do the right thing for f?truncate.

Also, the logic in inode_setattr is changed a bit so there are two return
points.  One returns the error from vmtruncate if it failed, the other
returns 0 (there can be no other failure).

Finally, if vmtruncate succeeds, and ATTR_SIZE is the only change
requested, we now fall-through and mark_inode_dirty.  If a filesystem did
not have a ->truncate function, then vmtruncate will have changed i_size,
without marking the inode as 'dirty', and I think this is wrong.

Signed-off-by: Neil Brown <neilb@suse.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/attr.c          | 24 ++++++++----------------
 fs/exec.c          |  2 +-
 fs/namei.c         |  2 +-
 fs/open.c          |  9 +++++----
 include/linux/fs.h |  3 ++-
 5 files changed, 17 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/fs/attr.c b/fs/attr.c
index 67bcd9b14ea5..b34732506f1d 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -67,20 +67,12 @@ EXPORT_SYMBOL(inode_change_ok);
 int inode_setattr(struct inode * inode, struct iattr * attr)
 {
 	unsigned int ia_valid = attr->ia_valid;
-	int error = 0;
-
-	if (ia_valid & ATTR_SIZE) {
-		if (attr->ia_size != i_size_read(inode)) {
-			error = vmtruncate(inode, attr->ia_size);
-			if (error || (ia_valid == ATTR_SIZE))
-				goto out;
-		} else {
-			/*
-			 * We skipped the truncate but must still update
-			 * timestamps
-			 */
-			ia_valid |= ATTR_MTIME|ATTR_CTIME;
-		}
+
+	if (ia_valid & ATTR_SIZE &&
+	    attr->ia_size != i_size_read(inode)) {
+		int error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
 	}
 
 	if (ia_valid & ATTR_UID)
@@ -104,8 +96,8 @@ int inode_setattr(struct inode * inode, struct iattr * attr)
 		inode->i_mode = mode;
 	}
 	mark_inode_dirty(inode);
-out:
-	return error;
+
+	return 0;
 }
 EXPORT_SYMBOL(inode_setattr);
 
diff --git a/fs/exec.c b/fs/exec.c
index e9650cd22a3b..2075b674d85e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1505,7 +1505,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 		goto close_fail;
 	if (!file->f_op->write)
 		goto close_fail;
-	if (do_truncate(file->f_dentry, 0, file) != 0)
+	if (do_truncate(file->f_dentry, 0, 0, file) != 0)
 		goto close_fail;
 
 	retval = binfmt->core_dump(signr, regs, file);
diff --git a/fs/namei.c b/fs/namei.c
index 6dbbd42d8b95..300eae088d5f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1491,7 +1491,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
 		if (!error) {
 			DQUOT_INIT(inode);
 			
-			error = do_truncate(dentry, 0, NULL);
+			error = do_truncate(dentry, 0, ATTR_MTIME|ATTR_CTIME, NULL);
 		}
 		put_write_access(inode);
 		if (error)
diff --git a/fs/open.c b/fs/open.c
index f53a5b9ffb7d..94968cb3afca 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -194,7 +194,8 @@ out:
 	return error;
 }
 
-int do_truncate(struct dentry *dentry, loff_t length, struct file *filp)
+int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
+	struct file *filp)
 {
 	int err;
 	struct iattr newattrs;
@@ -204,7 +205,7 @@ int do_truncate(struct dentry *dentry, loff_t length, struct file *filp)
 		return -EINVAL;
 
 	newattrs.ia_size = length;
-	newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
+	newattrs.ia_valid = ATTR_SIZE | time_attrs;
 	if (filp) {
 		newattrs.ia_file = filp;
 		newattrs.ia_valid |= ATTR_FILE;
@@ -266,7 +267,7 @@ static inline long do_sys_truncate(const char __user * path, loff_t length)
 	error = locks_verify_truncate(inode, NULL, length);
 	if (!error) {
 		DQUOT_INIT(inode);
-		error = do_truncate(nd.dentry, length, NULL);
+		error = do_truncate(nd.dentry, length, 0, NULL);
 	}
 	put_write_access(inode);
 
@@ -318,7 +319,7 @@ static inline long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 
 	error = locks_verify_truncate(inode, file, length);
 	if (!error)
-		error = do_truncate(dentry, length, file);
+		error = do_truncate(dentry, length, 0, file);
 out_putf:
 	fput(file);
 out:
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ef29500b5df8..74c01aabd4ab 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1344,7 +1344,8 @@ static inline int break_lease(struct inode *inode, unsigned int mode)
 
 /* fs/open.c */
 
-extern int do_truncate(struct dentry *, loff_t start, struct file *filp);
+extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs,
+		       struct file *filp);
 extern long do_sys_open(const char __user *filename, int flags, int mode);
 extern struct file *filp_open(const char *, int, int);
 extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
-- 
cgit v1.2.3


From 017679c4d45783158dba1dd6f79e712c22bb3d9a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 8 Jan 2006 01:02:43 -0800
Subject: [PATCH] keys: Permit key expiry time to be set

Add a new keyctl function that allows the expiry time to be set on a key or
removed from a key, provided the caller has attribute modification access.

Signed-off-by: David Howells <dhowells@redhat.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Alexander Zangerl <az@bond.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/keys.txt   | 15 ++++++++++++++-
 include/linux/keyctl.h   |  1 +
 security/keys/compat.c   |  3 +++
 security/keys/internal.h |  1 +
 security/keys/keyctl.c   | 44 ++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 63 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/keys.txt b/Documentation/keys.txt
index 6304db59bfe4..c17c4ca74302 100644
--- a/Documentation/keys.txt
+++ b/Documentation/keys.txt
@@ -498,7 +498,7 @@ The keyctl syscall functions are:
      keyring is full, error ENFILE will result.
 
      The link procedure checks the nesting of the keyrings, returning ELOOP if
-     it appears to deep or EDEADLK if the link would introduce a cycle.
+     it appears too deep or EDEADLK if the link would introduce a cycle.
 
 
  (*) Unlink a key or keyring from another keyring:
@@ -628,6 +628,19 @@ The keyctl syscall functions are:
      there is one, otherwise the user default session keyring.
 
 
+ (*) Set the timeout on a key.
+
+	long keyctl(KEYCTL_SET_TIMEOUT, key_serial_t key, unsigned timeout);
+
+     This sets or clears the timeout on a key. The timeout can be 0 to clear
+     the timeout or a number of seconds to set the expiry time that far into
+     the future.
+
+     The process must have attribute modification access on a key to set its
+     timeout. Timeouts may not be set with this function on negative, revoked
+     or expired keys.
+
+
 ===============
 KERNEL SERVICES
 ===============
diff --git a/include/linux/keyctl.h b/include/linux/keyctl.h
index 8d7c59a29e09..ec8f3d622a8d 100644
--- a/include/linux/keyctl.h
+++ b/include/linux/keyctl.h
@@ -46,5 +46,6 @@
 #define KEYCTL_INSTANTIATE		12	/* instantiate a partially constructed key */
 #define KEYCTL_NEGATE			13	/* negate a partially constructed key */
 #define KEYCTL_SET_REQKEY_KEYRING	14	/* set default request-key keyring */
+#define KEYCTL_SET_TIMEOUT		15	/* set key timeout */
 
 #endif /*  _LINUX_KEYCTL_H */
diff --git a/security/keys/compat.c b/security/keys/compat.c
index 3303673c636e..e8e7ef4a290c 100644
--- a/security/keys/compat.c
+++ b/security/keys/compat.c
@@ -74,6 +74,9 @@ asmlinkage long compat_sys_keyctl(u32 option,
 	case KEYCTL_SET_REQKEY_KEYRING:
 		return keyctl_set_reqkey_keyring(arg2);
 
+	case KEYCTL_SET_TIMEOUT:
+		return keyctl_set_timeout(arg2, arg3);
+
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 39cba97c5eb9..51f37c0bdb32 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -136,6 +136,7 @@ extern long keyctl_instantiate_key(key_serial_t, const void __user *,
 				   size_t, key_serial_t);
 extern long keyctl_negate_key(key_serial_t, unsigned, key_serial_t);
 extern long keyctl_set_reqkey_keyring(int);
+extern long keyctl_set_timeout(key_serial_t, unsigned);
 
 
 /*
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index b7a468fabdf9..299f0ae11cf0 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -965,6 +965,46 @@ long keyctl_set_reqkey_keyring(int reqkey_defl)
 
 } /* end keyctl_set_reqkey_keyring() */
 
+/*****************************************************************************/
+/*
+ * set or clear the timeout for a key
+ */
+long keyctl_set_timeout(key_serial_t id, unsigned timeout)
+{
+	struct timespec now;
+	struct key *key;
+	key_ref_t key_ref;
+	time_t expiry;
+	long ret;
+
+	key_ref = lookup_user_key(NULL, id, 1, 1, KEY_SETATTR);
+	if (IS_ERR(key_ref)) {
+		ret = PTR_ERR(key_ref);
+		goto error;
+	}
+
+	key = key_ref_to_ptr(key_ref);
+
+	/* make the changes with the locks held to prevent races */
+	down_write(&key->sem);
+
+	expiry = 0;
+	if (timeout > 0) {
+		now = current_kernel_time();
+		expiry = now.tv_sec + timeout;
+	}
+
+	key->expiry = expiry;
+
+	up_write(&key->sem);
+	key_put(key);
+
+	ret = 0;
+error:
+	return ret;
+
+} /* end keyctl_set_timeout() */
+
 /*****************************************************************************/
 /*
  * the key control system call
@@ -1038,6 +1078,10 @@ asmlinkage long sys_keyctl(int option, unsigned long arg2, unsigned long arg3,
 	case KEYCTL_SET_REQKEY_KEYRING:
 		return keyctl_set_reqkey_keyring(arg2);
 
+	case KEYCTL_SET_TIMEOUT:
+		return keyctl_set_timeout((key_serial_t) arg2,
+					  (unsigned) arg3);
+
 	default:
 		return -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3


From b5f545c880a2a47947ba2118b2509644ab7a2969 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 8 Jan 2006 01:02:47 -0800
Subject: [PATCH] keys: Permit running process to instantiate keys

Make it possible for a running process (such as gssapid) to be able to
instantiate a key, as was requested by Trond Myklebust for NFS4.

The patch makes the following changes:

 (1) A new, optional key type method has been added. This permits a key type
     to intercept requests at the point /sbin/request-key is about to be
     spawned and do something else with them - passing them over the
     rpc_pipefs files or netlink sockets for instance.

     The uninstantiated key, the authorisation key and the intended operation
     name are passed to the method.

 (2) The callout_info is no longer passed as an argument to /sbin/request-key
     to prevent unauthorised viewing of this data using ps or by looking in
     /proc/pid/cmdline.

     This means that the old /sbin/request-key program will not work with the
     patched kernel as it will expect to see an extra argument that is no
     longer there.

     A revised keyutils package will be made available tomorrow.

 (3) The callout_info is now attached to the authorisation key. Reading this
     key will retrieve the information.

 (4) A new field has been added to the task_struct. This holds the
     authorisation key currently active for a thread. Searches now look here
     for the caller's set of keys rather than looking for an auth key in the
     lowest level of the session keyring.

     This permits a thread to be servicing multiple requests at once and to
     switch between them. Note that this is per-thread, not per-process, and
     so is usable in multithreaded programs.

     The setting of this field is inherited across fork and exec.

 (5) A new keyctl function (KEYCTL_ASSUME_AUTHORITY) has been added that
     permits a thread to assume the authority to deal with an uninstantiated
     key. Assumption is only permitted if the authorisation key associated
     with the uninstantiated key is somewhere in the thread's keyrings.

     This function can also clear the assumption.

 (6) A new magic key specifier has been added to refer to the currently
     assumed authorisation key (KEY_SPEC_REQKEY_AUTH_KEY).

 (7) Instantiation will only proceed if the appropriate authorisation key is
     assumed first. The assumed authorisation key is discarded if
     instantiation is successful.

 (8) key_validate() is moved from the file of request_key functions to the
     file of permissions functions.

 (9) The documentation is updated.

From: <Valdis.Kletnieks@vt.edu>

    Build fix.

Signed-off-by: David Howells <dhowells@redhat.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Alexander Zangerl <az@bond.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/keys-request-key.txt |  22 +++--
 Documentation/keys.txt             |  24 +++++
 include/linux/key.h                |  12 +++
 include/linux/keyctl.h             |   2 +
 include/linux/sched.h              |   1 +
 security/keys/compat.c             |   3 +
 security/keys/internal.h           |   4 +-
 security/keys/keyctl.c             | 107 ++++++++++++++++-----
 security/keys/keyring.c            |  45 ---------
 security/keys/permission.c         |  32 +++++++
 security/keys/process_keys.c       |  71 +++++++-------
 security/keys/request_key.c        | 108 ++++++++++-----------
 security/keys/request_key_auth.c   | 192 ++++++++++++++++++++++---------------
 13 files changed, 378 insertions(+), 245 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/keys-request-key.txt b/Documentation/keys-request-key.txt
index 5f2b9c5edbb5..22488d791168 100644
--- a/Documentation/keys-request-key.txt
+++ b/Documentation/keys-request-key.txt
@@ -56,10 +56,12 @@ A request proceeds in the following manner:
  (4) request_key() then forks and executes /sbin/request-key with a new session
      keyring that contains a link to auth key V.
 
- (5) /sbin/request-key execs an appropriate program to perform the actual
+ (5) /sbin/request-key assumes the authority associated with key U.
+
+ (6) /sbin/request-key execs an appropriate program to perform the actual
      instantiation.
 
- (6) The program may want to access another key from A's context (say a
+ (7) The program may want to access another key from A's context (say a
      Kerberos TGT key). It just requests the appropriate key, and the keyring
      search notes that the session keyring has auth key V in its bottom level.
 
@@ -67,19 +69,19 @@ A request proceeds in the following manner:
      UID, GID, groups and security info of process A as if it was process A,
      and come up with key W.
 
- (7) The program then does what it must to get the data with which to
+ (8) The program then does what it must to get the data with which to
      instantiate key U, using key W as a reference (perhaps it contacts a
      Kerberos server using the TGT) and then instantiates key U.
 
- (8) Upon instantiating key U, auth key V is automatically revoked so that it
+ (9) Upon instantiating key U, auth key V is automatically revoked so that it
      may not be used again.
 
- (9) The program then exits 0 and request_key() deletes key V and returns key
+(10) The program then exits 0 and request_key() deletes key V and returns key
      U to the caller.
 
-This also extends further. If key W (step 5 above) didn't exist, key W would be
-created uninstantiated, another auth key (X) would be created [as per step 3]
-and another copy of /sbin/request-key spawned [as per step 4]; but the context
+This also extends further. If key W (step 7 above) didn't exist, key W would be
+created uninstantiated, another auth key (X) would be created (as per step 3)
+and another copy of /sbin/request-key spawned (as per step 4); but the context
 specified by auth key X will still be process A, as it was in auth key V.
 
 This is because process A's keyrings can't simply be attached to
@@ -138,8 +140,8 @@ until one succeeds:
 
  (3) The process's session keyring is searched.
 
- (4) If the process has a request_key() authorisation key in its session
-     keyring then:
+ (4) If the process has assumed the authority associated with a request_key()
+     authorisation key then:
 
      (a) If extant, the calling process's thread keyring is searched.
 
diff --git a/Documentation/keys.txt b/Documentation/keys.txt
index eeda00f82d2c..aaa01b0e3ee9 100644
--- a/Documentation/keys.txt
+++ b/Documentation/keys.txt
@@ -308,6 +308,8 @@ process making the call:
 	KEY_SPEC_USER_KEYRING		-4	UID-specific keyring
 	KEY_SPEC_USER_SESSION_KEYRING	-5	UID-session keyring
 	KEY_SPEC_GROUP_KEYRING		-6	GID-specific keyring
+	KEY_SPEC_REQKEY_AUTH_KEY	-7	assumed request_key()
+						  authorisation key
 
 
 The main syscalls are:
@@ -645,6 +647,28 @@ The keyctl syscall functions are:
      or expired keys.
 
 
+ (*) Assume the authority granted to instantiate a key
+
+	long keyctl(KEYCTL_ASSUME_AUTHORITY, key_serial_t key);
+
+     This assumes or divests the authority required to instantiate the
+     specified key. Authority can only be assumed if the thread has the
+     authorisation key associated with the specified key in its keyrings
+     somewhere.
+
+     Once authority is assumed, searches for keys will also search the
+     requester's keyrings using the requester's security label, UID, GID and
+     groups.
+
+     If the requested authority is unavailable, error EPERM will be returned,
+     likewise if the authority has been revoked because the target key is
+     already instantiated.
+
+     If the specified key is 0, then any assumed authority will be divested.
+
+     The assumed authorititive key is inherited across fork and exec.
+
+
 ===============
 KERNEL SERVICES
 ===============
diff --git a/include/linux/key.h b/include/linux/key.h
index 4d189e51bc6c..cbf464ad9589 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -177,6 +177,8 @@ struct key {
 /*
  * kernel managed key type definition
  */
+typedef int (*request_key_actor_t)(struct key *key, struct key *authkey, const char *op);
+
 struct key_type {
 	/* name of the type */
 	const char *name;
@@ -218,6 +220,16 @@ struct key_type {
 	 */
 	long (*read)(const struct key *key, char __user *buffer, size_t buflen);
 
+	/* handle request_key() for this type instead of invoking
+	 * /sbin/request-key (optional)
+	 * - key is the key to instantiate
+	 * - authkey is the authority to assume when instantiating this key
+	 * - op is the operation to be done, usually "create"
+	 * - the call must not return until the instantiation process has run
+	 *   its course
+	 */
+	request_key_actor_t request_key;
+
 	/* internal fields */
 	struct list_head	link;		/* link in types list */
 };
diff --git a/include/linux/keyctl.h b/include/linux/keyctl.h
index ec8f3d622a8d..3365945640c9 100644
--- a/include/linux/keyctl.h
+++ b/include/linux/keyctl.h
@@ -19,6 +19,7 @@
 #define KEY_SPEC_USER_KEYRING		-4	/* - key ID for UID-specific keyring */
 #define KEY_SPEC_USER_SESSION_KEYRING	-5	/* - key ID for UID-session keyring */
 #define KEY_SPEC_GROUP_KEYRING		-6	/* - key ID for GID-specific keyring */
+#define KEY_SPEC_REQKEY_AUTH_KEY	-7	/* - key ID for assumed request_key auth key */
 
 /* request-key default keyrings */
 #define KEY_REQKEY_DEFL_NO_CHANGE		-1
@@ -47,5 +48,6 @@
 #define KEYCTL_NEGATE			13	/* negate a partially constructed key */
 #define KEYCTL_SET_REQKEY_KEYRING	14	/* set default request-key keyring */
 #define KEYCTL_SET_TIMEOUT		15	/* set key timeout */
+#define KEYCTL_ASSUME_AUTHORITY		16	/* assume request_key() authorisation */
 
 #endif /*  _LINUX_KEYCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 20bd70749104..78eb92ae4d94 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -771,6 +771,7 @@ struct task_struct {
 	unsigned keep_capabilities:1;
 	struct user_struct *user;
 #ifdef CONFIG_KEYS
+	struct key *request_key_auth;	/* assumed request_key authority */
 	struct key *thread_keyring;	/* keyring private to this thread */
 	unsigned char jit_keyring;	/* default keyring to attach requested keys to */
 #endif
diff --git a/security/keys/compat.c b/security/keys/compat.c
index e8e7ef4a290c..bcdb28533733 100644
--- a/security/keys/compat.c
+++ b/security/keys/compat.c
@@ -77,6 +77,9 @@ asmlinkage long compat_sys_keyctl(u32 option,
 	case KEYCTL_SET_TIMEOUT:
 		return keyctl_set_timeout(arg2, arg3);
 
+	case KEYCTL_ASSUME_AUTHORITY:
+		return keyctl_assume_authority(arg2);
+
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 51f37c0bdb32..e066e6057955 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -107,12 +107,13 @@ extern struct key *request_key_and_link(struct key_type *type,
 struct request_key_auth {
 	struct key		*target_key;
 	struct task_struct	*context;
+	const char		*callout_info;
 	pid_t			pid;
 };
 
 extern struct key_type key_type_request_key_auth;
 extern struct key *request_key_auth_new(struct key *target,
-					struct key **_rkakey);
+					const char *callout_info);
 
 extern struct key *key_get_instantiation_authkey(key_serial_t target_id);
 
@@ -137,6 +138,7 @@ extern long keyctl_instantiate_key(key_serial_t, const void __user *,
 extern long keyctl_negate_key(key_serial_t, unsigned, key_serial_t);
 extern long keyctl_set_reqkey_keyring(int);
 extern long keyctl_set_timeout(key_serial_t, unsigned);
+extern long keyctl_assume_authority(key_serial_t);
 
 
 /*
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 299f0ae11cf0..3d2ebae029c1 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -834,6 +834,17 @@ long keyctl_instantiate_key(key_serial_t id,
 	if (plen > 32767)
 		goto error;
 
+	/* the appropriate instantiation authorisation key must have been
+	 * assumed before calling this */
+	ret = -EPERM;
+	instkey = current->request_key_auth;
+	if (!instkey)
+		goto error;
+
+	rka = instkey->payload.data;
+	if (rka->target_key->serial != id)
+		goto error;
+
 	/* pull the payload in if one was supplied */
 	payload = NULL;
 
@@ -848,15 +859,6 @@ long keyctl_instantiate_key(key_serial_t id,
 			goto error2;
 	}
 
-	/* find the instantiation authorisation key */
-	instkey = key_get_instantiation_authkey(id);
-	if (IS_ERR(instkey)) {
-		ret = PTR_ERR(instkey);
-		goto error2;
-	}
-
-	rka = instkey->payload.data;
-
 	/* find the destination keyring amongst those belonging to the
 	 * requesting task */
 	keyring_ref = NULL;
@@ -865,7 +867,7 @@ long keyctl_instantiate_key(key_serial_t id,
 					      KEY_WRITE);
 		if (IS_ERR(keyring_ref)) {
 			ret = PTR_ERR(keyring_ref);
-			goto error3;
+			goto error2;
 		}
 	}
 
@@ -874,11 +876,17 @@ long keyctl_instantiate_key(key_serial_t id,
 				       key_ref_to_ptr(keyring_ref), instkey);
 
 	key_ref_put(keyring_ref);
- error3:
-	key_put(instkey);
- error2:
+
+	/* discard the assumed authority if it's just been disabled by
+	 * instantiation of the key */
+	if (ret == 0) {
+		key_put(current->request_key_auth);
+		current->request_key_auth = NULL;
+	}
+
+error2:
 	kfree(payload);
- error:
+error:
 	return ret;
 
 } /* end keyctl_instantiate_key() */
@@ -895,14 +903,16 @@ long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid)
 	key_ref_t keyring_ref;
 	long ret;
 
-	/* find the instantiation authorisation key */
-	instkey = key_get_instantiation_authkey(id);
-	if (IS_ERR(instkey)) {
-		ret = PTR_ERR(instkey);
+	/* the appropriate instantiation authorisation key must have been
+	 * assumed before calling this */
+	ret = -EPERM;
+	instkey = current->request_key_auth;
+	if (!instkey)
 		goto error;
-	}
 
 	rka = instkey->payload.data;
+	if (rka->target_key->serial != id)
+		goto error;
 
 	/* find the destination keyring if present (which must also be
 	 * writable) */
@@ -911,7 +921,7 @@ long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid)
 		keyring_ref = lookup_user_key(NULL, ringid, 1, 0, KEY_WRITE);
 		if (IS_ERR(keyring_ref)) {
 			ret = PTR_ERR(keyring_ref);
-			goto error2;
+			goto error;
 		}
 	}
 
@@ -920,9 +930,15 @@ long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid)
 				  key_ref_to_ptr(keyring_ref), instkey);
 
 	key_ref_put(keyring_ref);
- error2:
-	key_put(instkey);
- error:
+
+	/* discard the assumed authority if it's just been disabled by
+	 * instantiation of the key */
+	if (ret == 0) {
+		key_put(current->request_key_auth);
+		current->request_key_auth = NULL;
+	}
+
+error:
 	return ret;
 
 } /* end keyctl_negate_key() */
@@ -1005,6 +1021,48 @@ error:
 
 } /* end keyctl_set_timeout() */
 
+/*****************************************************************************/
+/*
+ * assume the authority to instantiate the specified key
+ */
+long keyctl_assume_authority(key_serial_t id)
+{
+	struct key *authkey;
+	long ret;
+
+	/* special key IDs aren't permitted */
+	ret = -EINVAL;
+	if (id < 0)
+		goto error;
+
+	/* we divest ourselves of authority if given an ID of 0 */
+	if (id == 0) {
+		key_put(current->request_key_auth);
+		current->request_key_auth = NULL;
+		ret = 0;
+		goto error;
+	}
+
+	/* attempt to assume the authority temporarily granted to us whilst we
+	 * instantiate the specified key
+	 * - the authorisation key must be in the current task's keyrings
+	 *   somewhere
+	 */
+	authkey = key_get_instantiation_authkey(id);
+	if (IS_ERR(authkey)) {
+		ret = PTR_ERR(authkey);
+		goto error;
+	}
+
+	key_put(current->request_key_auth);
+	current->request_key_auth = authkey;
+	ret = authkey->serial;
+
+error:
+	return ret;
+
+} /* end keyctl_assume_authority() */
+
 /*****************************************************************************/
 /*
  * the key control system call
@@ -1082,6 +1140,9 @@ asmlinkage long sys_keyctl(int option, unsigned long arg2, unsigned long arg3,
 		return keyctl_set_timeout((key_serial_t) arg2,
 					  (unsigned) arg3);
 
+	case KEYCTL_ASSUME_AUTHORITY:
+		return keyctl_assume_authority((key_serial_t) arg2);
+
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index 09d92d52ef75..d65a180f888d 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -479,51 +479,6 @@ key_ref_t __keyring_search_one(key_ref_t keyring_ref,
 
 } /* end __keyring_search_one() */
 
-/*****************************************************************************/
-/*
- * search for an instantiation authorisation key matching a target key
- * - the RCU read lock must be held by the caller
- * - a target_id of zero specifies any valid token
- */
-struct key *keyring_search_instkey(struct key *keyring,
-				   key_serial_t target_id)
-{
-	struct request_key_auth *rka;
-	struct keyring_list *klist;
-	struct key *instkey;
-	int loop;
-
-	klist = rcu_dereference(keyring->payload.subscriptions);
-	if (klist) {
-		for (loop = 0; loop < klist->nkeys; loop++) {
-			instkey = klist->keys[loop];
-
-			if (instkey->type != &key_type_request_key_auth)
-				continue;
-
-			rka = instkey->payload.data;
-			if (target_id && rka->target_key->serial != target_id)
-				continue;
-
-			/* the auth key is revoked during instantiation */
-			if (!test_bit(KEY_FLAG_REVOKED, &instkey->flags))
-				goto found;
-
-			instkey = ERR_PTR(-EKEYREVOKED);
-			goto error;
-		}
-	}
-
-	instkey = ERR_PTR(-EACCES);
-	goto error;
-
-found:
-	atomic_inc(&instkey->usage);
-error:
-	return instkey;
-
-} /* end keyring_search_instkey() */
-
 /*****************************************************************************/
 /*
  * find a keyring with the specified name
diff --git a/security/keys/permission.c b/security/keys/permission.c
index e7f579c0eaf5..3b41f9b52537 100644
--- a/security/keys/permission.c
+++ b/security/keys/permission.c
@@ -73,3 +73,35 @@ use_these_perms:
 } /* end key_task_permission() */
 
 EXPORT_SYMBOL(key_task_permission);
+
+/*****************************************************************************/
+/*
+ * validate a key
+ */
+int key_validate(struct key *key)
+{
+	struct timespec now;
+	int ret = 0;
+
+	if (key) {
+		/* check it's still accessible */
+		ret = -EKEYREVOKED;
+		if (test_bit(KEY_FLAG_REVOKED, &key->flags) ||
+		    test_bit(KEY_FLAG_DEAD, &key->flags))
+			goto error;
+
+		/* check it hasn't expired */
+		ret = 0;
+		if (key->expiry) {
+			now = current_kernel_time();
+			if (now.tv_sec >= key->expiry)
+				ret = -EKEYEXPIRED;
+		}
+	}
+
+ error:
+	return ret;
+
+} /* end key_validate() */
+
+EXPORT_SYMBOL(key_validate);
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 566b1cc0118a..74cb79eb917e 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -270,9 +270,14 @@ int copy_thread_group_keys(struct task_struct *tsk)
 int copy_keys(unsigned long clone_flags, struct task_struct *tsk)
 {
 	key_check(tsk->thread_keyring);
+	key_check(tsk->request_key_auth);
 
 	/* no thread keyring yet */
 	tsk->thread_keyring = NULL;
+
+	/* copy the request_key() authorisation for this thread */
+	key_get(tsk->request_key_auth);
+
 	return 0;
 
 } /* end copy_keys() */
@@ -290,11 +295,12 @@ void exit_thread_group_keys(struct signal_struct *tg)
 
 /*****************************************************************************/
 /*
- * dispose of keys upon thread exit
+ * dispose of per-thread keys upon thread exit
  */
 void exit_keys(struct task_struct *tsk)
 {
 	key_put(tsk->thread_keyring);
+	key_put(tsk->request_key_auth);
 
 } /* end exit_keys() */
 
@@ -382,7 +388,7 @@ key_ref_t search_process_keyrings(struct key_type *type,
 				  struct task_struct *context)
 {
 	struct request_key_auth *rka;
-	key_ref_t key_ref, ret, err, instkey_ref;
+	key_ref_t key_ref, ret, err;
 
 	/* we want to return -EAGAIN or -ENOKEY if any of the keyrings were
 	 * searchable, but we failed to find a key or we found a negative key;
@@ -461,30 +467,12 @@ key_ref_t search_process_keyrings(struct key_type *type,
 			err = key_ref;
 			break;
 		}
-
-		/* if this process has a session keyring and that has an
-		 * instantiation authorisation key in the bottom level, then we
-		 * also search the keyrings of the process mentioned there */
-		if (context != current)
-			goto no_key;
-
-		rcu_read_lock();
-		instkey_ref = __keyring_search_one(
-			make_key_ref(rcu_dereference(
-					     context->signal->session_keyring),
-				     1),
-			&key_type_request_key_auth, NULL, 0);
-		rcu_read_unlock();
-
-		if (IS_ERR(instkey_ref))
-			goto no_key;
-
-		rka = key_ref_to_ptr(instkey_ref)->payload.data;
-
-		key_ref = search_process_keyrings(type, description, match,
-						  rka->context);
-		key_ref_put(instkey_ref);
-
+	}
+	/* or search the user-session keyring */
+	else {
+		key_ref = keyring_search_aux(
+			make_key_ref(context->user->session_keyring, 1),
+			context, type, description, match);
 		if (!IS_ERR(key_ref))
 			goto found;
 
@@ -500,11 +488,21 @@ key_ref_t search_process_keyrings(struct key_type *type,
 			break;
 		}
 	}
-	/* or search the user-session keyring */
-	else {
-		key_ref = keyring_search_aux(
-			make_key_ref(context->user->session_keyring, 1),
-			context, type, description, match);
+
+	/* if this process has an instantiation authorisation key, then we also
+	 * search the keyrings of the process mentioned there
+	 * - we don't permit access to request_key auth keys via this method
+	 */
+	if (context->request_key_auth &&
+	    context == current &&
+	    type != &key_type_request_key_auth &&
+	    key_validate(context->request_key_auth) == 0
+	    ) {
+		rka = context->request_key_auth->payload.data;
+
+		key_ref = search_process_keyrings(type, description, match,
+						  rka->context);
+
 		if (!IS_ERR(key_ref))
 			goto found;
 
@@ -521,8 +519,6 @@ key_ref_t search_process_keyrings(struct key_type *type,
 		}
 	}
 
-
-no_key:
 	/* no key - decide on the error we're going to go for */
 	key_ref = ret ? ret : err;
 
@@ -628,6 +624,15 @@ key_ref_t lookup_user_key(struct task_struct *context, key_serial_t id,
 		key = ERR_PTR(-EINVAL);
 		goto error;
 
+	case KEY_SPEC_REQKEY_AUTH_KEY:
+		key = context->request_key_auth;
+		if (!key)
+			goto error;
+
+		atomic_inc(&key->usage);
+		key_ref = make_key_ref(key, 1);
+		break;
+
 	default:
 		key_ref = ERR_PTR(-EINVAL);
 		if (id < 1)
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 5cc4bba70db6..f030a0ccbb93 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -29,28 +29,36 @@ DECLARE_WAIT_QUEUE_HEAD(request_key_conswq);
 /*****************************************************************************/
 /*
  * request userspace finish the construction of a key
- * - execute "/sbin/request-key <op> <key> <uid> <gid> <keyring> <keyring> <keyring> <info>"
+ * - execute "/sbin/request-key <op> <key> <uid> <gid> <keyring> <keyring> <keyring>"
  */
-static int call_request_key(struct key *key,
-			    const char *op,
-			    const char *callout_info)
+static int call_sbin_request_key(struct key *key,
+				 struct key *authkey,
+				 const char *op)
 {
 	struct task_struct *tsk = current;
 	key_serial_t prkey, sskey;
-	struct key *session_keyring, *rkakey;
-	char *argv[10], *envp[3], uid_str[12], gid_str[12];
+	struct key *keyring;
+	char *argv[9], *envp[3], uid_str[12], gid_str[12];
 	char key_str[12], keyring_str[3][12];
+	char desc[20];
 	int ret, i;
 
-	kenter("{%d},%s,%s", key->serial, op, callout_info);
+	kenter("{%d},{%d},%s", key->serial, authkey->serial, op);
 
-	/* generate a new session keyring with an auth key in it */
-	session_keyring = request_key_auth_new(key, &rkakey);
-	if (IS_ERR(session_keyring)) {
-		ret = PTR_ERR(session_keyring);
-		goto error;
+	/* allocate a new session keyring */
+	sprintf(desc, "_req.%u", key->serial);
+
+	keyring = keyring_alloc(desc, current->fsuid, current->fsgid, 1, NULL);
+	if (IS_ERR(keyring)) {
+		ret = PTR_ERR(keyring);
+		goto error_alloc;
 	}
 
+	/* attach the auth key to the session keyring */
+	ret = __key_link(keyring, authkey);
+	if (ret < 0)
+		goto error_link;
+
 	/* record the UID and GID */
 	sprintf(uid_str, "%d", current->fsuid);
 	sprintf(gid_str, "%d", current->fsgid);
@@ -95,22 +103,19 @@ static int call_request_key(struct key *key,
 	argv[i++] = keyring_str[0];
 	argv[i++] = keyring_str[1];
 	argv[i++] = keyring_str[2];
-	argv[i++] = (char *) callout_info;
 	argv[i] = NULL;
 
 	/* do it */
-	ret = call_usermodehelper_keys(argv[0], argv, envp, session_keyring, 1);
+	ret = call_usermodehelper_keys(argv[0], argv, envp, keyring, 1);
 
-	/* dispose of the special keys */
-	key_revoke(rkakey);
-	key_put(rkakey);
-	key_put(session_keyring);
+error_link:
+	key_put(keyring);
 
- error:
+error_alloc:
 	kleave(" = %d", ret);
 	return ret;
 
-} /* end call_request_key() */
+} /* end call_sbin_request_key() */
 
 /*****************************************************************************/
 /*
@@ -122,9 +127,10 @@ static struct key *__request_key_construction(struct key_type *type,
 					      const char *description,
 					      const char *callout_info)
 {
+	request_key_actor_t actor;
 	struct key_construction cons;
 	struct timespec now;
-	struct key *key;
+	struct key *key, *authkey;
 	int ret, negated;
 
 	kenter("%s,%s,%s", type->name, description, callout_info);
@@ -143,8 +149,19 @@ static struct key *__request_key_construction(struct key_type *type,
 	/* we drop the construction sem here on behalf of the caller */
 	up_write(&key_construction_sem);
 
+	/* allocate an authorisation key */
+	authkey = request_key_auth_new(key, callout_info);
+	if (IS_ERR(authkey)) {
+		ret = PTR_ERR(authkey);
+		authkey = NULL;
+		goto alloc_authkey_failed;
+	}
+
 	/* make the call */
-	ret = call_request_key(key, "create", callout_info);
+	actor = call_sbin_request_key;
+	if (type->request_key)
+		actor = type->request_key;
+	ret = actor(key, authkey, "create");
 	if (ret < 0)
 		goto request_failed;
 
@@ -153,22 +170,29 @@ static struct key *__request_key_construction(struct key_type *type,
 	if (!test_bit(KEY_FLAG_INSTANTIATED, &key->flags))
 		goto request_failed;
 
+	key_revoke(authkey);
+	key_put(authkey);
+
 	down_write(&key_construction_sem);
 	list_del(&cons.link);
 	up_write(&key_construction_sem);
 
 	/* also give an error if the key was negatively instantiated */
- check_not_negative:
+check_not_negative:
 	if (test_bit(KEY_FLAG_NEGATIVE, &key->flags)) {
 		key_put(key);
 		key = ERR_PTR(-ENOKEY);
 	}
 
- out:
+out:
 	kleave(" = %p", key);
 	return key;
 
- request_failed:
+request_failed:
+	key_revoke(authkey);
+	key_put(authkey);
+
+alloc_authkey_failed:
 	/* it wasn't instantiated
 	 * - remove from construction queue
 	 * - mark the key as dead
@@ -217,7 +241,7 @@ static struct key *__request_key_construction(struct key_type *type,
 	key = ERR_PTR(ret);
 	goto out;
 
- alloc_failed:
+alloc_failed:
 	up_write(&key_construction_sem);
 	goto out;
 
@@ -464,35 +488,3 @@ struct key *request_key(struct key_type *type,
 } /* end request_key() */
 
 EXPORT_SYMBOL(request_key);
-
-/*****************************************************************************/
-/*
- * validate a key
- */
-int key_validate(struct key *key)
-{
-	struct timespec now;
-	int ret = 0;
-
-	if (key) {
-		/* check it's still accessible */
-		ret = -EKEYREVOKED;
-		if (test_bit(KEY_FLAG_REVOKED, &key->flags) ||
-		    test_bit(KEY_FLAG_DEAD, &key->flags))
-			goto error;
-
-		/* check it hasn't expired */
-		ret = 0;
-		if (key->expiry) {
-			now = current_kernel_time();
-			if (now.tv_sec >= key->expiry)
-				ret = -EKEYEXPIRED;
-		}
-	}
-
- error:
-	return ret;
-
-} /* end key_validate() */
-
-EXPORT_SYMBOL(key_validate);
diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c
index a8e4069d48cb..cce6ba6b0323 100644
--- a/security/keys/request_key_auth.c
+++ b/security/keys/request_key_auth.c
@@ -15,11 +15,13 @@
 #include <linux/sched.h>
 #include <linux/err.h>
 #include <linux/seq_file.h>
+#include <asm/uaccess.h>
 #include "internal.h"
 
 static int request_key_auth_instantiate(struct key *, const void *, size_t);
 static void request_key_auth_describe(const struct key *, struct seq_file *);
 static void request_key_auth_destroy(struct key *);
+static long request_key_auth_read(const struct key *, char __user *, size_t);
 
 /*
  * the request-key authorisation key type definition
@@ -30,51 +32,25 @@ struct key_type key_type_request_key_auth = {
 	.instantiate	= request_key_auth_instantiate,
 	.describe	= request_key_auth_describe,
 	.destroy	= request_key_auth_destroy,
+	.read		= request_key_auth_read,
 };
 
 /*****************************************************************************/
 /*
- * instantiate a request-key authorisation record
+ * instantiate a request-key authorisation key
  */
 static int request_key_auth_instantiate(struct key *key,
 					const void *data,
 					size_t datalen)
 {
-	struct request_key_auth *rka, *irka;
-	struct key *instkey;
-	int ret;
-
-	ret = -ENOMEM;
-	rka = kmalloc(sizeof(*rka), GFP_KERNEL);
-	if (rka) {
-		/* see if the calling process is already servicing the key
-		 * request of another process */
-		instkey = key_get_instantiation_authkey(0);
-		if (!IS_ERR(instkey)) {
-			/* it is - use that instantiation context here too */
-			irka = instkey->payload.data;
-			rka->context = irka->context;
-			rka->pid = irka->pid;
-			key_put(instkey);
-		}
-		else {
-			/* it isn't - use this process as the context */
-			rka->context = current;
-			rka->pid = current->pid;
-		}
-
-		rka->target_key = key_get((struct key *) data);
-		key->payload.data = rka;
-		ret = 0;
-	}
-
-	return ret;
+	key->payload.data = (struct request_key_auth *) data;
+	return 0;
 
 } /* end request_key_auth_instantiate() */
 
 /*****************************************************************************/
 /*
- *
+ * reading a request-key authorisation key retrieves the callout information
  */
 static void request_key_auth_describe(const struct key *key,
 				      struct seq_file *m)
@@ -83,10 +59,38 @@ static void request_key_auth_describe(const struct key *key,
 
 	seq_puts(m, "key:");
 	seq_puts(m, key->description);
-	seq_printf(m, " pid:%d", rka->pid);
+	seq_printf(m, " pid:%d ci:%zu", rka->pid, strlen(rka->callout_info));
 
 } /* end request_key_auth_describe() */
 
+/*****************************************************************************/
+/*
+ * read the callout_info data
+ * - the key's semaphore is read-locked
+ */
+static long request_key_auth_read(const struct key *key,
+				  char __user *buffer, size_t buflen)
+{
+	struct request_key_auth *rka = key->payload.data;
+	size_t datalen;
+	long ret;
+
+	datalen = strlen(rka->callout_info);
+	ret = datalen;
+
+	/* we can return the data as is */
+	if (buffer && buflen > 0) {
+		if (buflen > datalen)
+			buflen = datalen;
+
+		if (copy_to_user(buffer, rka->callout_info, buflen) != 0)
+			ret = -EFAULT;
+	}
+
+	return ret;
+
+} /* end request_key_auth_read() */
+
 /*****************************************************************************/
 /*
  * destroy an instantiation authorisation token key
@@ -104,54 +108,87 @@ static void request_key_auth_destroy(struct key *key)
 
 /*****************************************************************************/
 /*
- * create a session keyring to be for the invokation of /sbin/request-key and
- * stick an authorisation token in it
+ * create an authorisation token for /sbin/request-key or whoever to gain
+ * access to the caller's security data
  */
-struct key *request_key_auth_new(struct key *target, struct key **_rkakey)
+struct key *request_key_auth_new(struct key *target, const char *callout_info)
 {
-	struct key *keyring, *rkakey = NULL;
+	struct request_key_auth *rka, *irka;
+	struct key *authkey = NULL;
 	char desc[20];
 	int ret;
 
 	kenter("%d,", target->serial);
 
-	/* allocate a new session keyring */
-	sprintf(desc, "_req.%u", target->serial);
+	/* allocate a auth record */
+	rka = kmalloc(sizeof(*rka), GFP_KERNEL);
+	if (!rka) {
+		kleave(" = -ENOMEM");
+		return ERR_PTR(-ENOMEM);
+	}
 
-	keyring = keyring_alloc(desc, current->fsuid, current->fsgid, 1, NULL);
-	if (IS_ERR(keyring)) {
-		kleave("= %ld", PTR_ERR(keyring));
-		return keyring;
+	/* see if the calling process is already servicing the key request of
+	 * another process */
+	if (current->request_key_auth) {
+		/* it is - use that instantiation context here too */
+		irka = current->request_key_auth->payload.data;
+		rka->context = irka->context;
+		rka->pid = irka->pid;
 	}
+	else {
+		/* it isn't - use this process as the context */
+		rka->context = current;
+		rka->pid = current->pid;
+	}
+
+	rka->target_key = key_get(target);
+	rka->callout_info = callout_info;
 
 	/* allocate the auth key */
 	sprintf(desc, "%x", target->serial);
 
-	rkakey = key_alloc(&key_type_request_key_auth, desc,
-			   current->fsuid, current->fsgid,
-			   KEY_POS_VIEW | KEY_USR_VIEW, 1);
-	if (IS_ERR(rkakey)) {
-		key_put(keyring);
-		kleave("= %ld", PTR_ERR(rkakey));
-		return rkakey;
+	authkey = key_alloc(&key_type_request_key_auth, desc,
+			    current->fsuid, current->fsgid,
+			    KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH |
+			    KEY_USR_VIEW, 1);
+	if (IS_ERR(authkey)) {
+		ret = PTR_ERR(authkey);
+		goto error_alloc;
 	}
 
 	/* construct and attach to the keyring */
-	ret = key_instantiate_and_link(rkakey, target, 0, keyring, NULL);
-	if (ret < 0) {
-		key_revoke(rkakey);
-		key_put(rkakey);
-		key_put(keyring);
-		kleave("= %d", ret);
-		return ERR_PTR(ret);
-	}
+	ret = key_instantiate_and_link(authkey, rka, 0, NULL, NULL);
+	if (ret < 0)
+		goto error_inst;
 
-	*_rkakey = rkakey;
-	kleave(" = {%d} ({%d})", keyring->serial, rkakey->serial);
-	return keyring;
+	kleave(" = {%d})", authkey->serial);
+	return authkey;
+
+error_inst:
+	key_revoke(authkey);
+	key_put(authkey);
+error_alloc:
+	key_put(rka->target_key);
+	kfree(rka);
+	kleave("= %d", ret);
+	return ERR_PTR(ret);
 
 } /* end request_key_auth_new() */
 
+/*****************************************************************************/
+/*
+ * see if an authorisation key is associated with a particular key
+ */
+static int key_get_instantiation_authkey_match(const struct key *key,
+					       const void *_id)
+{
+	struct request_key_auth *rka = key->payload.data;
+	key_serial_t id = (key_serial_t)(unsigned long) _id;
+
+	return rka->target_key->serial == id;
+
+} /* end key_get_instantiation_authkey_match() */
+
 /*****************************************************************************/
 /*
  * get the authorisation key for instantiation of a specific key if attached to
@@ -162,22 +199,27 @@ struct key *request_key_auth_new(struct key *target, struct key **_rkakey)
  */
 struct key *key_get_instantiation_authkey(key_serial_t target_id)
 {
-	struct task_struct *tsk = current;
-	struct key *instkey;
-
-	/* we must have our own personal session keyring */
-	if (!tsk->signal->session_keyring)
-		return ERR_PTR(-EACCES);
-
-	/* and it must contain a suitable request authorisation key
-	 * - lock RCU against session keyring changing
-	 */
-	rcu_read_lock();
+	struct key *authkey;
+	key_ref_t authkey_ref;
+
+	authkey_ref = search_process_keyrings(
+		&key_type_request_key_auth,
+		(void *) (unsigned long) target_id,
+		key_get_instantiation_authkey_match,
+		current);
+
+	if (IS_ERR(authkey_ref)) {
+		authkey = ERR_PTR(PTR_ERR(authkey_ref));
+		goto error;
+	}
 
-	instkey = keyring_search_instkey(
-		rcu_dereference(tsk->signal->session_keyring), target_id);
+	authkey = key_ref_to_ptr(authkey_ref);
+	if (test_bit(KEY_FLAG_REVOKED, &authkey->flags)) {
+		key_put(authkey);
+		authkey = ERR_PTR(-EKEYREVOKED);
+	}
 
-	rcu_read_unlock();
-	return instkey;
+error:
+	return authkey;
 
 } /* end key_get_instantiation_authkey() */
-- 
cgit v1.2.3


From 71fabd5e4835309b4feca6209122ce56c595c461 Mon Sep 17 00:00:00 2001
From: George Anzinger <george@mvista.com>
Date: Sun, 8 Jan 2006 01:02:48 -0800
Subject: [PATCH] sigaction should clear all signals on SIG_IGN, not just < 32

While rooting aroung in the signal code trying to understand how to fix the
SIG_IGN ploy (set sig handler to SIG_IGN and flood system with high speed
repeating timers) I came across what, I think, is a problem in sigaction()
in that when processing a SIG_IGN request it flushes signals from 1 to
SIGRTMIN and leaves the rest.  Attempt to fix this.

Signed-off-by: George Anzinger <george@mvista.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/signal.h | 17 +++++++++++++++++
 kernel/signal.c        | 34 ++++++++++++++++++++++++++++++++--
 2 files changed, 49 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/signal.h b/include/linux/signal.h
index ea9eff16c4b7..b7d093520bb6 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -94,6 +94,23 @@ static inline int sigfindinword(unsigned long word)
 
 #endif /* __HAVE_ARCH_SIG_BITOPS */
 
+static inline int sigisemptyset(sigset_t *set)
+{
+	extern void _NSIG_WORDS_is_unsupported_size(void);
+	switch (_NSIG_WORDS) {
+	case 4:
+		return (set->sig[3] | set->sig[2] |
+			set->sig[1] | set->sig[0]) == 0;
+	case 2:
+		return (set->sig[1] | set->sig[0]) == 0;
+	case 1:
+		return set->sig[0] == 0;
+	default:
+		_NSIG_WORDS_is_unsupported_size();
+		return 0;
+	}
+}
+
 #define sigmask(sig)	(1UL << ((sig) - 1))
 
 #ifndef __HAVE_ARCH_SIG_SETOPS
diff --git a/kernel/signal.c b/kernel/signal.c
index 9b6fda5e87f1..e20724af9b36 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -620,6 +620,33 @@ void signal_wake_up(struct task_struct *t, int resume)
 		kick_process(t);
 }
 
+/*
+ * Remove signals in mask from the pending set and queue.
+ * Returns 1 if any signals were found.
+ *
+ * All callers must be holding the siglock.
+ *
+ * This version takes a sigset mask and looks at all signals,
+ * not just those in the first mask word.
+ */
+static int rm_from_queue_full(sigset_t *mask, struct sigpending *s)
+{
+	struct sigqueue *q, *n;
+	sigset_t m;
+
+	sigandsets(&m, mask, &s->signal);
+	if (sigisemptyset(&m))
+		return 0;
+
+	signandsets(&s->signal, &s->signal, mask);
+	list_for_each_entry_safe(q, n, &s->list, list) {
+		if (sigismember(mask, q->info.si_signo)) {
+			list_del_init(&q->list);
+			__sigqueue_free(q);
+		}
+	}
+	return 1;
+}
 /*
  * Remove signals in mask from the pending set and queue.
  * Returns 1 if any signals were found.
@@ -2408,6 +2435,7 @@ int
 do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact)
 {
 	struct k_sigaction *k;
+	sigset_t mask;
 
 	if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
 		return -EINVAL;
@@ -2455,9 +2483,11 @@ do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact)
 			*k = *act;
 			sigdelsetmask(&k->sa.sa_mask,
 				      sigmask(SIGKILL) | sigmask(SIGSTOP));
-			rm_from_queue(sigmask(sig), &t->signal->shared_pending);
+			sigemptyset(&mask);
+			sigaddset(&mask, sig);
+			rm_from_queue_full(&mask, &t->signal->shared_pending);
 			do {
-				rm_from_queue(sigmask(sig), &t->pending);
+				rm_from_queue_full(&mask, &t->pending);
 				recalc_sigpending_tsk(t);
 				t = next_thread(t);
 			} while (t != current);
-- 
cgit v1.2.3


From a885c8c4316e1c1d2d2c8755da3f3d14f852528d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 8 Jan 2006 01:02:50 -0800
Subject: [PATCH] Add block_device_operations.getgeo block device method

HDIO_GETGEO is implemented in most block drivers, and all of them have to
duplicate the code to copy the structure to userspace, as well as getting
the start sector.  This patch moves that to common code [1] and adds a
->getgeo method to fill out the raw kernel hd_geometry structure.  For many
drivers this means ->ioctl can go away now.

[1] the s390 block drivers are odd in this respect.  xpram sets ->start
    to 4 always which seems more than odd, and the dasd driver shifts
    the start offset around, probably because of it's non-standard
    sector size.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Jens Axboe <axboe@suse.de>
Cc: <mike.miller@hp.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Paolo Giarrusso <blaisorblade@yahoo.it>
Cc: Bartlomiej Zolnierkiewicz <B.Zolnierkiewicz@elka.pw.edu.pl>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Cc: Markus Lidel <Markus.Lidel@shadowconnect.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/drivers/ubd_kern.c      | 21 +++++++++--------
 block/ioctl.c                   | 22 ++++++++++++++++++
 drivers/acorn/block/mfmhd.c     | 36 ++++++-----------------------
 drivers/block/DAC960.c          | 35 ++++++++++++-----------------
 drivers/block/acsi.c            | 26 +++++++++++----------
 drivers/block/amiflop.c         | 23 +++++++++----------
 drivers/block/aoe/aoeblk.c      | 26 ++++++---------------
 drivers/block/cciss.c           | 31 ++++++++++++-------------
 drivers/block/cpqarray.c        | 36 +++++++++++++++--------------
 drivers/block/floppy.c          | 35 +++++++++++++++--------------
 drivers/block/paride/pd.c       | 34 +++++++++++++++-------------
 drivers/block/paride/pf.c       | 50 +++++++++++++++++++++--------------------
 drivers/block/ps2esdi.c         | 25 +++++++--------------
 drivers/block/sx8.c             | 35 +++++++----------------------
 drivers/block/umem.c            | 41 +++++++++++++--------------------
 drivers/block/viodasd.c         | 44 ++++++++----------------------------
 drivers/block/xd.c              | 25 +++++++++++----------
 drivers/ide/ide-disk.c          | 12 ++++++++++
 drivers/ide/ide-floppy.c        | 12 ++++++++++
 drivers/ide/ide.c               | 13 -----------
 drivers/ide/legacy/hd.c         | 24 +++++++-------------
 drivers/md/md.c                 | 30 +++++++++----------------
 drivers/message/i2o/i2o_block.c | 18 +++++++--------
 drivers/mmc/mmc_block.c         | 25 +++++----------------
 drivers/mtd/mtd_blkdevs.c       | 25 ++++++++-------------
 drivers/s390/block/dasd.c       | 23 +++++++++++++++++++
 drivers/s390/block/dasd_ioctl.c | 28 -----------------------
 drivers/s390/block/xpram.c      | 18 ++++++---------
 drivers/scsi/sd.c               | 21 +++++------------
 include/linux/fs.h              |  2 ++
 30 files changed, 340 insertions(+), 456 deletions(-)

(limited to 'include/linux')

diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 73f9652b2ee9..3a93c6f772fa 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -117,6 +117,7 @@ static int ubd_open(struct inode * inode, struct file * filp);
 static int ubd_release(struct inode * inode, struct file * file);
 static int ubd_ioctl(struct inode * inode, struct file * file,
 		     unsigned int cmd, unsigned long arg);
+static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 
 #define MAX_DEV (8)
 
@@ -125,6 +126,7 @@ static struct block_device_operations ubd_blops = {
         .open		= ubd_open,
         .release	= ubd_release,
         .ioctl		= ubd_ioctl,
+	.getgeo		= ubd_getgeo,
 };
 
 /* Protected by the queue_lock */
@@ -1058,6 +1060,16 @@ static void do_ubd_request(request_queue_t *q)
 	}
 }
 
+static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct ubd *dev = bdev->bd_disk->private_data;
+
+	geo->heads = 128;
+	geo->sectors = 32;
+	geo->cylinders = dev->size / (128 * 32 * 512);
+	return 0;
+}
+
 static int ubd_ioctl(struct inode * inode, struct file * file,
 		     unsigned int cmd, unsigned long arg)
 {
@@ -1070,16 +1082,7 @@ static int ubd_ioctl(struct inode * inode, struct file * file,
 	};
 
 	switch (cmd) {
-	        struct hd_geometry g;
 		struct cdrom_volctrl volume;
-	case HDIO_GETGEO:
-		if(!loc) return(-EINVAL);
-		g.heads = 128;
-		g.sectors = 32;
-		g.cylinders = dev->size / (128 * 32 * 512);
-		g.start = get_start_sect(inode->i_bdev);
-		return(copy_to_user(loc, &g, sizeof(g)) ? -EFAULT : 0);
-
 	case HDIO_GET_IDENTITY:
 		ubd_id.cyls = dev->size / (128 * 32 * 512);
 		if(copy_to_user((char __user *) arg, (char *) &ubd_id,
diff --git a/block/ioctl.c b/block/ioctl.c
index 6e278474f9a8..82030e1dfd63 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -1,6 +1,7 @@
 #include <linux/sched.h>		/* for capable() */
 #include <linux/blkdev.h>
 #include <linux/blkpg.h>
+#include <linux/hdreg.h>
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 #include <linux/smp_lock.h>
@@ -245,6 +246,27 @@ int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd,
 		set_device_ro(bdev, n);
 		unlock_kernel();
 		return 0;
+	case HDIO_GETGEO: {
+		struct hd_geometry geo;
+
+		if (!arg)
+			return -EINVAL;
+		if (!disk->fops->getgeo)
+			return -ENOTTY;
+
+		/*
+		 * We need to set the startsect first, the driver may
+		 * want to override it.
+		 */
+		geo.start = get_start_sect(bdev);
+		ret = disk->fops->getgeo(bdev, &geo);
+		if (ret)
+			return ret;
+		if (copy_to_user((struct hd_geometry __user *)arg, &geo,
+					sizeof(geo)))
+			return -EFAULT;
+		return 0;
+	}
 	}
 
 	lock_kernel();
diff --git a/drivers/acorn/block/mfmhd.c b/drivers/acorn/block/mfmhd.c
index 4b65f74d66b1..ce074f6f3369 100644
--- a/drivers/acorn/block/mfmhd.c
+++ b/drivers/acorn/block/mfmhd.c
@@ -129,19 +129,6 @@ static DEFINE_SPINLOCK(mfm_lock);
 #define MAJOR_NR	MFM_ACORN_MAJOR
 #define QUEUE (mfm_queue)
 #define CURRENT elv_next_request(mfm_queue)
-/*
- * This sort of stuff should be in a header file shared with ide.c, hd.c, xd.c etc
- */
-#ifndef HDIO_GETGEO
-#define HDIO_GETGEO 0x301
-struct hd_geometry {
-	unsigned char heads;
-	unsigned char sectors;
-	unsigned short cylinders;
-	unsigned long start;
-};
-#endif
-
 
 /*
  * Configuration section
@@ -1153,22 +1140,13 @@ static int mfm_initdrives(void)
  * The 'front' end of the mfm driver follows...
  */
 
-static int mfm_ioctl(struct inode *inode, struct file *file, u_int cmd, u_long arg)
+static int mfm_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	struct mfm_info *p = inode->i_bdev->bd_disk->private_data;
-	struct hd_geometry *geo = (struct hd_geometry *) arg;
-	if (cmd != HDIO_GETGEO)
-		return -EINVAL;
-	if (!arg)
-		return -EINVAL;
-	if (put_user (p->heads, &geo->heads))
-		return -EFAULT;
-	if (put_user (p->sectors, &geo->sectors))
-		return -EFAULT;
-	if (put_user (p->cylinders, &geo->cylinders))
-		return -EFAULT;
-	if (put_user (get_start_sect(inode->i_bdev), &geo->start))
-		return -EFAULT;
+	struct mfm_info *p = bdev->bd_disk->private_data;
+
+	geo->heads = p->heads;
+	geo->sectors = p->sectors;
+	geo->cylinders = p->cylinders;
 	return 0;
 }
 
@@ -1219,7 +1197,7 @@ void xd_set_geometry(struct block_device *bdev, unsigned char secsptrack,
 static struct block_device_operations mfm_fops =
 {
 	.owner		= THIS_MODULE,
-	.ioctl		= mfm_ioctl,
+	.getgeo		= mfm_getgeo,
 };
 
 /*
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 21097a39a057..179c68a3cef3 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -92,34 +92,28 @@ static int DAC960_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static int DAC960_ioctl(struct inode *inode, struct file *file,
-			unsigned int cmd, unsigned long arg)
+static int DAC960_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	struct gendisk *disk = inode->i_bdev->bd_disk;
+	struct gendisk *disk = bdev->bd_disk;
 	DAC960_Controller_T *p = disk->queue->queuedata;
 	int drive_nr = (long)disk->private_data;
-	struct hd_geometry g;
-	struct hd_geometry __user *loc = (struct hd_geometry __user *)arg;
-
-	if (cmd != HDIO_GETGEO || !loc)
-		return -EINVAL;
 
 	if (p->FirmwareType == DAC960_V1_Controller) {
-		g.heads = p->V1.GeometryTranslationHeads;
-		g.sectors = p->V1.GeometryTranslationSectors;
-		g.cylinders = p->V1.LogicalDriveInformation[drive_nr].
-			LogicalDriveSize / (g.heads * g.sectors);
+		geo->heads = p->V1.GeometryTranslationHeads;
+		geo->sectors = p->V1.GeometryTranslationSectors;
+		geo->cylinders = p->V1.LogicalDriveInformation[drive_nr].
+			LogicalDriveSize / (geo->heads * geo->sectors);
 	} else {
 		DAC960_V2_LogicalDeviceInfo_T *i =
 			p->V2.LogicalDeviceInformation[drive_nr];
 		switch (i->DriveGeometry) {
 		case DAC960_V2_Geometry_128_32:
-			g.heads = 128;
-			g.sectors = 32;
+			geo->heads = 128;
+			geo->sectors = 32;
 			break;
 		case DAC960_V2_Geometry_255_63:
-			g.heads = 255;
-			g.sectors = 63;
+			geo->heads = 255;
+			geo->sectors = 63;
 			break;
 		default:
 			DAC960_Error("Illegal Logical Device Geometry %d\n",
@@ -127,12 +121,11 @@ static int DAC960_ioctl(struct inode *inode, struct file *file,
 			return -EINVAL;
 		}
 
-		g.cylinders = i->ConfigurableDeviceSize / (g.heads * g.sectors);
+		geo->cylinders = i->ConfigurableDeviceSize /
+			(geo->heads * geo->sectors);
 	}
 	
-	g.start = get_start_sect(inode->i_bdev);
-
-	return copy_to_user(loc, &g, sizeof g) ? -EFAULT : 0; 
+	return 0;
 }
 
 static int DAC960_media_changed(struct gendisk *disk)
@@ -157,7 +150,7 @@ static int DAC960_revalidate_disk(struct gendisk *disk)
 static struct block_device_operations DAC960_BlockDeviceOperations = {
 	.owner			= THIS_MODULE,
 	.open			= DAC960_open,
-	.ioctl			= DAC960_ioctl,
+	.getgeo			= DAC960_getgeo,
 	.media_changed		= DAC960_media_changed,
 	.revalidate_disk	= DAC960_revalidate_disk,
 };
diff --git a/drivers/block/acsi.c b/drivers/block/acsi.c
index 5d2d649f7e8d..196c0ec9cd54 100644
--- a/drivers/block/acsi.c
+++ b/drivers/block/acsi.c
@@ -1079,6 +1079,19 @@ static void redo_acsi_request( void )
  *
  ***********************************************************************/
 
+static int acsi_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct acsi_info_struct *aip = bdev->bd_disk->private_data;
+
+	/*
+	 * Just fake some geometry here, it's nonsense anyway
+	 * To make it easy, use Adaptec's usual 64/32 mapping
+	 */
+	geo->heads = 64;
+	geo->sectors = 32;
+	geo->cylinders = aip->size >> 11;
+	return 0;
+}
 
 static int acsi_ioctl( struct inode *inode, struct file *file,
 					   unsigned int cmd, unsigned long arg )
@@ -1086,18 +1099,6 @@ static int acsi_ioctl( struct inode *inode, struct file *file,
 	struct gendisk *disk = inode->i_bdev->bd_disk;
 	struct acsi_info_struct *aip = disk->private_data;
 	switch (cmd) {
-	  case HDIO_GETGEO:
-		/* HDIO_GETGEO is supported more for getting the partition's
-		 * start sector... */
-	  { struct hd_geometry *geo = (struct hd_geometry *)arg;
-	    /* just fake some geometry here, it's nonsense anyway; to make it
-		 * easy, use Adaptec's usual 64/32 mapping */
-	    put_user( 64, &geo->heads );
-	    put_user( 32, &geo->sectors );
-	    put_user( aip->size >> 11, &geo->cylinders );
-		put_user(get_start_sect(inode->i_bdev), &geo->start);
-		return 0;
-	  }
 	  case SCSI_IOCTL_GET_IDLUN:
 		/* SCSI compatible GET_IDLUN call to get target's ID and LUN number */
 		put_user( aip->target | (aip->lun << 8),
@@ -1592,6 +1593,7 @@ static struct block_device_operations acsi_fops = {
 	.open		= acsi_open,
 	.release	= acsi_release,
 	.ioctl		= acsi_ioctl,
+	.getgeo		= acsi_getgeo,
 	.media_changed	= acsi_media_change,
 	.revalidate_disk= acsi_revalidate,
 };
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 0acbfff8ad28..cb2a545e57dc 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1424,6 +1424,16 @@ static void do_fd_request(request_queue_t * q)
 	redo_fd_request();
 }
 
+static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	int drive = MINOR(bdev->bd_dev) & 3;
+
+	geo->heads = unit[drive].type->heads;
+	geo->sectors = unit[drive].dtype->sects * unit[drive].type->sect_mult;
+	geo->cylinders = unit[drive].type->tracks;
+	return 0;
+}
+
 static int fd_ioctl(struct inode *inode, struct file *filp,
 		    unsigned int cmd, unsigned long param)
 {
@@ -1431,18 +1441,6 @@ static int fd_ioctl(struct inode *inode, struct file *filp,
 	static struct floppy_struct getprm;
 
 	switch(cmd){
-	case HDIO_GETGEO:
-	{
-		struct hd_geometry loc;
-		loc.heads = unit[drive].type->heads;
-		loc.sectors = unit[drive].dtype->sects * unit[drive].type->sect_mult;
-		loc.cylinders = unit[drive].type->tracks;
-		loc.start = 0;
-		if (copy_to_user((void *)param, (void *)&loc,
-				 sizeof(struct hd_geometry)))
-			return -EFAULT;
-		break;
-	}
 	case FDFMTBEG:
 		get_fdc(drive);
 		if (fd_ref[drive] > 1) {
@@ -1652,6 +1650,7 @@ static struct block_device_operations floppy_fops = {
 	.open		= floppy_open,
 	.release	= floppy_release,
 	.ioctl		= fd_ioctl,
+	.getgeo		= fd_getgeo,
 	.media_changed	= amiga_floppy_change,
 };
 
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 0e97fcb9f3a1..c05ee8bffd97 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -169,38 +169,26 @@ aoeblk_make_request(request_queue_t *q, struct bio *bio)
 	return 0;
 }
 
-/* This ioctl implementation expects userland to have the device node
- * permissions set so that only priviledged users can open an aoe
- * block device directly.
- */
 static int
-aoeblk_ioctl(struct inode *inode, struct file *filp, uint cmd, ulong arg)
+aoeblk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	struct aoedev *d;
-
-	if (!arg)
-		return -EINVAL;
+	struct aoedev *d = bdev->bd_disk->private_data;
 
-	d = inode->i_bdev->bd_disk->private_data;
 	if ((d->flags & DEVFL_UP) == 0) {
 		printk(KERN_ERR "aoe: aoeblk_ioctl: disk not up\n");
 		return -ENODEV;
 	}
 
-	if (cmd == HDIO_GETGEO) {
-		d->geo.start = get_start_sect(inode->i_bdev);
-		if (!copy_to_user((void __user *) arg, &d->geo, sizeof d->geo))
-			return 0;
-		return -EFAULT;
-	}
-	printk(KERN_INFO "aoe: aoeblk_ioctl: unknown ioctl %d\n", cmd);
-	return -EINVAL;
+	geo->cylinders = d->geo.cylinders;
+	geo->heads = d->geo.heads;
+	geo->sectors = d->geo.sectors;
+	return 0;
 }
 
 static struct block_device_operations aoe_bdops = {
 	.open = aoeblk_open,
 	.release = aoeblk_release,
-	.ioctl = aoeblk_ioctl,
+	.getgeo = aoeblk_getgeo,
 	.owner = THIS_MODULE,
 };
 
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index d2815b7a9150..bdb9c2717d40 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -153,6 +153,7 @@ static int cciss_open(struct inode *inode, struct file *filep);
 static int cciss_release(struct inode *inode, struct file *filep);
 static int cciss_ioctl(struct inode *inode, struct file *filep, 
 		unsigned int cmd, unsigned long arg);
+static int cciss_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 
 static int revalidate_allvol(ctlr_info_t *host);
 static int cciss_revalidate(struct gendisk *disk);
@@ -194,6 +195,7 @@ static struct block_device_operations cciss_fops  = {
 	.open		= cciss_open, 
 	.release       	= cciss_release,
         .ioctl		= cciss_ioctl,
+        .getgeo		= cciss_getgeo,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = cciss_compat_ioctl,
 #endif
@@ -633,6 +635,20 @@ static int cciss_ioctl32_big_passthru(struct file *file, unsigned cmd, unsigned
 	return err;
 }
 #endif
+
+static int cciss_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	drive_info_struct *drv = get_drv(bdev->bd_disk);
+
+	if (!drv->cylinders)
+		return -ENXIO;
+
+	geo->heads = drv->heads;
+	geo->sectors = drv->sectors;
+	geo->cylinders = drv->cylinders;
+	return 0;
+}
+
 /*
  * ioctl 
  */
@@ -651,21 +667,6 @@ static int cciss_ioctl(struct inode *inode, struct file *filep,
 #endif /* CCISS_DEBUG */ 
 	
 	switch(cmd) {
-	case HDIO_GETGEO:
-	{
-                struct hd_geometry driver_geo;
-                if (drv->cylinders) {
-                        driver_geo.heads = drv->heads;
-                        driver_geo.sectors = drv->sectors;
-                        driver_geo.cylinders = drv->cylinders;
-                } else
-			return -ENXIO;
-                driver_geo.start= get_start_sect(inode->i_bdev);
-                if (copy_to_user(argp, &driver_geo, sizeof(struct hd_geometry)))
-                        return  -EFAULT;
-                return(0);
-	}
-
 	case CCISS_GETPCIINFO:
 	{
 		cciss_pci_info_struct pciinfo;
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index 9bddb6874873..9f0664dd3800 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -160,6 +160,7 @@ static int sendcmd(
 static int ida_open(struct inode *inode, struct file *filep);
 static int ida_release(struct inode *inode, struct file *filep);
 static int ida_ioctl(struct inode *inode, struct file *filep, unsigned int cmd, unsigned long arg);
+static int ida_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 static int ida_ctlr_ioctl(ctlr_info_t *h, int dsk, ida_ioctl_t *io);
 
 static void do_ida_request(request_queue_t *q);
@@ -199,6 +200,7 @@ static struct block_device_operations ida_fops  = {
 	.open		= ida_open,
 	.release	= ida_release,
 	.ioctl		= ida_ioctl,
+	.getgeo		= ida_getgeo,
 	.revalidate_disk= ida_revalidate,
 };
 
@@ -1124,6 +1126,23 @@ static void ida_timer(unsigned long tdata)
 	h->misc_tflags = 0;
 }
 
+static int ida_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	drv_info_t *drv = get_drv(bdev->bd_disk);
+
+	if (drv->cylinders) {
+		geo->heads = drv->heads;
+		geo->sectors = drv->sectors;
+		geo->cylinders = drv->cylinders;
+	} else {
+		geo->heads = 0xff;
+		geo->sectors = 0x3f;
+		geo->cylinders = drv->nr_blks / (0xff*0x3f);
+	}
+
+	return 0;
+}
+
 /*
  *  ida_ioctl does some miscellaneous stuff like reporting drive geometry,
  *  setting readahead and submitting commands from userspace to the controller.
@@ -1133,27 +1152,10 @@ static int ida_ioctl(struct inode *inode, struct file *filep, unsigned int cmd,
 	drv_info_t *drv = get_drv(inode->i_bdev->bd_disk);
 	ctlr_info_t *host = get_host(inode->i_bdev->bd_disk);
 	int error;
-	int diskinfo[4];
-	struct hd_geometry __user *geo = (struct hd_geometry __user *)arg;
 	ida_ioctl_t __user *io = (ida_ioctl_t __user *)arg;
 	ida_ioctl_t *my_io;
 
 	switch(cmd) {
-	case HDIO_GETGEO:
-		if (drv->cylinders) {
-			diskinfo[0] = drv->heads;
-			diskinfo[1] = drv->sectors;
-			diskinfo[2] = drv->cylinders;
-		} else {
-			diskinfo[0] = 0xff;
-			diskinfo[1] = 0x3f;
-			diskinfo[2] = drv->nr_blks / (0xff*0x3f);
-		}
-		put_user(diskinfo[0], &geo->heads);
-		put_user(diskinfo[1], &geo->sectors);
-		put_user(diskinfo[2], &geo->cylinders);
-		put_user(get_start_sect(inode->i_bdev), &geo->start);
-		return 0;
 	case IDAGETDRVINFO:
 		if (copy_to_user(&io->c.drv, drv, sizeof(drv_info_t)))
 			return -EFAULT;
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index a5b857c5c4b8..b86613b21cf1 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3445,6 +3445,23 @@ static int get_floppy_geometry(int drive, int type, struct floppy_struct **g)
 	return 0;
 }
 
+static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	int drive = (long)bdev->bd_disk->private_data;
+	int type = ITYPE(drive_state[drive].fd_device);
+	struct floppy_struct *g;
+	int ret;
+
+	ret = get_floppy_geometry(drive, type, &g);
+	if (ret)
+		return ret;
+
+	geo->heads = g->head;
+	geo->sectors = g->sect;
+	geo->cylinders = g->track;
+	return 0;
+}
+
 static int fd_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 		    unsigned long param)
 {
@@ -3474,23 +3491,6 @@ static int fd_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 		cmd = FDEJECT;
 	}
 
-	/* generic block device ioctls */
-	switch (cmd) {
-		/* the following have been inspired by the corresponding
-		 * code for other block devices. */
-		struct floppy_struct *g;
-	case HDIO_GETGEO:
-		{
-			struct hd_geometry loc;
-			ECALL(get_floppy_geometry(drive, type, &g));
-			loc.heads = g->head;
-			loc.sectors = g->sect;
-			loc.cylinders = g->track;
-			loc.start = 0;
-			return _COPYOUT(loc);
-		}
-	}
-
 	/* convert the old style command into a new style command */
 	if ((cmd & 0xff00) == 0x0200) {
 		ECALL(normalize_ioctl(&cmd, &size));
@@ -3938,6 +3938,7 @@ static struct block_device_operations floppy_fops = {
 	.open		= floppy_open,
 	.release	= floppy_release,
 	.ioctl		= fd_ioctl,
+	.getgeo		= fd_getgeo,
 	.media_changed	= check_floppy_change,
 	.revalidate_disk = floppy_revalidate,
 };
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index fa49d62626ba..62d2464c12f2 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -747,32 +747,33 @@ static int pd_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static int pd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct pd_unit *disk = bdev->bd_disk->private_data;
+
+	if (disk->alt_geom) {
+		geo->heads = PD_LOG_HEADS;
+		geo->sectors = PD_LOG_SECTS;
+		geo->cylinders = disk->capacity / (geo->heads * geo->sectors);
+	} else {
+		geo->heads = disk->heads;
+		geo->sectors = disk->sectors;
+		geo->cylinders = disk->cylinders;
+	}
+
+	return 0;
+}
+
 static int pd_ioctl(struct inode *inode, struct file *file,
 	 unsigned int cmd, unsigned long arg)
 {
 	struct pd_unit *disk = inode->i_bdev->bd_disk->private_data;
-	struct hd_geometry __user *geo = (struct hd_geometry __user *) arg;
-	struct hd_geometry g;
 
 	switch (cmd) {
 	case CDROMEJECT:
 		if (disk->access == 1)
 			pd_special_command(disk, pd_eject);
 		return 0;
-	case HDIO_GETGEO:
-		if (disk->alt_geom) {
-			g.heads = PD_LOG_HEADS;
-			g.sectors = PD_LOG_SECTS;
-			g.cylinders = disk->capacity / (g.heads * g.sectors);
-		} else {
-			g.heads = disk->heads;
-			g.sectors = disk->sectors;
-			g.cylinders = disk->cylinders;
-		}
-		g.start = get_start_sect(inode->i_bdev);
-		if (copy_to_user(geo, &g, sizeof(struct hd_geometry)))
-			return -EFAULT;
-		return 0;
 	default:
 		return -EINVAL;
 	}
@@ -815,6 +816,7 @@ static struct block_device_operations pd_fops = {
 	.open		= pd_open,
 	.release	= pd_release,
 	.ioctl		= pd_ioctl,
+	.getgeo		= pd_getgeo,
 	.media_changed	= pd_check_media,
 	.revalidate_disk= pd_revalidate
 };
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index e9746af29b9f..852b564e903a 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -205,6 +205,7 @@ static int pf_open(struct inode *inode, struct file *file);
 static void do_pf_request(request_queue_t * q);
 static int pf_ioctl(struct inode *inode, struct file *file,
 		    unsigned int cmd, unsigned long arg);
+static int pf_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 
 static int pf_release(struct inode *inode, struct file *file);
 
@@ -266,6 +267,7 @@ static struct block_device_operations pf_fops = {
 	.open		= pf_open,
 	.release	= pf_release,
 	.ioctl		= pf_ioctl,
+	.getgeo		= pf_getgeo,
 	.media_changed	= pf_check_media,
 };
 
@@ -313,34 +315,34 @@ static int pf_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static int pf_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg)
+static int pf_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	struct pf_unit *pf = inode->i_bdev->bd_disk->private_data;
-	struct hd_geometry __user *geo = (struct hd_geometry __user *) arg;
-	struct hd_geometry g;
-	sector_t capacity;
-
-	if (cmd == CDROMEJECT) {
-		if (pf->access == 1) {
-			pf_eject(pf);
-			return 0;
-		}
-		return -EBUSY;
-	}
-	if (cmd != HDIO_GETGEO)
-		return -EINVAL;
-	capacity = get_capacity(pf->disk);
+	struct pf_unit *pf = bdev->bd_disk->private_data;
+	sector_t capacity = get_capacity(pf->disk);
+
 	if (capacity < PF_FD_MAX) {
-		g.cylinders = sector_div(capacity, PF_FD_HDS * PF_FD_SPT);
-		g.heads = PF_FD_HDS;
-		g.sectors = PF_FD_SPT;
+		geo->cylinders = sector_div(capacity, PF_FD_HDS * PF_FD_SPT);
+		geo->heads = PF_FD_HDS;
+		geo->sectors = PF_FD_SPT;
 	} else {
-		g.cylinders = sector_div(capacity, PF_HD_HDS * PF_HD_SPT);
-		g.heads = PF_HD_HDS;
-		g.sectors = PF_HD_SPT;
+		geo->cylinders = sector_div(capacity, PF_HD_HDS * PF_HD_SPT);
+		geo->heads = PF_HD_HDS;
+		geo->sectors = PF_HD_SPT;
 	}
-	if (copy_to_user(geo, &g, sizeof(g)))
-		return -EFAULT;
+
+	return 0;
+}
+
+static int pf_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct pf_unit *pf = inode->i_bdev->bd_disk->private_data;
+
+	if (cmd != CDROMEJECT)
+		return -EINVAL;
+
+	if (pf->access != 1)
+		return -EBUSY;
+	pf_eject(pf);
 	return 0;
 }
 
diff --git a/drivers/block/ps2esdi.c b/drivers/block/ps2esdi.c
index 29d1518be72a..43415f69839f 100644
--- a/drivers/block/ps2esdi.c
+++ b/drivers/block/ps2esdi.c
@@ -81,8 +81,7 @@ static void (*current_int_handler) (u_int) = NULL;
 static void ps2esdi_normal_interrupt_handler(u_int);
 static void ps2esdi_initial_reset_int_handler(u_int);
 static void ps2esdi_geometry_int_handler(u_int);
-static int ps2esdi_ioctl(struct inode *inode, struct file *file,
-			 u_int cmd, u_long arg);
+static int ps2esdi_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 
 static int ps2esdi_read_status_words(int num_words, int max_words, u_short * buffer);
 
@@ -132,7 +131,7 @@ static struct ps2esdi_i_struct ps2esdi_info[MAX_HD] =
 static struct block_device_operations ps2esdi_fops =
 {
 	.owner		= THIS_MODULE,
-	.ioctl		= ps2esdi_ioctl,
+	.getgeo		= ps2esdi_getgeo,
 };
 
 static struct gendisk *ps2esdi_gendisk[2];
@@ -1058,21 +1057,13 @@ static void dump_cmd_complete_status(u_int int_ret_code)
 
 }
 
-static int ps2esdi_ioctl(struct inode *inode,
-			 struct file *file, u_int cmd, u_long arg)
+static int ps2esdi_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	struct ps2esdi_i_struct *p = inode->i_bdev->bd_disk->private_data;
-	struct ps2esdi_geometry geom;
-
-	if (cmd != HDIO_GETGEO)
-		return -EINVAL;
-	memset(&geom, 0, sizeof(geom));
-	geom.heads = p->head;
-	geom.sectors = p->sect;
-	geom.cylinders = p->cyl;
-	geom.start = get_start_sect(inode->i_bdev);
-	if (copy_to_user((void __user *)arg, &geom, sizeof(geom)))
-		return -EFAULT;
+	struct ps2esdi_i_struct *p = bdev->bd_disk->private_data;
+
+	geo->heads = p->head;
+	geo->sectors = p->sect;
+	geo->cylinders = p->cyl;
 	return 0;
 }
 
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 9251f4131b53..c0cdc182a8b0 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -407,8 +407,7 @@ struct carm_array_info {
 
 static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent);
 static void carm_remove_one (struct pci_dev *pdev);
-static int carm_bdev_ioctl(struct inode *ino, struct file *fil,
-			   unsigned int cmd, unsigned long arg);
+static int carm_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 
 static struct pci_device_id carm_pci_tbl[] = {
 	{ PCI_VENDOR_ID_PROMISE, 0x8000, PCI_ANY_ID, PCI_ANY_ID, 0, 0, },
@@ -426,7 +425,7 @@ static struct pci_driver carm_driver = {
 
 static struct block_device_operations carm_bd_ops = {
 	.owner		= THIS_MODULE,
-	.ioctl		= carm_bdev_ioctl,
+	.getgeo		= carm_bdev_getgeo,
 };
 
 static unsigned int carm_host_id;
@@ -434,32 +433,14 @@ static unsigned long carm_major_alloc;
 
 
-static int carm_bdev_ioctl(struct inode *ino, struct file *fil,
-			   unsigned int cmd, unsigned long arg)
+static int carm_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	void __user *usermem = (void __user *) arg;
-	struct carm_port *port = ino->i_bdev->bd_disk->private_data;
-	struct hd_geometry geom;
+	struct carm_port *port = bdev->bd_disk->private_data;
 
-	switch (cmd) {
-	case HDIO_GETGEO:
-		if (!usermem)
-			return -EINVAL;
-
-		geom.heads = (u8) port->dev_geom_head;
-		geom.sectors = (u8) port->dev_geom_sect;
-		geom.cylinders = port->dev_geom_cyl;
-		geom.start = get_start_sect(ino->i_bdev);
-
-		if (copy_to_user(usermem, &geom, sizeof(geom)))
-			return -EFAULT;
-		return 0;
-
-	default:
-		break;
-	}
-
-	return -EOPNOTSUPP;
+	geo->heads = (u8) port->dev_geom_head;
+	geo->sectors = (u8) port->dev_geom_sect;
+	geo->cylinders = port->dev_geom_cyl;
+	return 0;
 }
 
 static const u32 msg_sizes[] = { 32, 64, 128, CARM_MSG_SIZE };
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 0f48301342da..15299e7a1ade 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -809,34 +809,23 @@ static int mm_revalidate(struct gendisk *disk)
 	set_capacity(disk, card->mm_size << 1);
 	return 0;
 }
-/*
------------------------------------------------------------------------------------
---                            mm_ioctl
------------------------------------------------------------------------------------
-*/
-static int mm_ioctl(struct inode *i, struct file *f, unsigned int cmd, unsigned long arg)
+
+static int mm_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	if (cmd == HDIO_GETGEO) {
-		struct cardinfo *card = i->i_bdev->bd_disk->private_data;
-		int size = card->mm_size * (1024 / MM_HARDSECT);
-		struct hd_geometry geo;
-		/*
-		 * get geometry: we have to fake one...  trim the size to a
-		 * multiple of 2048 (1M): tell we have 32 sectors, 64 heads,
-		 * whatever cylinders.
-		 */
-		geo.heads     = 64;
-		geo.sectors   = 32;
-		geo.start     = get_start_sect(i->i_bdev);
-		geo.cylinders = size / (geo.heads * geo.sectors);
-
-		if (copy_to_user((void __user *) arg, &geo, sizeof(geo)))
-			return -EFAULT;
-		return 0;
-	}
+	struct cardinfo *card = bdev->bd_disk->private_data;
+	int size = card->mm_size * (1024 / MM_HARDSECT);
 
-	return -EINVAL;
+	/*
+	 * get geometry: we have to fake one...  trim the size to a
+	 * multiple of 2048 (1M): tell we have 32 sectors, 64 heads,
+	 * whatever cylinders.
+	 */
+	geo->heads     = 64;
+	geo->sectors   = 32;
+	geo->cylinders = size / (geo->heads * geo->sectors);
+	return 0;
 }
+
 /*
 -----------------------------------------------------------------------------------
 --                                mm_check_change
@@ -855,7 +844,7 @@ static int mm_check_change(struct gendisk *disk)
 */
 static struct block_device_operations mm_fops = {
 	.owner		= THIS_MODULE,
-	.ioctl		= mm_ioctl,
+	.getgeo		= mm_getgeo,
 	.revalidate_disk= mm_revalidate,
 	.media_changed	= mm_check_change,
 };
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
index 063f0304a163..d1aaf31bd97e 100644
--- a/drivers/block/viodasd.c
+++ b/drivers/block/viodasd.c
@@ -247,43 +247,17 @@ static int viodasd_release(struct inode *ino, struct file *fil)
 
 /* External ioctl entry point.
  */
-static int viodasd_ioctl(struct inode *ino, struct file *fil,
-			 unsigned int cmd, unsigned long arg)
+static int viodasd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	unsigned char sectors;
-	unsigned char heads;
-	unsigned short cylinders;
-	struct hd_geometry *geo;
-	struct gendisk *gendisk;
-	struct viodasd_device *d;
+	struct gendisk *disk = bdev->bd_disk;
+	struct viodasd_device *d = disk->private_data;
 
-	switch (cmd) {
-	case HDIO_GETGEO:
-		geo = (struct hd_geometry *)arg;
-		if (geo == NULL)
-			return -EINVAL;
-		if (!access_ok(VERIFY_WRITE, geo, sizeof(*geo)))
-			return -EFAULT;
-		gendisk = ino->i_bdev->bd_disk;
-		d = gendisk->private_data;
-		sectors = d->sectors;
-		if (sectors == 0)
-			sectors = 32;
-		heads = d->tracks;
-		if (heads == 0)
-			heads = 64;
-		cylinders = d->cylinders;
-		if (cylinders == 0)
-			cylinders = get_capacity(gendisk) / (sectors * heads);
-		if (__put_user(sectors, &geo->sectors) ||
-		    __put_user(heads, &geo->heads) ||
-		    __put_user(cylinders, &geo->cylinders) ||
-		    __put_user(get_start_sect(ino->i_bdev), &geo->start))
-			return -EFAULT;
-		return 0;
-	}
+	geo->sectors = d->sectors ? d->sectors : 0;
+	geo->heads = d->tracks ? d->tracks  : 64;
+	geo->cylinders = d->cylinders ? d->cylinders :
+		get_capacity(disk) / (geo->cylinders * geo->heads);
 
-	return -EINVAL;
+	return 0;
 }
 
 /*
@@ -293,7 +267,7 @@ static struct block_device_operations viodasd_fops = {
 	.owner = THIS_MODULE,
 	.open = viodasd_open,
 	.release = viodasd_release,
-	.ioctl = viodasd_ioctl,
+	.getgeo = viodasd_getgeo,
 };
 
 /*
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index 68b6d7b154cf..97f5dab24b5a 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -128,9 +128,12 @@ static DEFINE_SPINLOCK(xd_lock);
 
 static struct gendisk *xd_gendisk[2];
 
+static int xd_getgeo(struct block_device *bdev, struct hd_geometry *geo);
+
 static struct block_device_operations xd_fops = {
 	.owner	= THIS_MODULE,
 	.ioctl	= xd_ioctl,
+	.getgeo = xd_getgeo,
 };
 static DECLARE_WAIT_QUEUE_HEAD(xd_wait_int);
 static u_char xd_drives, xd_irq = 5, xd_dma = 3, xd_maxsectors;
@@ -330,22 +333,20 @@ static void do_xd_request (request_queue_t * q)
 	}
 }
 
+static int xd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	XD_INFO *p = bdev->bd_disk->private_data;
+
+	geo->heads = p->heads;
+	geo->sectors = p->sectors;
+	geo->cylinders = p->cylinders;
+	return 0;
+}
+
 /* xd_ioctl: handle device ioctl's */
 static int xd_ioctl (struct inode *inode,struct file *file,u_int cmd,u_long arg)
 {
-	XD_INFO *p = inode->i_bdev->bd_disk->private_data;
-
 	switch (cmd) {
-		case HDIO_GETGEO:
-		{
-			struct hd_geometry g;
-			struct hd_geometry __user *geom= (void __user *)arg;
-			g.heads = p->heads;
-			g.sectors = p->sectors;
-			g.cylinders = p->cylinders;
-			g.start = get_start_sect(inode->i_bdev);
-			return copy_to_user(geom, &g, sizeof(g)) ? -EFAULT : 0;
-		}
 		case HDIO_SET_DMA:
 			if (!capable(CAP_SYS_ADMIN)) return -EACCES;
 			if (xdc_busy) return -EBUSY;
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 4b441720b6ba..cab362ea0336 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -1130,6 +1130,17 @@ static int idedisk_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+static int idedisk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct ide_disk_obj *idkp = ide_disk_g(bdev->bd_disk);
+	ide_drive_t *drive = idkp->drive;
+
+	geo->heads = drive->bios_head;
+	geo->sectors = drive->bios_sect;
+	geo->cylinders = (u16)drive->bios_cyl; /* truncate */
+	return 0;
+}
+
 static int idedisk_ioctl(struct inode *inode, struct file *file,
 			unsigned int cmd, unsigned long arg)
 {
@@ -1164,6 +1175,7 @@ static struct block_device_operations idedisk_ops = {
 	.open		= idedisk_open,
 	.release	= idedisk_release,
 	.ioctl		= idedisk_ioctl,
+	.getgeo		= idedisk_getgeo,
 	.media_changed	= idedisk_media_changed,
 	.revalidate_disk= idedisk_revalidate_disk
 };
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index fba3fffc2d66..5945f551aaaa 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -2031,6 +2031,17 @@ static int idefloppy_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+static int idefloppy_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct ide_floppy_obj *floppy = ide_floppy_g(bdev->bd_disk);
+	ide_drive_t *drive = floppy->drive;
+
+	geo->heads = drive->bios_head;
+	geo->sectors = drive->bios_sect;
+	geo->cylinders = (u16)drive->bios_cyl; /* truncate */
+	return 0;
+}
+
 static int idefloppy_ioctl(struct inode *inode, struct file *file,
 			unsigned int cmd, unsigned long arg)
 {
@@ -2120,6 +2131,7 @@ static struct block_device_operations idefloppy_ops = {
 	.open		= idefloppy_open,
 	.release	= idefloppy_release,
 	.ioctl		= idefloppy_ioctl,
+	.getgeo		= idefloppy_getgeo,
 	.media_changed	= idefloppy_media_changed,
 	.revalidate_disk= idefloppy_revalidate_disk
 };
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index 4b524f6b3ecd..b069b13b75a7 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -1278,19 +1278,6 @@ int generic_ide_ioctl(ide_drive_t *drive, struct file *file, struct block_device
 	up(&ide_setting_sem);
 
 	switch (cmd) {
-		case HDIO_GETGEO:
-		{
-			struct hd_geometry geom;
-			if (!p || (drive->media != ide_disk && drive->media != ide_floppy)) return -EINVAL;
-			geom.heads = drive->bios_head;
-			geom.sectors = drive->bios_sect;
-			geom.cylinders = (u16)drive->bios_cyl; /* truncate */
-			geom.start = get_start_sect(bdev);
-			if (copy_to_user(p, &geom, sizeof(struct hd_geometry)))
-				return -EFAULT;
-			return 0;
-		}
-
 		case HDIO_OBSOLETE_IDENTITY:
 		case HDIO_GET_IDENTITY:
 			if (bdev != bdev->bd_contains)
diff --git a/drivers/ide/legacy/hd.c b/drivers/ide/legacy/hd.c
index 242029c9c0ca..6439dec66881 100644
--- a/drivers/ide/legacy/hd.c
+++ b/drivers/ide/legacy/hd.c
@@ -658,22 +658,14 @@ static void do_hd_request (request_queue_t * q)
 	enable_irq(HD_IRQ);
 }
 
-static int hd_ioctl(struct inode * inode, struct file * file,
-	unsigned int cmd, unsigned long arg)
+static int hd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	struct hd_i_struct *disk = inode->i_bdev->bd_disk->private_data;
-	struct hd_geometry __user *loc = (struct hd_geometry __user *) arg;
-	struct hd_geometry g; 
-
-	if (cmd != HDIO_GETGEO)
-		return -EINVAL;
-	if (!loc)
-		return -EINVAL;
-	g.heads = disk->head;
-	g.sectors = disk->sect;
-	g.cylinders = disk->cyl;
-	g.start = get_start_sect(inode->i_bdev);
-	return copy_to_user(loc, &g, sizeof g) ? -EFAULT : 0; 
+	struct hd_i_struct *disk = bdev->bd_disk->private_data;
+
+	geo->heads = disk->head;
+	geo->sectors = disk->sect;
+	geo->cylinders = disk->cyl;
+	return 0;
 }
 
 /*
@@ -695,7 +687,7 @@ static irqreturn_t hd_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 }
 
 static struct block_device_operations hd_fops = {
-	.ioctl =	hd_ioctl,
+	.getgeo =	hd_getgeo,
 };
 
 /*
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1b76fb29fb70..e423a16ba3c9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3598,12 +3598,21 @@ static int set_disk_faulty(mddev_t *mddev, dev_t dev)
 	return 0;
 }
 
+static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	mddev_t *mddev = bdev->bd_disk->private_data;
+
+	geo->heads = 2;
+	geo->sectors = 4;
+	geo->cylinders = get_capacity(mddev->gendisk) / 8;
+	return 0;
+}
+
 static int md_ioctl(struct inode *inode, struct file *file,
 			unsigned int cmd, unsigned long arg)
 {
 	int err = 0;
 	void __user *argp = (void __user *)arg;
-	struct hd_geometry __user *loc = argp;
 	mddev_t *mddev = NULL;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -3765,24 +3774,6 @@ static int md_ioctl(struct inode *inode, struct file *file,
 	 * 4 sectors (with a BIG number of cylinders...). This drives
 	 * dosfs just mad... ;-)
 	 */
-		case HDIO_GETGEO:
-			if (!loc) {
-				err = -EINVAL;
-				goto abort_unlock;
-			}
-			err = put_user (2, (char __user *) &loc->heads);
-			if (err)
-				goto abort_unlock;
-			err = put_user (4, (char __user *) &loc->sectors);
-			if (err)
-				goto abort_unlock;
-			err = put_user(get_capacity(mddev->gendisk)/8,
-					(short __user *) &loc->cylinders);
-			if (err)
-				goto abort_unlock;
-			err = put_user (get_start_sect(inode->i_bdev),
-						(long __user *) &loc->start);
-			goto done_unlock;
 	}
 
 	/*
@@ -3911,6 +3902,7 @@ static struct block_device_operations md_fops =
 	.open		= md_open,
 	.release	= md_release,
 	.ioctl		= md_ioctl,
+	.getgeo		= md_getgeo,
 	.media_changed	= md_media_changed,
 	.revalidate_disk= md_revalidate,
 };
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index 5b1febed3133..b09fb6307153 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -662,6 +662,13 @@ static int i2o_block_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static int i2o_block_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	i2o_block_biosparam(get_capacity(bdev->bd_disk),
+			    &geo->cylinders, &geo->heads, &geo->sectors);
+	return 0;
+}
+
 /**
  *	i2o_block_ioctl - Issue device specific ioctl calls.
  *	@cmd: ioctl command
@@ -676,7 +683,6 @@ static int i2o_block_ioctl(struct inode *inode, struct file *file,
 {
 	struct gendisk *disk = inode->i_bdev->bd_disk;
 	struct i2o_block_device *dev = disk->private_data;
-	void __user *argp = (void __user *)arg;
 
 	/* Anyone capable of this syscall can do *real bad* things */
 
@@ -684,15 +690,6 @@ static int i2o_block_ioctl(struct inode *inode, struct file *file,
 		return -EPERM;
 
 	switch (cmd) {
-	case HDIO_GETGEO:
-		{
-			struct hd_geometry g;
-			i2o_block_biosparam(get_capacity(disk),
-					    &g.cylinders, &g.heads, &g.sectors);
-			g.start = get_start_sect(inode->i_bdev);
-			return copy_to_user(argp, &g, sizeof(g)) ? -EFAULT : 0;
-		}
-
 	case BLKI2OGRSTRAT:
 		return put_user(dev->rcache, (int __user *)arg);
 	case BLKI2OGWSTRAT:
@@ -962,6 +959,7 @@ static struct block_device_operations i2o_block_fops = {
 	.open = i2o_block_open,
 	.release = i2o_block_release,
 	.ioctl = i2o_block_ioctl,
+	.getgeo = i2o_block_getgeo,
 	.media_changed = i2o_block_media_changed
 };
 
diff --git a/drivers/mmc/mmc_block.c b/drivers/mmc/mmc_block.c
index 198561d21710..d5f28981596b 100644
--- a/drivers/mmc/mmc_block.c
+++ b/drivers/mmc/mmc_block.c
@@ -113,31 +113,18 @@ static int mmc_blk_release(struct inode *inode, struct file *filp)
 }
 
 static int
-mmc_blk_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg)
+mmc_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	struct block_device *bdev = inode->i_bdev;
-
-	if (cmd == HDIO_GETGEO) {
-		struct hd_geometry geo;
-
-		memset(&geo, 0, sizeof(struct hd_geometry));
-
-		geo.cylinders	= get_capacity(bdev->bd_disk) / (4 * 16);
-		geo.heads	= 4;
-		geo.sectors	= 16;
-		geo.start	= get_start_sect(bdev);
-
-		return copy_to_user((void __user *)arg, &geo, sizeof(geo))
-			? -EFAULT : 0;
-	}
-
-	return -ENOTTY;
+	geo->cylinders = get_capacity(bdev->bd_disk) / (4 * 16);
+	geo->heads = 4;
+	geo->sectors = 16;
+	return 0;
 }
 
 static struct block_device_operations mmc_bdops = {
 	.open			= mmc_blk_open,
 	.release		= mmc_blk_release,
-	.ioctl			= mmc_blk_ioctl,
+	.getgeo			= mmc_blk_getgeo,
 	.owner			= THIS_MODULE,
 };
 
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 339cb1218eaa..7f3ff500b68e 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -194,6 +194,14 @@ static int blktrans_release(struct inode *i, struct file *f)
 	return ret;
 }
 
+static int blktrans_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct mtd_blktrans_dev *dev = bdev->bd_disk->private_data;
+
+	if (dev->tr->getgeo)
+		return dev->tr->getgeo(dev, geo);
+	return -ENOTTY;
+}
 
 static int blktrans_ioctl(struct inode *inode, struct file *file,
 			      unsigned int cmd, unsigned long arg)
@@ -207,22 +215,6 @@ static int blktrans_ioctl(struct inode *inode, struct file *file,
 			return tr->flush(dev);
 		/* The core code did the work, we had nothing to do. */
 		return 0;
-
-	case HDIO_GETGEO:
-		if (tr->getgeo) {
-			struct hd_geometry g;
-			int ret;
-
-			memset(&g, 0, sizeof(g));
-			ret = tr->getgeo(dev, &g);
-			if (ret)
-				return ret;
-
-			g.start = get_start_sect(inode->i_bdev);
-			if (copy_to_user((void __user *)arg, &g, sizeof(g)))
-				return -EFAULT;
-			return 0;
-		} /* else */
 	default:
 		return -ENOTTY;
 	}
@@ -233,6 +225,7 @@ struct block_device_operations mtd_blktrans_ops = {
 	.open		= blktrans_open,
 	.release	= blktrans_release,
 	.ioctl		= blktrans_ioctl,
+	.getgeo		= blktrans_getgeo,
 };
 
 int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index f779f674dfa0..2472fa1a1be1 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -18,6 +18,7 @@
 #include <linux/major.h>
 #include <linux/slab.h>
 #include <linux/buffer_head.h>
+#include <linux/hdreg.h>
 
 #include <asm/ccwdev.h>
 #include <asm/ebcdic.h>
@@ -1723,12 +1724,34 @@ dasd_release(struct inode *inp, struct file *filp)
 	return 0;
 }
 
+/*
+ * Return disk geometry.
+ */
+static int
+dasd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct dasd_device *device;
+
+	device = bdev->bd_disk->private_data;
+	if (!device)
+		return -ENODEV;
+
+	if (!device->discipline ||
+	    !device->discipline->fill_geometry)
+		return -EINVAL;
+
+	device->discipline->fill_geometry(device, geo);
+	geo->start = get_start_sect(bdev) >> device->s2b_shift;
+	return 0;
+}
+
 struct block_device_operations
 dasd_device_operations = {
 	.owner		= THIS_MODULE,
 	.open		= dasd_open,
 	.release	= dasd_release,
 	.ioctl		= dasd_ioctl,
+	.getgeo		= dasd_getgeo,
 };
 
 
diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c
index 044b75371990..8e4dcd58599e 100644
--- a/drivers/s390/block/dasd_ioctl.c
+++ b/drivers/s390/block/dasd_ioctl.c
@@ -485,33 +485,6 @@ dasd_ioctl_set_ro(struct block_device *bdev, int no, long args)
 	return rc;
 }
 
-/*
- * Return disk geometry.
- */
-static int
-dasd_ioctl_getgeo(struct block_device *bdev, int no, long args)
-{
-	struct hd_geometry geo = { 0, };
-	struct dasd_device *device;
-
-	device =  bdev->bd_disk->private_data;
-	if (device == NULL)
-		return -ENODEV;
-
-	if (device == NULL || device->discipline == NULL ||
-	    device->discipline->fill_geometry == NULL)
-		return -EINVAL;
-
-	geo = (struct hd_geometry) {};
-	device->discipline->fill_geometry(device, &geo);
-	geo.start = get_start_sect(bdev) >> device->s2b_shift;
-	if (copy_to_user((struct hd_geometry __user *) args, &geo,
-			 sizeof (struct hd_geometry)))
-		return -EFAULT;
-
-	return 0;
-}
-
 /*
  * List of static ioctls.
  */
@@ -528,7 +501,6 @@ static struct { int no; dasd_ioctl_fn_t fn; } dasd_ioctls[] =
 	{ BIODASDPRRST, dasd_ioctl_reset_profile },
 	{ BLKROSET, dasd_ioctl_set_ro },
 	{ DASDAPIVER, dasd_ioctl_api_version },
-	{ HDIO_GETGEO, dasd_ioctl_getgeo },
 	{ -1, NULL }
 };
 
diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c
index bf3a67c3cc5e..54ecd548c318 100644
--- a/drivers/s390/block/xpram.c
+++ b/drivers/s390/block/xpram.c
@@ -328,31 +328,27 @@ fail:
 	return 0;
 }
 
-static int xpram_ioctl (struct inode *inode, struct file *filp,
-		 unsigned int cmd, unsigned long arg)
+static int xpram_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	struct hd_geometry __user *geo;
 	unsigned long size;
-	if (cmd != HDIO_GETGEO)
-		return -EINVAL;
+
 	/*
 	 * get geometry: we have to fake one...  trim the size to a
 	 * multiple of 64 (32k): tell we have 16 sectors, 4 heads,
 	 * whatever cylinders. Tell also that data starts at sector. 4.
 	 */
-	geo = (struct hd_geometry __user *) arg;
 	size = (xpram_pages * 8) & ~0x3f;
-	put_user(size >> 6, &geo->cylinders);
-	put_user(4, &geo->heads);
-	put_user(16, &geo->sectors);
-	put_user(4, &geo->start);
+	geo->cylinders = size >> 6;
+	geo->heads = 4;
+	geo->sectors = 16;
+	geo->start = 4;
 	return 0;
 }
 
 static struct block_device_operations xpram_devops =
 {
 	.owner	= THIS_MODULE,
-	.ioctl	= xpram_ioctl,
+	.getgeo	= xpram_getgeo,
 };
 
 /*
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 32d4d8d7b9f3..4c5127ed379c 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -527,7 +527,7 @@ static int sd_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-static int sd_hdio_getgeo(struct block_device *bdev, struct hd_geometry __user *loc)
+static int sd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
 	struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk);
 	struct scsi_device *sdp = sdkp->device;
@@ -545,15 +545,9 @@ static int sd_hdio_getgeo(struct block_device *bdev, struct hd_geometry __user *
 	else
 		scsicam_bios_param(bdev, sdkp->capacity, diskinfo);
 
-	if (put_user(diskinfo[0], &loc->heads))
-		return -EFAULT;
-	if (put_user(diskinfo[1], &loc->sectors))
-		return -EFAULT;
-	if (put_user(diskinfo[2], &loc->cylinders))
-		return -EFAULT;
-	if (put_user((unsigned)get_start_sect(bdev),
-	             (unsigned long __user *)&loc->start))
-		return -EFAULT;
+	geo->heads = diskinfo[0];
+	geo->sectors = diskinfo[1];
+	geo->cylinders = diskinfo[2];
 	return 0;
 }
 
@@ -593,12 +587,6 @@ static int sd_ioctl(struct inode * inode, struct file * filp,
 	if (!scsi_block_when_processing_errors(sdp) || !error)
 		return error;
 
-	if (cmd == HDIO_GETGEO) {
-		if (!arg)
-			return -EINVAL;
-		return sd_hdio_getgeo(bdev, p);
-	}
-
 	/*
 	 * Send SCSI addressing ioctls directly to mid level, send other
 	 * ioctls to block level and then onto mid level if they can't be
@@ -800,6 +788,7 @@ static struct block_device_operations sd_fops = {
 	.open			= sd_open,
 	.release		= sd_release,
 	.ioctl			= sd_ioctl,
+	.getgeo			= sd_getgeo,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl		= sd_compat_ioctl,
 #endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 74c01aabd4ab..7f140480c6a8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -224,6 +224,7 @@ extern int dir_notify_enable;
 #include <asm/semaphore.h>
 #include <asm/byteorder.h>
 
+struct hd_geometry;
 struct iovec;
 struct nameidata;
 struct kiocb;
@@ -962,6 +963,7 @@ struct block_device_operations {
 	int (*direct_access) (struct block_device *, sector_t, unsigned long *);
 	int (*media_changed) (struct gendisk *);
 	int (*revalidate_disk) (struct gendisk *);
+	int (*getgeo)(struct block_device *, struct hd_geometry *);
 	struct module *owner;
 };
 
-- 
cgit v1.2.3


From bf066c7db775a04bd761f8ea206f5522d0cf40ff Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Sun, 8 Jan 2006 01:03:19 -0800
Subject: [PATCH] shared mounts: cleanup

Small cleanups in shared mounts code.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: <viro@parcelfarce.linux.theplanet.co.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c        | 2 +-
 fs/pnode.c            | 2 +-
 include/linux/fs.h    | 2 +-
 include/linux/mount.h | 3 ++-
 4 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index e5aa1eeb5748..3e8fb61ad597 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -451,7 +451,7 @@ EXPORT_SYMBOL(may_umount);
 void release_mounts(struct list_head *head)
 {
 	struct vfsmount *mnt;
-	while(!list_empty(head)) {
+	while (!list_empty(head)) {
 		mnt = list_entry(head->next, struct vfsmount, mnt_hash);
 		list_del_init(&mnt->mnt_hash);
 		if (mnt->mnt_parent != mnt) {
diff --git a/fs/pnode.c b/fs/pnode.c
index aeeec8ba8dd2..f1871f773f64 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -103,7 +103,7 @@ static struct vfsmount *propagation_next(struct vfsmount *m,
 		struct vfsmount *next;
 		struct vfsmount *master = m->mnt_master;
 
-		if ( master == origin->mnt_master ) {
+		if (master == origin->mnt_master) {
 			next = next_peer(m);
 			return ((next == origin) ? NULL : next);
 		} else if (m->mnt_slave.next != &master->mnt_slave_list)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7f140480c6a8..a1e28f0895c0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -103,11 +103,11 @@ extern int dir_notify_enable;
 #define MS_MOVE		8192
 #define MS_REC		16384
 #define MS_VERBOSE	32768
+#define MS_POSIXACL	(1<<16)	/* VFS does not apply the umask */
 #define MS_UNBINDABLE	(1<<17)	/* change to unbindable */
 #define MS_PRIVATE	(1<<18)	/* change to private */
 #define MS_SLAVE	(1<<19)	/* change to slave */
 #define MS_SHARED	(1<<20)	/* change to shared */
-#define MS_POSIXACL	(1<<16)	/* VFS does not apply the umask */
 #define MS_ACTIVE	(1<<30)
 #define MS_NOUSER	(1<<31)
 
diff --git a/include/linux/mount.h b/include/linux/mount.h
index dd4e83eba933..b98a709f1794 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -22,7 +22,8 @@
 #define MNT_NOEXEC	0x04
 #define MNT_SHARED	0x10	/* if the vfsmount is a shared mount */
 #define MNT_UNBINDABLE	0x20	/* if the vfsmount is a unbindable mount */
-#define MNT_PNODE_MASK	0x30	/* propogation flag mask */
+
+#define MNT_PNODE_MASK	(MNT_SHARED | MNT_UNBINDABLE)
 
 struct vfsmount {
 	struct list_head mnt_hash;
-- 
cgit v1.2.3


From 5160ee6fc891a9ca114be0e90fa6655647bb64b2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Sun, 8 Jan 2006 01:03:32 -0800
Subject: [PATCH] shrink dentry struct

Some long time ago, dentry struct was carefully tuned so that on 32 bits
UP, sizeof(struct dentry) was exactly 128, ie a power of 2, and a multiple
of memory cache lines.

Then RCU was added and dentry struct enlarged by two pointers, with nice
results for SMP, but not so good on UP, because breaking the above tuning
(128 + 8 = 136 bytes)

This patch reverts this unwanted side effect, by using an union (d_u),
where d_rcu and d_child are placed so that these two fields can share their
memory needs.

At the time d_free() is called (and d_rcu is really used), d_child is known
to be empty and not touched by the dentry freeing.

Lockless lookups only access d_name, d_parent, d_lock, d_op, d_flags (so
the previous content of d_child is not needed if said dentry was unhashed
but still accessed by a CPU because of RCU constraints)

As dentry cache easily contains millions of entries, a size reduction is
worth the extra complexity of the ugly C union.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: Maneesh Soni <maneesh@in.ibm.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Ian Kent <raven@themaw.net>
Cc: Paul Jackson <pj@sgi.com>
Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Cc: James Morris <jmorris@namei.org>
Cc: Stephen Smalley <sds@epoch.ncsc.mil>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/usb/core/inode.c     |  6 +++---
 fs/autofs4/autofs_i.h        |  2 +-
 fs/autofs4/expire.c          | 12 ++++++------
 fs/autofs4/inode.c           |  4 ++--
 fs/autofs4/root.c            |  3 ++-
 fs/coda/cache.c              |  2 +-
 fs/dcache.c                  | 34 +++++++++++++++++-----------------
 fs/libfs.c                   | 12 ++++++------
 fs/ncpfs/dir.c               |  2 +-
 fs/ncpfs/ncplib_kernel.h     |  4 ++--
 fs/smbfs/cache.c             |  4 ++--
 include/linux/dcache.h       |  9 +++++++--
 kernel/cpuset.c              |  4 ++--
 net/sunrpc/rpc_pipe.c        |  2 +-
 security/selinux/selinuxfs.c |  2 +-
 15 files changed, 54 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c
index c44bbedec817..4ddc453023a2 100644
--- a/drivers/usb/core/inode.c
+++ b/drivers/usb/core/inode.c
@@ -186,7 +186,7 @@ static void update_bus(struct dentry *bus)
 
 	down(&bus->d_inode->i_sem);
 
-	list_for_each_entry(dev, &bus->d_subdirs, d_child)
+	list_for_each_entry(dev, &bus->d_subdirs, d_u.d_child)
 		if (dev->d_inode)
 			update_dev(dev);
 
@@ -203,7 +203,7 @@ static void update_sb(struct super_block *sb)
 
 	down(&root->d_inode->i_sem);
 
-	list_for_each_entry(bus, &root->d_subdirs, d_child) {
+	list_for_each_entry(bus, &root->d_subdirs, d_u.d_child) {
 		if (bus->d_inode) {
 			switch (S_IFMT & bus->d_inode->i_mode) {
 			case S_IFDIR:
@@ -319,7 +319,7 @@ static int usbfs_empty (struct dentry *dentry)
 	spin_lock(&dcache_lock);
 
 	list_for_each(list, &dentry->d_subdirs) {
-		struct dentry *de = list_entry(list, struct dentry, d_child);
+		struct dentry *de = list_entry(list, struct dentry, d_u.d_child);
 		if (usbfs_positive(de)) {
 			spin_unlock(&dcache_lock);
 			return 0;
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index fca83e28edcf..385bed09b0d8 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -209,7 +209,7 @@ static inline int simple_empty_nolock(struct dentry *dentry)
 	struct dentry *child;
 	int ret = 0;
 
-	list_for_each_entry(child, &dentry->d_subdirs, d_child)
+	list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child)
 		if (simple_positive(child))
 			goto out;
 	ret = 1;
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index feb6ac427d05..dc39589df165 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -105,7 +105,7 @@ repeat:
 	next = this_parent->d_subdirs.next;
 resume:
 	while (next != &this_parent->d_subdirs) {
-		struct dentry *dentry = list_entry(next, struct dentry, d_child);
+		struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
 
 		/* Negative dentry - give up */
 		if (!simple_positive(dentry)) {
@@ -138,7 +138,7 @@ resume:
 	}
 
 	if (this_parent != top) {
-		next = this_parent->d_child.next;
+		next = this_parent->d_u.d_child.next;
 		this_parent = this_parent->d_parent;
 		goto resume;
 	}
@@ -163,7 +163,7 @@ repeat:
 	next = this_parent->d_subdirs.next;
 resume:
 	while (next != &this_parent->d_subdirs) {
-		struct dentry *dentry = list_entry(next, struct dentry, d_child);
+		struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
 
 		/* Negative dentry - give up */
 		if (!simple_positive(dentry)) {
@@ -199,7 +199,7 @@ cont:
 	}
 
 	if (this_parent != parent) {
-		next = this_parent->d_child.next;
+		next = this_parent->d_u.d_child.next;
 		this_parent = this_parent->d_parent;
 		goto resume;
 	}
@@ -238,7 +238,7 @@ static struct dentry *autofs4_expire(struct super_block *sb,
 	/* On exit from the loop expire is set to a dgot dentry
 	 * to expire or it's NULL */
 	while ( next != &root->d_subdirs ) {
-		struct dentry *dentry = list_entry(next, struct dentry, d_child);
+		struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
 
 		/* Negative dentry - give up */
 		if ( !simple_positive(dentry) ) {
@@ -302,7 +302,7 @@ next:
 			expired, (int)expired->d_name.len, expired->d_name.name);
 		spin_lock(&dcache_lock);
 		list_del(&expired->d_parent->d_subdirs);
-		list_add(&expired->d_parent->d_subdirs, &expired->d_child);
+		list_add(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
 		spin_unlock(&dcache_lock);
 		return expired;
 	}
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 818b37be5153..2d3082854a29 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -91,7 +91,7 @@ repeat:
 	next = this_parent->d_subdirs.next;
 resume:
 	while (next != &this_parent->d_subdirs) {
-		struct dentry *dentry = list_entry(next, struct dentry, d_child);
+		struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
 
 		/* Negative dentry - don`t care */
 		if (!simple_positive(dentry)) {
@@ -117,7 +117,7 @@ resume:
 	if (this_parent != sbi->root) {
 		struct dentry *dentry = this_parent;
 
-		next = this_parent->d_child.next;
+		next = this_parent->d_u.d_child.next;
 		this_parent = this_parent->d_parent;
 		spin_unlock(&dcache_lock);
 		DPRINTK("parent dentry %p %.*s",
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 2a771ec66956..2241405ffc41 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -143,7 +143,8 @@ static int autofs4_dcache_readdir(struct file * filp, void * dirent, filldir_t f
 			}
 
 			while(1) {
-				struct dentry *de = list_entry(list, struct dentry, d_child);
+				struct dentry *de = list_entry(list,
+						struct dentry, d_u.d_child);
 
 				if (!d_unhashed(de) && de->d_inode) {
 					spin_unlock(&dcache_lock);
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 80072fd9b7fa..c607d923350a 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -93,7 +93,7 @@ static void coda_flag_children(struct dentry *parent, int flag)
 	spin_lock(&dcache_lock);
 	list_for_each(child, &parent->d_subdirs)
 	{
-		de = list_entry(child, struct dentry, d_child);
+		de = list_entry(child, struct dentry, d_u.d_child);
 		/* don't know what to do with negative dentries */
 		if ( ! de->d_inode ) 
 			continue;
diff --git a/fs/dcache.c b/fs/dcache.c
index 17e439138681..1536f15c4d4c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -71,7 +71,7 @@ struct dentry_stat_t dentry_stat = {
 
 static void d_callback(struct rcu_head *head)
 {
-	struct dentry * dentry = container_of(head, struct dentry, d_rcu);
+	struct dentry * dentry = container_of(head, struct dentry, d_u.d_rcu);
 
 	if (dname_external(dentry))
 		kfree(dentry->d_name.name);
@@ -86,7 +86,7 @@ static void d_free(struct dentry *dentry)
 {
 	if (dentry->d_op && dentry->d_op->d_release)
 		dentry->d_op->d_release(dentry);
- 	call_rcu(&dentry->d_rcu, d_callback);
+ 	call_rcu(&dentry->d_u.d_rcu, d_callback);
 }
 
 /*
@@ -193,7 +193,7 @@ kill_it: {
   			list_del(&dentry->d_lru);
   			dentry_stat.nr_unused--;
   		}
-  		list_del(&dentry->d_child);
+  		list_del(&dentry->d_u.d_child);
 		dentry_stat.nr_dentry--;	/* For d_free, below */
 		/*drops the locks, at that point nobody can reach this dentry */
 		dentry_iput(dentry);
@@ -367,7 +367,7 @@ static inline void prune_one_dentry(struct dentry * dentry)
 	struct dentry * parent;
 
 	__d_drop(dentry);
-	list_del(&dentry->d_child);
+	list_del(&dentry->d_u.d_child);
 	dentry_stat.nr_dentry--;	/* For d_free, below */
 	dentry_iput(dentry);
 	parent = dentry->d_parent;
@@ -518,7 +518,7 @@ repeat:
 resume:
 	while (next != &this_parent->d_subdirs) {
 		struct list_head *tmp = next;
-		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
+		struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
 		next = tmp->next;
 		/* Have we found a mount point ? */
 		if (d_mountpoint(dentry))
@@ -532,7 +532,7 @@ resume:
 	 * All done at this level ... ascend and resume the search.
 	 */
 	if (this_parent != parent) {
-		next = this_parent->d_child.next; 
+		next = this_parent->d_u.d_child.next;
 		this_parent = this_parent->d_parent;
 		goto resume;
 	}
@@ -569,7 +569,7 @@ repeat:
 resume:
 	while (next != &this_parent->d_subdirs) {
 		struct list_head *tmp = next;
-		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
+		struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
 		next = tmp->next;
 
 		if (!list_empty(&dentry->d_lru)) {
@@ -610,7 +610,7 @@ dentry->d_parent->d_name.name, dentry->d_name.name, found);
 	 * All done at this level ... ascend and resume the search.
 	 */
 	if (this_parent != parent) {
-		next = this_parent->d_child.next; 
+		next = this_parent->d_u.d_child.next;
 		this_parent = this_parent->d_parent;
 #ifdef DCACHE_DEBUG
 printk(KERN_DEBUG "select_parent: ascending to %s/%s, found=%d\n",
@@ -753,12 +753,12 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 		dentry->d_parent = dget(parent);
 		dentry->d_sb = parent->d_sb;
 	} else {
-		INIT_LIST_HEAD(&dentry->d_child);
+		INIT_LIST_HEAD(&dentry->d_u.d_child);
 	}
 
 	spin_lock(&dcache_lock);
 	if (parent)
-		list_add(&dentry->d_child, &parent->d_subdirs);
+		list_add(&dentry->d_u.d_child, &parent->d_subdirs);
 	dentry_stat.nr_dentry++;
 	spin_unlock(&dcache_lock);
 
@@ -1310,8 +1310,8 @@ already_unhashed:
 	/* Unhash the target: dput() will then get rid of it */
 	__d_drop(target);
 
-	list_del(&dentry->d_child);
-	list_del(&target->d_child);
+	list_del(&dentry->d_u.d_child);
+	list_del(&target->d_u.d_child);
 
 	/* Switch the names.. */
 	switch_names(dentry, target);
@@ -1322,15 +1322,15 @@ already_unhashed:
 	if (IS_ROOT(dentry)) {
 		dentry->d_parent = target->d_parent;
 		target->d_parent = target;
-		INIT_LIST_HEAD(&target->d_child);
+		INIT_LIST_HEAD(&target->d_u.d_child);
 	} else {
 		do_switch(dentry->d_parent, target->d_parent);
 
 		/* And add them back to the (new) parent lists */
-		list_add(&target->d_child, &target->d_parent->d_subdirs);
+		list_add(&target->d_u.d_child, &target->d_parent->d_subdirs);
 	}
 
-	list_add(&dentry->d_child, &dentry->d_parent->d_subdirs);
+	list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
 	spin_unlock(&target->d_lock);
 	spin_unlock(&dentry->d_lock);
 	write_sequnlock(&rename_lock);
@@ -1568,7 +1568,7 @@ repeat:
 resume:
 	while (next != &this_parent->d_subdirs) {
 		struct list_head *tmp = next;
-		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
+		struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
 		next = tmp->next;
 		if (d_unhashed(dentry)||!dentry->d_inode)
 			continue;
@@ -1579,7 +1579,7 @@ resume:
 		atomic_dec(&dentry->d_count);
 	}
 	if (this_parent != root) {
-		next = this_parent->d_child.next; 
+		next = this_parent->d_u.d_child.next;
 		atomic_dec(&this_parent->d_count);
 		this_parent = this_parent->d_parent;
 		goto resume;
diff --git a/fs/libfs.c b/fs/libfs.c
index 58101dff2c66..9c50523382e7 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -93,16 +93,16 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
 			loff_t n = file->f_pos - 2;
 
 			spin_lock(&dcache_lock);
-			list_del(&cursor->d_child);
+			list_del(&cursor->d_u.d_child);
 			p = file->f_dentry->d_subdirs.next;
 			while (n && p != &file->f_dentry->d_subdirs) {
 				struct dentry *next;
-				next = list_entry(p, struct dentry, d_child);
+				next = list_entry(p, struct dentry, d_u.d_child);
 				if (!d_unhashed(next) && next->d_inode)
 					n--;
 				p = p->next;
 			}
-			list_add_tail(&cursor->d_child, p);
+			list_add_tail(&cursor->d_u.d_child, p);
 			spin_unlock(&dcache_lock);
 		}
 	}
@@ -126,7 +126,7 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
 	struct dentry *dentry = filp->f_dentry;
 	struct dentry *cursor = filp->private_data;
-	struct list_head *p, *q = &cursor->d_child;
+	struct list_head *p, *q = &cursor->d_u.d_child;
 	ino_t ino;
 	int i = filp->f_pos;
 
@@ -153,7 +153,7 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
 			}
 			for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
 				struct dentry *next;
-				next = list_entry(p, struct dentry, d_child);
+				next = list_entry(p, struct dentry, d_u.d_child);
 				if (d_unhashed(next) || !next->d_inode)
 					continue;
 
@@ -261,7 +261,7 @@ int simple_empty(struct dentry *dentry)
 	int ret = 0;
 
 	spin_lock(&dcache_lock);
-	list_for_each_entry(child, &dentry->d_subdirs, d_child)
+	list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child)
 		if (simple_positive(child))
 			goto out;
 	ret = 1;
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index a9f7a8ab1d59..cfd76f431dc0 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -365,7 +365,7 @@ ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
 	spin_lock(&dcache_lock);
 	next = parent->d_subdirs.next;
 	while (next != &parent->d_subdirs) {
-		dent = list_entry(next, struct dentry, d_child);
+		dent = list_entry(next, struct dentry, d_u.d_child);
 		if ((unsigned long)dent->d_fsdata == fpos) {
 			if (dent->d_inode)
 				dget_locked(dent);
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 9e4dc30c2435..799e5c2bec55 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -196,7 +196,7 @@ ncp_renew_dentries(struct dentry *parent)
 	spin_lock(&dcache_lock);
 	next = parent->d_subdirs.next;
 	while (next != &parent->d_subdirs) {
-		dentry = list_entry(next, struct dentry, d_child);
+		dentry = list_entry(next, struct dentry, d_u.d_child);
 
 		if (dentry->d_fsdata == NULL)
 			ncp_age_dentry(server, dentry);
@@ -218,7 +218,7 @@ ncp_invalidate_dircache_entries(struct dentry *parent)
 	spin_lock(&dcache_lock);
 	next = parent->d_subdirs.next;
 	while (next != &parent->d_subdirs) {
-		dentry = list_entry(next, struct dentry, d_child);
+		dentry = list_entry(next, struct dentry, d_u.d_child);
 		dentry->d_fsdata = NULL;
 		ncp_age_dentry(server, dentry);
 		next = next->next;
diff --git a/fs/smbfs/cache.c b/fs/smbfs/cache.c
index f3e6b81288ab..74b86d9725a6 100644
--- a/fs/smbfs/cache.c
+++ b/fs/smbfs/cache.c
@@ -66,7 +66,7 @@ smb_invalidate_dircache_entries(struct dentry *parent)
 	spin_lock(&dcache_lock);
 	next = parent->d_subdirs.next;
 	while (next != &parent->d_subdirs) {
-		dentry = list_entry(next, struct dentry, d_child);
+		dentry = list_entry(next, struct dentry, d_u.d_child);
 		dentry->d_fsdata = NULL;
 		smb_age_dentry(server, dentry);
 		next = next->next;
@@ -100,7 +100,7 @@ smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
 	spin_lock(&dcache_lock);
 	next = parent->d_subdirs.next;
 	while (next != &parent->d_subdirs) {
-		dent = list_entry(next, struct dentry, d_child);
+		dent = list_entry(next, struct dentry, d_u.d_child);
 		if ((unsigned long)dent->d_fsdata == fpos) {
 			if (dent->d_inode)
 				dget_locked(dent);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 46a2ba617595..a3ed5e059d47 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -95,14 +95,19 @@ struct dentry {
 	struct qstr d_name;
 
 	struct list_head d_lru;		/* LRU list */
-	struct list_head d_child;	/* child of parent list */
+	/*
+	 * d_child and d_rcu can share memory
+	 */
+	union {
+		struct list_head d_child;	/* child of parent list */
+	 	struct rcu_head d_rcu;
+	} d_u;
 	struct list_head d_subdirs;	/* our children */
 	struct list_head d_alias;	/* inode alias list */
 	unsigned long d_time;		/* used by d_revalidate */
 	struct dentry_operations *d_op;
 	struct super_block *d_sb;	/* The root of the dentry tree */
 	void *d_fsdata;			/* fs-specific data */
- 	struct rcu_head d_rcu;
 	struct dcookie_struct *d_cookie; /* cookie, if any */
 	int d_mounted;
 	unsigned char d_iname[DNAME_INLINE_LEN_MIN];	/* small names */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e04c2da9dadb..eab64e23bcae 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -331,7 +331,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry)
 	spin_lock(&dcache_lock);
 	node = dentry->d_subdirs.next;
 	while (node != &dentry->d_subdirs) {
-		struct dentry *d = list_entry(node, struct dentry, d_child);
+		struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
 		list_del_init(node);
 		if (d->d_inode) {
 			d = dget_locked(d);
@@ -343,7 +343,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry)
 		}
 		node = dentry->d_subdirs.next;
 	}
-	list_del_init(&dentry->d_child);
+	list_del_init(&dentry->d_u.d_child);
 	spin_unlock(&dcache_lock);
 	remove_dir(dentry);
 }
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 24cc23af9b95..e14c1cae7460 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -495,7 +495,7 @@ rpc_depopulate(struct dentry *parent)
 repeat:
 	spin_lock(&dcache_lock);
 	list_for_each_safe(pos, next, &parent->d_subdirs) {
-		dentry = list_entry(pos, struct dentry, d_child);
+		dentry = list_entry(pos, struct dentry, d_u.d_child);
 		spin_lock(&dentry->d_lock);
 		if (!d_unhashed(dentry)) {
 			dget_locked(dentry);
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index e59da6398d44..b5fa02d17b1e 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -889,7 +889,7 @@ static void sel_remove_bools(struct dentry *de)
 	spin_lock(&dcache_lock);
 	node = de->d_subdirs.next;
 	while (node != &de->d_subdirs) {
-		struct dentry *d = list_entry(node, struct dentry, d_child);
+		struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
 		list_del_init(node);
 
 		if (d->d_inode) {
-- 
cgit v1.2.3


From e78c9a004aadebe22306c81d1a7f1d1278dc37f9 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Sun, 8 Jan 2006 01:03:39 -0800
Subject: [PATCH] fs: remove s_old_blocksize from struct super_block

This patch inlines the single user of struct super_block field
s_old_blocksize and removes the field.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/super.c         | 3 +--
 include/linux/fs.h | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index 5a347a4f673a..0a30e51692cf 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -700,8 +700,7 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
 
 		s->s_flags = flags;
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
-		s->s_old_blocksize = block_size(bdev);
-		sb_set_blocksize(s, s->s_old_blocksize);
+		sb_set_blocksize(s, block_size(bdev));
 		error = fill_super(s, data, flags & MS_VERBOSE ? 1 : 0);
 		if (error) {
 			up_write(&s->s_umount);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a1e28f0895c0..4c82219b0fae 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -808,7 +808,6 @@ struct super_block {
 	struct list_head	s_list;		/* Keep this first */
 	dev_t			s_dev;		/* search index; _not_ kdev_t */
 	unsigned long		s_blocksize;
-	unsigned long		s_old_blocksize;
 	unsigned char		s_blocksize_bits;
 	unsigned char		s_dirt;
 	unsigned long long	s_maxbytes;	/* Max file size */
-- 
cgit v1.2.3


From f867bac65419a98c9682f4409e087582d29ec5f6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Sun, 8 Jan 2006 01:03:40 -0800
Subject: [PATCH] remove unused blkp field in percpu_data

I found that blkp field was not used in kernel tree.

As most of the times NR_CPUS is a power of two and kmalloc() memory blocks
too, this extra field basically doubles the memory space allocated in
__alloc_percpu() to store the 'struct percpu_data'

(for example, if NR_CPUS=8 on i386, kmalloc(4*8+4) returns a 64 bytes block
instead of a 32 bytes block after this patch)

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/percpu.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 20317d88deba..cb9039a21f2a 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -19,7 +19,6 @@
 
 struct percpu_data {
 	void *ptrs[NR_CPUS];
-	void *blkp;
 };
 
 /* 
-- 
cgit v1.2.3


From fd285bb54d8a3e99810090ae88cfe8ed77d1da25 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 8 Jan 2006 01:04:07 -0800
Subject: [PATCH] Abandon gcc-2.95.x

There's one scsi driver which doesn't compile due to weird __VA_ARGS__ tricks
and the rather useful scsi/sd.c is currently getting an ICE.  None of the new
SAS code compiles, due to extensive use of anonymous unions.  The V4L guys are
very good at exploiting the gcc-2.95.x macro expansion bug (_why_ does each
driver need to implement its own debug macros?) and various people keep on
sneaking in anonymous unions, which are rather nice.

Plus anonymous unions are rather useful.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/compiler-gcc2.h | 29 -----------------------------
 include/linux/compiler.h      |  2 --
 init/main.c                   |  7 +------
 3 files changed, 1 insertion(+), 37 deletions(-)
 delete mode 100644 include/linux/compiler-gcc2.h

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc2.h b/include/linux/compiler-gcc2.h
deleted file mode 100644
index ebed17660c5f..000000000000
--- a/include/linux/compiler-gcc2.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Never include this file directly.  Include <linux/compiler.h> instead.  */
-
-/* These definitions are for GCC v2.x.  */
-
-/* Somewhere in the middle of the GCC 2.96 development cycle, we implemented
-   a mechanism by which the user can annotate likely branch directions and
-   expect the blocks to be reordered appropriately.  Define __builtin_expect
-   to nothing for earlier compilers.  */
-#include <linux/compiler-gcc.h>
-
-#if __GNUC_MINOR__ < 96
-# define __builtin_expect(x, expected_value) (x)
-#endif
-
-#define __attribute_used__	__attribute__((__unused__))
-
-/*
- * The attribute `pure' is not implemented in GCC versions earlier
- * than 2.96.
- */
-#if __GNUC_MINOR__ >= 96
-# define __attribute_pure__	__attribute__((pure))
-# define __attribute_const__	__attribute__((__const__))
-#endif
-
-/* GCC 2.95.x/2.96 recognize __va_copy, but not va_copy. Actually later GCC's
- * define both va_copy and __va_copy, but the latter may go away, so limit this
- * to this header */
-#define va_copy			__va_copy
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index d7378215b851..f23d3c6fc2c0 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -42,8 +42,6 @@ extern void __chk_io_ptr(void __iomem *);
 # include <linux/compiler-gcc4.h>
 #elif __GNUC__ == 3
 # include <linux/compiler-gcc3.h>
-#elif __GNUC__ == 2
-# include <linux/compiler-gcc2.h>
 #else
 # error Sorry, your compiler is too old/not recognized.
 #endif
diff --git a/init/main.c b/init/main.c
index afe5eb84ad52..8342c2890b16 100644
--- a/init/main.c
+++ b/init/main.c
@@ -58,11 +58,6 @@
  * This is one of the first .c files built. Error out early
  * if we have compiler trouble..
  */
-#if __GNUC__ == 2 && __GNUC_MINOR__ == 96
-#ifdef CONFIG_FRAME_POINTER
-#error This compiler cannot compile correctly with frame pointers enabled
-#endif
-#endif
 
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/smp.h>
@@ -74,7 +69,7 @@
  * To avoid associated bogus bug reports, we flatly refuse to compile
  * with a gcc that is known to be too old from the very beginning.
  */
-#if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 95)
+#if (__GNUC__ < 3) || (__GNUC__ == 3 && __GNUC_MINOR__ < 2)
 #error Sorry, your GCC is too old. It builds incorrect kernels.
 #endif
 
-- 
cgit v1.2.3


From a1365647022eb05a5993f270a78e9bef3bf554eb Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 8 Jan 2006 01:04:09 -0800
Subject: [PATCH] remove gcc-2 checks

Remove various things which were checking for gcc-1.x and gcc-2.x compilers.

From: Adrian Bunk <bunk@stusta.de>

    Some documentation updates and removes some code paths for gcc < 3.2.

Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/Changes                 | 31 +++++--------------------------
 README                                |  7 ++-----
 arch/arm/kernel/asm-offsets.c         |  9 ++-------
 arch/arm26/kernel/asm-offsets.c       |  7 -------
 arch/i386/Kconfig                     |  4 ----
 arch/i386/Makefile                    |  5 +----
 arch/i386/Makefile.cpu                | 10 +++++-----
 arch/ia64/Makefile                    |  4 ----
 arch/ia64/kernel/head.S               |  2 +-
 arch/ia64/kernel/ia64_ksyms.c         |  2 +-
 arch/ia64/oprofile/backtrace.c        |  2 +-
 drivers/md/raid0.c                    |  6 ------
 drivers/media/video/v4l2-common.c     |  2 --
 fs/ocfs2/cluster/masklog.h            |  7 +++----
 fs/xfs/xfs_log.h                      |  8 +-------
 include/asm-alpha/compiler.h          |  2 --
 include/asm-alpha/processor.h         | 21 ---------------------
 include/asm-ia64/bug.h                |  6 +-----
 include/asm-ia64/spinlock.h           |  2 +-
 include/asm-sparc64/system.h          |  4 ----
 include/asm-um/rwsem.h                |  4 ----
 include/asm-v850/unistd.h             | 18 ------------------
 include/linux/byteorder/generic.h     |  2 +-
 include/linux/byteorder/swab.h        |  2 +-
 include/linux/byteorder/swabb.h       |  2 +-
 include/linux/compiler-gcc.h          |  9 +++++++++
 include/linux/compiler-gcc3.h         | 17 -----------------
 include/linux/compiler-gcc4.h         |  7 -------
 include/linux/kernel.h                |  2 --
 include/linux/seccomp.h               |  6 +-----
 include/linux/spinlock_types_up.h     | 14 --------------
 sound/isa/wavefront/wavefront_synth.c |  7 -------
 32 files changed, 37 insertions(+), 194 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/Changes b/Documentation/Changes
index 86b86399d61d..fe5ae0f55020 100644
--- a/Documentation/Changes
+++ b/Documentation/Changes
@@ -31,8 +31,6 @@ al espa
 Eine deutsche Version dieser Datei finden Sie unter
 <http://www.stefan-winter.de/Changes-2.4.0.txt>.
 
-Last updated: October 29th, 2002
-
 Chris Ricker (kaboom@gatech.edu or chris.ricker@genetics.utah.edu).
 
 Current Minimal Requirements
@@ -48,7 +46,7 @@ necessary on all systems; obviously, if you don't have any ISDN
 hardware, for example, you probably needn't concern yourself with
 isdn4k-utils.
 
-o  Gnu C                  2.95.3                  # gcc --version
+o  Gnu C                  3.2                     # gcc --version
 o  Gnu make               3.79.1                  # make --version
 o  binutils               2.12                    # ld -v
 o  util-linux             2.10o                   # fdformat --version
@@ -74,26 +72,7 @@ GCC
 ---
 
 The gcc version requirements may vary depending on the type of CPU in your
-computer. The next paragraph applies to users of x86 CPUs, but not
-necessarily to users of other CPUs. Users of other CPUs should obtain
-information about their gcc version requirements from another source.
-
-The recommended compiler for the kernel is gcc 2.95.x (x >= 3), and it
-should be used when you need absolute stability. You may use gcc 3.0.x
-instead if you wish, although it may cause problems. Later versions of gcc 
-have not received much testing for Linux kernel compilation, and there are 
-almost certainly bugs (mainly, but not exclusively, in the kernel) that
-will need to be fixed in order to use these compilers. In any case, using
-pgcc instead of plain gcc is just asking for trouble.
-
-The Red Hat gcc 2.96 compiler subtree can also be used to build this tree.
-You should ensure you use gcc-2.96-74 or later. gcc-2.96-54 will not build
-the kernel correctly.
-
-In addition, please pay attention to compiler optimization.  Anything
-greater than -O2 may not be wise.  Similarly, if you choose to use gcc-2.95.x
-or derivatives, be sure not to use -fstrict-aliasing (which, depending on
-your version of gcc 2.95.x, may necessitate using -fno-strict-aliasing).
+computer.
 
 Make
 ----
@@ -322,9 +301,9 @@ Getting updated software
 Kernel compilation
 ******************
 
-gcc 2.95.3
-----------
-o  <ftp://ftp.gnu.org/gnu/gcc/gcc-2.95.3.tar.gz>
+gcc
+---
+o  <ftp://ftp.gnu.org/gnu/gcc/>
 
 Make
 ----
diff --git a/README b/README
index 61c4f7429233..cd5e2eb6213b 100644
--- a/README
+++ b/README
@@ -183,11 +183,8 @@ CONFIGURING the kernel:
 
 COMPILING the kernel:
 
- - Make sure you have gcc 2.95.3 available.
-   gcc 2.91.66 (egcs-1.1.2), and gcc 2.7.2.3 are known to miscompile
-   some parts of the kernel, and are *no longer supported*.
-   Also remember to upgrade your binutils package (for as/ld/nm and company)
-   if necessary. For more information, refer to Documentation/Changes.
+ - Make sure you have at least gcc 3.2 available.
+   For more information, refer to Documentation/Changes.
 
    Please note that you can still run a.out user programs with this kernel.
 
diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
index 04d3082a7b94..0abbce8c70bc 100644
--- a/arch/arm/kernel/asm-offsets.c
+++ b/arch/arm/kernel/asm-offsets.c
@@ -23,20 +23,15 @@
 #error Sorry, your compiler targets APCS-26 but this kernel requires APCS-32
 #endif
 /*
- * GCC 2.95.1, 2.95.2: ignores register clobber list in asm().
  * GCC 3.0, 3.1: general bad code generation.
  * GCC 3.2.0: incorrect function argument offset calculation.
  * GCC 3.2.x: miscompiles NEW_AUX_ENT in fs/binfmt_elf.c
  *            (http://gcc.gnu.org/PR8896) and incorrect structure
  *	      initialisation in fs/jffs2/erase.c
  */
-#if __GNUC__ < 2 || \
-   (__GNUC__ == 2 && __GNUC_MINOR__ < 95) || \
-   (__GNUC__ == 2 && __GNUC_MINOR__ == 95 && __GNUC_PATCHLEVEL__ != 0 && \
-					     __GNUC_PATCHLEVEL__ < 3) || \
-   (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
+#if (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
 #error Your compiler is too buggy; it is known to miscompile kernels.
-#error    Known good compilers: 2.95.3, 2.95.4, 2.96, 3.3
+#error    Known good compilers: 3.3
 #endif
 
 /* Use marker if you need to separate the values later */
diff --git a/arch/arm26/kernel/asm-offsets.c b/arch/arm26/kernel/asm-offsets.c
index 4ccacaef94df..ac682d5fd039 100644
--- a/arch/arm26/kernel/asm-offsets.c
+++ b/arch/arm26/kernel/asm-offsets.c
@@ -25,13 +25,6 @@
 #if defined(__APCS_32__) && defined(CONFIG_CPU_26)
 #error Sorry, your compiler targets APCS-32 but this kernel requires APCS-26
 #endif
-#if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 95)
-#error Sorry, your compiler is known to miscompile kernels.  Only use gcc 2.95.3 and later.
-#endif
-#if __GNUC__ == 2 && __GNUC_MINOR__ == 95
-/* shame we can't detect the .1 or .2 releases */
-#warning GCC 2.95.2 and earlier miscompiles kernels.
-#endif
 
 /* Use marker if you need to separate the values later */
 
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 968fabd8723f..486449e9e710 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -630,10 +630,6 @@ config REGPARM
 	and passes the first three arguments of a function call in registers.
 	This will probably break binary only modules.
 
-	This feature is only enabled for gcc-3.0 and later - earlier compilers
-	generate incorrect output with certain kernel constructs when
-	-mregparm=3 is used.
-
 config SECCOMP
 	bool "Enable seccomp to safely compute untrusted bytecode"
 	depends on PROC_FS
diff --git a/arch/i386/Makefile b/arch/i386/Makefile
index d121ea18460f..b84119f9cc63 100644
--- a/arch/i386/Makefile
+++ b/arch/i386/Makefile
@@ -37,10 +37,7 @@ CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2)
 # CPU-specific tuning. Anything which can be shared with UML should go here.
 include $(srctree)/arch/i386/Makefile.cpu
 
-# -mregparm=3 works ok on gcc-3.0 and later
-#
-GCC_VERSION			:= $(call cc-version)
-cflags-$(CONFIG_REGPARM) 	+= $(shell if [ $(GCC_VERSION) -ge 0300 ] ; then echo "-mregparm=3"; fi ;)
+cflags-$(CONFIG_REGPARM) 	+= -mregparm=3
 
 # Disable unit-at-a-time mode, it makes gcc use a lot more stack
 # due to the lack of sharing of stacklots.
diff --git a/arch/i386/Makefile.cpu b/arch/i386/Makefile.cpu
index 8e51456df23d..dcd936ef45db 100644
--- a/arch/i386/Makefile.cpu
+++ b/arch/i386/Makefile.cpu
@@ -1,7 +1,7 @@
 # CPU tuning section - shared with UML.
 # Must change only cflags-y (or [yn]), not CFLAGS! That makes a difference for UML.
 
-#-mtune exists since gcc 3.4, and some -mcpu flavors didn't exist in gcc 2.95.
+#-mtune exists since gcc 3.4
 HAS_MTUNE	:= $(call cc-option-yn, -mtune=i386)
 ifeq ($(HAS_MTUNE),y)
 tune		= $(call cc-option,-mtune=$(1),)
@@ -14,7 +14,7 @@ cflags-$(CONFIG_M386)		+= -march=i386
 cflags-$(CONFIG_M486)		+= -march=i486
 cflags-$(CONFIG_M586)		+= -march=i586
 cflags-$(CONFIG_M586TSC)	+= -march=i586
-cflags-$(CONFIG_M586MMX)	+= $(call cc-option,-march=pentium-mmx,-march=i586)
+cflags-$(CONFIG_M586MMX)	+= -march=pentium-mmx
 cflags-$(CONFIG_M686)		+= -march=i686
 cflags-$(CONFIG_MPENTIUMII)	+= -march=i686 $(call tune,pentium2)
 cflags-$(CONFIG_MPENTIUMIII)	+= -march=i686 $(call tune,pentium3)
@@ -23,8 +23,8 @@ cflags-$(CONFIG_MPENTIUM4)	+= -march=i686 $(call tune,pentium4)
 cflags-$(CONFIG_MK6)		+= -march=k6
 # Please note, that patches that add -march=athlon-xp and friends are pointless.
 # They make zero difference whatsosever to performance at this time.
-cflags-$(CONFIG_MK7)		+= $(call cc-option,-march=athlon,-march=i686 $(align)-functions=4)
-cflags-$(CONFIG_MK8)		+= $(call cc-option,-march=k8,$(call cc-option,-march=athlon,-march=i686 $(align)-functions=4))
+cflags-$(CONFIG_MK7)		+= -march=athlon
+cflags-$(CONFIG_MK8)		+= $(call cc-option,-march=k8,-march=athlon)
 cflags-$(CONFIG_MCRUSOE)	+= -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
 cflags-$(CONFIG_MEFFICEON)	+= -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
 cflags-$(CONFIG_MWINCHIPC6)	+= $(call cc-option,-march=winchip-c6,-march=i586)
@@ -37,5 +37,5 @@ cflags-$(CONFIG_MVIAC3_2)	+= $(call cc-option,-march=c3-2,-march=i686)
 cflags-$(CONFIG_X86_ELAN)	+= -march=i486
 
 # Geode GX1 support
-cflags-$(CONFIG_MGEODEGX1)		+= $(call cc-option,-march=pentium-mmx,-march=i486)
+cflags-$(CONFIG_MGEODEGX1)	+= -march=pentium-mmx
 
diff --git a/arch/ia64/Makefile b/arch/ia64/Makefile
index 67932ad53082..57b047c27e46 100644
--- a/arch/ia64/Makefile
+++ b/arch/ia64/Makefile
@@ -37,10 +37,6 @@ $(error Sorry, you need a newer version of the assember, one that is built from
 		ftp://ftp.hpl.hp.com/pub/linux-ia64/gas-030124.tar.gz)
 endif
 
-ifneq ($(shell if [ $(GCC_VERSION) -lt 0300 ] ; then echo "bad"; fi ;),)
-$(error Sorry, your compiler is too old.  GCC v2.96 is known to generate bad code.)
-endif
-
 ifeq ($(GCC_VERSION),0304)
 	cflags-$(CONFIG_ITANIUM)	+= -mtune=merced
 	cflags-$(CONFIG_MCKINLEY)	+= -mtune=mckinley
diff --git a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S
index bfe65b2e8621..fbc7ea35dd57 100644
--- a/arch/ia64/kernel/head.S
+++ b/arch/ia64/kernel/head.S
@@ -1060,7 +1060,7 @@ SET_REG(b5);
 	 * the clobber lists for spin_lock() in include/asm-ia64/spinlock.h.
 	 */
 
-#if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
+#if (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
 
 GLOBAL_ENTRY(ia64_spinlock_contention_pre3_4)
 	.prologue
diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c
index 5db9d3bcbbcb..e72de580ebbf 100644
--- a/arch/ia64/kernel/ia64_ksyms.c
+++ b/arch/ia64/kernel/ia64_ksyms.c
@@ -103,7 +103,7 @@ EXPORT_SYMBOL(unw_init_running);
 
 #ifdef ASM_SUPPORTED
 # ifdef CONFIG_SMP
-#  if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
+#  if (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
 /*
  * This is not a normal routine and we don't want a function descriptor for it, so we use
  * a fake declaration here.
diff --git a/arch/ia64/oprofile/backtrace.c b/arch/ia64/oprofile/backtrace.c
index b7dabbfb0d61..adb01566bd57 100644
--- a/arch/ia64/oprofile/backtrace.c
+++ b/arch/ia64/oprofile/backtrace.c
@@ -32,7 +32,7 @@ typedef struct
 	u64 *prev_pfs_loc;	/* state for WAR for old spinlock ool code */
 } ia64_backtrace_t;
 
-#if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
+#if (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
 /*
  * Returns non-zero if the PC is in the spinlock contention out-of-line code
  * with non-standard calling sequence (on older compilers).
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index abbca150202b..d03f99cf4b7d 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -306,9 +306,6 @@ static int raid0_run (mddev_t *mddev)
 	printk("raid0 : conf->hash_spacing is %llu blocks.\n",
 		(unsigned long long)conf->hash_spacing);
 	{
-#if __GNUC__ < 3
-		volatile
-#endif
 		sector_t s = mddev->array_size;
 		sector_t space = conf->hash_spacing;
 		int round;
@@ -439,9 +436,6 @@ static int raid0_make_request (request_queue_t *q, struct bio *bio)
  
 
 	{
-#if __GNUC__ < 3
-		volatile
-#endif
 		sector_t x = block >> conf->preshift;
 		sector_div(x, (u32)conf->hash_spacing);
 		zone = conf->hash_table[x];
diff --git a/drivers/media/video/v4l2-common.c b/drivers/media/video/v4l2-common.c
index 597b8db35a13..62a7d636ef11 100644
--- a/drivers/media/video/v4l2-common.c
+++ b/drivers/media/video/v4l2-common.c
@@ -191,9 +191,7 @@ char *v4l2_type_names[] = {
 };
 
 char *v4l2_ioctl_names[256] = {
-#if __GNUC__ >= 3
 	[0 ... 255]                      = "UNKNOWN",
-#endif
 	[_IOC_NR(VIDIOC_QUERYCAP)]       = "VIDIOC_QUERYCAP",
 	[_IOC_NR(VIDIOC_RESERVED)]       = "VIDIOC_RESERVED",
 	[_IOC_NR(VIDIOC_ENUM_FMT)]       = "VIDIOC_ENUM_FMT",
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index f5ef5ea61a05..e8c56a3d9c64 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -212,11 +212,10 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
 	mlog(ML_ENTRY, "ENTRY:\n");					\
 } while (0)
 
-/* We disable this for old compilers since they don't have support for
- * __builtin_types_compatible_p.
+/*
+ * We disable this for sparse.
  */
-#if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) && \
-    !defined(__CHECKER__)
+#if !defined(__CHECKER__)
 #define mlog_exit(st) do {						     \
 	if (__builtin_types_compatible_p(typeof(st), unsigned long))	     \
 		mlog(ML_EXIT, "EXIT: %lu\n", (unsigned long) (st));	     \
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 158829ca56f6..f40d4391fcfc 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -30,13 +30,7 @@
  * By comparing each compnent, we don't have to worry about extra
  * endian issues in treating two 32 bit numbers as one 64 bit number
  */
-static
-#if defined(__GNUC__) && (__GNUC__ == 2) && ( (__GNUC_MINOR__ == 95) || (__GNUC_MINOR__ == 96))
-__attribute__((unused))	/* gcc 2.95, 2.96 miscompile this when inlined */
-#else
-__inline__
-#endif
-xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
+static inline xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
 {
 	if (CYCLE_LSN(lsn1) != CYCLE_LSN(lsn2))
 		return (CYCLE_LSN(lsn1)<CYCLE_LSN(lsn2))? -999 : 999;
diff --git a/include/asm-alpha/compiler.h b/include/asm-alpha/compiler.h
index 0a4a8b40dfcd..00c6f57ad9a7 100644
--- a/include/asm-alpha/compiler.h
+++ b/include/asm-alpha/compiler.h
@@ -98,9 +98,7 @@
 #undef inline
 #undef __inline__
 #undef __inline
-#if __GNUC__ == 3 && __GNUC_MINOR__ >= 1 || __GNUC__ > 3
 #undef __always_inline
 #define __always_inline		inline __attribute__((always_inline))
-#endif
 
 #endif /* __ALPHA_COMPILER_H */
diff --git a/include/asm-alpha/processor.h b/include/asm-alpha/processor.h
index 059780a7d3d7..bb1a7a3abb8b 100644
--- a/include/asm-alpha/processor.h
+++ b/include/asm-alpha/processor.h
@@ -77,7 +77,6 @@ unsigned long get_wchan(struct task_struct *p);
 #define spin_lock_prefetch(lock)  	do { } while (0)
 #endif
 
-#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)
 extern inline void prefetch(const void *ptr)  
 { 
 	__builtin_prefetch(ptr, 0, 3);
@@ -95,24 +94,4 @@ extern inline void spin_lock_prefetch(const void *ptr)
 }
 #endif
 
-#else
-extern inline void prefetch(const void *ptr)  
-{ 
-	__asm__ ("ldl $31,%0" : : "m"(*(char *)ptr)); 
-}
-
-extern inline void prefetchw(const void *ptr)  
-{
-	__asm__ ("ldq $31,%0" : : "m"(*(char *)ptr)); 
-}
-
-#ifdef CONFIG_SMP
-extern inline void spin_lock_prefetch(const void *ptr)  
-{
-	__asm__ ("ldq $31,%0" : : "m"(*(char *)ptr)); 
-}
-#endif
-
-#endif /* GCC 3.1 */
-
 #endif /* __ASM_ALPHA_PROCESSOR_H */
diff --git a/include/asm-ia64/bug.h b/include/asm-ia64/bug.h
index 3aa0a0a5474b..823616b5020b 100644
--- a/include/asm-ia64/bug.h
+++ b/include/asm-ia64/bug.h
@@ -2,11 +2,7 @@
 #define _ASM_IA64_BUG_H
 
 #ifdef CONFIG_BUG
-#if (__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)
-# define ia64_abort()	__builtin_trap()
-#else
-# define ia64_abort()	(*(volatile int *) 0 = 0)
-#endif
+#define ia64_abort()	__builtin_trap()
 #define BUG() do { printk("kernel BUG at %s:%d!\n", __FILE__, __LINE__); ia64_abort(); } while (0)
 
 /* should this BUG be made generic? */
diff --git a/include/asm-ia64/spinlock.h b/include/asm-ia64/spinlock.h
index 0c91a76c5ea3..9e83210dc312 100644
--- a/include/asm-ia64/spinlock.h
+++ b/include/asm-ia64/spinlock.h
@@ -34,7 +34,7 @@ __raw_spin_lock_flags (raw_spinlock_t *lock, unsigned long flags)
 {
 	register volatile unsigned int *ptr asm ("r31") = &lock->lock;
 
-#if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
+#if (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
 # ifdef CONFIG_ITANIUM
 	/* don't use brl on Itanium... */
 	asm volatile ("{\n\t"
diff --git a/include/asm-sparc64/system.h b/include/asm-sparc64/system.h
index b5417529f6f1..309f1466b6fa 100644
--- a/include/asm-sparc64/system.h
+++ b/include/asm-sparc64/system.h
@@ -193,11 +193,7 @@ do {						\
 	 * not preserve it's value.  Hairy, but it lets us remove 2 loads
 	 * and 2 stores in this critical code path.  -DaveM
 	 */
-#if __GNUC__ >= 3
 #define EXTRA_CLOBBER ,"%l1"
-#else
-#define EXTRA_CLOBBER
-#endif
 #define switch_to(prev, next, last)					\
 do {	if (test_thread_flag(TIF_PERFCTR)) {				\
 		unsigned long __tmp;					\
diff --git a/include/asm-um/rwsem.h b/include/asm-um/rwsem.h
index 661c0e54702b..b5fc449dc86b 100644
--- a/include/asm-um/rwsem.h
+++ b/include/asm-um/rwsem.h
@@ -1,10 +1,6 @@
 #ifndef __UM_RWSEM_H__
 #define __UM_RWSEM_H__
 
-#if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
-#define __builtin_expect(exp,c) (exp)
-#endif
-
 #include "asm/arch/rwsem.h"
 
 #endif
diff --git a/include/asm-v850/unistd.h b/include/asm-v850/unistd.h
index 5a86f8e976ec..82460a7bb233 100644
--- a/include/asm-v850/unistd.h
+++ b/include/asm-v850/unistd.h
@@ -241,9 +241,6 @@
 /* User programs sometimes end up including this header file
    (indirectly, via uClibc header files), so I'm a bit nervous just
    including <linux/compiler.h>.  */
-#if !defined(__builtin_expect) && __GNUC__ == 2 && __GNUC_MINOR__ < 96
-#define __builtin_expect(x, expected_value) (x)
-#endif
 
 #define __syscall_return(type, res)					      \
   do {									      \
@@ -346,20 +343,6 @@ type name (atype a, btype b, ctype c, dtype d, etype e)			      \
   __syscall_return (type, __ret);					      \
 }
 
-#if __GNUC__ < 3
-/* In older versions of gcc, `asm' statements with more than 10
-   input/output arguments produce a fatal error.  To work around this
-   problem, we use two versions, one for gcc-3.x and one for earlier
-   versions of gcc (the `earlier gcc' version doesn't work with gcc-3.x
-   because gcc-3.x doesn't allow clobbers to also be input arguments).  */
-#define __SYSCALL6_TRAP(syscall, ret, a, b, c, d, e, f)			      \
-  __asm__ __volatile__ ("trap " SYSCALL_LONG_TRAP			      \
-			: "=r" (ret), "=r" (syscall)			      \
-			: "1" (syscall),				      \
-			"r" (a), "r" (b), "r" (c), "r" (d),		      \
- 			"r" (e), "r" (f)				      \
-			: SYSCALL_CLOBBERS, SYSCALL_ARG4, SYSCALL_ARG5);
-#else /* __GNUC__ >= 3 */
 #define __SYSCALL6_TRAP(syscall, ret, a, b, c, d, e, f)			      \
   __asm__ __volatile__ ("trap " SYSCALL_LONG_TRAP			      \
 			: "=r" (ret), "=r" (syscall),			      \
@@ -368,7 +351,6 @@ type name (atype a, btype b, ctype c, dtype d, etype e)			      \
 			"r" (a), "r" (b), "r" (c), "r" (d),		      \
 			"2" (e), "3" (f)				      \
 			: SYSCALL_CLOBBERS);
-#endif
 
 #define _syscall6(type, name, atype, a, btype, b, ctype, c, dtype, d, etype, e, ftype, f) \
 type name (atype a, btype b, ctype c, dtype d, etype e, ftype f)	      \
diff --git a/include/linux/byteorder/generic.h b/include/linux/byteorder/generic.h
index 04bd756efc67..e86e4a938373 100644
--- a/include/linux/byteorder/generic.h
+++ b/include/linux/byteorder/generic.h
@@ -156,7 +156,7 @@ extern __be32			htonl(__u32);
 extern __u16			ntohs(__be16);
 extern __be16			htons(__u16);
 
-#if defined(__GNUC__) && (__GNUC__ >= 2) && defined(__OPTIMIZE__)
+#if defined(__GNUC__) && defined(__OPTIMIZE__)
 
 #define ___htonl(x) __cpu_to_be32(x)
 #define ___htons(x) __cpu_to_be16(x)
diff --git a/include/linux/byteorder/swab.h b/include/linux/byteorder/swab.h
index 2f1cb775125a..25f7f32883ec 100644
--- a/include/linux/byteorder/swab.h
+++ b/include/linux/byteorder/swab.h
@@ -110,7 +110,7 @@
 /*
  * Allow constant folding
  */
-#if defined(__GNUC__) && (__GNUC__ >= 2) && defined(__OPTIMIZE__)
+#if defined(__GNUC__) && defined(__OPTIMIZE__)
 #  define __swab16(x) \
 (__builtin_constant_p((__u16)(x)) ? \
  ___swab16((x)) : \
diff --git a/include/linux/byteorder/swabb.h b/include/linux/byteorder/swabb.h
index d5f2a3205109..ae5e5f914bf4 100644
--- a/include/linux/byteorder/swabb.h
+++ b/include/linux/byteorder/swabb.h
@@ -77,7 +77,7 @@
 /*
  * Allow constant folding
  */
-#if defined(__GNUC__) && (__GNUC__ >= 2) && defined(__OPTIMIZE__)
+#if defined(__GNUC__) && defined(__OPTIMIZE__)
 #  define __swahw32(x) \
 (__builtin_constant_p((__u32)(x)) ? \
  ___swahw32((x)) : \
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 152734055403..2e05e1e6b0e6 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -15,3 +15,12 @@
   ({ unsigned long __ptr;					\
     __asm__ ("" : "=g"(__ptr) : "0"(ptr));		\
     (typeof(ptr)) (__ptr + (off)); })
+
+
+#define inline		inline		__attribute__((always_inline))
+#define __inline__	__inline__	__attribute__((always_inline))
+#define __inline	__inline	__attribute__((always_inline))
+#define __deprecated			__attribute__((deprecated))
+#define  noinline			__attribute__((noinline))
+#define __attribute_pure__		__attribute__((pure))
+#define __attribute_const__		__attribute__((__const__))
diff --git a/include/linux/compiler-gcc3.h b/include/linux/compiler-gcc3.h
index a6fa615afab5..4209082ee934 100644
--- a/include/linux/compiler-gcc3.h
+++ b/include/linux/compiler-gcc3.h
@@ -3,29 +3,12 @@
 /* These definitions are for GCC v3.x.  */
 #include <linux/compiler-gcc.h>
 
-#if __GNUC_MINOR__ >= 1
-# define inline		inline		__attribute__((always_inline))
-# define __inline__	__inline__	__attribute__((always_inline))
-# define __inline	__inline	__attribute__((always_inline))
-#endif
-
-#if __GNUC_MINOR__ > 0
-# define __deprecated		__attribute__((deprecated))
-#endif
-
 #if __GNUC_MINOR__ >= 3
 # define __attribute_used__	__attribute__((__used__))
 #else
 # define __attribute_used__	__attribute__((__unused__))
 #endif
 
-#define __attribute_pure__	__attribute__((pure))
-#define __attribute_const__	__attribute__((__const__))
-
-#if __GNUC_MINOR__ >= 1
-#define  noinline		__attribute__((noinline))
-#endif
-
 #if __GNUC_MINOR__ >= 4
 #define __must_check		__attribute__((warn_unused_result))
 #endif
diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h
index 53686c037a06..e913e9beaf69 100644
--- a/include/linux/compiler-gcc4.h
+++ b/include/linux/compiler-gcc4.h
@@ -3,14 +3,7 @@
 /* These definitions are for GCC v4.x.  */
 #include <linux/compiler-gcc.h>
 
-#define inline			inline		__attribute__((always_inline))
-#define __inline__		__inline__	__attribute__((always_inline))
-#define __inline		__inline	__attribute__((always_inline))
-#define __deprecated		__attribute__((deprecated))
 #define __attribute_used__	__attribute__((__used__))
-#define __attribute_pure__	__attribute__((pure))
-#define __attribute_const__	__attribute__((__const__))
-#define  noinline		__attribute__((noinline))
 #define __must_check 		__attribute__((warn_unused_result))
 #define __compiler_offsetof(a,b) __builtin_offsetof(a,b)
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index b1e407a4fbda..ca7ff8fdd090 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -316,8 +316,6 @@ extern int randomize_va_space;
 #endif
 
 /* Trap pasters of __FUNCTION__ at compile-time */
-#if __GNUC__ > 2 || __GNUC_MINOR__ >= 95
 #define __FUNCTION__ (__func__)
-#endif
 
 #endif
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index dc89116bb1ca..cd2773b29a64 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -26,11 +26,7 @@ static inline int has_secure_computing(struct thread_info *ti)
 
 #else /* CONFIG_SECCOMP */
 
-#if (__GNUC__ > 2)
-  typedef struct { } seccomp_t;
-#else
-  typedef struct { int gcc_is_buggy; } seccomp_t;
-#endif
+typedef struct { } seccomp_t;
 
 #define secure_computing(x) do { } while (0)
 /* static inline to preserve typechecking */
diff --git a/include/linux/spinlock_types_up.h b/include/linux/spinlock_types_up.h
index def2d173a8db..04135b0e198e 100644
--- a/include/linux/spinlock_types_up.h
+++ b/include/linux/spinlock_types_up.h
@@ -22,30 +22,16 @@ typedef struct {
 
 #else
 
-/*
- * All gcc 2.95 versions and early versions of 2.96 have a nasty bug
- * with empty initializers.
- */
-#if (__GNUC__ > 2)
 typedef struct { } raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED { }
-#else
-typedef struct { int gcc_is_buggy; } raw_spinlock_t;
-#define __RAW_SPIN_LOCK_UNLOCKED (raw_spinlock_t) { 0 }
-#endif
 
 #endif
 
-#if (__GNUC__ > 2)
 typedef struct {
 	/* no debug version on UP */
 } raw_rwlock_t;
 
 #define __RAW_RW_LOCK_UNLOCKED { }
-#else
-typedef struct { int gcc_is_buggy; } raw_rwlock_t;
-#define __RAW_RW_LOCK_UNLOCKED (raw_rwlock_t) { 0 }
-#endif
 
 #endif /* __LINUX_SPINLOCK_TYPES_UP_H */
diff --git a/sound/isa/wavefront/wavefront_synth.c b/sound/isa/wavefront/wavefront_synth.c
index 679d0ae97e4f..ed81eec6e732 100644
--- a/sound/isa/wavefront/wavefront_synth.c
+++ b/sound/isa/wavefront/wavefront_synth.c
@@ -115,18 +115,11 @@ MODULE_PARM_DESC(osrun_time, "how many seconds to wait for the ICS2115 OS");
 
 #ifdef WF_DEBUG
 
-#if defined(NEW_MACRO_VARARGS) || __GNUC__ >= 3
 #define DPRINT(cond, ...) \
        if ((dev->debug & (cond)) == (cond)) { \
 	     snd_printk (__VA_ARGS__); \
        }
 #else
-#define DPRINT(cond, args...) \
-       if ((dev->debug & (cond)) == (cond)) { \
-	     snd_printk (args); \
-       }
-#endif
-#else
 #define DPRINT(cond, args...)
 #endif /* WF_DEBUG */
 
-- 
cgit v1.2.3


From 59d9136b9844d3a0376d93c945ab280decedb323 Mon Sep 17 00:00:00 2001
From: Benjamin LaHaise <bcrl@kvack.org>
Date: Sun, 8 Jan 2006 01:04:34 -0800
Subject: [PATCH] aio: reorder kiocb structure elements to make sync iocb setup
 faster

Reorder members of the kiocb structure to make sync kiocb setup faster.  By
setting the elements sequentially, the write combining buffers on the CPU
are able to combine the writes into a single burst, which results in fewer
cache cycles being consumed, freeing them up for other code.  This results
in a 10-20KB/s[*] increase on the bw_unix part of LMbench on my test
system.

* The improvement varies based on what other patches are in the system,
  as there are a number of bottlenecks, so this number is not absolutely
  accurate.

Signed-off-by: Benjamin LaHaise <benjamin.c.lahaise@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/aio.h | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/aio.h b/include/linux/aio.h
index 49fd37629ee4..00c8efa95cc3 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -94,26 +94,27 @@ struct kiocb {
 	ssize_t			(*ki_retry)(struct kiocb *);
 	void			(*ki_dtor)(struct kiocb *);
 
-	struct list_head	ki_list;	/* the aio core uses this
-						 * for cancellation */
-
 	union {
 		void __user		*user;
 		struct task_struct	*tsk;
 	} ki_obj;
+
 	__u64			ki_user_data;	/* user's data for completion */
+	wait_queue_t		ki_wait;
 	loff_t			ki_pos;
+
+	void			*private;
 	/* State that we remember to be able to restart/retry  */
 	unsigned short		ki_opcode;
 	size_t			ki_nbytes; 	/* copy of iocb->aio_nbytes */
 	char 			__user *ki_buf;	/* remaining iocb->aio_buf */
 	size_t			ki_left; 	/* remaining bytes */
-	wait_queue_t		ki_wait;
 	long			ki_retried; 	/* just for testing */
 	long			ki_kicked; 	/* just for testing */
 	long			ki_queued; 	/* just for testing */
 
-	void			*private;
+	struct list_head	ki_list;	/* the aio core uses this
+						 * for cancellation */
 };
 
 #define is_sync_kiocb(iocb)	((iocb)->ki_key == KIOCB_SYNC_KEY)
@@ -126,6 +127,7 @@ struct kiocb {
 		(x)->ki_filp = (filp);			\
 		(x)->ki_ctx = NULL;			\
 		(x)->ki_cancel = NULL;			\
+		(x)->ki_retry = NULL;			\
 		(x)->ki_dtor = NULL;			\
 		(x)->ki_obj.tsk = tsk;			\
 		(x)->ki_user_data = 0;                  \
-- 
cgit v1.2.3


From 349aef0bc4c7f07d685c977e12d0e2d0b5d0e6db Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 8 Jan 2006 01:04:36 -0800
Subject: [PATCH] shrink struct page

Reduce the size of the pageframe for NR_CPUS>4, CONFIG_PREEMPT back to the
minimal size by unionising both ->private and ->mapping with the pagetable
lock.

It uses an anonymous struct and hence requires gcc-3.x.

Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 41 ++++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7ff54242c5d7..df80e63903b5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -223,24 +223,27 @@ struct page {
 					 * & limit reverse map searches.
 					 */
 	union {
-		unsigned long private;	/* Mapping-private opaque data:
-					 * usually used for buffer_heads
-					 * if PagePrivate set; used for
-					 * swp_entry_t if PageSwapCache
-					 * When page is free, this indicates
-					 * order in the buddy system.
-					 */
+	    struct {
+		unsigned long private;		/* Mapping-private opaque data:
+					 	 * usually used for buffer_heads
+						 * if PagePrivate set; used for
+						 * swp_entry_t if PageSwapCache.
+						 * When page is free, this
+						 * indicates order in the buddy
+						 * system.
+						 */
+		struct address_space *mapping;	/* If low bit clear, points to
+						 * inode address_space, or NULL.
+						 * If page mapped as anonymous
+						 * memory, low bit is set, and
+						 * it points to anon_vma object:
+						 * see PAGE_MAPPING_ANON below.
+						 */
+	    };
 #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
-		spinlock_t ptl;
+	    spinlock_t ptl;
 #endif
-	} u;
-	struct address_space *mapping;	/* If low bit clear, points to
-					 * inode address_space, or NULL.
-					 * If page mapped as anonymous
-					 * memory, low bit is set, and
-					 * it points to anon_vma object:
-					 * see PAGE_MAPPING_ANON below.
-					 */
+	};
 	pgoff_t index;			/* Our offset within mapping. */
 	struct list_head lru;		/* Pageout list, eg. active_list
 					 * protected by zone->lru_lock !
@@ -261,8 +264,8 @@ struct page {
 #endif /* WANT_PAGE_VIRTUAL */
 };
 
-#define page_private(page)		((page)->u.private)
-#define set_page_private(page, v)	((page)->u.private = (v))
+#define page_private(page)		((page)->private)
+#define set_page_private(page, v)	((page)->private = (v))
 
 /*
  * FIXME: take this include out, include page-flags.h in
@@ -815,7 +818,7 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
  * overflow into the next struct page (as it might with DEBUG_SPINLOCK).
  * When freeing, reset page->mapping so free_pages_check won't complain.
  */
-#define __pte_lockptr(page)	&((page)->u.ptl)
+#define __pte_lockptr(page)	&((page)->ptl)
 #define pte_lock_init(_page)	do {					\
 	spin_lock_init(__pte_lockptr(_page));				\
 } while (0)
-- 
cgit v1.2.3


From a12dea7af93ae83bd868c0dc09367090ead7cc1e Mon Sep 17 00:00:00 2001
From: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Date: Sun, 8 Jan 2006 01:04:49 -0800
Subject: [PATCH] PTRACE_SYSEMU is only for i386 and clashes with other ptrace
 codes of other archs

PTRACE_SYSEMU{,_SINGLESTEP} is actually arch specific, for now, and the
current allocated number clashes with a ptrace code of frv, i.e.
PTRACE_GETFDPIC.  I should have submitted this much earlier, anyway we get no
breakage for this.

CC: Daniel Jacobowitz <dan@debian.org>
Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/ptrace.h | 3 +++
 include/linux/ptrace.h    | 2 --
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-i386/ptrace.h b/include/asm-i386/ptrace.h
index 7e0f2945d17d..f324c53b6f9a 100644
--- a/include/asm-i386/ptrace.h
+++ b/include/asm-i386/ptrace.h
@@ -54,6 +54,9 @@ struct pt_regs {
 #define PTRACE_GET_THREAD_AREA    25
 #define PTRACE_SET_THREAD_AREA    26
 
+#define PTRACE_SYSEMU		  31
+#define PTRACE_SYSEMU_SINGLESTEP  32
+
 #ifdef __KERNEL__
 
 #include <asm/vm86.h>
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 864791996b5f..9d5cd106b344 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -20,8 +20,6 @@
 #define PTRACE_DETACH		0x11
 
 #define PTRACE_SYSCALL		  24
-#define PTRACE_SYSEMU		  31
-#define PTRACE_SYSEMU_SINGLESTEP  32
 
 /* 0x4200-0x4300 are reserved for architecture-independent additions.  */
 #define PTRACE_SETOPTIONS	0x4200
-- 
cgit v1.2.3


From 7e7f358c8f8f836c504faa293fda0c1c0733b63c Mon Sep 17 00:00:00 2001
From: Brian Gerst <bgerst@didntduck.org>
Date: Sun, 8 Jan 2006 01:04:54 -0800
Subject: [PATCH] Split out screen_info from tty.h

This makes it possible for boot code to use screen_info without dragging in
all of tty.h.

Signed-off-by: Brian Gerst <bgerst@didntduck.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/boot/compressed/misc.c        |  2 +-
 arch/x86_64/boot/compressed/misc.c      |  2 +-
 arch/x86_64/boot/compressed/miscsetup.h | 39 -----------------
 include/linux/screen_info.h             | 77 +++++++++++++++++++++++++++++++++
 include/linux/tty.h                     | 72 +-----------------------------
 5 files changed, 80 insertions(+), 112 deletions(-)
 delete mode 100644 arch/x86_64/boot/compressed/miscsetup.h
 create mode 100644 include/linux/screen_info.h

(limited to 'include/linux')

diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c
index 82a807f9f5e6..f19f3a7492a5 100644
--- a/arch/i386/boot/compressed/misc.c
+++ b/arch/i386/boot/compressed/misc.c
@@ -11,7 +11,7 @@
 
 #include <linux/linkage.h>
 #include <linux/vmalloc.h>
-#include <linux/tty.h>
+#include <linux/screen_info.h>
 #include <asm/io.h>
 #include <asm/page.h>
 
diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c
index 0e10fd84c7cc..cf4b88c416dc 100644
--- a/arch/x86_64/boot/compressed/misc.c
+++ b/arch/x86_64/boot/compressed/misc.c
@@ -9,7 +9,7 @@
  * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
  */
 
-#include "miscsetup.h"
+#include <linux/screen_info.h>
 #include <asm/io.h>
 #include <asm/page.h>
 
diff --git a/arch/x86_64/boot/compressed/miscsetup.h b/arch/x86_64/boot/compressed/miscsetup.h
deleted file mode 100644
index bb1620531703..000000000000
--- a/arch/x86_64/boot/compressed/miscsetup.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#define NULL 0
-//typedef unsigned int size_t; 
-
-
-struct screen_info {
-	unsigned char  orig_x;			/* 0x00 */
-	unsigned char  orig_y;			/* 0x01 */
-	unsigned short dontuse1;		/* 0x02 -- EXT_MEM_K sits here */
-	unsigned short orig_video_page;		/* 0x04 */
-	unsigned char  orig_video_mode;		/* 0x06 */
-	unsigned char  orig_video_cols;		/* 0x07 */
-	unsigned short unused2;			/* 0x08 */
-	unsigned short orig_video_ega_bx;	/* 0x0a */
-	unsigned short unused3;			/* 0x0c */
-	unsigned char  orig_video_lines;	/* 0x0e */
-	unsigned char  orig_video_isVGA;	/* 0x0f */
-	unsigned short orig_video_points;	/* 0x10 */
-
-	/* VESA graphic mode -- linear frame buffer */
-	unsigned short lfb_width;		/* 0x12 */
-	unsigned short lfb_height;		/* 0x14 */
-	unsigned short lfb_depth;		/* 0x16 */
-	unsigned long  lfb_base;		/* 0x18 */
-	unsigned long  lfb_size;		/* 0x1c */
-	unsigned short dontuse2, dontuse3;	/* 0x20 -- CL_MAGIC and CL_OFFSET here */
-	unsigned short lfb_linelength;		/* 0x24 */
-	unsigned char  red_size;		/* 0x26 */
-	unsigned char  red_pos;			/* 0x27 */
-	unsigned char  green_size;		/* 0x28 */
-	unsigned char  green_pos;		/* 0x29 */
-	unsigned char  blue_size;		/* 0x2a */
-	unsigned char  blue_pos;		/* 0x2b */
-	unsigned char  rsvd_size;		/* 0x2c */
-	unsigned char  rsvd_pos;		/* 0x2d */
-	unsigned short vesapm_seg;		/* 0x2e */
-	unsigned short vesapm_off;		/* 0x30 */
-	unsigned short pages;			/* 0x32 */
-						/* 0x34 -- 0x3f reserved for future expansion */
-};
diff --git a/include/linux/screen_info.h b/include/linux/screen_info.h
new file mode 100644
index 000000000000..76850b75b3f6
--- /dev/null
+++ b/include/linux/screen_info.h
@@ -0,0 +1,77 @@
+#ifndef _SCREEN_INFO_H
+#define _SCREEN_INFO_H
+
+#include <linux/types.h>
+
+/*
+ * These are set up by the setup-routine at boot-time:
+ */
+
+struct screen_info {
+	u8  orig_x;		/* 0x00 */
+	u8  orig_y;		/* 0x01 */
+	u16 dontuse1;		/* 0x02 -- EXT_MEM_K sits here */
+	u16 orig_video_page;	/* 0x04 */
+	u8  orig_video_mode;	/* 0x06 */
+	u8  orig_video_cols;	/* 0x07 */
+	u16 unused2;		/* 0x08 */
+	u16 orig_video_ega_bx;	/* 0x0a */
+	u16 unused3;		/* 0x0c */
+	u8  orig_video_lines;	/* 0x0e */
+	u8  orig_video_isVGA;	/* 0x0f */
+	u16 orig_video_points;	/* 0x10 */
+
+	/* VESA graphic mode -- linear frame buffer */
+	u16 lfb_width;		/* 0x12 */
+	u16 lfb_height;		/* 0x14 */
+	u16 lfb_depth;		/* 0x16 */
+	u32 lfb_base;		/* 0x18 */
+	u32 lfb_size;		/* 0x1c */
+	u16 dontuse2, dontuse3;	/* 0x20 -- CL_MAGIC and CL_OFFSET here */
+	u16 lfb_linelength;	/* 0x24 */
+	u8  red_size;		/* 0x26 */
+	u8  red_pos;		/* 0x27 */
+	u8  green_size;		/* 0x28 */
+	u8  green_pos;		/* 0x29 */
+	u8  blue_size;		/* 0x2a */
+	u8  blue_pos;		/* 0x2b */
+	u8  rsvd_size;		/* 0x2c */
+	u8  rsvd_pos;		/* 0x2d */
+	u16 vesapm_seg;		/* 0x2e */
+	u16 vesapm_off;		/* 0x30 */
+	u16 pages;		/* 0x32 */
+	u16 vesa_attributes;	/* 0x34 */
+	u32  capabilities;      /* 0x36 */
+				/* 0x3a -- 0x3f reserved for future expansion */
+};
+
+extern struct screen_info screen_info;
+
+#define ORIG_X			(screen_info.orig_x)
+#define ORIG_Y			(screen_info.orig_y)
+#define ORIG_VIDEO_MODE		(screen_info.orig_video_mode)
+#define ORIG_VIDEO_COLS 	(screen_info.orig_video_cols)
+#define ORIG_VIDEO_EGA_BX	(screen_info.orig_video_ega_bx)
+#define ORIG_VIDEO_LINES	(screen_info.orig_video_lines)
+#define ORIG_VIDEO_ISVGA	(screen_info.orig_video_isVGA)
+#define ORIG_VIDEO_POINTS       (screen_info.orig_video_points)
+
+#define VIDEO_TYPE_MDA		0x10	/* Monochrome Text Display	*/
+#define VIDEO_TYPE_CGA		0x11	/* CGA Display 			*/
+#define VIDEO_TYPE_EGAM		0x20	/* EGA/VGA in Monochrome Mode	*/
+#define VIDEO_TYPE_EGAC		0x21	/* EGA in Color Mode		*/
+#define VIDEO_TYPE_VGAC		0x22	/* VGA+ in Color Mode		*/
+#define VIDEO_TYPE_VLFB		0x23	/* VESA VGA in graphic mode	*/
+
+#define VIDEO_TYPE_PICA_S3	0x30	/* ACER PICA-61 local S3 video	*/
+#define VIDEO_TYPE_MIPS_G364	0x31    /* MIPS Magnum 4000 G364 video  */
+#define VIDEO_TYPE_SGI          0x33    /* Various SGI graphics hardware */
+
+#define VIDEO_TYPE_TGAC		0x40	/* DEC TGA */
+
+#define VIDEO_TYPE_SUN          0x50    /* Sun frame buffer. */
+#define VIDEO_TYPE_SUNPCI       0x51    /* Sun PCI based frame buffer. */
+
+#define VIDEO_TYPE_PMAC		0x60	/* PowerMacintosh frame buffer. */
+
+#endif /* _SCREEN_INFO_H */
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 1267f88ece6e..57449704a47b 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -23,6 +23,7 @@
 #include <linux/workqueue.h>
 #include <linux/tty_driver.h>
 #include <linux/tty_ldisc.h>
+#include <linux/screen_info.h>
 
 #include <asm/system.h>
 
@@ -36,77 +37,6 @@
 #define NR_UNIX98_PTY_MAX	(1 << MINORBITS) /* Absolute limit */
 #define NR_LDISCS		16
 
-/*
- * These are set up by the setup-routine at boot-time:
- */
-
-struct screen_info {
-	u8  orig_x;		/* 0x00 */
-	u8  orig_y;		/* 0x01 */
-	u16 dontuse1;		/* 0x02 -- EXT_MEM_K sits here */
-	u16 orig_video_page;	/* 0x04 */
-	u8  orig_video_mode;	/* 0x06 */
-	u8  orig_video_cols;	/* 0x07 */
-	u16 unused2;		/* 0x08 */
-	u16 orig_video_ega_bx;	/* 0x0a */
-	u16 unused3;		/* 0x0c */
-	u8  orig_video_lines;	/* 0x0e */
-	u8  orig_video_isVGA;	/* 0x0f */
-	u16 orig_video_points;	/* 0x10 */
-
-	/* VESA graphic mode -- linear frame buffer */
-	u16 lfb_width;		/* 0x12 */
-	u16 lfb_height;		/* 0x14 */
-	u16 lfb_depth;		/* 0x16 */
-	u32 lfb_base;		/* 0x18 */
-	u32 lfb_size;		/* 0x1c */
-	u16 dontuse2, dontuse3;	/* 0x20 -- CL_MAGIC and CL_OFFSET here */
-	u16 lfb_linelength;	/* 0x24 */
-	u8  red_size;		/* 0x26 */
-	u8  red_pos;		/* 0x27 */
-	u8  green_size;		/* 0x28 */
-	u8  green_pos;		/* 0x29 */
-	u8  blue_size;		/* 0x2a */
-	u8  blue_pos;		/* 0x2b */
-	u8  rsvd_size;		/* 0x2c */
-	u8  rsvd_pos;		/* 0x2d */
-	u16 vesapm_seg;		/* 0x2e */
-	u16 vesapm_off;		/* 0x30 */
-	u16 pages;		/* 0x32 */
-	u16 vesa_attributes;	/* 0x34 */
-	u32  capabilities;      /* 0x36 */
-				/* 0x3a -- 0x3f reserved for future expansion */
-};
-
-extern struct screen_info screen_info;
-
-#define ORIG_X			(screen_info.orig_x)
-#define ORIG_Y			(screen_info.orig_y)
-#define ORIG_VIDEO_MODE		(screen_info.orig_video_mode)
-#define ORIG_VIDEO_COLS 	(screen_info.orig_video_cols)
-#define ORIG_VIDEO_EGA_BX	(screen_info.orig_video_ega_bx)
-#define ORIG_VIDEO_LINES	(screen_info.orig_video_lines)
-#define ORIG_VIDEO_ISVGA	(screen_info.orig_video_isVGA)
-#define ORIG_VIDEO_POINTS       (screen_info.orig_video_points)
-
-#define VIDEO_TYPE_MDA		0x10	/* Monochrome Text Display	*/
-#define VIDEO_TYPE_CGA		0x11	/* CGA Display 			*/
-#define VIDEO_TYPE_EGAM		0x20	/* EGA/VGA in Monochrome Mode	*/
-#define VIDEO_TYPE_EGAC		0x21	/* EGA in Color Mode		*/
-#define VIDEO_TYPE_VGAC		0x22	/* VGA+ in Color Mode		*/
-#define VIDEO_TYPE_VLFB		0x23	/* VESA VGA in graphic mode	*/
-
-#define VIDEO_TYPE_PICA_S3	0x30	/* ACER PICA-61 local S3 video	*/
-#define VIDEO_TYPE_MIPS_G364	0x31    /* MIPS Magnum 4000 G364 video  */
-#define VIDEO_TYPE_SGI          0x33    /* Various SGI graphics hardware */
-
-#define VIDEO_TYPE_TGAC		0x40	/* DEC TGA */
-
-#define VIDEO_TYPE_SUN          0x50    /* Sun frame buffer. */
-#define VIDEO_TYPE_SUNPCI       0x51    /* Sun PCI based frame buffer. */
-
-#define VIDEO_TYPE_PMAC		0x60	/* PowerMacintosh frame buffer. */
-
 /*
  * This character is the same as _POSIX_VDISABLE: it cannot be used as
  * a c_cc[] character, but indicates that a particular special character
-- 
cgit v1.2.3


From d8a33496671e4533aed090793436d58debea6f3a Mon Sep 17 00:00:00 2001
From: Marko Kohtala <marko.kohtala@gmail.com>
Date: Sun, 8 Jan 2006 01:05:06 -0800
Subject: [PATCH] parport: bring back an unused phase for ppdev ioctl

Earlier fix removed unused phase, but that changed the values for other
phases.  Since these are exposed to userspace through ppdev, it is safer
not to change them.  Restore the unused phase value.

Signed-off-by: Marko Kohtala <marko.kohtala@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/parport.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/parport.h b/include/linux/parport.h
index f7ff0b0c4031..f67f838a3a1f 100644
--- a/include/linux/parport.h
+++ b/include/linux/parport.h
@@ -236,12 +236,14 @@ struct pardevice {
 
 /* IEEE1284 information */
 
-/* IEEE1284 phases */
+/* IEEE1284 phases. These are exposed to userland through ppdev IOCTL
+ * PP[GS]ETPHASE, so do not change existing values. */
 enum ieee1284_phase {
 	IEEE1284_PH_FWD_DATA,
 	IEEE1284_PH_FWD_IDLE,
 	IEEE1284_PH_TERMINATE,
 	IEEE1284_PH_NEGOTIATION,
+	IEEE1284_PH_HBUSY_DNA,
 	IEEE1284_PH_REV_IDLE,
 	IEEE1284_PH_HBUSY_DAVAIL,
 	IEEE1284_PH_REV_DATA,
-- 
cgit v1.2.3


From 6a878184c202395ea17212f111ab9ec4b5f6d6ee Mon Sep 17 00:00:00 2001
From: Jan Blunck <jblunck@suse.de>
Date: Sun, 8 Jan 2006 01:05:07 -0800
Subject: [PATCH] Eliminate __attribute__ ((packed)) warnings for gcc-4.1

Since version 4.1 the gcc is warning about ignored attributes. This patch is
using the equivalent attribute on the struct instead of on each of the
structure or union members.

GCC Manual:
  "Specifying Attributes of Types

   packed
    This attribute, attached to struct or union type definition, specifies
    that
    each member of the structure or union is placed to minimize the memory
    required. When attached to an enum definition, it indicates that the
    smallest integral type should be used.

    Specifying this attribute for struct and union types is equivalent to
    specifying the packed attribute on each of the structure or union
    members."

Signed-off-by: Jan Blunck <jblunck@suse.de>
Cc: Dave Jones <davej@codemonkey.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/isdn/hisax/hisax.h          |  18 +++---
 drivers/isdn/hisax/hisax_fcpcipnp.h |  18 +++---
 drivers/net/3c527.h                 |  50 +++++++-------
 drivers/net/irda/vlsi_ir.h          |   4 +-
 drivers/net/wan/sdla.c              |   6 +-
 include/linux/atalk.h               |  18 +++---
 include/linux/cycx_x25.h            |  66 +++++++++----------
 include/linux/if_frad.h             |  12 ++--
 include/linux/isdnif.h              |  70 ++++++++++----------
 include/linux/ncp.h                 | 126 ++++++++++++++++++------------------
 include/linux/sdla.h                |  64 +++++++++---------
 include/linux/wavefront.h           |  36 +++++------
 include/net/dn_dev.h                |  84 ++++++++++++------------
 include/net/dn_nsp.h                |  74 ++++++++++-----------
 include/sound/wavefront.h           |  36 +++++------
 15 files changed, 341 insertions(+), 341 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/isdn/hisax/hisax.h b/drivers/isdn/hisax/hisax.h
index 26c545fa223b..1b85ce166af8 100644
--- a/drivers/isdn/hisax/hisax.h
+++ b/drivers/isdn/hisax/hisax.h
@@ -396,17 +396,17 @@ struct isar_hw {
 
 struct hdlc_stat_reg {
 #ifdef __BIG_ENDIAN
-	u_char fill __attribute__((packed));
-	u_char mode __attribute__((packed));
-	u_char xml  __attribute__((packed));
-	u_char cmd  __attribute__((packed));
+	u_char fill;
+	u_char mode;
+	u_char xml;
+	u_char cmd;
 #else
-	u_char cmd  __attribute__((packed));
-	u_char xml  __attribute__((packed));
-	u_char mode __attribute__((packed));
-	u_char fill __attribute__((packed));
+	u_char cmd;
+	u_char xml;
+	u_char mode;
+	u_char fill;
 #endif
-};
+} __attribute__((packed));
 
 struct hdlc_hw {
 	union {
diff --git a/drivers/isdn/hisax/hisax_fcpcipnp.h b/drivers/isdn/hisax/hisax_fcpcipnp.h
index bd8a22e4d6a2..21fbcedf3a94 100644
--- a/drivers/isdn/hisax/hisax_fcpcipnp.h
+++ b/drivers/isdn/hisax/hisax_fcpcipnp.h
@@ -12,17 +12,17 @@ enum {
 
 struct hdlc_stat_reg {
 #ifdef __BIG_ENDIAN
-	u_char fill __attribute__((packed));
-	u_char mode __attribute__((packed));
-	u_char xml  __attribute__((packed));
-	u_char cmd  __attribute__((packed));
+	u_char fill;
+	u_char mode;
+	u_char xml;
+	u_char cmd;
 #else
-	u_char cmd  __attribute__((packed));
-	u_char xml  __attribute__((packed));
-	u_char mode __attribute__((packed));
-	u_char fill __attribute__((packed));
+	u_char cmd;
+	u_char xml;
+	u_char mode;
+	u_char fill;
 #endif
-};
+} __attribute__((packed));
 
 struct fritz_bcs {
 	struct hisax_b_if b_if;
diff --git a/drivers/net/3c527.h b/drivers/net/3c527.h
index c10f009ce9b6..53b5b071df08 100644
--- a/drivers/net/3c527.h
+++ b/drivers/net/3c527.h
@@ -32,43 +32,43 @@
 
 struct mc32_mailbox
 {
-	u16	mbox __attribute((packed));
-	u16	data[1] __attribute((packed));
-};
+ 	u16 mbox;
+ 	u16 data[1];
+} __attribute((packed));
 
 struct skb_header
 {
-	u8	status __attribute((packed));
-	u8	control __attribute((packed));
-	u16	next __attribute((packed));	/* Do not change! */
-	u16	length __attribute((packed));
-	u32	data __attribute((packed));
-};
+	u8 status;
+	u8 control;
+	u16 next;	/* Do not change! */
+	u16 length;
+	u32 data;
+} __attribute((packed));
 
 struct mc32_stats
 {
 	/* RX Errors */
-	u32     rx_crc_errors       __attribute((packed)); 	
-	u32     rx_alignment_errors  __attribute((packed)); 	
-	u32     rx_overrun_errors    __attribute((packed));
-	u32     rx_tooshort_errors  __attribute((packed));
-	u32     rx_toolong_errors   __attribute((packed));
-	u32     rx_outofresource_errors  __attribute((packed)); 
+	u32 rx_crc_errors;
+	u32 rx_alignment_errors;
+	u32 rx_overrun_errors;
+	u32 rx_tooshort_errors;
+	u32 rx_toolong_errors;
+	u32 rx_outofresource_errors;
 
-	u32     rx_discarded   __attribute((packed));  /* via card pattern match filter */ 
+	u32 rx_discarded;  /* via card pattern match filter */
 
 	/* TX Errors */
-	u32     tx_max_collisions __attribute((packed)); 
-	u32     tx_carrier_errors __attribute((packed)); 
-	u32     tx_underrun_errors __attribute((packed)); 
-	u32     tx_cts_errors     __attribute((packed)); 
-	u32     tx_timeout_errors __attribute((packed)) ;
+	u32 tx_max_collisions;
+	u32 tx_carrier_errors;
+	u32 tx_underrun_errors;
+	u32 tx_cts_errors;
+	u32 tx_timeout_errors;
 	
 	/* various cruft */
-	u32     dataA[6] __attribute((packed));   
-        u16	dataB[5] __attribute((packed));   
-  	u32     dataC[14] __attribute((packed)); 	
-};
+	u32 dataA[6];
+	u16 dataB[5];
+	u32 dataC[14];
+} __attribute((packed));
 
 #define STATUS_MASK	0x0F
 #define COMPLETED	(1<<7)
diff --git a/drivers/net/irda/vlsi_ir.h b/drivers/net/irda/vlsi_ir.h
index 741aecc655df..a82a4ba8de4f 100644
--- a/drivers/net/irda/vlsi_ir.h
+++ b/drivers/net/irda/vlsi_ir.h
@@ -577,8 +577,8 @@ struct ring_descr_hw {
 		struct {
 			u8		addr_res[3];
 			volatile u8	status;		/* descriptor status */
-		} rd_s __attribute__((packed));
-	} rd_u __attribute((packed));
+		} __attribute__((packed)) rd_s;
+	} __attribute((packed)) rd_u;
 } __attribute__ ((packed));
 
 #define rd_addr		rd_u.addr
diff --git a/drivers/net/wan/sdla.c b/drivers/net/wan/sdla.c
index 036adc4f8ba7..22e794071cf4 100644
--- a/drivers/net/wan/sdla.c
+++ b/drivers/net/wan/sdla.c
@@ -329,9 +329,9 @@ static int sdla_cpuspeed(struct net_device *dev, struct ifreq *ifr)
 
 struct _dlci_stat 
 {
-	short dlci		__attribute__((packed));
-	char  flags		__attribute__((packed));
-};
+	short dlci;
+	char  flags;
+} __attribute__((packed));
 
 struct _frad_stat 
 {
diff --git a/include/linux/atalk.h b/include/linux/atalk.h
index 911c09cb9bf9..6ba3aa8a81f4 100644
--- a/include/linux/atalk.h
+++ b/include/linux/atalk.h
@@ -155,15 +155,15 @@ struct elapaarp {
 #define AARP_REQUEST			1
 #define AARP_REPLY			2
 #define AARP_PROBE			3
-	__u8	hw_src[ETH_ALEN]	__attribute__ ((packed));
-	__u8	pa_src_zero		__attribute__ ((packed));
-	__be16	pa_src_net		__attribute__ ((packed));
-	__u8	pa_src_node		__attribute__ ((packed));
-	__u8	hw_dst[ETH_ALEN]	__attribute__ ((packed));
-	__u8	pa_dst_zero		__attribute__ ((packed));
-	__be16	pa_dst_net		__attribute__ ((packed));
-	__u8	pa_dst_node		__attribute__ ((packed));	
-};
+	__u8	hw_src[ETH_ALEN];
+	__u8	pa_src_zero;
+	__be16	pa_src_net;
+	__u8	pa_src_node;
+	__u8	hw_dst[ETH_ALEN];
+	__u8	pa_dst_zero;
+	__be16	pa_dst_net;
+	__u8	pa_dst_node;
+} __attribute__ ((packed));
 
 static __inline__ struct elapaarp *aarp_hdr(struct sk_buff *skb)
 {
diff --git a/include/linux/cycx_x25.h b/include/linux/cycx_x25.h
index b10a7f3a8cac..f7a906583463 100644
--- a/include/linux/cycx_x25.h
+++ b/include/linux/cycx_x25.h
@@ -38,11 +38,11 @@ extern unsigned int cycx_debug;
 /* Data Structures */
 /* X.25 Command Block. */
 struct cycx_x25_cmd {
-	u16 command PACKED;
-	u16 link    PACKED; /* values: 0 or 1 */
-	u16 len     PACKED; /* values: 0 thru 0x205 (517) */
-	u32 buf     PACKED;
-};
+	u16 command;
+	u16 link;	/* values: 0 or 1 */
+	u16 len;	/* values: 0 thru 0x205 (517) */
+	u32 buf;
+} PACKED;
 
 /* Defines for the 'command' field. */
 #define X25_CONNECT_REQUEST             0x4401
@@ -92,34 +92,34 @@ struct cycx_x25_cmd {
  *	@flags - see dosx25.doc, in portuguese, for details
  */
 struct cycx_x25_config {
-	u8  link	PACKED;
-	u8  speed	PACKED;
-	u8  clock	PACKED;
-	u8  n2		PACKED;
-	u8  n2win	PACKED;
-	u8  n3win	PACKED;
-	u8  nvc		PACKED;
-	u8  pktlen	PACKED;
-	u8  locaddr	PACKED;
-	u8  remaddr	PACKED;
-	u16 t1		PACKED;
-	u16 t2		PACKED;
-	u8  t21		PACKED;
-	u8  npvc	PACKED;
-	u8  t23		PACKED;
-	u8  flags	PACKED;
-};
+	u8  link;
+	u8  speed;
+	u8  clock;
+	u8  n2;
+	u8  n2win;
+	u8  n3win;
+	u8  nvc;
+	u8  pktlen;
+	u8  locaddr;
+	u8  remaddr;
+	u16 t1;
+	u16 t2;
+	u8  t21;
+	u8  npvc;
+	u8  t23;
+	u8  flags;
+} PACKED;
 
 struct cycx_x25_stats {
-	u16 rx_crc_errors	PACKED;
-	u16 rx_over_errors	PACKED;
-	u16 n2_tx_frames 	PACKED;
-	u16 n2_rx_frames 	PACKED;
-	u16 tx_timeouts 	PACKED;
-	u16 rx_timeouts 	PACKED;
-	u16 n3_tx_packets 	PACKED;
-	u16 n3_rx_packets 	PACKED;
-	u16 tx_aborts	 	PACKED;
-	u16 rx_aborts	 	PACKED;
-};
+	u16 rx_crc_errors;
+	u16 rx_over_errors;
+	u16 n2_tx_frames;
+	u16 n2_rx_frames;
+	u16 tx_timeouts;
+	u16 rx_timeouts;
+	u16 n3_tx_packets;
+	u16 n3_rx_packets;
+	u16 tx_aborts;
+	u16 rx_aborts;
+} PACKED;
 #endif	/* _CYCX_X25_H */
diff --git a/include/linux/if_frad.h b/include/linux/if_frad.h
index 511999c7eeda..395f0aad9cbf 100644
--- a/include/linux/if_frad.h
+++ b/include/linux/if_frad.h
@@ -131,17 +131,17 @@ struct frad_conf
 /* these are the fields of an RFC 1490 header */
 struct frhdr
 {
-   unsigned char  control	__attribute__((packed));
+   unsigned char  control;
 
    /* for IP packets, this can be the NLPID */
-   unsigned char  pad		__attribute__((packed)); 
+   unsigned char  pad;
 
-   unsigned char  NLPID		__attribute__((packed));
-   unsigned char  OUI[3]	__attribute__((packed));
-   unsigned short PID		__attribute__((packed));
+   unsigned char  NLPID;
+   unsigned char  OUI[3];
+   unsigned short PID;
 
 #define IP_NLPID pad 
-};
+} __attribute__((packed));
 
 /* see RFC 1490 for the definition of the following */
 #define FRAD_I_UI		0x03
diff --git a/include/linux/isdnif.h b/include/linux/isdnif.h
index 7a4eacd77cb2..04e10f9f14f8 100644
--- a/include/linux/isdnif.h
+++ b/include/linux/isdnif.h
@@ -282,43 +282,43 @@ typedef struct setup_parm {
 
 typedef struct T30_s {
 	/* session parameters */
-	__u8 resolution		__attribute__ ((packed));
-	__u8 rate		__attribute__ ((packed));
-	__u8 width		__attribute__ ((packed));
-	__u8 length		__attribute__ ((packed));
-	__u8 compression	__attribute__ ((packed));
-	__u8 ecm		__attribute__ ((packed));
-	__u8 binary		__attribute__ ((packed));
-	__u8 scantime		__attribute__ ((packed));
-	__u8 id[FAXIDLEN]	__attribute__ ((packed));
+	__u8 resolution;
+	__u8 rate;
+	__u8 width;
+	__u8 length;
+	__u8 compression;
+	__u8 ecm;
+	__u8 binary;
+	__u8 scantime;
+	__u8 id[FAXIDLEN];
 	/* additional parameters */
-	__u8 phase		__attribute__ ((packed));
-	__u8 direction		__attribute__ ((packed));
-	__u8 code		__attribute__ ((packed));
-	__u8 badlin		__attribute__ ((packed));
-	__u8 badmul		__attribute__ ((packed));
-	__u8 bor		__attribute__ ((packed));
-	__u8 fet		__attribute__ ((packed));
-	__u8 pollid[FAXIDLEN]	__attribute__ ((packed));
-	__u8 cq			__attribute__ ((packed));
-	__u8 cr			__attribute__ ((packed));
-	__u8 ctcrty		__attribute__ ((packed));
-	__u8 minsp		__attribute__ ((packed));
-	__u8 phcto		__attribute__ ((packed));
-	__u8 rel		__attribute__ ((packed));
-	__u8 nbc		__attribute__ ((packed));
+	__u8 phase;
+	__u8 direction;
+	__u8 code;
+	__u8 badlin;
+	__u8 badmul;
+	__u8 bor;
+	__u8 fet;
+	__u8 pollid[FAXIDLEN];
+	__u8 cq;
+	__u8 cr;
+	__u8 ctcrty;
+	__u8 minsp;
+	__u8 phcto;
+	__u8 rel;
+	__u8 nbc;
 	/* remote station parameters */
-	__u8 r_resolution	__attribute__ ((packed));
-	__u8 r_rate		__attribute__ ((packed));
-	__u8 r_width		__attribute__ ((packed));
-	__u8 r_length		__attribute__ ((packed));
-	__u8 r_compression	__attribute__ ((packed));
-	__u8 r_ecm		__attribute__ ((packed));
-	__u8 r_binary		__attribute__ ((packed));
-	__u8 r_scantime		__attribute__ ((packed));
-	__u8 r_id[FAXIDLEN]	__attribute__ ((packed));
-	__u8 r_code		__attribute__ ((packed));
-} T30_s;
+	__u8 r_resolution;
+	__u8 r_rate;
+	__u8 r_width;
+	__u8 r_length;
+	__u8 r_compression;
+	__u8 r_ecm;
+	__u8 r_binary;
+	__u8 r_scantime;
+	__u8 r_id[FAXIDLEN];
+	__u8 r_code;
+} __attribute__((packed)) T30_s;
 
 #define ISDN_TTY_FAX_CONN_IN	0
 #define ISDN_TTY_FAX_CONN_OUT	1
diff --git a/include/linux/ncp.h b/include/linux/ncp.h
index 99f77876b716..99f0adeeb3f3 100644
--- a/include/linux/ncp.h
+++ b/include/linux/ncp.h
@@ -20,29 +20,29 @@
 #define NCP_DEALLOC_SLOT_REQUEST (0x5555)
 
 struct ncp_request_header {
-	__u16 type __attribute__((packed));
-	__u8 sequence __attribute__((packed));
-	__u8 conn_low __attribute__((packed));
-	__u8 task __attribute__((packed));
-	__u8 conn_high __attribute__((packed));
-	__u8 function __attribute__((packed));
-	__u8 data[0] __attribute__((packed));
-};
+	__u16 type;
+	__u8 sequence;
+	__u8 conn_low;
+	__u8 task;
+	__u8 conn_high;
+	__u8 function;
+	__u8 data[0];
+} __attribute__((packed));
 
 #define NCP_REPLY                (0x3333)
 #define NCP_WATCHDOG		 (0x3E3E)
 #define NCP_POSITIVE_ACK         (0x9999)
 
 struct ncp_reply_header {
-	__u16 type __attribute__((packed));
-	__u8 sequence __attribute__((packed));
-	__u8 conn_low __attribute__((packed));
-	__u8 task __attribute__((packed));
-	__u8 conn_high __attribute__((packed));
-	__u8 completion_code __attribute__((packed));
-	__u8 connection_state __attribute__((packed));
-	__u8 data[0] __attribute__((packed));
-};
+	__u16 type;
+	__u8 sequence;
+	__u8 conn_low;
+	__u8 task;
+	__u8 conn_high;
+	__u8 completion_code;
+	__u8 connection_state;
+	__u8 data[0];
+} __attribute__((packed));
 
 #define NCP_VOLNAME_LEN (16)
 #define NCP_NUMBER_OF_VOLUMES (256)
@@ -128,37 +128,37 @@ struct nw_nfs_info {
 };
 
 struct nw_info_struct {
-	__u32 spaceAlloc __attribute__((packed));
-	__le32 attributes __attribute__((packed));
-	__u16 flags __attribute__((packed));
-	__le32 dataStreamSize __attribute__((packed));
-	__le32 totalStreamSize __attribute__((packed));
-	__u16 numberOfStreams __attribute__((packed));
-	__le16 creationTime __attribute__((packed));
-	__le16 creationDate __attribute__((packed));
-	__u32 creatorID __attribute__((packed));
-	__le16 modifyTime __attribute__((packed));
-	__le16 modifyDate __attribute__((packed));
-	__u32 modifierID __attribute__((packed));
-	__le16 lastAccessDate __attribute__((packed));
-	__u16 archiveTime __attribute__((packed));
-	__u16 archiveDate __attribute__((packed));
-	__u32 archiverID __attribute__((packed));
-	__u16 inheritedRightsMask __attribute__((packed));
-	__le32 dirEntNum __attribute__((packed));
-	__le32 DosDirNum __attribute__((packed));
-	__u32 volNumber __attribute__((packed));
-	__u32 EADataSize __attribute__((packed));
-	__u32 EAKeyCount __attribute__((packed));
-	__u32 EAKeySize __attribute__((packed));
-	__u32 NSCreator __attribute__((packed));
-	__u8 nameLen __attribute__((packed));
-	__u8 entryName[256] __attribute__((packed));
+	__u32 spaceAlloc;
+	__le32 attributes;
+	__u16 flags;
+	__le32 dataStreamSize;
+	__le32 totalStreamSize;
+	__u16 numberOfStreams;
+	__le16 creationTime;
+	__le16 creationDate;
+	__u32 creatorID;
+	__le16 modifyTime;
+	__le16 modifyDate;
+	__u32 modifierID;
+	__le16 lastAccessDate;
+	__u16 archiveTime;
+	__u16 archiveDate;
+	__u32 archiverID;
+	__u16 inheritedRightsMask;
+	__le32 dirEntNum;
+	__le32 DosDirNum;
+	__u32 volNumber;
+	__u32 EADataSize;
+	__u32 EAKeyCount;
+	__u32 EAKeySize;
+	__u32 NSCreator;
+	__u8 nameLen;
+	__u8 entryName[256];
 	/* libncp may depend on there being nothing after entryName */
 #ifdef __KERNEL__
 	struct nw_nfs_info nfs;
 #endif
-};
+} __attribute__((packed));
 
 /* modify mask - use with MODIFY_DOS_INFO structure */
 #define DM_ATTRIBUTES		  (cpu_to_le32(0x02))
@@ -176,26 +176,26 @@ struct nw_info_struct {
 #define DM_MAXIMUM_SPACE	  (cpu_to_le32(0x2000))
 
 struct nw_modify_dos_info {
-	__le32 attributes __attribute__((packed));
-	__le16 creationDate __attribute__((packed));
-	__le16 creationTime __attribute__((packed));
-	__u32 creatorID __attribute__((packed));
-	__le16 modifyDate __attribute__((packed));
-	__le16 modifyTime __attribute__((packed));
-	__u32 modifierID __attribute__((packed));
-	__u16 archiveDate __attribute__((packed));
-	__u16 archiveTime __attribute__((packed));
-	__u32 archiverID __attribute__((packed));
-	__le16 lastAccessDate __attribute__((packed));
-	__u16 inheritanceGrantMask __attribute__((packed));
-	__u16 inheritanceRevokeMask __attribute__((packed));
-	__u32 maximumSpace __attribute__((packed));
-};
+	__le32 attributes;
+	__le16 creationDate;
+	__le16 creationTime;
+	__u32 creatorID;
+	__le16 modifyDate;
+	__le16 modifyTime;
+	__u32 modifierID;
+	__u16 archiveDate;
+	__u16 archiveTime;
+	__u32 archiverID;
+	__le16 lastAccessDate;
+	__u16 inheritanceGrantMask;
+	__u16 inheritanceRevokeMask;
+	__u32 maximumSpace;
+} __attribute__((packed));
 
 struct nw_search_sequence {
-	__u8 volNumber __attribute__((packed));
-	__u32 dirBase __attribute__((packed));
-	__u32 sequence __attribute__((packed));
-};
+	__u8 volNumber;
+	__u32 dirBase;
+	__u32 sequence;
+} __attribute__((packed));
 
 #endif				/* _LINUX_NCP_H */
diff --git a/include/linux/sdla.h b/include/linux/sdla.h
index 3b6afb8caa42..564acd3a71c1 100644
--- a/include/linux/sdla.h
+++ b/include/linux/sdla.h
@@ -293,46 +293,46 @@ void sdla(void *cfg_info, char *dev, struct frad_conf *conf, int quiet);
 #define SDLA_S508_INTEN			0x10
 
 struct sdla_cmd {
-   char  opp_flag		__attribute__((packed));
-   char  cmd			__attribute__((packed));
-   short length			__attribute__((packed));
-   char  retval			__attribute__((packed));
-   short dlci			__attribute__((packed));
-   char  flags			__attribute__((packed));
-   short rxlost_int		__attribute__((packed));
-   long  rxlost_app		__attribute__((packed));
-   char  reserve[2]		__attribute__((packed));
-   char  data[SDLA_MAX_DATA]	__attribute__((packed));	/* transfer data buffer */
-};
+   char  opp_flag;
+   char  cmd;
+   short length;
+   char  retval;
+   short dlci;
+   char  flags;
+   short rxlost_int;
+   long  rxlost_app;
+   char  reserve[2];
+   char  data[SDLA_MAX_DATA];	/* transfer data buffer */
+} __attribute__((packed));
 
 struct intr_info {
-   char  flags		__attribute__((packed));
-   short txlen		__attribute__((packed));
-   char  irq		__attribute__((packed));
-   char  flags2		__attribute__((packed));
-   short timeout	__attribute__((packed));
-};
+   char  flags;
+   short txlen;
+   char  irq;
+   char  flags2;
+   short timeout;
+} __attribute__((packed));
 
 /* found in the 508's control window at RXBUF_INFO */
 struct buf_info {
-   unsigned short rse_num	__attribute__((packed));
-   unsigned long  rse_base	__attribute__((packed));
-   unsigned long  rse_next	__attribute__((packed));
-   unsigned long  buf_base	__attribute__((packed));
-   unsigned short reserved	__attribute__((packed));
-   unsigned long  buf_top	__attribute__((packed));
-};
+   unsigned short rse_num;
+   unsigned long  rse_base;
+   unsigned long  rse_next;
+   unsigned long  buf_base;
+   unsigned short reserved;
+   unsigned long  buf_top;
+} __attribute__((packed));
 
 /* structure pointed to by rse_base in RXBUF_INFO struct */
 struct buf_entry {
-   char  opp_flag	__attribute__((packed));
-   short length		__attribute__((packed));
-   short dlci		__attribute__((packed));
-   char  flags		__attribute__((packed));
-   short timestamp	__attribute__((packed));
-   short reserved[2]	__attribute__((packed));
-   long  buf_addr	__attribute__((packed));
-};
+   char  opp_flag;
+   short length;
+   short dlci;
+   char  flags;
+   short timestamp;
+   short reserved[2];
+   long  buf_addr;
+} __attribute__((packed));
 
 #endif
 
diff --git a/include/linux/wavefront.h b/include/linux/wavefront.h
index 61bd0fd35240..51ab3c933acd 100644
--- a/include/linux/wavefront.h
+++ b/include/linux/wavefront.h
@@ -434,22 +434,22 @@ typedef struct wf_multisample {
 } wavefront_multisample;
 
 typedef struct wf_alias {
-    INT16 OriginalSample __attribute__ ((packed));
-
-    struct wf_sample_offset sampleStartOffset __attribute__ ((packed));
-    struct wf_sample_offset loopStartOffset __attribute__ ((packed));
-    struct wf_sample_offset sampleEndOffset __attribute__ ((packed));
-    struct wf_sample_offset loopEndOffset __attribute__ ((packed));
-
-    INT16  FrequencyBias __attribute__ ((packed));
-
-    UCHAR8 SampleResolution:2  __attribute__ ((packed));
-    UCHAR8 Unused1:1  __attribute__ ((packed));
-    UCHAR8 Loop:1 __attribute__ ((packed));
-    UCHAR8 Bidirectional:1  __attribute__ ((packed));
-    UCHAR8 Unused2:1 __attribute__ ((packed));
-    UCHAR8 Reverse:1 __attribute__ ((packed));
-    UCHAR8 Unused3:1 __attribute__ ((packed)); 
+    INT16 OriginalSample;
+
+    struct wf_sample_offset sampleStartOffset;
+    struct wf_sample_offset loopStartOffset;
+    struct wf_sample_offset sampleEndOffset;
+    struct wf_sample_offset loopEndOffset;
+
+    INT16  FrequencyBias;
+
+    UCHAR8 SampleResolution:2;
+    UCHAR8 Unused1:1;
+    UCHAR8 Loop:1;
+    UCHAR8 Bidirectional:1;
+    UCHAR8 Unused2:1;
+    UCHAR8 Reverse:1;
+    UCHAR8 Unused3:1;
     
     /* This structure is meant to be padded only to 16 bits on their
        original. Of course, whoever wrote their documentation didn't
@@ -460,8 +460,8 @@ typedef struct wf_alias {
        standard 16->32 bit issues.
     */
 
-    UCHAR8 sixteen_bit_padding __attribute__ ((packed));
-} wavefront_alias;
+    UCHAR8 sixteen_bit_padding;
+} __attribute__((packed)) wavefront_alias;
 
 typedef struct wf_drum {
     UCHAR8 PatchNumber;
diff --git a/include/net/dn_dev.h b/include/net/dn_dev.h
index 86e8e86e624a..5a86e78081bf 100644
--- a/include/net/dn_dev.h
+++ b/include/net/dn_dev.h
@@ -88,8 +88,8 @@ struct dn_dev {
 	struct net_device *dev;
 	struct dn_dev_parms parms;
 	char use_long;
-        struct timer_list timer;
-        unsigned long t3;
+	struct timer_list timer;
+	unsigned long t3;
 	struct neigh_parms *neigh_parms;
 	unsigned char addr[ETH_ALEN];
 	struct neighbour *router; /* Default router on circuit */
@@ -99,57 +99,57 @@ struct dn_dev {
 
 struct dn_short_packet
 {
-	unsigned char   msgflg          __attribute__((packed));
-        unsigned short  dstnode         __attribute__((packed));
-        unsigned short  srcnode         __attribute__((packed));
-        unsigned char   forward         __attribute__((packed));
-};
+	unsigned char   msgflg;
+	unsigned short  dstnode;
+	unsigned short  srcnode;
+	unsigned char   forward;
+} __attribute__((packed));
 
 struct dn_long_packet
 {
-	unsigned char   msgflg          __attribute__((packed));
-        unsigned char   d_area          __attribute__((packed));
-        unsigned char   d_subarea       __attribute__((packed));
-        unsigned char   d_id[6]         __attribute__((packed));
-        unsigned char   s_area          __attribute__((packed));
-        unsigned char   s_subarea       __attribute__((packed));
-        unsigned char   s_id[6]         __attribute__((packed));
-        unsigned char   nl2             __attribute__((packed));
-        unsigned char   visit_ct        __attribute__((packed));
-        unsigned char   s_class         __attribute__((packed));
-        unsigned char   pt              __attribute__((packed));
-};
+	unsigned char   msgflg;
+	unsigned char   d_area;
+	unsigned char   d_subarea;
+	unsigned char   d_id[6];
+	unsigned char   s_area;
+	unsigned char   s_subarea;
+	unsigned char   s_id[6];
+	unsigned char   nl2;
+	unsigned char   visit_ct;
+	unsigned char   s_class;
+	unsigned char   pt;
+} __attribute__((packed));
 
 /*------------------------- DRP - Routing messages ---------------------*/
 
 struct endnode_hello_message
 {
-	unsigned char   msgflg          __attribute__((packed));
-        unsigned char   tiver[3]        __attribute__((packed));
-        unsigned char   id[6]           __attribute__((packed));
-        unsigned char   iinfo           __attribute__((packed));
-        unsigned short  blksize         __attribute__((packed));
-        unsigned char   area            __attribute__((packed));
-        unsigned char   seed[8]         __attribute__((packed));
-        unsigned char   neighbor[6]     __attribute__((packed));
-        unsigned short  timer           __attribute__((packed));
-        unsigned char   mpd             __attribute__((packed));
-        unsigned char   datalen         __attribute__((packed));
-        unsigned char   data[2]         __attribute__((packed));
-};
+	unsigned char   msgflg;
+	unsigned char   tiver[3];
+	unsigned char   id[6];
+	unsigned char   iinfo;
+	unsigned short  blksize;
+	unsigned char   area;
+	unsigned char   seed[8];
+	unsigned char   neighbor[6];
+	unsigned short  timer;
+	unsigned char   mpd;
+	unsigned char   datalen;
+	unsigned char   data[2];
+} __attribute__((packed));
 
 struct rtnode_hello_message
 {
-	unsigned char   msgflg          __attribute__((packed));
-        unsigned char   tiver[3]        __attribute__((packed));
-        unsigned char   id[6]           __attribute__((packed));
-        unsigned char   iinfo           __attribute__((packed));
-        unsigned short  blksize         __attribute__((packed));
-        unsigned char   priority        __attribute__((packed));
-        unsigned char   area            __attribute__((packed));
-        unsigned short  timer           __attribute__((packed));
-        unsigned char   mpd             __attribute__((packed));
-};
+	unsigned char   msgflg;
+	unsigned char   tiver[3];
+	unsigned char   id[6];
+	unsigned char   iinfo;
+	unsigned short  blksize;
+	unsigned char   priority;
+	unsigned char   area;
+	unsigned short  timer;
+	unsigned char   mpd;
+} __attribute__((packed));
 
 
 extern void dn_dev_init(void);
diff --git a/include/net/dn_nsp.h b/include/net/dn_nsp.h
index 1ba03be0af3a..e6182b86262b 100644
--- a/include/net/dn_nsp.h
+++ b/include/net/dn_nsp.h
@@ -72,78 +72,78 @@ extern struct sk_buff *dn_alloc_send_skb(struct sock *sk, size_t *size, int nobl
 
 struct nsp_data_seg_msg
 {
-	unsigned char   msgflg          __attribute__((packed));
-	unsigned short  dstaddr         __attribute__((packed));
-	unsigned short  srcaddr         __attribute__((packed));
-};
+	unsigned char   msgflg;
+	unsigned short  dstaddr;
+	unsigned short  srcaddr;
+} __attribute__((packed));
 
 struct nsp_data_opt_msg
 {
-	unsigned short  acknum          __attribute__((packed));
-	unsigned short  segnum          __attribute__((packed));
-	unsigned short  lsflgs          __attribute__((packed));
-};
+	unsigned short  acknum;
+	unsigned short  segnum;
+	unsigned short  lsflgs;
+} __attribute__((packed));
 
 struct nsp_data_opt_msg1
 {
-	unsigned short  acknum          __attribute__((packed));
-	unsigned short  segnum          __attribute__((packed));
-};
+	unsigned short  acknum;
+	unsigned short  segnum;
+} __attribute__((packed));
 
 
 /* Acknowledgment Message (data/other data)                             */
 struct nsp_data_ack_msg
 {
-	unsigned char   msgflg          __attribute__((packed));
-	unsigned short  dstaddr         __attribute__((packed));
-	unsigned short  srcaddr         __attribute__((packed));
-	unsigned short  acknum          __attribute__((packed));
-};
+	unsigned char   msgflg;
+	unsigned short  dstaddr;
+	unsigned short  srcaddr;
+	unsigned short  acknum;
+} __attribute__((packed));
 
 /* Connect Acknowledgment Message */
 struct  nsp_conn_ack_msg
 {
-	unsigned char   msgflg          __attribute__((packed));
-	unsigned short  dstaddr         __attribute__((packed));
-};
+	unsigned char   msgflg;
+	unsigned short  dstaddr;
+} __attribute__((packed));
 
 
 /* Connect Initiate/Retransmit Initiate/Connect Confirm */
 struct  nsp_conn_init_msg
 {
-	unsigned char   msgflg          __attribute__((packed));
+	unsigned char   msgflg;
 #define NSP_CI      0x18            /* Connect Initiate     */
 #define NSP_RCI     0x68            /* Retrans. Conn Init   */
-	unsigned short  dstaddr         __attribute__((packed));
-        unsigned short  srcaddr         __attribute__((packed));
-        unsigned char   services        __attribute__((packed));
+	unsigned short  dstaddr;
+	unsigned short  srcaddr;
+	unsigned char   services;
 #define NSP_FC_NONE   0x00            /* Flow Control None    */
 #define NSP_FC_SRC    0x04            /* Seg Req. Count       */
 #define NSP_FC_SCMC   0x08            /* Sess. Control Mess   */
 #define NSP_FC_MASK   0x0c            /* FC type mask         */
-	unsigned char   info            __attribute__((packed));
-        unsigned short  segsize         __attribute__((packed));
-};
+	unsigned char   info;
+	unsigned short  segsize;
+} __attribute__((packed));
 
 /* Disconnect Initiate/Disconnect Confirm */
 struct  nsp_disconn_init_msg
 {
-	unsigned char   msgflg          __attribute__((packed));
-        unsigned short  dstaddr         __attribute__((packed));
-        unsigned short  srcaddr         __attribute__((packed));
-        unsigned short  reason          __attribute__((packed));
-};
+	unsigned char   msgflg;
+	unsigned short  dstaddr;
+	unsigned short  srcaddr;
+	unsigned short  reason;
+} __attribute__((packed));
 
 
 struct  srcobj_fmt
 {
-	char            format          __attribute__((packed));
-        unsigned char   task            __attribute__((packed));
-        unsigned short  grpcode         __attribute__((packed));
-        unsigned short  usrcode         __attribute__((packed));
-        char            dlen            __attribute__((packed));
-};
+	char            format;
+	unsigned char   task;
+	unsigned short  grpcode;
+	unsigned short  usrcode;
+	char            dlen;
+} __attribute__((packed));
 
 /*
  * A collection of functions for manipulating the sequence
diff --git a/include/sound/wavefront.h b/include/sound/wavefront.h
index 9e572aed2435..15d82e594b56 100644
--- a/include/sound/wavefront.h
+++ b/include/sound/wavefront.h
@@ -454,22 +454,22 @@ typedef struct wf_multisample {
 } wavefront_multisample;
 
 typedef struct wf_alias {
-    s16 OriginalSample __attribute__ ((packed));
-
-    struct wf_sample_offset sampleStartOffset __attribute__ ((packed));
-    struct wf_sample_offset loopStartOffset __attribute__ ((packed));
-    struct wf_sample_offset sampleEndOffset __attribute__ ((packed));
-    struct wf_sample_offset loopEndOffset __attribute__ ((packed));
-
-    s16  FrequencyBias __attribute__ ((packed));
-
-    u8 SampleResolution:2  __attribute__ ((packed));
-    u8 Unused1:1  __attribute__ ((packed));
-    u8 Loop:1 __attribute__ ((packed));
-    u8 Bidirectional:1  __attribute__ ((packed));
-    u8 Unused2:1 __attribute__ ((packed));
-    u8 Reverse:1 __attribute__ ((packed));
-    u8 Unused3:1 __attribute__ ((packed)); 
+    s16 OriginalSample;
+
+    struct wf_sample_offset sampleStartOffset;
+    struct wf_sample_offset loopStartOffset;
+    struct wf_sample_offset sampleEndOffset;
+    struct wf_sample_offset loopEndOffset;
+
+    s16  FrequencyBias;
+
+    u8 SampleResolution:2;
+    u8 Unused1:1;
+    u8 Loop:1;
+    u8 Bidirectional:1;
+    u8 Unused2:1;
+    u8 Reverse:1;
+    u8 Unused3:1;
     
     /* This structure is meant to be padded only to 16 bits on their
        original. Of course, whoever wrote their documentation didn't
@@ -480,8 +480,8 @@ typedef struct wf_alias {
        standard 16->32 bit issues.
     */
 
-    u8 sixteen_bit_padding __attribute__ ((packed));
-} wavefront_alias;
+    u8 sixteen_bit_padding;
+} __attribute__((packed)) wavefront_alias;
 
 typedef struct wf_drum {
     u8 PatchNumber;
-- 
cgit v1.2.3


From ef9ceab28203690a42d7d3915ccf6e208f0762bc Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 8 Jan 2006 01:05:10 -0800
Subject: [PATCH] remove semicolons from save_flags()

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/interrupt.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 41f150a3d2dd..e50a95fbeb11 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -79,7 +79,7 @@ static inline void __deprecated save_flags(unsigned long *x)
 {
 	local_save_flags(*x);
 }
-#define save_flags(x) save_flags(&x);
+#define save_flags(x) save_flags(&x)
 static inline void __deprecated restore_flags(unsigned long x)
 {
 	local_irq_restore(x);
-- 
cgit v1.2.3