summaryrefslogtreecommitdiff
path: root/fs/notify
diff options
context:
space:
mode:
Diffstat (limited to 'fs/notify')
-rw-r--r--fs/notify/dnotify/dnotify.c8
-rw-r--r--fs/notify/fanotify/fanotify.c11
-rw-r--r--fs/notify/fanotify/fanotify.h16
-rw-r--r--fs/notify/fanotify/fanotify_user.c264
-rw-r--r--fs/notify/fdinfo.c10
-rw-r--r--fs/notify/fsnotify.c169
-rw-r--r--fs/notify/fsnotify.h5
-rw-r--r--fs/notify/group.c2
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c2
-rw-r--r--fs/notify/inotify/inotify_user.c5
-rw-r--r--fs/notify/mark.c200
11 files changed, 462 insertions, 230 deletions
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index c4cdaf5fa7ed..9fb73bafd41d 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -308,6 +308,10 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
goto out_err;
}
+ error = file_f_owner_allocate(filp);
+ if (error)
+ goto out_err;
+
/* new fsnotify mark, we expect most fcntl calls to add a new mark */
new_dn_mark = kmem_cache_alloc(dnotify_mark_cache, GFP_KERNEL);
if (!new_dn_mark) {
@@ -315,10 +319,6 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
goto out_err;
}
- error = file_f_owner_allocate(filp);
- if (error)
- goto out_err;
-
/* set up the new_fsn_mark and new_dn_mark */
new_fsn_mark = &new_dn_mark->fsn_mark;
fsnotify_init_mark(new_fsn_mark, dnotify_group);
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 6d386080faf2..38290b9c07f7 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -415,7 +415,7 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode,
{
int dwords, type = 0;
char *ext_buf = NULL;
- void *buf = fh->buf;
+ void *buf = fh + 1;
int err;
fh->type = FILEID_ROOT;
@@ -454,7 +454,13 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode,
dwords = fh_len >> 2;
type = exportfs_encode_fid(inode, buf, &dwords);
err = -EINVAL;
- if (type <= 0 || type == FILEID_INVALID || fh_len != dwords << 2)
+ /*
+ * Unlike file_handle, type and len of struct fanotify_fh are u8.
+ * Traditionally, filesystem return handle_type < 0xff, but there
+ * is no enforcement for that in vfs.
+ */
+ BUILD_BUG_ON(MAX_HANDLE_SZ > 0xff || FILEID_INVALID > 0xff);
+ if (type <= 0 || type >= FILEID_INVALID || fh_len != dwords << 2)
goto out_err;
fh->type = type;
@@ -1009,6 +1015,7 @@ finish:
static void fanotify_free_group_priv(struct fsnotify_group *group)
{
+ put_user_ns(group->user_ns);
kfree(group->fanotify_data.merge_hash);
if (group->fanotify_data.ucounts)
dec_ucount(group->fanotify_data.ucounts,
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index b44e70e44be6..a0619e7694d5 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -2,6 +2,7 @@
#include <linux/fsnotify_backend.h>
#include <linux/path.h>
#include <linux/slab.h>
+#include <linux/string.h>
#include <linux/exportfs.h>
#include <linux/hashtable.h>
@@ -25,7 +26,7 @@ enum {
* stored in either the first or last 2 dwords.
*/
#define FANOTIFY_INLINE_FH_LEN (3 << 2)
-#define FANOTIFY_FH_HDR_LEN offsetof(struct fanotify_fh, buf)
+#define FANOTIFY_FH_HDR_LEN sizeof(struct fanotify_fh)
/* Fixed size struct for file handle */
struct fanotify_fh {
@@ -34,7 +35,6 @@ struct fanotify_fh {
#define FANOTIFY_FH_FLAG_EXT_BUF 1
u8 flags;
u8 pad;
- unsigned char buf[];
} __aligned(4);
/* Variable size struct for dir file handle + child file handle + name */
@@ -92,7 +92,7 @@ static inline char **fanotify_fh_ext_buf_ptr(struct fanotify_fh *fh)
BUILD_BUG_ON(FANOTIFY_FH_HDR_LEN % 4);
BUILD_BUG_ON(__alignof__(char *) - 4 + sizeof(char *) >
FANOTIFY_INLINE_FH_LEN);
- return (char **)ALIGN((unsigned long)(fh->buf), __alignof__(char *));
+ return (char **)ALIGN((unsigned long)(fh + 1), __alignof__(char *));
}
static inline void *fanotify_fh_ext_buf(struct fanotify_fh *fh)
@@ -102,7 +102,7 @@ static inline void *fanotify_fh_ext_buf(struct fanotify_fh *fh)
static inline void *fanotify_fh_buf(struct fanotify_fh *fh)
{
- return fanotify_fh_has_ext_buf(fh) ? fanotify_fh_ext_buf(fh) : fh->buf;
+ return fanotify_fh_has_ext_buf(fh) ? fanotify_fh_ext_buf(fh) : fh + 1;
}
static inline int fanotify_info_dir_fh_len(struct fanotify_info *info)
@@ -219,7 +219,7 @@ static inline void fanotify_info_copy_name(struct fanotify_info *info,
return;
info->name_len = name->len;
- strcpy(fanotify_info_name(info), name->name);
+ strscpy(fanotify_info_name(info), name->name, name->len + 1);
}
static inline void fanotify_info_copy_name2(struct fanotify_info *info,
@@ -229,7 +229,7 @@ static inline void fanotify_info_copy_name2(struct fanotify_info *info,
return;
info->name2_len = name->len;
- strcpy(fanotify_info_name2(info), name->name);
+ strscpy(fanotify_info_name2(info), name->name, name->len + 1);
}
/*
@@ -278,7 +278,7 @@ static inline void fanotify_init_event(struct fanotify_event *event,
#define FANOTIFY_INLINE_FH(name, size) \
struct { \
struct fanotify_fh name; \
- /* Space for object_fh.buf[] - access with fanotify_fh_buf() */ \
+ /* Space for filehandle - access with fanotify_fh_buf() */ \
unsigned char _inline_fh_buf[size]; \
}
@@ -442,7 +442,9 @@ struct fanotify_perm_event {
size_t count;
u32 response; /* userspace answer to the event */
unsigned short state; /* state of the event */
+ unsigned short watchdog_cnt; /* already scanned by watchdog? */
int fd; /* fd we passed to userspace for this event */
+ pid_t recv_pid; /* pid of task receiving the event */
union {
struct fanotify_response_info_header hdr;
struct fanotify_response_info_audit_rule audit_rule;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index f2d840ae4ded..ae904451dfc0 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -50,6 +50,7 @@
/* configurable via /proc/sys/fs/fanotify/ */
static int fanotify_max_queued_events __read_mostly;
+static int perm_group_timeout __read_mostly;
#ifdef CONFIG_SYSCTL
@@ -85,6 +86,14 @@ static const struct ctl_table fanotify_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO
},
+ {
+ .procname = "watchdog_timeout",
+ .data = &perm_group_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
};
static void __init fanotify_sysctls_init(void)
@@ -95,6 +104,91 @@ static void __init fanotify_sysctls_init(void)
#define fanotify_sysctls_init() do { } while (0)
#endif /* CONFIG_SYSCTL */
+static LIST_HEAD(perm_group_list);
+static DEFINE_SPINLOCK(perm_group_lock);
+static void perm_group_watchdog(struct work_struct *work);
+static DECLARE_DELAYED_WORK(perm_group_work, perm_group_watchdog);
+
+static void perm_group_watchdog_schedule(void)
+{
+ schedule_delayed_work(&perm_group_work, secs_to_jiffies(perm_group_timeout));
+}
+
+static void perm_group_watchdog(struct work_struct *work)
+{
+ struct fsnotify_group *group;
+ struct fanotify_perm_event *event;
+ struct task_struct *task;
+ pid_t failed_pid = 0;
+
+ guard(spinlock)(&perm_group_lock);
+ if (list_empty(&perm_group_list))
+ return;
+
+ list_for_each_entry(group, &perm_group_list,
+ fanotify_data.perm_grp_list) {
+ /*
+ * Ok to test without lock, racing with an addition is
+ * fine, will deal with it next round
+ */
+ if (list_empty(&group->fanotify_data.access_list))
+ continue;
+
+ spin_lock(&group->notification_lock);
+ list_for_each_entry(event, &group->fanotify_data.access_list,
+ fae.fse.list) {
+ if (likely(event->watchdog_cnt == 0)) {
+ event->watchdog_cnt = 1;
+ } else if (event->watchdog_cnt == 1) {
+ /* Report on event only once */
+ event->watchdog_cnt = 2;
+
+ /* Do not report same pid repeatedly */
+ if (event->recv_pid == failed_pid)
+ continue;
+
+ failed_pid = event->recv_pid;
+ rcu_read_lock();
+ task = find_task_by_pid_ns(event->recv_pid,
+ &init_pid_ns);
+ pr_warn_ratelimited(
+ "PID %u (%s) failed to respond to fanotify queue for more than %d seconds\n",
+ event->recv_pid,
+ task ? task->comm : NULL,
+ perm_group_timeout);
+ rcu_read_unlock();
+ }
+ }
+ spin_unlock(&group->notification_lock);
+ }
+ perm_group_watchdog_schedule();
+}
+
+static void fanotify_perm_watchdog_group_remove(struct fsnotify_group *group)
+{
+ if (!list_empty(&group->fanotify_data.perm_grp_list)) {
+ /* Perm event watchdog can no longer scan this group. */
+ spin_lock(&perm_group_lock);
+ list_del_init(&group->fanotify_data.perm_grp_list);
+ spin_unlock(&perm_group_lock);
+ }
+}
+
+static void fanotify_perm_watchdog_group_add(struct fsnotify_group *group)
+{
+ if (!perm_group_timeout)
+ return;
+
+ spin_lock(&perm_group_lock);
+ if (list_empty(&group->fanotify_data.perm_grp_list)) {
+ /* Add to perm_group_list for monitoring by watchdog. */
+ if (list_empty(&perm_group_list))
+ perm_group_watchdog_schedule();
+ list_add_tail(&group->fanotify_data.perm_grp_list, &perm_group_list);
+ }
+ spin_unlock(&perm_group_lock);
+}
+
/*
* All flags that may be specified in parameter event_f_flags of fanotify_init.
*
@@ -953,6 +1047,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
spin_lock(&group->notification_lock);
list_add_tail(&event->fse.list,
&group->fanotify_data.access_list);
+ FANOTIFY_PERM(event)->recv_pid = current->pid;
spin_unlock(&group->notification_lock);
}
}
@@ -1012,6 +1107,8 @@ static int fanotify_release(struct inode *ignored, struct file *file)
*/
fsnotify_group_stop_queueing(group);
+ fanotify_perm_watchdog_group_remove(group);
+
/*
* Process all permission events on access_list and notification queue
* and simulate reply from userspace.
@@ -1113,6 +1210,7 @@ static int fanotify_find_path(int dfd, const char __user *filename,
*path = fd_file(f)->f_path;
path_get(path);
+ ret = 0;
} else {
unsigned int lookup_flags = 0;
@@ -1122,22 +1220,7 @@ static int fanotify_find_path(int dfd, const char __user *filename,
lookup_flags |= LOOKUP_DIRECTORY;
ret = user_path_at(dfd, filename, lookup_flags, path);
- if (ret)
- goto out;
}
-
- /* you can only watch an inode if you have read permissions on it */
- ret = path_permission(path, MAY_READ);
- if (ret) {
- path_put(path);
- goto out;
- }
-
- ret = security_path_notify(path, mask, obj_type);
- if (ret)
- path_put(path);
-
-out:
return ret;
}
@@ -1334,6 +1417,7 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
* A group with FAN_UNLIMITED_MARKS does not contribute to mark count
* in the limited groups account.
*/
+ BUILD_BUG_ON(!(FANOTIFY_ADMIN_INIT_FLAGS & FAN_UNLIMITED_MARKS));
if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) &&
!inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS))
return ERR_PTR(-ENOSPC);
@@ -1464,6 +1548,10 @@ out:
fsnotify_group_unlock(group);
fsnotify_put_mark(fsn_mark);
+
+ if (!ret && (mask & FANOTIFY_PERM_EVENTS))
+ fanotify_perm_watchdog_group_add(group);
+
return ret;
}
@@ -1471,7 +1559,7 @@ static struct fsnotify_event *fanotify_alloc_overflow_event(void)
{
struct fanotify_event *oevent;
- oevent = kmalloc(sizeof(*oevent), GFP_KERNEL_ACCOUNT);
+ oevent = kmalloc_obj(*oevent, GFP_KERNEL_ACCOUNT);
if (!oevent)
return NULL;
@@ -1495,29 +1583,36 @@ static struct hlist_head *fanotify_alloc_merge_hash(void)
return hash;
}
+DEFINE_CLASS(fsnotify_group,
+ struct fsnotify_group *,
+ if (!IS_ERR_OR_NULL(_T)) fsnotify_destroy_group(_T),
+ fsnotify_alloc_group(ops, flags),
+ const struct fsnotify_ops *ops, int flags)
+
/* fanotify syscalls */
SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
{
- struct fsnotify_group *group;
+ struct user_namespace *user_ns = current_user_ns();
int f_flags, fd;
unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
unsigned int class = flags & FANOTIFY_CLASS_BITS;
unsigned int internal_flags = 0;
- struct file *file;
pr_debug("%s: flags=%x event_f_flags=%x\n",
__func__, flags, event_f_flags);
- if (!capable(CAP_SYS_ADMIN)) {
- /*
- * An unprivileged user can setup an fanotify group with
- * limited functionality - an unprivileged group is limited to
- * notification events with file handles and it cannot use
- * unlimited queue/marks.
- */
- if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !fid_mode)
- return -EPERM;
+ /*
+ * An unprivileged user can setup an fanotify group with limited
+ * functionality - an unprivileged group is limited to notification
+ * events with file handles or mount ids and it cannot use unlimited
+ * queue/marks.
+ */
+ if (((flags & FANOTIFY_ADMIN_INIT_FLAGS) ||
+ !(flags & (FANOTIFY_FID_BITS | FAN_REPORT_MNT))) &&
+ !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (!ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN)) {
/*
* Setting the internal flag FANOTIFY_UNPRIV on the group
* prevents setting mount/filesystem marks on this group and
@@ -1586,42 +1681,36 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
if (flags & FAN_NONBLOCK)
f_flags |= O_NONBLOCK;
- /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */
- group = fsnotify_alloc_group(&fanotify_fsnotify_ops,
+ CLASS(fsnotify_group, group)(&fanotify_fsnotify_ops,
FSNOTIFY_GROUP_USER);
- if (IS_ERR(group)) {
+ /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */
+ if (IS_ERR(group))
return PTR_ERR(group);
- }
/* Enforce groups limits per user in all containing user ns */
- group->fanotify_data.ucounts = inc_ucount(current_user_ns(),
- current_euid(),
+ group->fanotify_data.ucounts = inc_ucount(user_ns, current_euid(),
UCOUNT_FANOTIFY_GROUPS);
- if (!group->fanotify_data.ucounts) {
- fd = -EMFILE;
- goto out_destroy_group;
- }
+ if (!group->fanotify_data.ucounts)
+ return -EMFILE;
group->fanotify_data.flags = flags | internal_flags;
group->memcg = get_mem_cgroup_from_mm(current->mm);
+ group->user_ns = get_user_ns(user_ns);
group->fanotify_data.merge_hash = fanotify_alloc_merge_hash();
- if (!group->fanotify_data.merge_hash) {
- fd = -ENOMEM;
- goto out_destroy_group;
- }
+ if (!group->fanotify_data.merge_hash)
+ return -ENOMEM;
group->overflow_event = fanotify_alloc_overflow_event();
- if (unlikely(!group->overflow_event)) {
- fd = -ENOMEM;
- goto out_destroy_group;
- }
+ if (unlikely(!group->overflow_event))
+ return -ENOMEM;
if (force_o_largefile())
event_f_flags |= O_LARGEFILE;
group->fanotify_data.f_flags = event_f_flags;
init_waitqueue_head(&group->fanotify_data.access_waitq);
INIT_LIST_HEAD(&group->fanotify_data.access_list);
+ INIT_LIST_HEAD(&group->fanotify_data.perm_grp_list);
switch (class) {
case FAN_CLASS_NOTIF:
group->priority = FSNOTIFY_PRIO_NORMAL;
@@ -1633,47 +1722,26 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
group->priority = FSNOTIFY_PRIO_PRE_CONTENT;
break;
default:
- fd = -EINVAL;
- goto out_destroy_group;
+ return -EINVAL;
}
+ BUILD_BUG_ON(!(FANOTIFY_ADMIN_INIT_FLAGS & FAN_UNLIMITED_QUEUE));
if (flags & FAN_UNLIMITED_QUEUE) {
- fd = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
- goto out_destroy_group;
group->max_events = UINT_MAX;
} else {
group->max_events = fanotify_max_queued_events;
}
- if (flags & FAN_UNLIMITED_MARKS) {
- fd = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
- goto out_destroy_group;
- }
-
if (flags & FAN_ENABLE_AUDIT) {
- fd = -EPERM;
if (!capable(CAP_AUDIT_WRITE))
- goto out_destroy_group;
+ return -EPERM;
}
- fd = get_unused_fd_flags(f_flags);
- if (fd < 0)
- goto out_destroy_group;
-
- file = anon_inode_getfile_fmode("[fanotify]", &fanotify_fops, group,
- f_flags, FMODE_NONOTIFY);
- if (IS_ERR(file)) {
- put_unused_fd(fd);
- fd = PTR_ERR(file);
- goto out_destroy_group;
- }
- fd_install(fd, file);
- return fd;
-
-out_destroy_group:
- fsnotify_destroy_group(group);
+ fd = FD_ADD(f_flags,
+ anon_inode_getfile_fmode("[fanotify]", &fanotify_fops,
+ group, f_flags, FMODE_NONOTIFY));
+ if (fd >= 0)
+ retain_and_null_ptr(group);
return fd;
}
@@ -1811,6 +1879,8 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
struct fsnotify_group *group;
struct path path;
struct fan_fsid __fsid, *fsid = NULL;
+ struct user_namespace *user_ns = NULL;
+ struct mnt_namespace *mntns;
u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS;
@@ -1904,13 +1974,11 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
}
/*
- * An unprivileged user is not allowed to setup mount nor filesystem
- * marks. This also includes setting up such marks by a group that
- * was initialized by an unprivileged user.
+ * A user is allowed to setup sb/mount/mntns marks only if it is
+ * capable in the user ns where the group was created.
*/
- if ((!capable(CAP_SYS_ADMIN) ||
- FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) &&
- mark_type != FAN_MARK_INODE)
+ if (mark_type != FAN_MARK_INODE &&
+ !ns_capable(group->user_ns, CAP_SYS_ADMIN))
return -EPERM;
/*
@@ -1961,12 +2029,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
return -EINVAL;
if (mark_cmd == FAN_MARK_FLUSH) {
- if (mark_type == FAN_MARK_MOUNT)
- fsnotify_clear_vfsmount_marks_by_group(group);
- else if (mark_type == FAN_MARK_FILESYSTEM)
- fsnotify_clear_sb_marks_by_group(group);
- else
- fsnotify_clear_inode_marks_by_group(group);
+ fsnotify_clear_marks_by_group(group, obj_type);
return 0;
}
@@ -1981,6 +2044,15 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
goto path_put_and_out;
}
+ /* you can only watch an inode if you have read permissions on it */
+ ret = path_permission(&path, MAY_READ);
+ if (ret)
+ goto path_put_and_out;
+
+ ret = security_path_notify(&path, mask, obj_type);
+ if (ret)
+ goto path_put_and_out;
+
if (fid_mode) {
ret = fanotify_test_fsid(path.dentry, flags, &__fsid);
if (ret)
@@ -1993,18 +2065,34 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
fsid = &__fsid;
}
- /* inode held in place by reference to path; group by fget on fd */
+ /*
+ * In addition to being capable in the user ns where group was created,
+ * the user also needs to be capable in the user ns associated with
+ * the filesystem or in the user ns associated with the mntns
+ * (when marking mntns).
+ */
if (obj_type == FSNOTIFY_OBJ_TYPE_INODE) {
inode = path.dentry->d_inode;
obj = inode;
} else if (obj_type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
+ user_ns = path.mnt->mnt_sb->s_user_ns;
obj = path.mnt;
} else if (obj_type == FSNOTIFY_OBJ_TYPE_SB) {
+ user_ns = path.mnt->mnt_sb->s_user_ns;
obj = path.mnt->mnt_sb;
} else if (obj_type == FSNOTIFY_OBJ_TYPE_MNTNS) {
- obj = mnt_ns_from_dentry(path.dentry);
+ ret = -EINVAL;
+ mntns = mnt_ns_from_dentry(path.dentry);
+ if (!mntns)
+ goto path_put_and_out;
+ user_ns = mntns->user_ns;
+ obj = mntns;
}
+ ret = -EPERM;
+ if (user_ns && !ns_capable(user_ns, CAP_SYS_ADMIN))
+ goto path_put_and_out;
+
ret = -EINVAL;
if (!obj)
goto path_put_and_out;
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 1161eabf11ee..0f731eddeb8b 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -17,6 +17,7 @@
#include "fanotify/fanotify.h"
#include "fdinfo.h"
#include "fsnotify.h"
+#include "../internal.h"
#if defined(CONFIG_PROC_FS)
@@ -46,7 +47,12 @@ static void show_mark_fhandle(struct seq_file *m, struct inode *inode)
size = f->handle_bytes >> 2;
+ if (!super_trylock_shared(inode->i_sb))
+ return;
+
ret = exportfs_encode_fid(inode, (struct fid *)f->f_handle, &size);
+ up_read(&inode->i_sb->s_umount);
+
if ((ret == FILEID_INVALID) || (ret < 0))
return;
@@ -78,7 +84,7 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
inode = igrab(fsnotify_conn_inode(mark->connector));
if (inode) {
- seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:0 ",
+ seq_printf(m, "inotify wd:%x ino:%llx sdev:%x mask:%x ignored_mask:0 ",
inode_mark->wd, inode->i_ino, inode->i_sb->s_dev,
inotify_mark_user_mask(mark));
show_mark_fhandle(m, inode);
@@ -105,7 +111,7 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
inode = igrab(fsnotify_conn_inode(mark->connector));
if (!inode)
return;
- seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ",
+ seq_printf(m, "fanotify ino:%llx sdev:%x mflags:%x mask:%x ignored_mask:%x ",
inode->i_ino, inode->i_sb->s_dev,
mflags, mark->mask, mark->ignore_mask);
show_mark_fhandle(m, inode);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index e2b4f17a48bb..2dac70b99b0d 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -33,65 +33,6 @@ void __fsnotify_mntns_delete(struct mnt_namespace *mntns)
fsnotify_clear_marks_by_mntns(mntns);
}
-/**
- * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes.
- * @sb: superblock being unmounted.
- *
- * Called during unmount with no locks held, so needs to be safe against
- * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block.
- */
-static void fsnotify_unmount_inodes(struct super_block *sb)
-{
- struct inode *inode, *iput_inode = NULL;
-
- spin_lock(&sb->s_inode_list_lock);
- list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
- /*
- * We cannot __iget() an inode in state I_FREEING,
- * I_WILL_FREE, or I_NEW which is fine because by that point
- * the inode cannot have any associated watches.
- */
- spin_lock(&inode->i_lock);
- if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
- spin_unlock(&inode->i_lock);
- continue;
- }
-
- /*
- * If i_count is zero, the inode cannot have any watches and
- * doing an __iget/iput with SB_ACTIVE clear would actually
- * evict all inodes with zero i_count from icache which is
- * unnecessarily violent and may in fact be illegal to do.
- * However, we should have been called /after/ evict_inodes
- * removed all zero refcount inodes, in any case. Test to
- * be sure.
- */
- if (!atomic_read(&inode->i_count)) {
- spin_unlock(&inode->i_lock);
- continue;
- }
-
- __iget(inode);
- spin_unlock(&inode->i_lock);
- spin_unlock(&sb->s_inode_list_lock);
-
- iput(iput_inode);
-
- /* for each watch, send FS_UNMOUNT and then remove it */
- fsnotify_inode(inode, FS_UNMOUNT);
-
- fsnotify_inode_delete(inode);
-
- iput_inode = inode;
-
- cond_resched();
- spin_lock(&sb->s_inode_list_lock);
- }
- spin_unlock(&sb->s_inode_list_lock);
-
- iput(iput_inode);
-}
-
void fsnotify_sb_delete(struct super_block *sb)
{
struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);
@@ -100,7 +41,7 @@ void fsnotify_sb_delete(struct super_block *sb)
if (!sbinfo)
return;
- fsnotify_unmount_inodes(sb);
+ fsnotify_unmount_inodes(sbinfo);
fsnotify_clear_marks_by_sb(sb);
/* Wait for outstanding object references from connectors */
wait_var_event(fsnotify_sb_watched_objects(sb),
@@ -112,7 +53,10 @@ void fsnotify_sb_delete(struct super_block *sb)
void fsnotify_sb_free(struct super_block *sb)
{
- kfree(sb->s_fsnotify_info);
+ if (sb->s_fsnotify_info) {
+ WARN_ON_ONCE(!list_empty(&sb->s_fsnotify_info->inode_conn_list));
+ kfree(sb->s_fsnotify_info);
+ }
}
/*
@@ -132,7 +76,7 @@ void fsnotify_set_children_dentry_flags(struct inode *inode)
spin_lock(&inode->i_lock);
/* run all of the dentries associated with this inode. Since this is a
* directory, there damn well better only be one item on this list */
- hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
+ for_each_alias(alias, inode) {
struct dentry *child;
/* run all of the children of the original inode and fix their
@@ -199,8 +143,8 @@ static bool fsnotify_event_needs_parent(struct inode *inode, __u32 mnt_mask,
}
/* Are there any inode/mount/sb objects that watch for these events? */
-static inline bool fsnotify_object_watched(struct inode *inode, __u32 mnt_mask,
- __u32 mask)
+static inline __u32 fsnotify_object_watched(struct inode *inode, __u32 mnt_mask,
+ __u32 mask)
{
__u32 marks_mask = READ_ONCE(inode->i_fsnotify_mask) | mnt_mask |
READ_ONCE(inode->i_sb->s_fsnotify_mask);
@@ -270,8 +214,15 @@ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
/*
* Include parent/name in notification either if some notification
* groups require parent info or the parent is interested in this event.
+ * The parent interest in ACCESS/MODIFY events does not apply to special
+ * files, where read/write are not on the filesystem of the parent and
+ * events can provide an undesirable side-channel for information
+ * exfiltration.
*/
- parent_interested = mask & p_mask & ALL_FSNOTIFY_EVENTS;
+ parent_interested = mask & p_mask & ALL_FSNOTIFY_EVENTS &&
+ !(data_type == FSNOTIFY_EVENT_PATH &&
+ d_is_special(dentry) &&
+ (mask & (FS_ACCESS | FS_MODIFY)));
if (parent_needed || parent_interested) {
/* When notifying parent, child should be passed as data */
WARN_ON_ONCE(inode != fsnotify_data_inode(data, data_type));
@@ -437,7 +388,7 @@ static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector
return hlist_entry_safe(node, struct fsnotify_mark, obj_list);
}
-static struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark)
+struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark)
{
struct hlist_node *node = NULL;
@@ -656,20 +607,20 @@ EXPORT_SYMBOL_GPL(fsnotify);
#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
/*
- * At open time we check fsnotify_sb_has_priority_watchers() and set the
- * FMODE_NONOTIFY_ mode bits accordignly.
+ * At open time we check fsnotify_sb_has_priority_watchers(), call the open perm
+ * hook and set the FMODE_NONOTIFY_ mode bits accordignly.
* Later, fsnotify permission hooks do not check if there are permission event
* watches, but that there were permission event watches at open time.
*/
-void file_set_fsnotify_mode_from_watchers(struct file *file)
+int fsnotify_open_perm_and_set_mode(struct file *file)
{
struct dentry *dentry = file->f_path.dentry, *parent;
struct super_block *sb = dentry->d_sb;
- __u32 mnt_mask, p_mask;
+ __u32 mnt_mask, p_mask = 0;
/* Is it a file opened by fanotify? */
if (FMODE_FSNOTIFY_NONE(file->f_mode))
- return;
+ return 0;
/*
* Permission events is a super set of pre-content events, so if there
@@ -679,45 +630,64 @@ void file_set_fsnotify_mode_from_watchers(struct file *file)
if (likely(!fsnotify_sb_has_priority_watchers(sb,
FSNOTIFY_PRIO_CONTENT))) {
file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM);
- return;
+ return 0;
}
/*
- * If there are permission event watchers but no pre-content event
- * watchers, set FMODE_NONOTIFY | FMODE_NONOTIFY_PERM to indicate that.
+ * OK, there are some permission event watchers. Check if anybody is
+ * watching for permission events on *this* file.
*/
- if ((!d_is_dir(dentry) && !d_is_reg(dentry)) ||
- likely(!fsnotify_sb_has_priority_watchers(sb,
- FSNOTIFY_PRIO_PRE_CONTENT))) {
- file_set_fsnotify_mode(file, FMODE_NONOTIFY | FMODE_NONOTIFY_PERM);
- return;
+ mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask);
+ p_mask = fsnotify_object_watched(d_inode(dentry), mnt_mask,
+ ALL_FSNOTIFY_PERM_EVENTS);
+ if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) {
+ parent = dget_parent(dentry);
+ p_mask |= fsnotify_inode_watches_children(d_inode(parent));
+ dput(parent);
}
/*
- * OK, there are some pre-content watchers. Check if anybody is
- * watching for pre-content events on *this* file.
+ * Legacy FAN_ACCESS_PERM events have very high performance overhead,
+ * so unlikely to be used in the wild. If they are used there will be
+ * no optimizations at all.
*/
- mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask);
- if (unlikely(fsnotify_object_watched(d_inode(dentry), mnt_mask,
- FSNOTIFY_PRE_CONTENT_EVENTS))) {
- /* Enable pre-content events */
+ if (unlikely(p_mask & FS_ACCESS_PERM)) {
+ /* Enable all permission and pre-content events */
file_set_fsnotify_mode(file, 0);
- return;
+ goto open_perm;
}
- /* Is parent watching for pre-content events on this file? */
- if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) {
- parent = dget_parent(dentry);
- p_mask = fsnotify_inode_watches_children(d_inode(parent));
- dput(parent);
- if (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS) {
- /* Enable pre-content events */
- file_set_fsnotify_mode(file, 0);
- return;
- }
+ /*
+ * Pre-content events are only supported on regular files.
+ * If there are pre-content event watchers and no permission access
+ * watchers, set FMODE_NONOTIFY | FMODE_NONOTIFY_PERM to indicate that.
+ * That is the common case with HSM service.
+ */
+ if (d_is_reg(dentry) && (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS)) {
+ file_set_fsnotify_mode(file, FMODE_NONOTIFY |
+ FMODE_NONOTIFY_PERM);
+ goto open_perm;
}
- /* Nobody watching for pre-content events from this file */
- file_set_fsnotify_mode(file, FMODE_NONOTIFY | FMODE_NONOTIFY_PERM);
+
+ /* Nobody watching permission and pre-content events on this file */
+ file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM);
+
+open_perm:
+ /*
+ * Send open perm events depending on object masks and regardless of
+ * FMODE_NONOTIFY_PERM.
+ */
+ if (file->f_flags & __FMODE_EXEC && p_mask & FS_OPEN_EXEC_PERM) {
+ int ret = fsnotify_path(&file->f_path, FS_OPEN_EXEC_PERM);
+
+ if (ret)
+ return ret;
+ }
+
+ if (p_mask & FS_OPEN_PERM)
+ return fsnotify_path(&file->f_path, FS_OPEN_PERM);
+
+ return 0;
}
#endif
@@ -751,8 +721,7 @@ static __init int fsnotify_init(void)
if (ret)
panic("initializing fsnotify_mark_srcu");
- fsnotify_mark_connector_cachep = KMEM_CACHE(fsnotify_mark_connector,
- SLAB_PANIC);
+ fsnotify_init_connector_caches();
return 0;
}
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 5950c7a67f41..58c7bb25e571 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -77,6 +77,9 @@ extern struct srcu_struct fsnotify_mark_srcu;
extern int fsnotify_compare_groups(struct fsnotify_group *a,
struct fsnotify_group *b);
+/* Destroy all inode marks for given superblock */
+void fsnotify_unmount_inodes(struct fsnotify_sb_info *sbinfo);
+
/* Destroy all marks attached to an object via connector */
extern void fsnotify_destroy_marks(fsnotify_connp_t *connp);
/* run the list of all marks associated with inode and destroy them */
@@ -106,6 +109,6 @@ static inline void fsnotify_clear_marks_by_mntns(struct mnt_namespace *mntns)
*/
extern void fsnotify_set_children_dentry_flags(struct inode *inode);
-extern struct kmem_cache *fsnotify_mark_connector_cachep;
+void fsnotify_init_connector_caches(void);
#endif /* __FS_NOTIFY_FSNOTIFY_H_ */
diff --git a/fs/notify/group.c b/fs/notify/group.c
index 18446b7b0d49..b56d1c1d9644 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -117,7 +117,7 @@ static struct fsnotify_group *__fsnotify_alloc_group(
{
struct fsnotify_group *group;
- group = kzalloc(sizeof(struct fsnotify_group), gfp);
+ group = kzalloc_obj(struct fsnotify_group, gfp);
if (!group)
return ERR_PTR(-ENOMEM);
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index cd7d11b0eb08..7c326ec2e8a8 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -10,7 +10,7 @@
* Copyright 2006 Hewlett-Packard Development Company, L.P.
*
* Copyright (C) 2009 Eric Paris <Red Hat Inc>
- * inotify was largely rewriten to make use of the fsnotify infrastructure
+ * inotify was largely rewritten to make use of the fsnotify infrastructure
*/
#include <linux/dcache.h> /* d_unlinked */
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index b372fb2c56bd..ed37491c1618 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -573,7 +573,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
/* update the inode with this new fsn_mark */
if (dropped || do_inode)
- fsnotify_recalc_mask(inode->i_fsnotify_marks);
+ fsnotify_recalc_mask(fsn_mark->connector);
}
@@ -621,6 +621,7 @@ static int inotify_new_watch(struct fsnotify_group *group,
if (ret) {
/* we failed to get on the inode, get off the idr */
inotify_remove_from_idr(group, tmp_i_mark);
+ dec_inotify_watches(group->inotify_data.ucounts);
goto out_err;
}
@@ -660,7 +661,7 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
if (IS_ERR(group))
return group;
- oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL_ACCOUNT);
+ oevent = kmalloc_obj(struct inotify_event_info, GFP_KERNEL_ACCOUNT);
if (unlikely(!oevent)) {
fsnotify_destroy_group(group);
return ERR_PTR(-ENOMEM);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 798340db69d7..e256b420100d 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -79,7 +79,8 @@
#define FSNOTIFY_REAPER_DELAY (1) /* 1 jiffy */
struct srcu_struct fsnotify_mark_srcu;
-struct kmem_cache *fsnotify_mark_connector_cachep;
+static struct kmem_cache *fsnotify_mark_connector_cachep;
+static struct kmem_cache *fsnotify_inode_mark_connector_cachep;
static DEFINE_SPINLOCK(destroy_lock);
static LIST_HEAD(destroy_list);
@@ -237,7 +238,12 @@ static struct inode *fsnotify_update_iref(struct fsnotify_mark_connector *conn,
return inode;
}
-static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
+/*
+ * Calculate mask of events for a list of marks.
+ *
+ * Return true if any of the attached marks want to hold an inode reference.
+ */
+static bool __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
{
u32 new_mask = 0;
bool want_iref = false;
@@ -261,6 +267,34 @@ static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
*/
WRITE_ONCE(*fsnotify_conn_mask_p(conn), new_mask);
+ return want_iref;
+}
+
+/*
+ * Calculate mask of events for a list of marks after attach/modify mark
+ * and get an inode reference for the connector if needed.
+ *
+ * A concurrent add of evictable mark and detach of non-evictable mark can
+ * lead to __fsnotify_recalc_mask() returning false want_iref, but in this
+ * case we defer clearing iref to fsnotify_recalc_mask_clear_iref() called
+ * from fsnotify_put_mark().
+ */
+static void fsnotify_recalc_mask_set_iref(struct fsnotify_mark_connector *conn)
+{
+ bool has_iref = conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF;
+ bool want_iref = __fsnotify_recalc_mask(conn) || has_iref;
+
+ (void) fsnotify_update_iref(conn, want_iref);
+}
+
+/*
+ * Calculate mask of events for a list of marks after detach mark
+ * and return the inode object if its reference is no longer needed.
+ */
+static void *fsnotify_recalc_mask_clear_iref(struct fsnotify_mark_connector *conn)
+{
+ bool want_iref = __fsnotify_recalc_mask(conn);
+
return fsnotify_update_iref(conn, want_iref);
}
@@ -297,7 +331,7 @@ void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
spin_lock(&conn->lock);
update_children = !fsnotify_conn_watches_children(conn);
- __fsnotify_recalc_mask(conn);
+ fsnotify_recalc_mask_set_iref(conn);
update_children &= fsnotify_conn_watches_children(conn);
spin_unlock(&conn->lock);
/*
@@ -323,10 +357,12 @@ static void fsnotify_connector_destroy_workfn(struct work_struct *work)
while (conn) {
free = conn;
conn = conn->destroy_next;
- kmem_cache_free(fsnotify_mark_connector_cachep, free);
+ kfree(free);
}
}
+static void fsnotify_untrack_connector(struct fsnotify_mark_connector *conn);
+
static void *fsnotify_detach_connector_from_object(
struct fsnotify_mark_connector *conn,
unsigned int *type)
@@ -342,6 +378,7 @@ static void *fsnotify_detach_connector_from_object(
if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) {
inode = fsnotify_conn_inode(conn);
inode->i_fsnotify_mask = 0;
+ fsnotify_untrack_connector(conn);
/* Unpin inode when detaching from connector */
if (!(conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF))
@@ -415,7 +452,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
/* Update watched objects after detaching mark */
if (sb)
fsnotify_update_sb_watchers(sb, conn);
- objp = __fsnotify_recalc_mask(conn);
+ objp = fsnotify_recalc_mask_clear_iref(conn);
type = conn->type;
}
WRITE_ONCE(mark->connector, NULL);
@@ -428,7 +465,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
conn->destroy_next = connector_destroy_list;
connector_destroy_list = conn;
spin_unlock(&destroy_lock);
- queue_work(system_unbound_wq, &connector_reaper_work);
+ queue_work(system_dfl_wq, &connector_reaper_work);
}
/*
* Note that we didn't update flags telling whether inode cares about
@@ -439,7 +476,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
spin_lock(&destroy_lock);
list_add(&mark->g_list, &destroy_list);
spin_unlock(&destroy_lock);
- queue_delayed_work(system_unbound_wq, &reaper_work,
+ queue_delayed_work(system_dfl_wq, &reaper_work,
FSNOTIFY_REAPER_DELAY);
}
EXPORT_SYMBOL_GPL(fsnotify_put_mark);
@@ -453,9 +490,6 @@ EXPORT_SYMBOL_GPL(fsnotify_put_mark);
*/
static bool fsnotify_get_mark_safe(struct fsnotify_mark *mark)
{
- if (!mark)
- return true;
-
if (refcount_inc_not_zero(&mark->refcnt)) {
spin_lock(&mark->lock);
if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) {
@@ -496,15 +530,22 @@ bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info)
int type;
fsnotify_foreach_iter_type(type) {
+ struct fsnotify_mark *mark = iter_info->marks[type];
+
/* This can fail if mark is being removed */
- if (!fsnotify_get_mark_safe(iter_info->marks[type])) {
- __release(&fsnotify_mark_srcu);
- goto fail;
+ while (mark && !fsnotify_get_mark_safe(mark)) {
+ if (mark->group == iter_info->current_group) {
+ __release(&fsnotify_mark_srcu);
+ goto fail;
+ }
+ /* This is a mark in an unrelated group, skip */
+ mark = fsnotify_next_mark(mark);
+ iter_info->marks[type] = mark;
}
}
/*
- * Now that both marks are pinned by refcount in the inode / vfsmount
+ * Now that all marks are pinned by refcount in the inode / vfsmount / etc
* lists, we can drop SRCU lock, and safely resume the list iteration
* once userspace returns.
*/
@@ -640,10 +681,12 @@ static int fsnotify_attach_info_to_sb(struct super_block *sb)
struct fsnotify_sb_info *sbinfo;
/* sb info is freed on fsnotify_sb_delete() */
- sbinfo = kzalloc(sizeof(*sbinfo), GFP_KERNEL);
+ sbinfo = kzalloc_obj(*sbinfo);
if (!sbinfo)
return -ENOMEM;
+ INIT_LIST_HEAD(&sbinfo->inode_conn_list);
+ spin_lock_init(&sbinfo->list_lock);
/*
* cmpxchg() provides the barrier so that callers of fsnotify_sb_info()
* will observe an initialized structure
@@ -655,20 +698,123 @@ static int fsnotify_attach_info_to_sb(struct super_block *sb)
return 0;
}
-static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
- void *obj, unsigned int obj_type)
+struct fsnotify_inode_mark_connector {
+ struct fsnotify_mark_connector common;
+ struct list_head conns_list;
+};
+
+static struct inode *fsnotify_get_living_inode(struct fsnotify_sb_info *sbinfo)
{
- struct fsnotify_mark_connector *conn;
+ struct fsnotify_inode_mark_connector *iconn;
+ struct inode *inode;
+
+ spin_lock(&sbinfo->list_lock);
+ /* Find the first non-evicting inode */
+ list_for_each_entry(iconn, &sbinfo->inode_conn_list, conns_list) {
+ /* All connectors on the list are still attached to an inode */
+ inode = iconn->common.obj;
+ /*
+ * For connectors without FSNOTIFY_CONN_FLAG_HAS_IREF
+ * (evictable marks) corresponding inode may well have 0
+ * refcount and can be undergoing eviction. OTOH list_lock
+ * protects us from the connector getting detached and inode
+ * freed. So we can poke around the inode safely.
+ */
+ spin_lock(&inode->i_lock);
+ if (likely(
+ !(inode_state_read(inode) & (I_FREEING | I_WILL_FREE)))) {
+ __iget(inode);
+ spin_unlock(&inode->i_lock);
+ spin_unlock(&sbinfo->list_lock);
+ return inode;
+ }
+ spin_unlock(&inode->i_lock);
+ }
+ spin_unlock(&sbinfo->list_lock);
- conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, GFP_KERNEL);
- if (!conn)
- return -ENOMEM;
+ return NULL;
+}
+
+/**
+ * fsnotify_unmount_inodes - an sb is unmounting. Handle any watched inodes.
+ * @sbinfo: fsnotify info for superblock being unmounted.
+ *
+ * Walk all inode connectors for the superblock and free all associated marks.
+ */
+void fsnotify_unmount_inodes(struct fsnotify_sb_info *sbinfo)
+{
+ struct inode *inode;
+
+ while ((inode = fsnotify_get_living_inode(sbinfo))) {
+ fsnotify_inode(inode, FS_UNMOUNT);
+ fsnotify_clear_marks_by_inode(inode);
+ iput(inode);
+ cond_resched();
+ }
+}
+
+static void fsnotify_init_connector(struct fsnotify_mark_connector *conn,
+ void *obj, unsigned int obj_type)
+{
spin_lock_init(&conn->lock);
INIT_HLIST_HEAD(&conn->list);
conn->flags = 0;
conn->prio = 0;
conn->type = obj_type;
conn->obj = obj;
+}
+
+static struct fsnotify_mark_connector *
+fsnotify_alloc_inode_connector(struct inode *inode)
+{
+ struct fsnotify_inode_mark_connector *iconn;
+ struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(inode->i_sb);
+
+ iconn = kmem_cache_alloc(fsnotify_inode_mark_connector_cachep,
+ GFP_KERNEL);
+ if (!iconn)
+ return NULL;
+
+ fsnotify_init_connector(&iconn->common, inode, FSNOTIFY_OBJ_TYPE_INODE);
+ spin_lock(&sbinfo->list_lock);
+ list_add(&iconn->conns_list, &sbinfo->inode_conn_list);
+ spin_unlock(&sbinfo->list_lock);
+
+ return &iconn->common;
+}
+
+static void fsnotify_untrack_connector(struct fsnotify_mark_connector *conn)
+{
+ struct fsnotify_inode_mark_connector *iconn;
+ struct fsnotify_sb_info *sbinfo;
+
+ if (conn->type != FSNOTIFY_OBJ_TYPE_INODE)
+ return;
+
+ iconn = container_of(conn, struct fsnotify_inode_mark_connector, common);
+ sbinfo = fsnotify_sb_info(fsnotify_conn_inode(conn)->i_sb);
+ spin_lock(&sbinfo->list_lock);
+ list_del(&iconn->conns_list);
+ spin_unlock(&sbinfo->list_lock);
+}
+
+static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
+ void *obj, unsigned int obj_type)
+{
+ struct fsnotify_mark_connector *conn;
+
+ if (obj_type == FSNOTIFY_OBJ_TYPE_INODE) {
+ struct inode *inode = obj;
+
+ conn = fsnotify_alloc_inode_connector(inode);
+ } else {
+ conn = kmem_cache_alloc(fsnotify_mark_connector_cachep,
+ GFP_KERNEL);
+ if (conn)
+ fsnotify_init_connector(conn, obj, obj_type);
+ }
+ if (!conn)
+ return -ENOMEM;
/*
* cmpxchg() provides the barrier so that readers of *connp can see
@@ -676,7 +822,8 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
*/
if (cmpxchg(connp, NULL, conn)) {
/* Someone else created list structure for us */
- kmem_cache_free(fsnotify_mark_connector_cachep, conn);
+ fsnotify_untrack_connector(conn);
+ kfree(conn);
}
return 0;
}
@@ -1007,3 +1154,12 @@ void fsnotify_wait_marks_destroyed(void)
flush_delayed_work(&reaper_work);
}
EXPORT_SYMBOL_GPL(fsnotify_wait_marks_destroyed);
+
+__init void fsnotify_init_connector_caches(void)
+{
+ fsnotify_mark_connector_cachep = KMEM_CACHE(fsnotify_mark_connector,
+ SLAB_PANIC);
+ fsnotify_inode_mark_connector_cachep = KMEM_CACHE(
+ fsnotify_inode_mark_connector,
+ SLAB_PANIC);
+}