summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorJoel Becker <jlbec@evilplan.org>2011-05-25 21:51:55 -0700
committerJoel Becker <jlbec@evilplan.org>2011-05-25 21:51:55 -0700
commitece928df16494becd43f999aff9bd530182e7e81 (patch)
tree905042764ea5d8ab6eda63666406e19f607bcf4c /fs
parent3d1c1829ebe7e8bb48a997b39b4865abc9197e5e (diff)
parentdda54e76d7dba0532ebdd72e0b4f492a03f83225 (diff)
downloadlwn-ece928df16494becd43f999aff9bd530182e7e81.tar.gz
lwn-ece928df16494becd43f999aff9bd530182e7e81.zip
Merge branch 'move_extents' of git://oss.oracle.com/git/tye/linux-2.6 into ocfs2-merge-window
Conflicts: fs/ocfs2/ioctl.c
Diffstat (limited to 'fs')
-rw-r--r--fs/binfmt_flat.c8
-rw-r--r--fs/dlm/config.c9
-rw-r--r--fs/dlm/config.h1
-rw-r--r--fs/dlm/dlm_internal.h3
-rw-r--r--fs/dlm/lock.c182
-rw-r--r--fs/dlm/lock.h1
-rw-r--r--fs/dlm/lockspace.c6
-rw-r--r--fs/dlm/plock.c65
-rw-r--r--fs/dlm/user.c1
-rw-r--r--fs/ext2/super.c3
-rw-r--r--fs/ext3/namei.c80
-rw-r--r--fs/jbd/commit.c15
-rw-r--r--fs/jbd/journal.c16
-rw-r--r--fs/jbd/transaction.c3
-rw-r--r--fs/jbd2/commit.c6
-rw-r--r--fs/ocfs2/Makefile1
-rw-r--r--fs/ocfs2/ioctl.c468
-rw-r--r--fs/ocfs2/move_extents.c1153
-rw-r--r--fs/ocfs2/move_extents.h22
-rw-r--r--fs/ocfs2/ocfs2_ioctl.h68
-rw-r--r--fs/ocfs2/refcounttree.c58
-rw-r--r--fs/ocfs2/refcounttree.h11
-rw-r--r--fs/ubifs/budget.c104
-rw-r--r--fs/ubifs/commit.c2
-rw-r--r--fs/ubifs/debug.c167
-rw-r--r--fs/ubifs/debug.h178
-rw-r--r--fs/ubifs/dir.c4
-rw-r--r--fs/ubifs/file.c28
-rw-r--r--fs/ubifs/find.c10
-rw-r--r--fs/ubifs/gc.c71
-rw-r--r--fs/ubifs/io.c33
-rw-r--r--fs/ubifs/journal.c29
-rw-r--r--fs/ubifs/log.c28
-rw-r--r--fs/ubifs/lprops.c115
-rw-r--r--fs/ubifs/lpt_commit.c55
-rw-r--r--fs/ubifs/master.c8
-rw-r--r--fs/ubifs/misc.h17
-rw-r--r--fs/ubifs/orphan.c3
-rw-r--r--fs/ubifs/recovery.c354
-rw-r--r--fs/ubifs/replay.c468
-rw-r--r--fs/ubifs/sb.c153
-rw-r--r--fs/ubifs/super.c46
-rw-r--r--fs/ubifs/tnc.c10
-rw-r--r--fs/ubifs/tnc_commit.c18
-rw-r--r--fs/ubifs/ubifs-media.h30
-rw-r--r--fs/ubifs/ubifs.h86
-rw-r--r--fs/ubifs/xattr.c8
47 files changed, 3158 insertions, 1047 deletions
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 397d3057d336..1bffbe0ed778 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -820,6 +820,8 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
int res;
char buf[16];
+ memset(&bprm, 0, sizeof(bprm));
+
/* Create the file name */
sprintf(buf, "/lib/lib%d.so", id);
@@ -835,6 +837,12 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
if (!bprm.cred)
goto out;
+ /* We don't really care about recalculating credentials at this point
+ * as we're past the point of no return and are dealing with shared
+ * libraries.
+ */
+ bprm.cred_prepared = 1;
+
res = prepare_binprm(&bprm);
if (!IS_ERR_VALUE(res))
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 0d329ff8ed4c..9b026ea8baa9 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -100,6 +100,7 @@ struct dlm_cluster {
unsigned int cl_log_debug;
unsigned int cl_protocol;
unsigned int cl_timewarn_cs;
+ unsigned int cl_waitwarn_us;
};
enum {
@@ -114,6 +115,7 @@ enum {
CLUSTER_ATTR_LOG_DEBUG,
CLUSTER_ATTR_PROTOCOL,
CLUSTER_ATTR_TIMEWARN_CS,
+ CLUSTER_ATTR_WAITWARN_US,
};
struct cluster_attribute {
@@ -166,6 +168,7 @@ CLUSTER_ATTR(scan_secs, 1);
CLUSTER_ATTR(log_debug, 0);
CLUSTER_ATTR(protocol, 0);
CLUSTER_ATTR(timewarn_cs, 1);
+CLUSTER_ATTR(waitwarn_us, 0);
static struct configfs_attribute *cluster_attrs[] = {
[CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr,
@@ -179,6 +182,7 @@ static struct configfs_attribute *cluster_attrs[] = {
[CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr,
[CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol.attr,
[CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr,
+ [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us.attr,
NULL,
};
@@ -439,6 +443,7 @@ static struct config_group *make_cluster(struct config_group *g,
cl->cl_log_debug = dlm_config.ci_log_debug;
cl->cl_protocol = dlm_config.ci_protocol;
cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs;
+ cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us;
space_list = &sps->ss_group;
comm_list = &cms->cs_group;
@@ -986,6 +991,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
#define DEFAULT_LOG_DEBUG 0
#define DEFAULT_PROTOCOL 0
#define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */
+#define DEFAULT_WAITWARN_US 0
struct dlm_config_info dlm_config = {
.ci_tcp_port = DEFAULT_TCP_PORT,
@@ -998,6 +1004,7 @@ struct dlm_config_info dlm_config = {
.ci_scan_secs = DEFAULT_SCAN_SECS,
.ci_log_debug = DEFAULT_LOG_DEBUG,
.ci_protocol = DEFAULT_PROTOCOL,
- .ci_timewarn_cs = DEFAULT_TIMEWARN_CS
+ .ci_timewarn_cs = DEFAULT_TIMEWARN_CS,
+ .ci_waitwarn_us = DEFAULT_WAITWARN_US
};
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index 4f1d6fce58c5..dd0ce24d5a80 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -28,6 +28,7 @@ struct dlm_config_info {
int ci_log_debug;
int ci_protocol;
int ci_timewarn_cs;
+ int ci_waitwarn_us;
};
extern struct dlm_config_info dlm_config;
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index b94204913011..0262451eb9c6 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -209,6 +209,7 @@ struct dlm_args {
#define DLM_IFL_WATCH_TIMEWARN 0x00400000
#define DLM_IFL_TIMEOUT_CANCEL 0x00800000
#define DLM_IFL_DEADLOCK_CANCEL 0x01000000
+#define DLM_IFL_STUB_MS 0x02000000 /* magic number for m_flags */
#define DLM_IFL_USER 0x00000001
#define DLM_IFL_ORPHAN 0x00000002
@@ -245,6 +246,7 @@ struct dlm_lkb {
int8_t lkb_wait_type; /* type of reply waiting for */
int8_t lkb_wait_count;
+ int lkb_wait_nodeid; /* for debugging */
struct list_head lkb_idtbl_list; /* lockspace lkbtbl */
struct list_head lkb_statequeue; /* rsb g/c/w list */
@@ -254,6 +256,7 @@ struct dlm_lkb {
struct list_head lkb_ownqueue; /* list of locks for a process */
struct list_head lkb_time_list;
ktime_t lkb_timestamp;
+ ktime_t lkb_wait_time;
unsigned long lkb_timeout_cs;
struct dlm_callback lkb_callbacks[DLM_CALLBACKS_SIZE];
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 56d6bfcc1e48..f71d0b5abd95 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -799,10 +799,84 @@ static int msg_reply_type(int mstype)
return -1;
}
+static int nodeid_warned(int nodeid, int num_nodes, int *warned)
+{
+ int i;
+
+ for (i = 0; i < num_nodes; i++) {
+ if (!warned[i]) {
+ warned[i] = nodeid;
+ return 0;
+ }
+ if (warned[i] == nodeid)
+ return 1;
+ }
+ return 0;
+}
+
+void dlm_scan_waiters(struct dlm_ls *ls)
+{
+ struct dlm_lkb *lkb;
+ ktime_t zero = ktime_set(0, 0);
+ s64 us;
+ s64 debug_maxus = 0;
+ u32 debug_scanned = 0;
+ u32 debug_expired = 0;
+ int num_nodes = 0;
+ int *warned = NULL;
+
+ if (!dlm_config.ci_waitwarn_us)
+ return;
+
+ mutex_lock(&ls->ls_waiters_mutex);
+
+ list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+ if (ktime_equal(lkb->lkb_wait_time, zero))
+ continue;
+
+ debug_scanned++;
+
+ us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_wait_time));
+
+ if (us < dlm_config.ci_waitwarn_us)
+ continue;
+
+ lkb->lkb_wait_time = zero;
+
+ debug_expired++;
+ if (us > debug_maxus)
+ debug_maxus = us;
+
+ if (!num_nodes) {
+ num_nodes = ls->ls_num_nodes;
+ warned = kmalloc(GFP_KERNEL, num_nodes * sizeof(int));
+ if (warned)
+ memset(warned, 0, num_nodes * sizeof(int));
+ }
+ if (!warned)
+ continue;
+ if (nodeid_warned(lkb->lkb_wait_nodeid, num_nodes, warned))
+ continue;
+
+ log_error(ls, "waitwarn %x %lld %d us check connection to "
+ "node %d", lkb->lkb_id, (long long)us,
+ dlm_config.ci_waitwarn_us, lkb->lkb_wait_nodeid);
+ }
+ mutex_unlock(&ls->ls_waiters_mutex);
+
+ if (warned)
+ kfree(warned);
+
+ if (debug_expired)
+ log_debug(ls, "scan_waiters %u warn %u over %d us max %lld us",
+ debug_scanned, debug_expired,
+ dlm_config.ci_waitwarn_us, (long long)debug_maxus);
+}
+
/* add/remove lkb from global waiters list of lkb's waiting for
a reply from a remote node */
-static int add_to_waiters(struct dlm_lkb *lkb, int mstype)
+static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid)
{
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
int error = 0;
@@ -842,6 +916,8 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype)
lkb->lkb_wait_count++;
lkb->lkb_wait_type = mstype;
+ lkb->lkb_wait_time = ktime_get();
+ lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */
hold_lkb(lkb);
list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
out:
@@ -961,10 +1037,10 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
int error;
- if (ms != &ls->ls_stub_ms)
+ if (ms->m_flags != DLM_IFL_STUB_MS)
mutex_lock(&ls->ls_waiters_mutex);
error = _remove_from_waiters(lkb, ms->m_type, ms);
- if (ms != &ls->ls_stub_ms)
+ if (ms->m_flags != DLM_IFL_STUB_MS)
mutex_unlock(&ls->ls_waiters_mutex);
return error;
}
@@ -1157,6 +1233,16 @@ void dlm_adjust_timeouts(struct dlm_ls *ls)
list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list)
lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us);
mutex_unlock(&ls->ls_timeout_mutex);
+
+ if (!dlm_config.ci_waitwarn_us)
+ return;
+
+ mutex_lock(&ls->ls_waiters_mutex);
+ list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+ if (ktime_to_us(lkb->lkb_wait_time))
+ lkb->lkb_wait_time = ktime_get();
+ }
+ mutex_unlock(&ls->ls_waiters_mutex);
}
/* lkb is master or local copy */
@@ -1376,14 +1462,8 @@ static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
ALTPR/ALTCW: our rqmode may have been changed to PR or CW to become
compatible with other granted locks */
-static void munge_demoted(struct dlm_lkb *lkb, struct dlm_message *ms)
+static void munge_demoted(struct dlm_lkb *lkb)
{
- if (ms->m_type != DLM_MSG_CONVERT_REPLY) {
- log_print("munge_demoted %x invalid reply type %d",
- lkb->lkb_id, ms->m_type);
- return;
- }
-
if (lkb->lkb_rqmode == DLM_LOCK_IV || lkb->lkb_grmode == DLM_LOCK_IV) {
log_print("munge_demoted %x invalid modes gr %d rq %d",
lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode);
@@ -2844,12 +2924,12 @@ static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
struct dlm_mhandle *mh;
int to_nodeid, error;
- error = add_to_waiters(lkb, mstype);
+ to_nodeid = r->res_nodeid;
+
+ error = add_to_waiters(lkb, mstype, to_nodeid);
if (error)
return error;
- to_nodeid = r->res_nodeid;
-
error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
if (error)
goto fail;
@@ -2880,9 +2960,9 @@ static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
/* down conversions go without a reply from the master */
if (!error && down_conversion(lkb)) {
remove_from_waiters(lkb, DLM_MSG_CONVERT_REPLY);
+ r->res_ls->ls_stub_ms.m_flags = DLM_IFL_STUB_MS;
r->res_ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
r->res_ls->ls_stub_ms.m_result = 0;
- r->res_ls->ls_stub_ms.m_flags = lkb->lkb_flags;
__receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
}
@@ -2951,12 +3031,12 @@ static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
struct dlm_mhandle *mh;
int to_nodeid, error;
- error = add_to_waiters(lkb, DLM_MSG_LOOKUP);
+ to_nodeid = dlm_dir_nodeid(r);
+
+ error = add_to_waiters(lkb, DLM_MSG_LOOKUP, to_nodeid);
if (error)
return error;
- to_nodeid = dlm_dir_nodeid(r);
-
error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
if (error)
goto fail;
@@ -3070,6 +3150,9 @@ static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
{
+ if (ms->m_flags == DLM_IFL_STUB_MS)
+ return;
+
lkb->lkb_sbflags = ms->m_sbflags;
lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
(ms->m_flags & 0x0000FFFF);
@@ -3612,7 +3695,7 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
/* convert was queued on remote master */
receive_flags_reply(lkb, ms);
if (is_demoted(lkb))
- munge_demoted(lkb, ms);
+ munge_demoted(lkb);
del_lkb(r, lkb);
add_lkb(r, lkb, DLM_LKSTS_CONVERT);
add_timeout(lkb);
@@ -3622,7 +3705,7 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
/* convert was granted on remote master */
receive_flags_reply(lkb, ms);
if (is_demoted(lkb))
- munge_demoted(lkb, ms);
+ munge_demoted(lkb);
grant_lock_pc(r, lkb, ms);
queue_cast(r, lkb, 0);
break;
@@ -3996,15 +4079,17 @@ void dlm_receive_buffer(union dlm_packet *p, int nodeid)
dlm_put_lockspace(ls);
}
-static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
+static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_message *ms_stub)
{
if (middle_conversion(lkb)) {
hold_lkb(lkb);
- ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
- ls->ls_stub_ms.m_result = -EINPROGRESS;
- ls->ls_stub_ms.m_flags = lkb->lkb_flags;
- ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
- _receive_convert_reply(lkb, &ls->ls_stub_ms);
+ memset(ms_stub, 0, sizeof(struct dlm_message));
+ ms_stub->m_flags = DLM_IFL_STUB_MS;
+ ms_stub->m_type = DLM_MSG_CONVERT_REPLY;
+ ms_stub->m_result = -EINPROGRESS;
+ ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
+ _receive_convert_reply(lkb, ms_stub);
/* Same special case as in receive_rcom_lock_args() */
lkb->lkb_grmode = DLM_LOCK_IV;
@@ -4045,13 +4130,27 @@ static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
void dlm_recover_waiters_pre(struct dlm_ls *ls)
{
struct dlm_lkb *lkb, *safe;
+ struct dlm_message *ms_stub;
int wait_type, stub_unlock_result, stub_cancel_result;
+ ms_stub = kmalloc(GFP_KERNEL, sizeof(struct dlm_message));
+ if (!ms_stub) {
+ log_error(ls, "dlm_recover_waiters_pre no mem");
+ return;
+ }
+
mutex_lock(&ls->ls_waiters_mutex);
list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
- log_debug(ls, "pre recover waiter lkid %x type %d flags %x",
- lkb->lkb_id, lkb->lkb_wait_type, lkb->lkb_flags);
+
+ /* exclude debug messages about unlocks because there can be so
+ many and they aren't very interesting */
+
+ if (lkb->lkb_wait_type != DLM_MSG_UNLOCK) {
+ log_debug(ls, "recover_waiter %x nodeid %d "
+ "msg %d to %d", lkb->lkb_id, lkb->lkb_nodeid,
+ lkb->lkb_wait_type, lkb->lkb_wait_nodeid);
+ }
/* all outstanding lookups, regardless of destination will be
resent after recovery is done */
@@ -4097,26 +4196,28 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
break;
case DLM_MSG_CONVERT:
- recover_convert_waiter(ls, lkb);
+ recover_convert_waiter(ls, lkb, ms_stub);
break;
case DLM_MSG_UNLOCK:
hold_lkb(lkb);
- ls->ls_stub_ms.m_type = DLM_MSG_UNLOCK_REPLY;
- ls->ls_stub_ms.m_result = stub_unlock_result;
- ls->ls_stub_ms.m_flags = lkb->lkb_flags;
- ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
- _receive_unlock_reply(lkb, &ls->ls_stub_ms);
+ memset(ms_stub, 0, sizeof(struct dlm_message));
+ ms_stub->m_flags = DLM_IFL_STUB_MS;
+ ms_stub->m_type = DLM_MSG_UNLOCK_REPLY;
+ ms_stub->m_result = stub_unlock_result;
+ ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
+ _receive_unlock_reply(lkb, ms_stub);
dlm_put_lkb(lkb);
break;
case DLM_MSG_CANCEL:
hold_lkb(lkb);
- ls->ls_stub_ms.m_type = DLM_MSG_CANCEL_REPLY;
- ls->ls_stub_ms.m_result = stub_cancel_result;
- ls->ls_stub_ms.m_flags = lkb->lkb_flags;
- ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
- _receive_cancel_reply(lkb, &ls->ls_stub_ms);
+ memset(ms_stub, 0, sizeof(struct dlm_message));
+ ms_stub->m_flags = DLM_IFL_STUB_MS;
+ ms_stub->m_type = DLM_MSG_CANCEL_REPLY;
+ ms_stub->m_result = stub_cancel_result;
+ ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
+ _receive_cancel_reply(lkb, ms_stub);
dlm_put_lkb(lkb);
break;
@@ -4127,6 +4228,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
schedule();
}
mutex_unlock(&ls->ls_waiters_mutex);
+ kfree(ms_stub);
}
static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls)
@@ -4191,8 +4293,8 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
ou = is_overlap_unlock(lkb);
err = 0;
- log_debug(ls, "recover_waiters_post %x type %d flags %x %s",
- lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name);
+ log_debug(ls, "recover_waiter %x nodeid %d msg %d r_nodeid %d",
+ lkb->lkb_id, lkb->lkb_nodeid, mstype, r->res_nodeid);
/* At this point we assume that we won't get a reply to any
previous op or overlap op on this lock. First, do a big
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index 88e93c80cc22..265017a7c3e7 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -24,6 +24,7 @@ int dlm_put_lkb(struct dlm_lkb *lkb);
void dlm_scan_rsbs(struct dlm_ls *ls);
int dlm_lock_recovery_try(struct dlm_ls *ls);
void dlm_unlock_recovery(struct dlm_ls *ls);
+void dlm_scan_waiters(struct dlm_ls *ls);
void dlm_scan_timeout(struct dlm_ls *ls);
void dlm_adjust_timeouts(struct dlm_ls *ls);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index f994a7dfda85..14cbf4099753 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -243,7 +243,6 @@ static struct dlm_ls *find_ls_to_scan(void)
static int dlm_scand(void *data)
{
struct dlm_ls *ls;
- int timeout_jiffies = dlm_config.ci_scan_secs * HZ;
while (!kthread_should_stop()) {
ls = find_ls_to_scan();
@@ -252,13 +251,14 @@ static int dlm_scand(void *data)
ls->ls_scan_time = jiffies;
dlm_scan_rsbs(ls);
dlm_scan_timeout(ls);
+ dlm_scan_waiters(ls);
dlm_unlock_recovery(ls);
} else {
ls->ls_scan_time += HZ;
}
- } else {
- schedule_timeout_interruptible(timeout_jiffies);
+ continue;
}
+ schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ);
}
return 0;
}
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 30d8b85febbf..e2b878004364 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -71,6 +71,36 @@ static void send_op(struct plock_op *op)
wake_up(&send_wq);
}
+/* If a process was killed while waiting for the only plock on a file,
+ locks_remove_posix will not see any lock on the file so it won't
+ send an unlock-close to us to pass on to userspace to clean up the
+ abandoned waiter. So, we have to insert the unlock-close when the
+ lock call is interrupted. */
+
+static void do_unlock_close(struct dlm_ls *ls, u64 number,
+ struct file *file, struct file_lock *fl)
+{
+ struct plock_op *op;
+
+ op = kzalloc(sizeof(*op), GFP_NOFS);
+ if (!op)
+ return;
+
+ op->info.optype = DLM_PLOCK_OP_UNLOCK;
+ op->info.pid = fl->fl_pid;
+ op->info.fsid = ls->ls_global_id;
+ op->info.number = number;
+ op->info.start = 0;
+ op->info.end = OFFSET_MAX;
+ if (fl->fl_lmops && fl->fl_lmops->fl_grant)
+ op->info.owner = (__u64) fl->fl_pid;
+ else
+ op->info.owner = (__u64)(long) fl->fl_owner;
+
+ op->info.flags |= DLM_PLOCK_FL_CLOSE;
+ send_op(op);
+}
+
int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
int cmd, struct file_lock *fl)
{
@@ -114,9 +144,19 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
send_op(op);
- if (xop->callback == NULL)
- wait_event(recv_wq, (op->done != 0));
- else {
+ if (xop->callback == NULL) {
+ rv = wait_event_killable(recv_wq, (op->done != 0));
+ if (rv == -ERESTARTSYS) {
+ log_debug(ls, "dlm_posix_lock: wait killed %llx",
+ (unsigned long long)number);
+ spin_lock(&ops_lock);
+ list_del(&op->list);
+ spin_unlock(&ops_lock);
+ kfree(xop);
+ do_unlock_close(ls, number, file, fl);
+ goto out;
+ }
+ } else {
rv = FILE_LOCK_DEFERRED;
goto out;
}
@@ -233,6 +273,13 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
else
op->info.owner = (__u64)(long) fl->fl_owner;
+ if (fl->fl_flags & FL_CLOSE) {
+ op->info.flags |= DLM_PLOCK_FL_CLOSE;
+ send_op(op);
+ rv = 0;
+ goto out;
+ }
+
send_op(op);
wait_event(recv_wq, (op->done != 0));
@@ -334,7 +381,10 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count,
spin_lock(&ops_lock);
if (!list_empty(&send_list)) {
op = list_entry(send_list.next, struct plock_op, list);
- list_move(&op->list, &recv_list);
+ if (op->info.flags & DLM_PLOCK_FL_CLOSE)
+ list_del(&op->list);
+ else
+ list_move(&op->list, &recv_list);
memcpy(&info, &op->info, sizeof(info));
}
spin_unlock(&ops_lock);
@@ -342,6 +392,13 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count,
if (!op)
return -EAGAIN;
+ /* there is no need to get a reply from userspace for unlocks
+ that were generated by the vfs cleaning up for a close
+ (the process did not make an unlock call). */
+
+ if (op->info.flags & DLM_PLOCK_FL_CLOSE)
+ kfree(op);
+
if (copy_to_user(u, &info, sizeof(info)))
return -EFAULT;
return sizeof(info);
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index d5ab3fe7c198..e96bf3e9be88 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -611,7 +611,6 @@ static ssize_t device_write(struct file *file, const char __user *buf,
out_sig:
sigprocmask(SIG_SETMASK, &tmpsig, NULL);
- recalc_sigpending();
out_free:
kfree(kbuf);
return error;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 0a78dae7e2cb..1dd62ed35b85 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -898,7 +898,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
brelse(bh);
if (!sb_set_blocksize(sb, blocksize)) {
- ext2_msg(sb, KERN_ERR, "error: blocksize is too small");
+ ext2_msg(sb, KERN_ERR,
+ "error: bad blocksize %d", blocksize);
goto failed_sbi;
}
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 32f3b8695859..34b6d9bfc48a 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1416,10 +1416,19 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
frame->at = entries;
frame->bh = bh;
bh = bh2;
+ /*
+ * Mark buffers dirty here so that if do_split() fails we write a
+ * consistent set of buffers to disk.
+ */
+ ext3_journal_dirty_metadata(handle, frame->bh);
+ ext3_journal_dirty_metadata(handle, bh);
de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
- dx_release (frames);
- if (!(de))
+ if (!de) {
+ ext3_mark_inode_dirty(handle, dir);
+ dx_release(frames);
return retval;
+ }
+ dx_release(frames);
return add_dirent_to_buf(handle, dentry, inode, de, bh);
}
@@ -2189,6 +2198,7 @@ static int ext3_symlink (struct inode * dir,
handle_t *handle;
struct inode * inode;
int l, err, retries = 0;
+ int credits;
l = strlen(symname)+1;
if (l > dir->i_sb->s_blocksize)
@@ -2196,10 +2206,26 @@ static int ext3_symlink (struct inode * dir,
dquot_initialize(dir);
+ if (l > EXT3_N_BLOCKS * 4) {
+ /*
+ * For non-fast symlinks, we just allocate inode and put it on
+ * orphan list in the first transaction => we need bitmap,
+ * group descriptor, sb, inode block, quota blocks.
+ */
+ credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+ } else {
+ /*
+ * Fast symlink. We have to add entry to directory
+ * (EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS),
+ * allocate new inode (bitmap, group descriptor, inode block,
+ * quota blocks, sb is already counted in previous macros).
+ */
+ credits = EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+ EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+ }
retry:
- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 +
- EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+ handle = ext3_journal_start(dir, credits);
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2211,21 +2237,45 @@ retry:
if (IS_ERR(inode))
goto out_stop;
- if (l > sizeof (EXT3_I(inode)->i_data)) {
+ if (l > EXT3_N_BLOCKS * 4) {
inode->i_op = &ext3_symlink_inode_operations;
ext3_set_aops(inode);
/*
- * page_symlink() calls into ext3_prepare/commit_write.
- * We have a transaction open. All is sweetness. It also sets
- * i_size in generic_commit_write().
+ * We cannot call page_symlink() with transaction started
+ * because it calls into ext3_write_begin() which acquires page
+ * lock which ranks below transaction start (and it can also
+ * wait for journal commit if we are running out of space). So
+ * we have to stop transaction now and restart it when symlink
+ * contents is written.
+ *
+ * To keep fs consistent in case of crash, we have to put inode
+ * to orphan list in the mean time.
*/
+ drop_nlink(inode);
+ err = ext3_orphan_add(handle, inode);
+ ext3_journal_stop(handle);
+ if (err)
+ goto err_drop_inode;
err = __page_symlink(inode, symname, l, 1);
+ if (err)
+ goto err_drop_inode;
+ /*
+ * Now inode is being linked into dir (EXT3_DATA_TRANS_BLOCKS
+ * + EXT3_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
+ */
+ handle = ext3_journal_start(dir,
+ EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto err_drop_inode;
+ }
+ inc_nlink(inode);
+ err = ext3_orphan_del(handle, inode);
if (err) {
+ ext3_journal_stop(handle);
drop_nlink(inode);
- unlock_new_inode(inode);
- ext3_mark_inode_dirty(handle, inode);
- iput (inode);
- goto out_stop;
+ goto err_drop_inode;
}
} else {
inode->i_op = &ext3_fast_symlink_inode_operations;
@@ -2239,6 +2289,10 @@ out_stop:
if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
+err_drop_inode:
+ unlock_new_inode(inode);
+ iput(inode);
+ return err;
}
static int ext3_link (struct dentry * old_dentry,
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 69b180459463..72ffa974b0b8 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -302,12 +302,6 @@ void journal_commit_transaction(journal_t *journal)
* all outstanding updates to complete.
*/
-#ifdef COMMIT_STATS
- spin_lock(&journal->j_list_lock);
- summarise_journal_usage(journal);
- spin_unlock(&journal->j_list_lock);
-#endif
-
/* Do we need to erase the effects of a prior journal_flush? */
if (journal->j_flags & JFS_FLUSHED) {
jbd_debug(3, "super block updated\n");
@@ -722,8 +716,13 @@ wait_for_iobuf:
required. */
JBUFFER_TRACE(jh, "file as BJ_Forget");
journal_file_buffer(jh, commit_transaction, BJ_Forget);
- /* Wake up any transactions which were waiting for this
- IO to complete */
+ /*
+ * Wake up any transactions which were waiting for this
+ * IO to complete. The barrier must be here so that changes
+ * by journal_file_buffer() take effect before wake_up_bit()
+ * does the waitqueue check.
+ */
+ smp_mb();
wake_up_bit(&bh->b_state, BH_Unshadow);
JBUFFER_TRACE(jh, "brelse shadowed buffer");
__brelse(bh);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index b3713afaaa9e..e2d4285fbe90 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -437,9 +437,12 @@ int __log_space_left(journal_t *journal)
int __log_start_commit(journal_t *journal, tid_t target)
{
/*
- * Are we already doing a recent enough commit?
+ * The only transaction we can possibly wait upon is the
+ * currently running transaction (if it exists). Otherwise,
+ * the target tid must be an old one.
*/
- if (!tid_geq(journal->j_commit_request, target)) {
+ if (journal->j_running_transaction &&
+ journal->j_running_transaction->t_tid == target) {
/*
* We want a new commit: OK, mark the request and wakeup the
* commit thread. We do _not_ do the commit ourselves.
@@ -451,7 +454,14 @@ int __log_start_commit(journal_t *journal, tid_t target)
journal->j_commit_sequence);
wake_up(&journal->j_wait_commit);
return 1;
- }
+ } else if (!tid_geq(journal->j_commit_request, target))
+ /* This should never happen, but if it does, preserve
+ the evidence before kjournald goes into a loop and
+ increments j_commit_sequence beyond all recognition. */
+ WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
+ journal->j_commit_request, journal->j_commit_sequence,
+ target, journal->j_running_transaction ?
+ journal->j_running_transaction->t_tid : 0);
return 0;
}
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 60d2319651b2..f7ee81a065da 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -266,7 +266,8 @@ static handle_t *new_handle(int nblocks)
* This function is visible to journal users (like ext3fs), so is not
* called with the journal already locked.
*
- * Return a pointer to a newly allocated handle, or NULL on failure
+ * Return a pointer to a newly allocated handle, or an ERR_PTR() value
+ * on failure.
*/
handle_t *journal_start(journal_t *journal, int nblocks)
{
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 6e28000a4b21..29148a81c783 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -338,12 +338,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
* all outstanding updates to complete.
*/
-#ifdef COMMIT_STATS
- spin_lock(&journal->j_list_lock);
- summarise_journal_usage(journal);
- spin_unlock(&journal->j_list_lock);
-#endif
-
/* Do we need to erase the effects of a prior jbd2_journal_flush? */
if (journal->j_flags & JBD2_FLUSHED) {
jbd_debug(3, "super block updated\n");
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index d8a0313e99e6..f17e58b32989 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -30,6 +30,7 @@ ocfs2-objs := \
namei.o \
refcounttree.o \
reservations.o \
+ move_extents.o \
resize.o \
slot_map.o \
suballoc.o \
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 312a28f433a4..bc91072b7219 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -22,6 +22,11 @@
#include "ioctl.h"
#include "resize.h"
#include "refcounttree.h"
+#include "sysfile.h"
+#include "dir.h"
+#include "buffer_head_io.h"
+#include "suballoc.h"
+#include "move_extents.h"
#include <linux/ext2_fs.h>
@@ -35,31 +40,27 @@
* be -EFAULT. The error will be returned from the ioctl(2) call. It's
* just a best-effort to tell userspace that this request caused the error.
*/
-static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq,
+static inline void o2info_set_request_error(struct ocfs2_info_request *kreq,
struct ocfs2_info_request __user *req)
{
kreq->ir_flags |= OCFS2_INFO_FL_ERROR;
(void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags));
}
-#define o2info_set_request_error(a, b) \
- __o2info_set_request_error((struct ocfs2_info_request *)&(a), b)
-
-static inline void __o2info_set_request_filled(struct ocfs2_info_request *req)
+static inline void o2info_set_request_filled(struct ocfs2_info_request *req)
{
req->ir_flags |= OCFS2_INFO_FL_FILLED;
}
-#define o2info_set_request_filled(a) \
- __o2info_set_request_filled((struct ocfs2_info_request *)&(a))
-
-static inline void __o2info_clear_request_filled(struct ocfs2_info_request *req)
+static inline void o2info_clear_request_filled(struct ocfs2_info_request *req)
{
req->ir_flags &= ~OCFS2_INFO_FL_FILLED;
}
-#define o2info_clear_request_filled(a) \
- __o2info_clear_request_filled((struct ocfs2_info_request *)&(a))
+static inline int o2info_coherent(struct ocfs2_info_request *req)
+{
+ return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT));
+}
static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
{
@@ -153,7 +154,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode,
oib.ib_blocksize = inode->i_sb->s_blocksize;
- o2info_set_request_filled(oib);
+ o2info_set_request_filled(&oib.ib_req);
if (o2info_to_user(oib, req))
goto bail;
@@ -161,7 +162,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode,
status = 0;
bail:
if (status)
- o2info_set_request_error(oib, req);
+ o2info_set_request_error(&oib.ib_req, req);
return status;
}
@@ -178,7 +179,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode,
oic.ic_clustersize = osb->s_clustersize;
- o2info_set_request_filled(oic);
+ o2info_set_request_filled(&oic.ic_req);
if (o2info_to_user(oic, req))
goto bail;
@@ -186,7 +187,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode,
status = 0;
bail:
if (status)
- o2info_set_request_error(oic, req);
+ o2info_set_request_error(&oic.ic_req, req);
return status;
}
@@ -203,7 +204,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode,
oim.im_max_slots = osb->max_slots;
- o2info_set_request_filled(oim);
+ o2info_set_request_filled(&oim.im_req);
if (o2info_to_user(oim, req))
goto bail;
@@ -211,7 +212,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode,
status = 0;
bail:
if (status)
- o2info_set_request_error(oim, req);
+ o2info_set_request_error(&oim.im_req, req);
return status;
}
@@ -228,7 +229,7 @@ int ocfs2_info_handle_label(struct inode *inode,
memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
- o2info_set_request_filled(oil);
+ o2info_set_request_filled(&oil.il_req);
if (o2info_to_user(oil, req))
goto bail;
@@ -236,7 +237,7 @@ int ocfs2_info_handle_label(struct inode *inode,
status = 0;
bail:
if (status)
- o2info_set_request_error(oil, req);
+ o2info_set_request_error(&oil.il_req, req);
return status;
}
@@ -253,7 +254,7 @@ int ocfs2_info_handle_uuid(struct inode *inode,
memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
- o2info_set_request_filled(oiu);
+ o2info_set_request_filled(&oiu.iu_req);
if (o2info_to_user(oiu, req))
goto bail;
@@ -261,7 +262,7 @@ int ocfs2_info_handle_uuid(struct inode *inode,
status = 0;
bail:
if (status)
- o2info_set_request_error(oiu, req);
+ o2info_set_request_error(&oiu.iu_req, req);
return status;
}
@@ -280,7 +281,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode,
oif.if_incompat_features = osb->s_feature_incompat;
oif.if_ro_compat_features = osb->s_feature_ro_compat;
- o2info_set_request_filled(oif);
+ o2info_set_request_filled(&oif.if_req);
if (o2info_to_user(oif, req))
goto bail;
@@ -288,7 +289,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode,
status = 0;
bail:
if (status)
- o2info_set_request_error(oif, req);
+ o2info_set_request_error(&oif.if_req, req);
return status;
}
@@ -305,7 +306,7 @@ int ocfs2_info_handle_journal_size(struct inode *inode,
oij.ij_journal_size = osb->journal->j_inode->i_size;
- o2info_set_request_filled(oij);
+ o2info_set_request_filled(&oij.ij_req);
if (o2info_to_user(oij, req))
goto bail;
@@ -313,7 +314,408 @@ int ocfs2_info_handle_journal_size(struct inode *inode,
status = 0;
bail:
if (status)
- o2info_set_request_error(oij, req);
+ o2info_set_request_error(&oij.ij_req, req);
+
+ return status;
+}
+
+int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
+ struct inode *inode_alloc, u64 blkno,
+ struct ocfs2_info_freeinode *fi, u32 slot)
+{
+ int status = 0, unlock = 0;
+
+ struct buffer_head *bh = NULL;
+ struct ocfs2_dinode *dinode_alloc = NULL;
+
+ if (inode_alloc)
+ mutex_lock(&inode_alloc->i_mutex);
+
+ if (o2info_coherent(&fi->ifi_req)) {
+ status = ocfs2_inode_lock(inode_alloc, &bh, 0);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ unlock = 1;
+ } else {
+ status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
+ dinode_alloc = (struct ocfs2_dinode *)bh->b_data;
+
+ fi->ifi_stat[slot].lfi_total =
+ le32_to_cpu(dinode_alloc->id1.bitmap1.i_total);
+ fi->ifi_stat[slot].lfi_free =
+ le32_to_cpu(dinode_alloc->id1.bitmap1.i_total) -
+ le32_to_cpu(dinode_alloc->id1.bitmap1.i_used);
+
+bail:
+ if (unlock)
+ ocfs2_inode_unlock(inode_alloc, 0);
+
+ if (inode_alloc)
+ mutex_unlock(&inode_alloc->i_mutex);
+
+ brelse(bh);
+
+ return status;
+}
+
+int ocfs2_info_handle_freeinode(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ u32 i;
+ u64 blkno = -1;
+ char namebuf[40];
+ int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE;
+ struct ocfs2_info_freeinode *oifi = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct inode *inode_alloc = NULL;
+
+ oifi = kzalloc(sizeof(struct ocfs2_info_freeinode), GFP_KERNEL);
+ if (!oifi) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto bail;
+ }
+
+ if (o2info_from_user(*oifi, req))
+ goto bail;
+
+ oifi->ifi_slotnum = osb->max_slots;
+
+ for (i = 0; i < oifi->ifi_slotnum; i++) {
+ if (o2info_coherent(&oifi->ifi_req)) {
+ inode_alloc = ocfs2_get_system_file_inode(osb, type, i);
+ if (!inode_alloc) {
+ mlog(ML_ERROR, "unable to get alloc inode in "
+ "slot %u\n", i);
+ status = -EIO;
+ goto bail;
+ }
+ } else {
+ ocfs2_sprintf_system_inode_name(namebuf,
+ sizeof(namebuf),
+ type, i);
+ status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
+ namebuf,
+ strlen(namebuf),
+ &blkno);
+ if (status < 0) {
+ status = -ENOENT;
+ goto bail;
+ }
+ }
+
+ status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i);
+ if (status < 0)
+ goto bail;
+
+ iput(inode_alloc);
+ inode_alloc = NULL;
+ }
+
+ o2info_set_request_filled(&oifi->ifi_req);
+
+ if (o2info_to_user(*oifi, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oifi->ifi_req, req);
+
+ kfree(oifi);
+
+ return status;
+}
+
+static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist,
+ unsigned int chunksize)
+{
+ int index;
+
+ index = __ilog2_u32(chunksize);
+ if (index >= OCFS2_INFO_MAX_HIST)
+ index = OCFS2_INFO_MAX_HIST - 1;
+
+ hist->fc_chunks[index]++;
+ hist->fc_clusters[index] += chunksize;
+}
+
+static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats,
+ unsigned int chunksize)
+{
+ if (chunksize > stats->ffs_max)
+ stats->ffs_max = chunksize;
+
+ if (chunksize < stats->ffs_min)
+ stats->ffs_min = chunksize;
+
+ stats->ffs_avg += chunksize;
+ stats->ffs_free_chunks_real++;
+}
+
+void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg,
+ unsigned int chunksize)
+{
+ o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize);
+ o2ffg_update_stats(&(ffg->iff_ffs), chunksize);
+}
+
+int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb,
+ struct inode *gb_inode,
+ struct ocfs2_dinode *gb_dinode,
+ struct ocfs2_chain_rec *rec,
+ struct ocfs2_info_freefrag *ffg,
+ u32 chunks_in_group)
+{
+ int status = 0, used;
+ u64 blkno;
+
+ struct buffer_head *bh = NULL;
+ struct ocfs2_group_desc *bg = NULL;
+
+ unsigned int max_bits, num_clusters;
+ unsigned int offset = 0, cluster, chunk;
+ unsigned int chunk_free, last_chunksize = 0;
+
+ if (!le32_to_cpu(rec->c_free))
+ goto bail;
+
+ do {
+ if (!bg)
+ blkno = le64_to_cpu(rec->c_blkno);
+ else
+ blkno = le64_to_cpu(bg->bg_next_group);
+
+ if (bh) {
+ brelse(bh);
+ bh = NULL;
+ }
+
+ if (o2info_coherent(&ffg->iff_req))
+ status = ocfs2_read_group_descriptor(gb_inode,
+ gb_dinode,
+ blkno, &bh);
+ else
+ status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+
+ if (status < 0) {
+ mlog(ML_ERROR, "Can't read the group descriptor # "
+ "%llu from device.", (unsigned long long)blkno);
+ status = -EIO;
+ goto bail;
+ }
+
+ bg = (struct ocfs2_group_desc *)bh->b_data;
+
+ if (!le16_to_cpu(bg->bg_free_bits_count))
+ continue;
+
+ max_bits = le16_to_cpu(bg->bg_bits);
+ offset = 0;
+
+ for (chunk = 0; chunk < chunks_in_group; chunk++) {
+ /*
+ * last chunk may be not an entire one.
+ */
+ if ((offset + ffg->iff_chunksize) > max_bits)
+ num_clusters = max_bits - offset;
+ else
+ num_clusters = ffg->iff_chunksize;
+
+ chunk_free = 0;
+ for (cluster = 0; cluster < num_clusters; cluster++) {
+ used = ocfs2_test_bit(offset,
+ (unsigned long *)bg->bg_bitmap);
+ /*
+ * - chunk_free counts free clusters in #N chunk.
+ * - last_chunksize records the size(in) clusters
+ * for the last real free chunk being counted.
+ */
+ if (!used) {
+ last_chunksize++;
+ chunk_free++;
+ }
+
+ if (used && last_chunksize) {
+ ocfs2_info_update_ffg(ffg,
+ last_chunksize);
+ last_chunksize = 0;
+ }
+
+ offset++;
+ }
+
+ if (chunk_free == ffg->iff_chunksize)
+ ffg->iff_ffs.ffs_free_chunks++;
+ }
+
+ /*
+ * need to update the info for last free chunk.
+ */
+ if (last_chunksize)
+ ocfs2_info_update_ffg(ffg, last_chunksize);
+
+ } while (le64_to_cpu(bg->bg_next_group));
+
+bail:
+ brelse(bh);
+
+ return status;
+}
+
+int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb,
+ struct inode *gb_inode, u64 blkno,
+ struct ocfs2_info_freefrag *ffg)
+{
+ u32 chunks_in_group;
+ int status = 0, unlock = 0, i;
+
+ struct buffer_head *bh = NULL;
+ struct ocfs2_chain_list *cl = NULL;
+ struct ocfs2_chain_rec *rec = NULL;
+ struct ocfs2_dinode *gb_dinode = NULL;
+
+ if (gb_inode)
+ mutex_lock(&gb_inode->i_mutex);
+
+ if (o2info_coherent(&ffg->iff_req)) {
+ status = ocfs2_inode_lock(gb_inode, &bh, 0);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ unlock = 1;
+ } else {
+ status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
+ gb_dinode = (struct ocfs2_dinode *)bh->b_data;
+ cl = &(gb_dinode->id2.i_chain);
+
+ /*
+ * Chunksize(in) clusters from userspace should be
+ * less than clusters in a group.
+ */
+ if (ffg->iff_chunksize > le16_to_cpu(cl->cl_cpg)) {
+ status = -EINVAL;
+ goto bail;
+ }
+
+ memset(&ffg->iff_ffs, 0, sizeof(struct ocfs2_info_freefrag_stats));
+
+ ffg->iff_ffs.ffs_min = ~0U;
+ ffg->iff_ffs.ffs_clusters =
+ le32_to_cpu(gb_dinode->id1.bitmap1.i_total);
+ ffg->iff_ffs.ffs_free_clusters = ffg->iff_ffs.ffs_clusters -
+ le32_to_cpu(gb_dinode->id1.bitmap1.i_used);
+
+ chunks_in_group = le16_to_cpu(cl->cl_cpg) / ffg->iff_chunksize + 1;
+
+ for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
+ rec = &(cl->cl_recs[i]);
+ status = ocfs2_info_freefrag_scan_chain(osb, gb_inode,
+ gb_dinode,
+ rec, ffg,
+ chunks_in_group);
+ if (status)
+ goto bail;
+ }
+
+ if (ffg->iff_ffs.ffs_free_chunks_real)
+ ffg->iff_ffs.ffs_avg = (ffg->iff_ffs.ffs_avg /
+ ffg->iff_ffs.ffs_free_chunks_real);
+bail:
+ if (unlock)
+ ocfs2_inode_unlock(gb_inode, 0);
+
+ if (gb_inode)
+ mutex_unlock(&gb_inode->i_mutex);
+
+ if (gb_inode)
+ iput(gb_inode);
+
+ brelse(bh);
+
+ return status;
+}
+
+int ocfs2_info_handle_freefrag(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ u64 blkno = -1;
+ char namebuf[40];
+ int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE;
+
+ struct ocfs2_info_freefrag *oiff;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct inode *gb_inode = NULL;
+
+ oiff = kzalloc(sizeof(struct ocfs2_info_freefrag), GFP_KERNEL);
+ if (!oiff) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto bail;
+ }
+
+ if (o2info_from_user(*oiff, req))
+ goto bail;
+ /*
+ * chunksize from userspace should be power of 2.
+ */
+ if ((oiff->iff_chunksize & (oiff->iff_chunksize - 1)) ||
+ (!oiff->iff_chunksize)) {
+ status = -EINVAL;
+ goto bail;
+ }
+
+ if (o2info_coherent(&oiff->iff_req)) {
+ gb_inode = ocfs2_get_system_file_inode(osb, type,
+ OCFS2_INVALID_SLOT);
+ if (!gb_inode) {
+ mlog(ML_ERROR, "unable to get global_bitmap inode\n");
+ status = -EIO;
+ goto bail;
+ }
+ } else {
+ ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type,
+ OCFS2_INVALID_SLOT);
+ status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
+ namebuf,
+ strlen(namebuf),
+ &blkno);
+ if (status < 0) {
+ status = -ENOENT;
+ goto bail;
+ }
+ }
+
+ status = ocfs2_info_freefrag_scan_bitmap(osb, gb_inode, blkno, oiff);
+ if (status < 0)
+ goto bail;
+
+ o2info_set_request_filled(&oiff->iff_req);
+
+ if (o2info_to_user(*oiff, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oiff->iff_req, req);
+
+ kfree(oiff);
return status;
}
@@ -327,7 +729,7 @@ int ocfs2_info_handle_unknown(struct inode *inode,
if (o2info_from_user(oir, req))
goto bail;
- o2info_clear_request_filled(oir);
+ o2info_clear_request_filled(&oir);
if (o2info_to_user(oir, req))
goto bail;
@@ -335,7 +737,7 @@ int ocfs2_info_handle_unknown(struct inode *inode,
status = 0;
bail:
if (status)
- o2info_set_request_error(oir, req);
+ o2info_set_request_error(&oir, req);
return status;
}
@@ -389,6 +791,14 @@ int ocfs2_info_handle_request(struct inode *inode,
if (oir.ir_size == sizeof(struct ocfs2_info_journal_size))
status = ocfs2_info_handle_journal_size(inode, req);
break;
+ case OCFS2_INFO_FREEINODE:
+ if (oir.ir_size == sizeof(struct ocfs2_info_freeinode))
+ status = ocfs2_info_handle_freeinode(inode, req);
+ break;
+ case OCFS2_INFO_FREEFRAG:
+ if (oir.ir_size == sizeof(struct ocfs2_info_freefrag))
+ status = ocfs2_info_handle_freefrag(inode, req);
+ break;
default:
status = ocfs2_info_handle_unknown(inode, req);
break;
@@ -565,6 +975,8 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return 0;
}
+ case OCFS2_IOC_MOVE_EXT:
+ return ocfs2_ioctl_move_extents(filp, (void __user *)arg);
default:
return -ENOTTY;
}
@@ -608,6 +1020,8 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
return -EFAULT;
return ocfs2_info_handle(inode, &info, 1);
+ case OCFS2_IOC_MOVE_EXT:
+ break;
default:
return -ENOIOCTLCMD;
}
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
new file mode 100644
index 000000000000..4c5488468c14
--- /dev/null
+++ b/fs/ocfs2/move_extents.c
@@ -0,0 +1,1153 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * move_extents.c
+ *
+ * Copyright (C) 2011 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/mount.h>
+#include <linux/swap.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "ocfs2_ioctl.h"
+
+#include "alloc.h"
+#include "aops.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "inode.h"
+#include "journal.h"
+#include "suballoc.h"
+#include "uptodate.h"
+#include "super.h"
+#include "dir.h"
+#include "buffer_head_io.h"
+#include "sysfile.h"
+#include "suballoc.h"
+#include "refcounttree.h"
+#include "move_extents.h"
+
+struct ocfs2_move_extents_context {
+ struct inode *inode;
+ struct file *file;
+ int auto_defrag;
+ int partial;
+ int credits;
+ u32 new_phys_cpos;
+ u32 clusters_moved;
+ u64 refcount_loc;
+ struct ocfs2_move_extents *range;
+ struct ocfs2_extent_tree et;
+ struct ocfs2_alloc_context *meta_ac;
+ struct ocfs2_alloc_context *data_ac;
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+};
+
+static int __ocfs2_move_extent(handle_t *handle,
+ struct ocfs2_move_extents_context *context,
+ u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos,
+ int ext_flags)
+{
+ int ret = 0, index;
+ struct inode *inode = context->inode;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_extent_rec *rec, replace_rec;
+ struct ocfs2_path *path = NULL;
+ struct ocfs2_extent_list *el;
+ u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci);
+ u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos);
+
+ ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos,
+ p_cpos, new_p_cpos, len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ memset(&replace_rec, 0, sizeof(replace_rec));
+ replace_rec.e_cpos = cpu_to_le32(cpos);
+ replace_rec.e_leaf_clusters = cpu_to_le16(len);
+ replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
+ new_p_cpos));
+
+ path = ocfs2_new_path_from_et(&context->et);
+ if (!path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ el = path_leaf_el(path);
+
+ index = ocfs2_search_extent_list(el, cpos);
+ if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+ ocfs2_error(inode->i_sb,
+ "Inode %llu has an extent at cpos %u which can no "
+ "longer be found.\n",
+ (unsigned long long)ino, cpos);
+ ret = -EROFS;
+ goto out;
+ }
+
+ rec = &el->l_recs[index];
+
+ BUG_ON(ext_flags != rec->e_flags);
+ /*
+ * after moving/defraging to new location, the extent is not going
+ * to be refcounted anymore.
+ */
+ replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED;
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+ context->et.et_root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_split_extent(handle, &context->et, path, index,
+ &replace_rec, context->meta_ac,
+ &context->dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_journal_dirty(handle, context->et.et_root_bh);
+
+ context->new_phys_cpos = new_p_cpos;
+
+ /*
+ * need I to append truncate log for old clusters?
+ */
+ if (old_blkno) {
+ if (ext_flags & OCFS2_EXT_REFCOUNTED)
+ ret = ocfs2_decrease_refcount(inode, handle,
+ ocfs2_blocks_to_clusters(osb->sb,
+ old_blkno),
+ len, context->meta_ac,
+ &context->dealloc, 1);
+ else
+ ret = ocfs2_truncate_log_append(osb, handle,
+ old_blkno, len);
+ }
+
+out:
+ return ret;
+}
+
+/*
+ * lock allocators, and reserving appropriate number of bits for
+ * meta blocks and data clusters.
+ *
+ * in some cases, we don't need to reserve clusters, just let data_ac
+ * be NULL.
+ */
+static int ocfs2_lock_allocators_move_extents(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ u32 clusters_to_move,
+ u32 extents_to_split,
+ struct ocfs2_alloc_context **meta_ac,
+ struct ocfs2_alloc_context **data_ac,
+ int extra_blocks,
+ int *credits)
+{
+ int ret, num_free_extents;
+ unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ num_free_extents = ocfs2_num_free_extents(osb, et);
+ if (num_free_extents < 0) {
+ ret = num_free_extents;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (!num_free_extents ||
+ (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
+ extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
+
+ ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (data_ac) {
+ ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el,
+ clusters_to_move + 2);
+
+ mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",
+ extra_blocks, clusters_to_move, *credits);
+out:
+ if (ret) {
+ if (*meta_ac) {
+ ocfs2_free_alloc_context(*meta_ac);
+ *meta_ac = NULL;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * Using one journal handle to guarantee the data consistency in case
+ * crash happens anywhere.
+ *
+ * XXX: defrag can end up with finishing partial extent as requested,
+ * due to not enough contiguous clusters can be found in allocator.
+ */
+static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
+ u32 cpos, u32 phys_cpos, u32 *len, int ext_flags)
+{
+ int ret, credits = 0, extra_blocks = 0, partial = context->partial;
+ handle_t *handle;
+ struct inode *inode = context->inode;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct inode *tl_inode = osb->osb_tl_inode;
+ struct ocfs2_refcount_tree *ref_tree = NULL;
+ u32 new_phys_cpos, new_len;
+ u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+
+ if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
+
+ BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+ OCFS2_HAS_REFCOUNT_FL));
+
+ BUG_ON(!context->refcount_loc);
+
+ ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
+ &ref_tree, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ret = ocfs2_prepare_refcount_change_for_del(inode,
+ context->refcount_loc,
+ phys_blkno,
+ *len,
+ &credits,
+ &extra_blocks);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1,
+ &context->meta_ac,
+ &context->data_ac,
+ extra_blocks, &credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * should be using allocation reservation strategy there?
+ *
+ * if (context->data_ac)
+ * context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+ */
+
+ mutex_lock(&tl_inode->i_mutex);
+
+ if (ocfs2_truncate_log_needs_flush(osb)) {
+ ret = __ocfs2_flush_truncate_log(osb);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_unlock_mutex;
+ }
+ }
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out_unlock_mutex;
+ }
+
+ ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len,
+ &new_phys_cpos, &new_len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ /*
+ * allowing partial extent moving is kind of 'pros and cons', it makes
+ * whole defragmentation less likely to fail, on the contrary, the bad
+ * thing is it may make the fs even more fragmented after moving, let
+ * userspace make a good decision here.
+ */
+ if (new_len != *len) {
+ mlog(0, "len_claimed: %u, len: %u\n", new_len, *len);
+ if (!partial) {
+ context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
+ ret = -ENOSPC;
+ goto out_commit;
+ }
+ }
+
+ mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos,
+ phys_cpos, new_phys_cpos);
+
+ ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos,
+ new_phys_cpos, ext_flags);
+ if (ret)
+ mlog_errno(ret);
+
+ if (partial && (new_len != *len))
+ *len = new_len;
+
+ /*
+ * Here we should write the new page out first if we are
+ * in write-back mode.
+ */
+ ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len);
+ if (ret)
+ mlog_errno(ret);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+
+out_unlock_mutex:
+ mutex_unlock(&tl_inode->i_mutex);
+
+ if (context->data_ac) {
+ ocfs2_free_alloc_context(context->data_ac);
+ context->data_ac = NULL;
+ }
+
+ if (context->meta_ac) {
+ ocfs2_free_alloc_context(context->meta_ac);
+ context->meta_ac = NULL;
+ }
+
+out:
+ if (ref_tree)
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+
+ return ret;
+}
+
+/*
+ * find the victim alloc group, where #blkno fits.
+ */
+static int ocfs2_find_victim_alloc_group(struct inode *inode,
+ u64 vict_blkno,
+ int type, int slot,
+ int *vict_bit,
+ struct buffer_head **ret_bh)
+{
+ int ret, i, blocks_per_unit = 1;
+ u64 blkno;
+ char namebuf[40];
+
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct buffer_head *ac_bh = NULL, *gd_bh = NULL;
+ struct ocfs2_chain_list *cl;
+ struct ocfs2_chain_rec *rec;
+ struct ocfs2_dinode *ac_dinode;
+ struct ocfs2_group_desc *bg;
+
+ ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
+ ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
+ strlen(namebuf), &blkno);
+ if (ret) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data;
+ cl = &(ac_dinode->id2.i_chain);
+ rec = &(cl->cl_recs[0]);
+
+ if (type == GLOBAL_BITMAP_SYSTEM_INODE)
+ blocks_per_unit <<= (osb->s_clustersize_bits -
+ inode->i_sb->s_blocksize_bits);
+ /*
+ * 'vict_blkno' was out of the valid range.
+ */
+ if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
+ (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) *
+ blocks_per_unit))) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
+
+ rec = &(cl->cl_recs[i]);
+ if (!rec)
+ continue;
+
+ bg = NULL;
+
+ do {
+ if (!bg)
+ blkno = le64_to_cpu(rec->c_blkno);
+ else
+ blkno = le64_to_cpu(bg->bg_next_group);
+
+ if (gd_bh) {
+ brelse(gd_bh);
+ gd_bh = NULL;
+ }
+
+ ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ bg = (struct ocfs2_group_desc *)gd_bh->b_data;
+
+ if (vict_blkno < (le64_to_cpu(bg->bg_blkno) +
+ le16_to_cpu(bg->bg_bits))) {
+
+ *ret_bh = gd_bh;
+ *vict_bit = (vict_blkno - blkno) /
+ blocks_per_unit;
+ mlog(0, "find the victim group: #%llu, "
+ "total_bits: %u, vict_bit: %u\n",
+ blkno, le16_to_cpu(bg->bg_bits),
+ *vict_bit);
+ goto out;
+ }
+
+ } while (le64_to_cpu(bg->bg_next_group));
+ }
+
+ ret = -EINVAL;
+out:
+ brelse(ac_bh);
+
+ /*
+ * caller has to release the gd_bh properly.
+ */
+ return ret;
+}
+
+/*
+ * XXX: helper to validate and adjust moving goal.
+ */
+static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
+ struct ocfs2_move_extents *range)
+{
+ int ret, goal_bit = 0;
+
+ struct buffer_head *gd_bh = NULL;
+ struct ocfs2_group_desc *bg;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ int c_to_b = 1 << (osb->s_clustersize_bits -
+ inode->i_sb->s_blocksize_bits);
+
+ /*
+ * validate goal sits within global_bitmap, and return the victim
+ * group desc
+ */
+ ret = ocfs2_find_victim_alloc_group(inode, range->me_goal,
+ GLOBAL_BITMAP_SYSTEM_INODE,
+ OCFS2_INVALID_SLOT,
+ &goal_bit, &gd_bh);
+ if (ret)
+ goto out;
+
+ bg = (struct ocfs2_group_desc *)gd_bh->b_data;
+
+ /*
+ * make goal become cluster aligned.
+ */
+ if (range->me_goal % c_to_b)
+ range->me_goal = range->me_goal / c_to_b * c_to_b;
+
+ /*
+ * moving goal is not allowd to start with a group desc blok(#0 blk)
+ * let's compromise to the latter cluster.
+ */
+ if (range->me_goal == le64_to_cpu(bg->bg_blkno))
+ range->me_goal += c_to_b;
+
+ /*
+ * movement is not gonna cross two groups.
+ */
+ if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize <
+ range->me_len) {
+ ret = -EINVAL;
+ goto out;
+ }
+ /*
+ * more exact validations/adjustments will be performed later during
+ * moving operation for each extent range.
+ */
+ mlog(0, "extents get ready to be moved to #%llu block\n",
+ range->me_goal);
+
+out:
+ brelse(gd_bh);
+
+ return ret;
+}
+
+static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
+ int *goal_bit, u32 move_len, u32 max_hop,
+ u32 *phys_cpos)
+{
+ int i, used, last_free_bits = 0, base_bit = *goal_bit;
+ struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+ u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+ le64_to_cpu(gd->bg_blkno));
+
+ for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) {
+
+ used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap);
+ if (used) {
+ /*
+ * we even tried searching the free chunk by jumping
+ * a 'max_hop' distance, but still failed.
+ */
+ if ((i - base_bit) > max_hop) {
+ *phys_cpos = 0;
+ break;
+ }
+
+ if (last_free_bits)
+ last_free_bits = 0;
+
+ continue;
+ } else
+ last_free_bits++;
+
+ if (last_free_bits == move_len) {
+ *goal_bit = i;
+ *phys_cpos = base_cpos + i;
+ break;
+ }
+ }
+
+ mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
+}
+
+static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
+ handle_t *handle,
+ struct buffer_head *di_bh,
+ u32 num_bits,
+ u16 chain)
+{
+ int ret;
+ u32 tmp_used;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+ struct ocfs2_chain_list *cl =
+ (struct ocfs2_chain_list *) &di->id2.i_chain;
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
+ di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
+ le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
+ ocfs2_journal_dirty(handle, di_bh);
+
+out:
+ return ret;
+}
+
+static inline int ocfs2_block_group_set_bits(handle_t *handle,
+ struct inode *alloc_inode,
+ struct ocfs2_group_desc *bg,
+ struct buffer_head *group_bh,
+ unsigned int bit_off,
+ unsigned int num_bits)
+{
+ int status;
+ void *bitmap = bg->bg_bitmap;
+ int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
+
+ /* All callers get the descriptor via
+ * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
+ BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
+ BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
+
+ mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
+ num_bits);
+
+ if (ocfs2_is_cluster_bitmap(alloc_inode))
+ journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
+
+ status = ocfs2_journal_access_gd(handle,
+ INODE_CACHE(alloc_inode),
+ group_bh,
+ journal_type);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
+ if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+ ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+ " count %u but claims %u are freed. num_bits %d",
+ (unsigned long long)le64_to_cpu(bg->bg_blkno),
+ le16_to_cpu(bg->bg_bits),
+ le16_to_cpu(bg->bg_free_bits_count), num_bits);
+ return -EROFS;
+ }
+ while (num_bits--)
+ ocfs2_set_bit(bit_off++, bitmap);
+
+ ocfs2_journal_dirty(handle, group_bh);
+
+bail:
+ return status;
+}
+
+static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
+ u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
+ u32 len, int ext_flags)
+{
+ int ret, credits = 0, extra_blocks = 0, goal_bit = 0;
+ handle_t *handle;
+ struct inode *inode = context->inode;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct inode *tl_inode = osb->osb_tl_inode;
+ struct inode *gb_inode = NULL;
+ struct buffer_head *gb_bh = NULL;
+ struct buffer_head *gd_bh = NULL;
+ struct ocfs2_group_desc *gd;
+ struct ocfs2_refcount_tree *ref_tree = NULL;
+ u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb,
+ context->range->me_threshold);
+ u64 phys_blkno, new_phys_blkno;
+
+ phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+
+ if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) {
+
+ BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+ OCFS2_HAS_REFCOUNT_FL));
+
+ BUG_ON(!context->refcount_loc);
+
+ ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
+ &ref_tree, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ret = ocfs2_prepare_refcount_change_for_del(inode,
+ context->refcount_loc,
+ phys_blkno,
+ len,
+ &credits,
+ &extra_blocks);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1,
+ &context->meta_ac,
+ NULL, extra_blocks, &credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * need to count 2 extra credits for global_bitmap inode and
+ * group descriptor.
+ */
+ credits += OCFS2_INODE_UPDATE_CREDITS + 1;
+
+ /*
+ * ocfs2_move_extent() didn't reserve any clusters in lock_allocators()
+ * logic, while we still need to lock the global_bitmap.
+ */
+ gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
+ OCFS2_INVALID_SLOT);
+ if (!gb_inode) {
+ mlog(ML_ERROR, "unable to get global_bitmap inode\n");
+ ret = -EIO;
+ goto out;
+ }
+
+ mutex_lock(&gb_inode->i_mutex);
+
+ ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock_gb_mutex;
+ }
+
+ mutex_lock(&tl_inode->i_mutex);
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out_unlock_tl_inode;
+ }
+
+ new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos);
+ ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno,
+ GLOBAL_BITMAP_SYSTEM_INODE,
+ OCFS2_INVALID_SLOT,
+ &goal_bit, &gd_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ /*
+ * probe the victim cluster group to find a proper
+ * region to fit wanted movement, it even will perfrom
+ * a best-effort attempt by compromising to a threshold
+ * around the goal.
+ */
+ ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
+ new_phys_cpos);
+ if (!new_phys_cpos) {
+ ret = -ENOSPC;
+ goto out_commit;
+ }
+
+ ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos,
+ *new_phys_cpos, ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ gd = (struct ocfs2_group_desc *)gd_bh->b_data;
+ ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len,
+ le16_to_cpu(gd->bg_chain));
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
+ goal_bit, len);
+ if (ret)
+ mlog_errno(ret);
+
+ /*
+ * Here we should write the new page out first if we are
+ * in write-back mode.
+ */
+ ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len);
+ if (ret)
+ mlog_errno(ret);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+ brelse(gd_bh);
+
+out_unlock_tl_inode:
+ mutex_unlock(&tl_inode->i_mutex);
+
+ ocfs2_inode_unlock(gb_inode, 1);
+out_unlock_gb_mutex:
+ mutex_unlock(&gb_inode->i_mutex);
+ brelse(gb_bh);
+ iput(gb_inode);
+
+out:
+ if (context->meta_ac) {
+ ocfs2_free_alloc_context(context->meta_ac);
+ context->meta_ac = NULL;
+ }
+
+ if (ref_tree)
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+
+ return ret;
+}
+
+/*
+ * Helper to calculate the defraging length in one run according to threshold.
+ */
+static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged,
+ u32 threshold, int *skip)
+{
+ if ((*alloc_size + *len_defraged) < threshold) {
+ /*
+ * proceed defragmentation until we meet the thresh
+ */
+ *len_defraged += *alloc_size;
+ } else if (*len_defraged == 0) {
+ /*
+ * XXX: skip a large extent.
+ */
+ *skip = 1;
+ } else {
+ /*
+ * split this extent to coalesce with former pieces as
+ * to reach the threshold.
+ *
+ * we're done here with one cycle of defragmentation
+ * in a size of 'thresh', resetting 'len_defraged'
+ * forces a new defragmentation.
+ */
+ *alloc_size = threshold - *len_defraged;
+ *len_defraged = 0;
+ }
+}
+
+static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
+ struct ocfs2_move_extents_context *context)
+{
+ int ret = 0, flags, do_defrag, skip = 0;
+ u32 cpos, phys_cpos, move_start, len_to_move, alloc_size;
+ u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0;
+
+ struct inode *inode = context->inode;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_move_extents *range = context->range;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if ((inode->i_size == 0) || (range->me_len == 0))
+ return 0;
+
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ return 0;
+
+ context->refcount_loc = le64_to_cpu(di->i_refcount_loc);
+
+ ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh);
+ ocfs2_init_dealloc_ctxt(&context->dealloc);
+
+ /*
+ * TO-DO XXX:
+ *
+ * - xattr extents.
+ */
+
+ do_defrag = context->auto_defrag;
+
+ /*
+ * extents moving happens in unit of clusters, for the sake
+ * of simplicity, we may ignore two clusters where 'byte_start'
+ * and 'byte_start + len' were within.
+ */
+ move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start);
+ len_to_move = (range->me_start + range->me_len) >>
+ osb->s_clustersize_bits;
+ if (len_to_move >= move_start)
+ len_to_move -= move_start;
+ else
+ len_to_move = 0;
+
+ if (do_defrag) {
+ defrag_thresh = range->me_threshold >> osb->s_clustersize_bits;
+ if (defrag_thresh <= 1)
+ goto done;
+ } else
+ new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+ range->me_goal);
+
+ mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, "
+ "thresh: %u\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)range->me_start,
+ (unsigned long long)range->me_len,
+ move_start, len_to_move, defrag_thresh);
+
+ cpos = move_start;
+ while (len_to_move) {
+ ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size,
+ &flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (alloc_size > len_to_move)
+ alloc_size = len_to_move;
+
+ /*
+ * XXX: how to deal with a hole:
+ *
+ * - skip the hole of course
+ * - force a new defragmentation
+ */
+ if (!phys_cpos) {
+ if (do_defrag)
+ len_defraged = 0;
+
+ goto next;
+ }
+
+ if (do_defrag) {
+ ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged,
+ defrag_thresh, &skip);
+ /*
+ * skip large extents
+ */
+ if (skip) {
+ skip = 0;
+ goto next;
+ }
+
+ mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, "
+ "alloc_size: %u, len_defraged: %u\n",
+ cpos, phys_cpos, alloc_size, len_defraged);
+
+ ret = ocfs2_defrag_extent(context, cpos, phys_cpos,
+ &alloc_size, flags);
+ } else {
+ ret = ocfs2_move_extent(context, cpos, phys_cpos,
+ &new_phys_cpos, alloc_size,
+ flags);
+
+ new_phys_cpos += alloc_size;
+ }
+
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ context->clusters_moved += alloc_size;
+next:
+ cpos += alloc_size;
+ len_to_move -= alloc_size;
+ }
+
+done:
+ range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE;
+
+out:
+ range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb,
+ context->clusters_moved);
+ range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb,
+ context->new_phys_cpos);
+
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &context->dealloc);
+
+ return ret;
+}
+
+static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
+{
+ int status;
+ handle_t *handle;
+ struct inode *inode = context->inode;
+ struct ocfs2_dinode *di;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (!inode)
+ return -ENOENT;
+
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+ return -EROFS;
+
+ mutex_lock(&inode->i_mutex);
+
+ /*
+ * This prevents concurrent writes from other nodes
+ */
+ status = ocfs2_rw_lock(inode, 1);
+ if (status) {
+ mlog_errno(status);
+ goto out;
+ }
+
+ status = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (status) {
+ mlog_errno(status);
+ goto out_rw_unlock;
+ }
+
+ /*
+ * rememer ip_xattr_sem also needs to be held if necessary
+ */
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+ status = __ocfs2_move_extents_range(di_bh, context);
+
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
+ if (status) {
+ mlog_errno(status);
+ goto out_inode_unlock;
+ }
+
+ /*
+ * We update ctime for these changes
+ */
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out_inode_unlock;
+ }
+
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status) {
+ mlog_errno(status);
+ goto out_commit;
+ }
+
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+ inode->i_ctime = CURRENT_TIME;
+ di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+ ocfs2_journal_dirty(handle, di_bh);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+
+out_inode_unlock:
+ brelse(di_bh);
+ ocfs2_inode_unlock(inode, 1);
+out_rw_unlock:
+ ocfs2_rw_unlock(inode, 1);
+out:
+ mutex_unlock(&inode->i_mutex);
+
+ return status;
+}
+
+int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
+{
+ int status;
+
+ struct inode *inode = filp->f_path.dentry->d_inode;
+ struct ocfs2_move_extents range;
+ struct ocfs2_move_extents_context *context = NULL;
+
+ status = mnt_want_write(filp->f_path.mnt);
+ if (status)
+ return status;
+
+ if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE))
+ goto out;
+
+ if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
+ status = -EPERM;
+ goto out;
+ }
+
+ context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS);
+ if (!context) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto out;
+ }
+
+ context->inode = inode;
+ context->file = filp;
+
+ if (argp) {
+ if (copy_from_user(&range, (struct ocfs2_move_extents *)argp,
+ sizeof(range))) {
+ status = -EFAULT;
+ goto out;
+ }
+ } else {
+ status = -EINVAL;
+ goto out;
+ }
+
+ if (range.me_start > i_size_read(inode))
+ goto out;
+
+ if (range.me_start + range.me_len > i_size_read(inode))
+ range.me_len = i_size_read(inode) - range.me_start;
+
+ context->range = &range;
+
+ if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) {
+ context->auto_defrag = 1;
+ /*
+ * ok, the default theshold for the defragmentation
+ * is 1M, since our maximum clustersize was 1M also.
+ * any thought?
+ */
+ if (!range.me_threshold)
+ range.me_threshold = 1024 * 1024;
+
+ if (range.me_threshold > i_size_read(inode))
+ range.me_threshold = i_size_read(inode);
+
+ if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG)
+ context->partial = 1;
+ } else {
+ /*
+ * first best-effort attempt to validate and adjust the goal
+ * (physical address in block), while it can't guarantee later
+ * operation can succeed all the time since global_bitmap may
+ * change a bit over time.
+ */
+
+ status = ocfs2_validate_and_adjust_move_goal(inode, &range);
+ if (status)
+ goto out;
+ }
+
+ status = ocfs2_move_extents(context);
+ if (status)
+ mlog_errno(status);
+out:
+ /*
+ * movement/defragmentation may end up being partially completed,
+ * that's the reason why we need to return userspace the finished
+ * length and new_offset even if failure happens somewhere.
+ */
+ if (argp) {
+ if (copy_to_user((struct ocfs2_move_extents *)argp, &range,
+ sizeof(range)))
+ status = -EFAULT;
+ }
+
+ kfree(context);
+
+ mnt_drop_write(filp->f_path.mnt);
+
+ return status;
+}
diff --git a/fs/ocfs2/move_extents.h b/fs/ocfs2/move_extents.h
new file mode 100644
index 000000000000..4e143e811441
--- /dev/null
+++ b/fs/ocfs2/move_extents.h
@@ -0,0 +1,22 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * move_extents.h
+ *
+ * Copyright (C) 2011 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_MOVE_EXTENTS_H
+#define OCFS2_MOVE_EXTENTS_H
+
+int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp);
+
+#endif /* OCFS2_MOVE_EXTENTS_H */
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index b46f39bf7438..5b27ff1fa577 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -142,6 +142,38 @@ struct ocfs2_info_journal_size {
__u64 ij_journal_size;
};
+struct ocfs2_info_freeinode {
+ struct ocfs2_info_request ifi_req;
+ struct ocfs2_info_local_freeinode {
+ __u64 lfi_total;
+ __u64 lfi_free;
+ } ifi_stat[OCFS2_MAX_SLOTS];
+ __u32 ifi_slotnum; /* out */
+ __u32 ifi_pad;
+};
+
+#define OCFS2_INFO_MAX_HIST (32)
+
+struct ocfs2_info_freefrag {
+ struct ocfs2_info_request iff_req;
+ struct ocfs2_info_freefrag_stats { /* (out) */
+ struct ocfs2_info_free_chunk_list {
+ __u32 fc_chunks[OCFS2_INFO_MAX_HIST];
+ __u32 fc_clusters[OCFS2_INFO_MAX_HIST];
+ } ffs_fc_hist;
+ __u32 ffs_clusters;
+ __u32 ffs_free_clusters;
+ __u32 ffs_free_chunks;
+ __u32 ffs_free_chunks_real;
+ __u32 ffs_min; /* Minimum free chunksize in clusters */
+ __u32 ffs_max;
+ __u32 ffs_avg;
+ __u32 ffs_pad;
+ } iff_ffs;
+ __u32 iff_chunksize; /* chunksize in clusters(in) */
+ __u32 iff_pad;
+};
+
/* Codes for ocfs2_info_request */
enum ocfs2_info_type {
OCFS2_INFO_CLUSTERSIZE = 1,
@@ -151,6 +183,8 @@ enum ocfs2_info_type {
OCFS2_INFO_UUID,
OCFS2_INFO_FS_FEATURES,
OCFS2_INFO_JOURNAL_SIZE,
+ OCFS2_INFO_FREEINODE,
+ OCFS2_INFO_FREEFRAG,
OCFS2_INFO_NUM_TYPES
};
@@ -171,4 +205,38 @@ enum ocfs2_info_type {
#define OCFS2_IOC_INFO _IOR('o', 5, struct ocfs2_info)
+struct ocfs2_move_extents {
+/* All values are in bytes */
+ /* in */
+ __u64 me_start; /* Virtual start in the file to move */
+ __u64 me_len; /* Length of the extents to be moved */
+ __u64 me_goal; /* Physical offset of the goal,
+ it's in block unit */
+ __u64 me_threshold; /* Maximum distance from goal or threshold
+ for auto defragmentation */
+ __u64 me_flags; /* Flags for the operation:
+ * - auto defragmentation.
+ * - refcount,xattr cases.
+ */
+ /* out */
+ __u64 me_moved_len; /* Moved/defraged length */
+ __u64 me_new_offset; /* Resulting physical location */
+ __u32 me_reserved[2]; /* Reserved for futhure */
+};
+
+#define OCFS2_MOVE_EXT_FL_AUTO_DEFRAG (0x00000001) /* Kernel manages to
+ claim new clusters
+ as the goal place
+ for extents moving */
+#define OCFS2_MOVE_EXT_FL_PART_DEFRAG (0x00000002) /* Allow partial extent
+ moving, is to make
+ movement less likely
+ to fail, may make fs
+ even more fragmented */
+#define OCFS2_MOVE_EXT_FL_COMPLETE (0x00000004) /* Move or defragmenation
+ completely gets done.
+ */
+
+#define OCFS2_IOC_MOVE_EXT _IOW('o', 6, struct ocfs2_move_extents)
+
#endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 3c7606cff1ab..ebfd3825f12a 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -66,7 +66,7 @@ struct ocfs2_cow_context {
u32 *num_clusters,
unsigned int *extent_flags);
int (*cow_duplicate_clusters)(handle_t *handle,
- struct ocfs2_cow_context *context,
+ struct file *file,
u32 cpos, u32 old_cluster,
u32 new_cluster, u32 new_len);
};
@@ -2921,20 +2921,21 @@ static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
return 0;
}
-static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
- struct ocfs2_cow_context *context,
- u32 cpos, u32 old_cluster,
- u32 new_cluster, u32 new_len)
+int ocfs2_duplicate_clusters_by_page(handle_t *handle,
+ struct file *file,
+ u32 cpos, u32 old_cluster,
+ u32 new_cluster, u32 new_len)
{
int ret = 0, partial;
- struct ocfs2_caching_info *ci = context->data_et.et_ci;
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct ocfs2_caching_info *ci = INODE_CACHE(inode);
struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
struct page *page;
pgoff_t page_index;
unsigned int from, to, readahead_pages;
loff_t offset, end, map_end;
- struct address_space *mapping = context->inode->i_mapping;
+ struct address_space *mapping = inode->i_mapping;
trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
new_cluster, new_len);
@@ -2948,8 +2949,8 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
* We only duplicate pages until we reach the page contains i_size - 1.
* So trim 'end' to i_size.
*/
- if (end > i_size_read(context->inode))
- end = i_size_read(context->inode);
+ if (end > i_size_read(inode))
+ end = i_size_read(inode);
while (offset < end) {
page_index = offset >> PAGE_CACHE_SHIFT;
@@ -2972,10 +2973,9 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
BUG_ON(PageDirty(page));
- if (PageReadahead(page) && context->file) {
+ if (PageReadahead(page)) {
page_cache_async_readahead(mapping,
- &context->file->f_ra,
- context->file,
+ &file->f_ra, file,
page, page_index,
readahead_pages);
}
@@ -2999,8 +2999,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
}
}
- ocfs2_map_and_dirty_page(context->inode,
- handle, from, to,
+ ocfs2_map_and_dirty_page(inode, handle, from, to,
page, 0, &new_block);
mark_page_accessed(page);
unlock:
@@ -3015,14 +3014,15 @@ unlock:
return ret;
}
-static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
- struct ocfs2_cow_context *context,
- u32 cpos, u32 old_cluster,
- u32 new_cluster, u32 new_len)
+int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
+ struct file *file,
+ u32 cpos, u32 old_cluster,
+ u32 new_cluster, u32 new_len)
{
int ret = 0;
- struct super_block *sb = context->inode->i_sb;
- struct ocfs2_caching_info *ci = context->data_et.et_ci;
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct super_block *sb = inode->i_sb;
+ struct ocfs2_caching_info *ci = INODE_CACHE(inode);
int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
@@ -3145,8 +3145,8 @@ static int ocfs2_replace_clusters(handle_t *handle,
/*If the old clusters is unwritten, no need to duplicate. */
if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
- ret = context->cow_duplicate_clusters(handle, context, cpos,
- old, new, len);
+ ret = context->cow_duplicate_clusters(handle, context->file,
+ cpos, old, new, len);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3162,22 +3162,22 @@ out:
return ret;
}
-static int ocfs2_cow_sync_writeback(struct super_block *sb,
- struct ocfs2_cow_context *context,
- u32 cpos, u32 num_clusters)
+int ocfs2_cow_sync_writeback(struct super_block *sb,
+ struct inode *inode,
+ u32 cpos, u32 num_clusters)
{
int ret = 0;
loff_t offset, end, map_end;
pgoff_t page_index;
struct page *page;
- if (ocfs2_should_order_data(context->inode))
+ if (ocfs2_should_order_data(inode))
return 0;
offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
- ret = filemap_fdatawrite_range(context->inode->i_mapping,
+ ret = filemap_fdatawrite_range(inode->i_mapping,
offset, end - 1);
if (ret < 0) {
mlog_errno(ret);
@@ -3190,7 +3190,7 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb,
if (map_end > end)
map_end = end;
- page = find_or_create_page(context->inode->i_mapping,
+ page = find_or_create_page(inode->i_mapping,
page_index, GFP_NOFS);
BUG_ON(!page);
@@ -3349,7 +3349,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
* in write-back mode.
*/
if (context->get_clusters == ocfs2_di_get_clusters) {
- ret = ocfs2_cow_sync_writeback(sb, context, cpos,
+ ret = ocfs2_cow_sync_writeback(sb, context->inode, cpos,
orig_num_clusters);
if (ret)
mlog_errno(ret);
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index c8ce46f7d8e3..7754608c83a4 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -84,6 +84,17 @@ int ocfs2_refcount_cow_xattr(struct inode *inode,
struct buffer_head *ref_root_bh,
u32 cpos, u32 write_len,
struct ocfs2_post_refcount *post);
+int ocfs2_duplicate_clusters_by_page(handle_t *handle,
+ struct file *file,
+ u32 cpos, u32 old_cluster,
+ u32 new_cluster, u32 new_len);
+int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
+ struct file *file,
+ u32 cpos, u32 old_cluster,
+ u32 new_cluster, u32 new_len);
+int ocfs2_cow_sync_writeback(struct super_block *sb,
+ struct inode *inode,
+ u32 cpos, u32 num_clusters);
int ocfs2_add_refcount_flag(struct inode *inode,
struct ocfs2_extent_tree *data_et,
struct ocfs2_caching_info *ref_ci,
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 8b3a7da531eb..315de66e52b2 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -106,7 +106,7 @@ static long long get_liability(struct ubifs_info *c)
long long liab;
spin_lock(&c->space_lock);
- liab = c->budg_idx_growth + c->budg_data_growth + c->budg_dd_growth;
+ liab = c->bi.idx_growth + c->bi.data_growth + c->bi.dd_growth;
spin_unlock(&c->space_lock);
return liab;
}
@@ -180,7 +180,7 @@ int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
int idx_lebs;
long long idx_size;
- idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
+ idx_size = c->bi.old_idx_sz + c->bi.idx_growth + c->bi.uncommitted_idx;
/* And make sure we have thrice the index size of space reserved */
idx_size += idx_size << 1;
/*
@@ -292,13 +292,13 @@ static int can_use_rp(struct ubifs_info *c)
* budgeted index space to the size of the current index, multiplies this by 3,
* and makes sure this does not exceed the amount of free LEBs.
*
- * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables:
+ * Notes about @c->bi.min_idx_lebs and @c->lst.idx_lebs variables:
* o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might
* be large, because UBIFS does not do any index consolidation as long as
* there is free space. IOW, the index may take a lot of LEBs, but the LEBs
* will contain a lot of dirt.
- * o @c->min_idx_lebs is the number of LEBS the index presumably takes. IOW,
- * the index may be consolidated to take up to @c->min_idx_lebs LEBs.
+ * o @c->bi.min_idx_lebs is the number of LEBS the index presumably takes. IOW,
+ * the index may be consolidated to take up to @c->bi.min_idx_lebs LEBs.
*
* This function returns zero in case of success, and %-ENOSPC in case of
* failure.
@@ -343,13 +343,13 @@ static int do_budget_space(struct ubifs_info *c)
c->lst.taken_empty_lebs;
if (unlikely(rsvd_idx_lebs > lebs)) {
dbg_budg("out of indexing space: min_idx_lebs %d (old %d), "
- "rsvd_idx_lebs %d", min_idx_lebs, c->min_idx_lebs,
+ "rsvd_idx_lebs %d", min_idx_lebs, c->bi.min_idx_lebs,
rsvd_idx_lebs);
return -ENOSPC;
}
available = ubifs_calc_available(c, min_idx_lebs);
- outstanding = c->budg_data_growth + c->budg_dd_growth;
+ outstanding = c->bi.data_growth + c->bi.dd_growth;
if (unlikely(available < outstanding)) {
dbg_budg("out of data space: available %lld, outstanding %lld",
@@ -360,7 +360,7 @@ static int do_budget_space(struct ubifs_info *c)
if (available - outstanding <= c->rp_size && !can_use_rp(c))
return -ENOSPC;
- c->min_idx_lebs = min_idx_lebs;
+ c->bi.min_idx_lebs = min_idx_lebs;
return 0;
}
@@ -393,11 +393,11 @@ static int calc_data_growth(const struct ubifs_info *c,
{
int data_growth;
- data_growth = req->new_ino ? c->inode_budget : 0;
+ data_growth = req->new_ino ? c->bi.inode_budget : 0;
if (req->new_page)
- data_growth += c->page_budget;
+ data_growth += c->bi.page_budget;
if (req->new_dent)
- data_growth += c->dent_budget;
+ data_growth += c->bi.dent_budget;
data_growth += req->new_ino_d;
return data_growth;
}
@@ -413,12 +413,12 @@ static int calc_dd_growth(const struct ubifs_info *c,
{
int dd_growth;
- dd_growth = req->dirtied_page ? c->page_budget : 0;
+ dd_growth = req->dirtied_page ? c->bi.page_budget : 0;
if (req->dirtied_ino)
- dd_growth += c->inode_budget << (req->dirtied_ino - 1);
+ dd_growth += c->bi.inode_budget << (req->dirtied_ino - 1);
if (req->mod_dent)
- dd_growth += c->dent_budget;
+ dd_growth += c->bi.dent_budget;
dd_growth += req->dirtied_ino_d;
return dd_growth;
}
@@ -460,19 +460,19 @@ int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
again:
spin_lock(&c->space_lock);
- ubifs_assert(c->budg_idx_growth >= 0);
- ubifs_assert(c->budg_data_growth >= 0);
- ubifs_assert(c->budg_dd_growth >= 0);
+ ubifs_assert(c->bi.idx_growth >= 0);
+ ubifs_assert(c->bi.data_growth >= 0);
+ ubifs_assert(c->bi.dd_growth >= 0);
- if (unlikely(c->nospace) && (c->nospace_rp || !can_use_rp(c))) {
+ if (unlikely(c->bi.nospace) && (c->bi.nospace_rp || !can_use_rp(c))) {
dbg_budg("no space");
spin_unlock(&c->space_lock);
return -ENOSPC;
}
- c->budg_idx_growth += idx_growth;
- c->budg_data_growth += data_growth;
- c->budg_dd_growth += dd_growth;
+ c->bi.idx_growth += idx_growth;
+ c->bi.data_growth += data_growth;
+ c->bi.dd_growth += dd_growth;
err = do_budget_space(c);
if (likely(!err)) {
@@ -484,9 +484,9 @@ again:
}
/* Restore the old values */
- c->budg_idx_growth -= idx_growth;
- c->budg_data_growth -= data_growth;
- c->budg_dd_growth -= dd_growth;
+ c->bi.idx_growth -= idx_growth;
+ c->bi.data_growth -= data_growth;
+ c->bi.dd_growth -= dd_growth;
spin_unlock(&c->space_lock);
if (req->fast) {
@@ -506,9 +506,9 @@ again:
goto again;
}
dbg_budg("FS is full, -ENOSPC");
- c->nospace = 1;
+ c->bi.nospace = 1;
if (can_use_rp(c) || c->rp_size == 0)
- c->nospace_rp = 1;
+ c->bi.nospace_rp = 1;
smp_wmb();
} else
ubifs_err("cannot budget space, error %d", err);
@@ -523,8 +523,8 @@ again:
* This function releases the space budgeted by 'ubifs_budget_space()'. Note,
* since the index changes (which were budgeted for in @req->idx_growth) will
* only be written to the media on commit, this function moves the index budget
- * from @c->budg_idx_growth to @c->budg_uncommitted_idx. The latter will be
- * zeroed by the commit operation.
+ * from @c->bi.idx_growth to @c->bi.uncommitted_idx. The latter will be zeroed
+ * by the commit operation.
*/
void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
{
@@ -553,23 +553,23 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
if (!req->data_growth && !req->dd_growth)
return;
- c->nospace = c->nospace_rp = 0;
+ c->bi.nospace = c->bi.nospace_rp = 0;
smp_wmb();
spin_lock(&c->space_lock);
- c->budg_idx_growth -= req->idx_growth;
- c->budg_uncommitted_idx += req->idx_growth;
- c->budg_data_growth -= req->data_growth;
- c->budg_dd_growth -= req->dd_growth;
- c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
-
- ubifs_assert(c->budg_idx_growth >= 0);
- ubifs_assert(c->budg_data_growth >= 0);
- ubifs_assert(c->budg_dd_growth >= 0);
- ubifs_assert(c->min_idx_lebs < c->main_lebs);
- ubifs_assert(!(c->budg_idx_growth & 7));
- ubifs_assert(!(c->budg_data_growth & 7));
- ubifs_assert(!(c->budg_dd_growth & 7));
+ c->bi.idx_growth -= req->idx_growth;
+ c->bi.uncommitted_idx += req->idx_growth;
+ c->bi.data_growth -= req->data_growth;
+ c->bi.dd_growth -= req->dd_growth;
+ c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+
+ ubifs_assert(c->bi.idx_growth >= 0);
+ ubifs_assert(c->bi.data_growth >= 0);
+ ubifs_assert(c->bi.dd_growth >= 0);
+ ubifs_assert(c->bi.min_idx_lebs < c->main_lebs);
+ ubifs_assert(!(c->bi.idx_growth & 7));
+ ubifs_assert(!(c->bi.data_growth & 7));
+ ubifs_assert(!(c->bi.dd_growth & 7));
spin_unlock(&c->space_lock);
}
@@ -586,13 +586,13 @@ void ubifs_convert_page_budget(struct ubifs_info *c)
{
spin_lock(&c->space_lock);
/* Release the index growth reservation */
- c->budg_idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+ c->bi.idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT;
/* Release the data growth reservation */
- c->budg_data_growth -= c->page_budget;
+ c->bi.data_growth -= c->bi.page_budget;
/* Increase the dirty data growth reservation instead */
- c->budg_dd_growth += c->page_budget;
+ c->bi.dd_growth += c->bi.page_budget;
/* And re-calculate the indexing space reservation */
- c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+ c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
spin_unlock(&c->space_lock);
}
@@ -612,7 +612,7 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
memset(&req, 0, sizeof(struct ubifs_budget_req));
/* The "no space" flags will be cleared because dd_growth is > 0 */
- req.dd_growth = c->inode_budget + ALIGN(ui->data_len, 8);
+ req.dd_growth = c->bi.inode_budget + ALIGN(ui->data_len, 8);
ubifs_release_budget(c, &req);
}
@@ -682,9 +682,9 @@ long long ubifs_get_free_space_nolock(struct ubifs_info *c)
int rsvd_idx_lebs, lebs;
long long available, outstanding, free;
- ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c));
- outstanding = c->budg_data_growth + c->budg_dd_growth;
- available = ubifs_calc_available(c, c->min_idx_lebs);
+ ubifs_assert(c->bi.min_idx_lebs == ubifs_calc_min_idx_lebs(c));
+ outstanding = c->bi.data_growth + c->bi.dd_growth;
+ available = ubifs_calc_available(c, c->bi.min_idx_lebs);
/*
* When reporting free space to user-space, UBIFS guarantees that it is
@@ -697,8 +697,8 @@ long long ubifs_get_free_space_nolock(struct ubifs_info *c)
* Note, the calculations below are similar to what we have in
* 'do_budget_space()', so refer there for comments.
*/
- if (c->min_idx_lebs > c->lst.idx_lebs)
- rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs;
+ if (c->bi.min_idx_lebs > c->lst.idx_lebs)
+ rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs;
else
rsvd_idx_lebs = 0;
lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 1bd01ded7123..87cd0ead8633 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -182,7 +182,7 @@ static int do_commit(struct ubifs_info *c)
c->mst_node->root_len = cpu_to_le32(zroot.len);
c->mst_node->ihead_lnum = cpu_to_le32(c->ihead_lnum);
c->mst_node->ihead_offs = cpu_to_le32(c->ihead_offs);
- c->mst_node->index_size = cpu_to_le64(c->old_idx_sz);
+ c->mst_node->index_size = cpu_to_le64(c->bi.old_idx_sz);
c->mst_node->lpt_lnum = cpu_to_le32(c->lpt_lnum);
c->mst_node->lpt_offs = cpu_to_le32(c->lpt_offs);
c->mst_node->nhead_lnum = cpu_to_le32(c->nhead_lnum);
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 004d3745dc45..0bb2bcef0de9 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -34,7 +34,6 @@
#include <linux/moduleparam.h>
#include <linux/debugfs.h>
#include <linux/math64.h>
-#include <linux/slab.h>
#ifdef CONFIG_UBIFS_FS_DEBUG
@@ -43,15 +42,12 @@ DEFINE_SPINLOCK(dbg_lock);
static char dbg_key_buf0[128];
static char dbg_key_buf1[128];
-unsigned int ubifs_msg_flags;
unsigned int ubifs_chk_flags;
unsigned int ubifs_tst_flags;
-module_param_named(debug_msgs, ubifs_msg_flags, uint, S_IRUGO | S_IWUSR);
module_param_named(debug_chks, ubifs_chk_flags, uint, S_IRUGO | S_IWUSR);
module_param_named(debug_tsts, ubifs_tst_flags, uint, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(debug_msgs, "Debug message type flags");
MODULE_PARM_DESC(debug_chks, "Debug check flags");
MODULE_PARM_DESC(debug_tsts, "Debug special test flags");
@@ -317,6 +313,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
printk(KERN_DEBUG "\tflags %#x\n", sup_flags);
printk(KERN_DEBUG "\t big_lpt %u\n",
!!(sup_flags & UBIFS_FLG_BIGLPT));
+ printk(KERN_DEBUG "\t space_fixup %u\n",
+ !!(sup_flags & UBIFS_FLG_SPACE_FIXUP));
printk(KERN_DEBUG "\tmin_io_size %u\n",
le32_to_cpu(sup->min_io_size));
printk(KERN_DEBUG "\tleb_size %u\n",
@@ -602,7 +600,7 @@ void dbg_dump_lstats(const struct ubifs_lp_stats *lst)
spin_unlock(&dbg_lock);
}
-void dbg_dump_budg(struct ubifs_info *c)
+void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi)
{
int i;
struct rb_node *rb;
@@ -610,26 +608,42 @@ void dbg_dump_budg(struct ubifs_info *c)
struct ubifs_gced_idx_leb *idx_gc;
long long available, outstanding, free;
- ubifs_assert(spin_is_locked(&c->space_lock));
+ spin_lock(&c->space_lock);
spin_lock(&dbg_lock);
- printk(KERN_DEBUG "(pid %d) Budgeting info: budg_data_growth %lld, "
- "budg_dd_growth %lld, budg_idx_growth %lld\n", current->pid,
- c->budg_data_growth, c->budg_dd_growth, c->budg_idx_growth);
- printk(KERN_DEBUG "\tdata budget sum %lld, total budget sum %lld, "
- "freeable_cnt %d\n", c->budg_data_growth + c->budg_dd_growth,
- c->budg_data_growth + c->budg_dd_growth + c->budg_idx_growth,
- c->freeable_cnt);
- printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %lld, "
- "calc_idx_sz %lld, idx_gc_cnt %d\n", c->min_idx_lebs,
- c->old_idx_sz, c->calc_idx_sz, c->idx_gc_cnt);
+ printk(KERN_DEBUG "(pid %d) Budgeting info: data budget sum %lld, "
+ "total budget sum %lld\n", current->pid,
+ bi->data_growth + bi->dd_growth,
+ bi->data_growth + bi->dd_growth + bi->idx_growth);
+ printk(KERN_DEBUG "\tbudg_data_growth %lld, budg_dd_growth %lld, "
+ "budg_idx_growth %lld\n", bi->data_growth, bi->dd_growth,
+ bi->idx_growth);
+ printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %llu, "
+ "uncommitted_idx %lld\n", bi->min_idx_lebs, bi->old_idx_sz,
+ bi->uncommitted_idx);
+ printk(KERN_DEBUG "\tpage_budget %d, inode_budget %d, dent_budget %d\n",
+ bi->page_budget, bi->inode_budget, bi->dent_budget);
+ printk(KERN_DEBUG "\tnospace %u, nospace_rp %u\n",
+ bi->nospace, bi->nospace_rp);
+ printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
+ c->dark_wm, c->dead_wm, c->max_idx_node_sz);
+
+ if (bi != &c->bi)
+ /*
+ * If we are dumping saved budgeting data, do not print
+ * additional information which is about the current state, not
+ * the old one which corresponded to the saved budgeting data.
+ */
+ goto out_unlock;
+
+ printk(KERN_DEBUG "\tfreeable_cnt %d, calc_idx_sz %lld, idx_gc_cnt %d\n",
+ c->freeable_cnt, c->calc_idx_sz, c->idx_gc_cnt);
printk(KERN_DEBUG "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, "
"clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt),
atomic_long_read(&c->dirty_zn_cnt),
atomic_long_read(&c->clean_zn_cnt));
- printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
- c->dark_wm, c->dead_wm, c->max_idx_node_sz);
printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n",
c->gc_lnum, c->ihead_lnum);
+
/* If we are in R/O mode, journal heads do not exist */
if (c->jheads)
for (i = 0; i < c->jhead_cnt; i++)
@@ -648,13 +662,15 @@ void dbg_dump_budg(struct ubifs_info *c)
printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state);
/* Print budgeting predictions */
- available = ubifs_calc_available(c, c->min_idx_lebs);
- outstanding = c->budg_data_growth + c->budg_dd_growth;
+ available = ubifs_calc_available(c, c->bi.min_idx_lebs);
+ outstanding = c->bi.data_growth + c->bi.dd_growth;
free = ubifs_get_free_space_nolock(c);
printk(KERN_DEBUG "Budgeting predictions:\n");
printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n",
available, outstanding, free);
+out_unlock:
spin_unlock(&dbg_lock);
+ spin_unlock(&c->space_lock);
}
void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
@@ -729,7 +745,13 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
if (bud->lnum == lp->lnum) {
int head = 0;
for (i = 0; i < c->jhead_cnt; i++) {
- if (lp->lnum == c->jheads[i].wbuf.lnum) {
+ /*
+ * Note, if we are in R/O mode or in the middle
+ * of mounting/re-mounting, the write-buffers do
+ * not exist.
+ */
+ if (c->jheads &&
+ lp->lnum == c->jheads[i].wbuf.lnum) {
printk(KERN_CONT ", jhead %s",
dbg_jhead(i));
head = 1;
@@ -976,6 +998,8 @@ void dbg_save_space_info(struct ubifs_info *c)
spin_lock(&c->space_lock);
memcpy(&d->saved_lst, &c->lst, sizeof(struct ubifs_lp_stats));
+ memcpy(&d->saved_bi, &c->bi, sizeof(struct ubifs_budg_info));
+ d->saved_idx_gc_cnt = c->idx_gc_cnt;
/*
* We use a dirty hack here and zero out @c->freeable_cnt, because it
@@ -1042,14 +1066,14 @@ int dbg_check_space_info(struct ubifs_info *c)
out:
ubifs_msg("saved lprops statistics dump");
dbg_dump_lstats(&d->saved_lst);
- ubifs_get_lp_stats(c, &lst);
-
+ ubifs_msg("saved budgeting info dump");
+ dbg_dump_budg(c, &d->saved_bi);
+ ubifs_msg("saved idx_gc_cnt %d", d->saved_idx_gc_cnt);
ubifs_msg("current lprops statistics dump");
+ ubifs_get_lp_stats(c, &lst);
dbg_dump_lstats(&lst);
-
- spin_lock(&c->space_lock);
- dbg_dump_budg(c);
- spin_unlock(&c->space_lock);
+ ubifs_msg("current budgeting info dump");
+ dbg_dump_budg(c, &c->bi);
dump_stack();
return -EINVAL;
}
@@ -1793,6 +1817,8 @@ static struct fsck_inode *add_inode(struct ubifs_info *c,
struct rb_node **p, *parent = NULL;
struct fsck_inode *fscki;
ino_t inum = key_inum_flash(c, &ino->key);
+ struct inode *inode;
+ struct ubifs_inode *ui;
p = &fsckd->inodes.rb_node;
while (*p) {
@@ -1816,19 +1842,46 @@ static struct fsck_inode *add_inode(struct ubifs_info *c,
if (!fscki)
return ERR_PTR(-ENOMEM);
+ inode = ilookup(c->vfs_sb, inum);
+
fscki->inum = inum;
- fscki->nlink = le32_to_cpu(ino->nlink);
- fscki->size = le64_to_cpu(ino->size);
- fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt);
- fscki->xattr_sz = le32_to_cpu(ino->xattr_size);
- fscki->xattr_nms = le32_to_cpu(ino->xattr_names);
- fscki->mode = le32_to_cpu(ino->mode);
+ /*
+ * If the inode is present in the VFS inode cache, use it instead of
+ * the on-flash inode which might be out-of-date. E.g., the size might
+ * be out-of-date. If we do not do this, the following may happen, for
+ * example:
+ * 1. A power cut happens
+ * 2. We mount the file-system R/O, the replay process fixes up the
+ * inode size in the VFS cache, but on on-flash.
+ * 3. 'check_leaf()' fails because it hits a data node beyond inode
+ * size.
+ */
+ if (!inode) {
+ fscki->nlink = le32_to_cpu(ino->nlink);
+ fscki->size = le64_to_cpu(ino->size);
+ fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt);
+ fscki->xattr_sz = le32_to_cpu(ino->xattr_size);
+ fscki->xattr_nms = le32_to_cpu(ino->xattr_names);
+ fscki->mode = le32_to_cpu(ino->mode);
+ } else {
+ ui = ubifs_inode(inode);
+ fscki->nlink = inode->i_nlink;
+ fscki->size = inode->i_size;
+ fscki->xattr_cnt = ui->xattr_cnt;
+ fscki->xattr_sz = ui->xattr_size;
+ fscki->xattr_nms = ui->xattr_names;
+ fscki->mode = inode->i_mode;
+ iput(inode);
+ }
+
if (S_ISDIR(fscki->mode)) {
fscki->calc_sz = UBIFS_INO_NODE_SZ;
fscki->calc_cnt = 2;
}
+
rb_link_node(&fscki->rb, parent, p);
rb_insert_color(&fscki->rb, &fsckd->inodes);
+
return fscki;
}
@@ -2421,7 +2474,8 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head)
hashb = key_block(c, &sb->key);
if (hasha > hashb) {
- ubifs_err("larger hash %u goes before %u", hasha, hashb);
+ ubifs_err("larger hash %u goes before %u",
+ hasha, hashb);
goto error_dump;
}
}
@@ -2437,14 +2491,12 @@ error_dump:
return 0;
}
-static int invocation_cnt;
-
int dbg_force_in_the_gaps(void)
{
- if (!dbg_force_in_the_gaps_enabled)
+ if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
return 0;
- /* Force in-the-gaps every 8th commit */
- return !((invocation_cnt++) & 0x7);
+
+ return !(random32() & 7);
}
/* Failure mode for recovery testing */
@@ -2632,7 +2684,7 @@ int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
int len, int check)
{
if (in_failure_mode(desc))
- return -EIO;
+ return -EROFS;
return ubi_leb_read(desc, lnum, buf, offset, len, check);
}
@@ -2642,7 +2694,7 @@ int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
int err, failing;
if (in_failure_mode(desc))
- return -EIO;
+ return -EROFS;
failing = do_fail(desc, lnum, 1);
if (failing)
cut_data(buf, len);
@@ -2650,7 +2702,7 @@ int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
if (err)
return err;
if (failing)
- return -EIO;
+ return -EROFS;
return 0;
}
@@ -2660,12 +2712,12 @@ int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
int err;
if (do_fail(desc, lnum, 1))
- return -EIO;
+ return -EROFS;
err = ubi_leb_change(desc, lnum, buf, len, dtype);
if (err)
return err;
if (do_fail(desc, lnum, 1))
- return -EIO;
+ return -EROFS;
return 0;
}
@@ -2674,12 +2726,12 @@ int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum)
int err;
if (do_fail(desc, lnum, 0))
- return -EIO;
+ return -EROFS;
err = ubi_leb_erase(desc, lnum);
if (err)
return err;
if (do_fail(desc, lnum, 0))
- return -EIO;
+ return -EROFS;
return 0;
}
@@ -2688,19 +2740,19 @@ int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum)
int err;
if (do_fail(desc, lnum, 0))
- return -EIO;
+ return -EROFS;
err = ubi_leb_unmap(desc, lnum);
if (err)
return err;
if (do_fail(desc, lnum, 0))
- return -EIO;
+ return -EROFS;
return 0;
}
int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum)
{
if (in_failure_mode(desc))
- return -EIO;
+ return -EROFS;
return ubi_is_mapped(desc, lnum);
}
@@ -2709,12 +2761,12 @@ int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype)
int err;
if (do_fail(desc, lnum, 0))
- return -EIO;
+ return -EROFS;
err = ubi_leb_map(desc, lnum, dtype);
if (err)
return err;
if (do_fail(desc, lnum, 0))
- return -EIO;
+ return -EROFS;
return 0;
}
@@ -2784,7 +2836,7 @@ void dbg_debugfs_exit(void)
static int open_debugfs_file(struct inode *inode, struct file *file)
{
file->private_data = inode->i_private;
- return 0;
+ return nonseekable_open(inode, file);
}
static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
@@ -2795,18 +2847,15 @@ static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
if (file->f_path.dentry == d->dfs_dump_lprops)
dbg_dump_lprops(c);
- else if (file->f_path.dentry == d->dfs_dump_budg) {
- spin_lock(&c->space_lock);
- dbg_dump_budg(c);
- spin_unlock(&c->space_lock);
- } else if (file->f_path.dentry == d->dfs_dump_tnc) {
+ else if (file->f_path.dentry == d->dfs_dump_budg)
+ dbg_dump_budg(c, &c->bi);
+ else if (file->f_path.dentry == d->dfs_dump_tnc) {
mutex_lock(&c->tnc_mutex);
dbg_dump_tnc(c);
mutex_unlock(&c->tnc_mutex);
} else
return -EINVAL;
- *ppos += count;
return count;
}
@@ -2814,7 +2863,7 @@ static const struct file_operations dfs_fops = {
.open = open_debugfs_file,
.write = write_debugfs_file,
.owner = THIS_MODULE,
- .llseek = default_llseek,
+ .llseek = no_llseek,
};
/**
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index e6493cac193d..a811ac4a26bb 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -31,6 +31,8 @@ typedef int (*dbg_znode_callback)(struct ubifs_info *c,
#ifdef CONFIG_UBIFS_FS_DEBUG
+#include <linux/random.h>
+
/**
* ubifs_debug_info - per-FS debugging information.
* @old_zroot: old index root - used by 'dbg_check_old_index()'
@@ -50,13 +52,15 @@ typedef int (*dbg_znode_callback)(struct ubifs_info *c,
* @new_ihead_offs: used by debugging to check @c->ihead_offs
*
* @saved_lst: saved lprops statistics (used by 'dbg_save_space_info()')
- * @saved_free: saved free space (used by 'dbg_save_space_info()')
+ * @saved_bi: saved budgeting information
+ * @saved_free: saved amount of free space
+ * @saved_idx_gc_cnt: saved value of @c->idx_gc_cnt
*
- * dfs_dir_name: name of debugfs directory containing this file-system's files
- * dfs_dir: direntry object of the file-system debugfs directory
- * dfs_dump_lprops: "dump lprops" debugfs knob
- * dfs_dump_budg: "dump budgeting information" debugfs knob
- * dfs_dump_tnc: "dump TNC" debugfs knob
+ * @dfs_dir_name: name of debugfs directory containing this file-system's files
+ * @dfs_dir: direntry object of the file-system debugfs directory
+ * @dfs_dump_lprops: "dump lprops" debugfs knob
+ * @dfs_dump_budg: "dump budgeting information" debugfs knob
+ * @dfs_dump_tnc: "dump TNC" debugfs knob
*/
struct ubifs_debug_info {
struct ubifs_zbranch old_zroot;
@@ -76,7 +80,9 @@ struct ubifs_debug_info {
int new_ihead_offs;
struct ubifs_lp_stats saved_lst;
+ struct ubifs_budg_info saved_bi;
long long saved_free;
+ int saved_idx_gc_cnt;
char dfs_dir_name[100];
struct dentry *dfs_dir;
@@ -101,23 +107,7 @@ struct ubifs_debug_info {
} \
} while (0)
-#define dbg_dump_stack() do { \
- if (!dbg_failure_mode) \
- dump_stack(); \
-} while (0)
-
-/* Generic debugging messages */
-#define dbg_msg(fmt, ...) do { \
- spin_lock(&dbg_lock); \
- printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid, \
- __func__, ##__VA_ARGS__); \
- spin_unlock(&dbg_lock); \
-} while (0)
-
-#define dbg_do_msg(typ, fmt, ...) do { \
- if (ubifs_msg_flags & typ) \
- dbg_msg(fmt, ##__VA_ARGS__); \
-} while (0)
+#define dbg_dump_stack() dump_stack()
#define dbg_err(fmt, ...) do { \
spin_lock(&dbg_lock); \
@@ -137,77 +127,40 @@ const char *dbg_key_str1(const struct ubifs_info *c,
#define DBGKEY(key) dbg_key_str0(c, (key))
#define DBGKEY1(key) dbg_key_str1(c, (key))
-/* General messages */
-#define dbg_gen(fmt, ...) dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__)
+#define ubifs_dbg_msg(type, fmt, ...) do { \
+ spin_lock(&dbg_lock); \
+ pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__); \
+ spin_unlock(&dbg_lock); \
+} while (0)
+/* Just a debugging messages not related to any specific UBIFS subsystem */
+#define dbg_msg(fmt, ...) ubifs_dbg_msg("msg", fmt, ##__VA_ARGS__)
+/* General messages */
+#define dbg_gen(fmt, ...) ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__)
/* Additional journal messages */
-#define dbg_jnl(fmt, ...) dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__)
-
+#define dbg_jnl(fmt, ...) ubifs_dbg_msg("jnl", fmt, ##__VA_ARGS__)
/* Additional TNC messages */
-#define dbg_tnc(fmt, ...) dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__)
-
+#define dbg_tnc(fmt, ...) ubifs_dbg_msg("tnc", fmt, ##__VA_ARGS__)
/* Additional lprops messages */
-#define dbg_lp(fmt, ...) dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__)
-
+#define dbg_lp(fmt, ...) ubifs_dbg_msg("lp", fmt, ##__VA_ARGS__)
/* Additional LEB find messages */
-#define dbg_find(fmt, ...) dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__)
-
+#define dbg_find(fmt, ...) ubifs_dbg_msg("find", fmt, ##__VA_ARGS__)
/* Additional mount messages */
-#define dbg_mnt(fmt, ...) dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__)
-
+#define dbg_mnt(fmt, ...) ubifs_dbg_msg("mnt", fmt, ##__VA_ARGS__)
/* Additional I/O messages */
-#define dbg_io(fmt, ...) dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__)
-
+#define dbg_io(fmt, ...) ubifs_dbg_msg("io", fmt, ##__VA_ARGS__)
/* Additional commit messages */
-#define dbg_cmt(fmt, ...) dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__)
-
+#define dbg_cmt(fmt, ...) ubifs_dbg_msg("cmt", fmt, ##__VA_ARGS__)
/* Additional budgeting messages */
-#define dbg_budg(fmt, ...) dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__)
-
+#define dbg_budg(fmt, ...) ubifs_dbg_msg("budg", fmt, ##__VA_ARGS__)
/* Additional log messages */
-#define dbg_log(fmt, ...) dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__)
-
+#define dbg_log(fmt, ...) ubifs_dbg_msg("log", fmt, ##__VA_ARGS__)
/* Additional gc messages */
-#define dbg_gc(fmt, ...) dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__)
-
+#define dbg_gc(fmt, ...) ubifs_dbg_msg("gc", fmt, ##__VA_ARGS__)
/* Additional scan messages */
-#define dbg_scan(fmt, ...) dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__)
-
+#define dbg_scan(fmt, ...) ubifs_dbg_msg("scan", fmt, ##__VA_ARGS__)
/* Additional recovery messages */
-#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__)
-
-/*
- * Debugging message type flags.
- *
- * UBIFS_MSG_GEN: general messages
- * UBIFS_MSG_JNL: journal messages
- * UBIFS_MSG_MNT: mount messages
- * UBIFS_MSG_CMT: commit messages
- * UBIFS_MSG_FIND: LEB find messages
- * UBIFS_MSG_BUDG: budgeting messages
- * UBIFS_MSG_GC: garbage collection messages
- * UBIFS_MSG_TNC: TNC messages
- * UBIFS_MSG_LP: lprops messages
- * UBIFS_MSG_IO: I/O messages
- * UBIFS_MSG_LOG: log messages
- * UBIFS_MSG_SCAN: scan messages
- * UBIFS_MSG_RCVRY: recovery messages
- */
-enum {
- UBIFS_MSG_GEN = 0x1,
- UBIFS_MSG_JNL = 0x2,
- UBIFS_MSG_MNT = 0x4,
- UBIFS_MSG_CMT = 0x8,
- UBIFS_MSG_FIND = 0x10,
- UBIFS_MSG_BUDG = 0x20,
- UBIFS_MSG_GC = 0x40,
- UBIFS_MSG_TNC = 0x80,
- UBIFS_MSG_LP = 0x100,
- UBIFS_MSG_IO = 0x200,
- UBIFS_MSG_LOG = 0x400,
- UBIFS_MSG_SCAN = 0x800,
- UBIFS_MSG_RCVRY = 0x1000,
-};
+#define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__)
/*
* Debugging check flags.
@@ -233,11 +186,9 @@ enum {
/*
* Special testing flags.
*
- * UBIFS_TST_FORCE_IN_THE_GAPS: force the use of in-the-gaps method
* UBIFS_TST_RCVRY: failure mode for recovery testing
*/
enum {
- UBIFS_TST_FORCE_IN_THE_GAPS = 0x2,
UBIFS_TST_RCVRY = 0x4,
};
@@ -262,7 +213,7 @@ void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum,
int offs);
void dbg_dump_budget_req(const struct ubifs_budget_req *req);
void dbg_dump_lstats(const struct ubifs_lp_stats *lst);
-void dbg_dump_budg(struct ubifs_info *c);
+void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi);
void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp);
void dbg_dump_lprops(struct ubifs_info *c);
void dbg_dump_lpt_info(struct ubifs_info *c);
@@ -304,18 +255,16 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head);
int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head);
/* Force the use of in-the-gaps method for testing */
-
-#define dbg_force_in_the_gaps_enabled \
- (ubifs_tst_flags & UBIFS_TST_FORCE_IN_THE_GAPS)
-
+static inline int dbg_force_in_the_gaps_enabled(void)
+{
+ return ubifs_chk_flags & UBIFS_CHK_GEN;
+}
int dbg_force_in_the_gaps(void);
/* Failure mode for recovery testing */
-
#define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY)
#ifndef UBIFS_DBG_PRESERVE_UBI
-
#define ubi_leb_read dbg_leb_read
#define ubi_leb_write dbg_leb_write
#define ubi_leb_change dbg_leb_change
@@ -323,7 +272,6 @@ int dbg_force_in_the_gaps(void);
#define ubi_leb_unmap dbg_leb_unmap
#define ubi_is_mapped dbg_is_mapped
#define ubi_leb_map dbg_leb_map
-
#endif
int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
@@ -370,33 +318,33 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
__func__, __LINE__, current->pid); \
} while (0)
-#define dbg_err(fmt, ...) do { \
- if (0) \
- ubifs_err(fmt, ##__VA_ARGS__); \
+#define dbg_err(fmt, ...) do { \
+ if (0) \
+ ubifs_err(fmt, ##__VA_ARGS__); \
} while (0)
-#define dbg_msg(fmt, ...) do { \
- if (0) \
- printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", \
- current->pid, __func__, ##__VA_ARGS__); \
+#define ubifs_dbg_msg(fmt, ...) do { \
+ if (0) \
+ pr_debug(fmt "\n", ##__VA_ARGS__); \
} while (0)
#define dbg_dump_stack()
#define ubifs_assert_cmt_locked(c)
-#define dbg_gen(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_jnl(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_tnc(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_lp(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_find(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_mnt(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_io(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_cmt(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_budg(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_log(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_gc(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_scan(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_rcvry(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_msg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_gen(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_jnl(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_tnc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_lp(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_find(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_mnt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_io(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_cmt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_budg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_log(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_gc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_scan(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
#define DBGKEY(key) ((char *)(key))
#define DBGKEY1(key) ((char *)(key))
@@ -420,7 +368,9 @@ static inline void
dbg_dump_budget_req(const struct ubifs_budget_req *req) { return; }
static inline void
dbg_dump_lstats(const struct ubifs_lp_stats *lst) { return; }
-static inline void dbg_dump_budg(struct ubifs_info *c) { return; }
+static inline void
+dbg_dump_budg(struct ubifs_info *c,
+ const struct ubifs_budg_info *bi) { return; }
static inline void dbg_dump_lprop(const struct ubifs_info *c,
const struct ubifs_lprops *lp) { return; }
static inline void dbg_dump_lprops(struct ubifs_info *c) { return; }
@@ -482,8 +432,8 @@ dbg_check_nondata_nodes_order(struct ubifs_info *c,
struct list_head *head) { return 0; }
static inline int dbg_force_in_the_gaps(void) { return 0; }
-#define dbg_force_in_the_gaps_enabled 0
-#define dbg_failure_mode 0
+#define dbg_force_in_the_gaps_enabled() 0
+#define dbg_failure_mode 0
static inline int dbg_debugfs_init(void) { return 0; }
static inline void dbg_debugfs_exit(void) { return; }
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 7217d67a80a6..ef5abd38f0bf 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -603,7 +603,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
ubifs_release_budget(c, &req);
else {
/* We've deleted something - clean the "no space" flags */
- c->nospace = c->nospace_rp = 0;
+ c->bi.nospace = c->bi.nospace_rp = 0;
smp_wmb();
}
return 0;
@@ -693,7 +693,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
ubifs_release_budget(c, &req);
else {
/* We've deleted something - clean the "no space" flags */
- c->nospace = c->nospace_rp = 0;
+ c->bi.nospace = c->bi.nospace_rp = 0;
smp_wmb();
}
return 0;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index b286db79c686..5e7fccfc4b29 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -212,7 +212,7 @@ static void release_new_page_budget(struct ubifs_info *c)
*/
static void release_existing_page_budget(struct ubifs_info *c)
{
- struct ubifs_budget_req req = { .dd_growth = c->page_budget};
+ struct ubifs_budget_req req = { .dd_growth = c->bi.page_budget};
ubifs_release_budget(c, &req);
}
@@ -971,11 +971,11 @@ static int do_writepage(struct page *page, int len)
* the page locked, and it locks @ui_mutex. However, write-back does take inode
* @i_mutex, which means other VFS operations may be run on this inode at the
* same time. And the problematic one is truncation to smaller size, from where
- * we have to call 'truncate_setsize()', which first changes @inode->i_size, then
- * drops the truncated pages. And while dropping the pages, it takes the page
- * lock. This means that 'do_truncation()' cannot call 'truncate_setsize()' with
- * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This
- * means that @inode->i_size is changed while @ui_mutex is unlocked.
+ * we have to call 'truncate_setsize()', which first changes @inode->i_size,
+ * then drops the truncated pages. And while dropping the pages, it takes the
+ * page lock. This means that 'do_truncation()' cannot call 'truncate_setsize()'
+ * with @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'.
+ * This means that @inode->i_size is changed while @ui_mutex is unlocked.
*
* XXX(truncate): with the new truncate sequence this is not true anymore,
* and the calls to truncate_setsize can be move around freely. They should
@@ -1189,7 +1189,7 @@ out_budg:
if (budgeted)
ubifs_release_budget(c, &req);
else {
- c->nospace = c->nospace_rp = 0;
+ c->bi.nospace = c->bi.nospace_rp = 0;
smp_wmb();
}
return err;
@@ -1312,7 +1312,11 @@ int ubifs_fsync(struct file *file, int datasync)
dbg_gen("syncing inode %lu", inode->i_ino);
- if (inode->i_sb->s_flags & MS_RDONLY)
+ if (c->ro_mount)
+ /*
+ * For some really strange reasons VFS does not filter out
+ * 'fsync()' for R/O mounted file-systems as per 2.6.39.
+ */
return 0;
/*
@@ -1432,10 +1436,11 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
}
/*
- * mmap()d file has taken write protection fault and is being made
- * writable. UBIFS must ensure page is budgeted for.
+ * mmap()d file has taken write protection fault and is being made writable.
+ * UBIFS must ensure page is budgeted for.
*/
-static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
{
struct page *page = vmf->page;
struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
@@ -1536,7 +1541,6 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
int err;
- /* 'generic_file_mmap()' takes care of NOMMU case */
err = generic_file_mmap(file, vma);
if (err)
return err;
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 1d54383d1269..2559d174e004 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -252,8 +252,8 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
* But if the index takes fewer LEBs than it is reserved for it,
* this function must avoid picking those reserved LEBs.
*/
- if (c->min_idx_lebs >= c->lst.idx_lebs) {
- rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs;
+ if (c->bi.min_idx_lebs >= c->lst.idx_lebs) {
+ rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs;
exclude_index = 1;
}
spin_unlock(&c->space_lock);
@@ -276,7 +276,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
pick_free = 0;
} else {
spin_lock(&c->space_lock);
- exclude_index = (c->min_idx_lebs >= c->lst.idx_lebs);
+ exclude_index = (c->bi.min_idx_lebs >= c->lst.idx_lebs);
spin_unlock(&c->space_lock);
}
@@ -501,8 +501,8 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
/* Check if there are enough empty LEBs for commit */
spin_lock(&c->space_lock);
- if (c->min_idx_lebs > c->lst.idx_lebs)
- rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs;
+ if (c->bi.min_idx_lebs > c->lst.idx_lebs)
+ rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs;
else
rsvd_idx_lebs = 0;
lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 151f10882820..ded29f6224c2 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -100,6 +100,10 @@ static int switch_gc_head(struct ubifs_info *c)
if (err)
return err;
+ err = ubifs_wbuf_sync_nolock(wbuf);
+ if (err)
+ return err;
+
err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0);
if (err)
return err;
@@ -118,7 +122,7 @@ static int switch_gc_head(struct ubifs_info *c)
* This function compares data nodes @a and @b. Returns %1 if @a has greater
* inode or block number, and %-1 otherwise.
*/
-int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
{
ino_t inuma, inumb;
struct ubifs_info *c = priv;
@@ -161,7 +165,8 @@ int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
* first and sorted by length in descending order. Directory entry nodes go
* after inode nodes and are sorted in ascending hash valuer order.
*/
-int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int nondata_nodes_cmp(void *priv, struct list_head *a,
+ struct list_head *b)
{
ino_t inuma, inumb;
struct ubifs_info *c = priv;
@@ -473,6 +478,37 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
ubifs_assert(c->gc_lnum != lnum);
ubifs_assert(wbuf->lnum != lnum);
+ if (lp->free + lp->dirty == c->leb_size) {
+ /* Special case - a free LEB */
+ dbg_gc("LEB %d is free, return it", lp->lnum);
+ ubifs_assert(!(lp->flags & LPROPS_INDEX));
+
+ if (lp->free != c->leb_size) {
+ /*
+ * Write buffers must be sync'd before unmapping
+ * freeable LEBs, because one of them may contain data
+ * which obsoletes something in 'lp->pnum'.
+ */
+ err = gc_sync_wbufs(c);
+ if (err)
+ return err;
+ err = ubifs_change_one_lp(c, lp->lnum, c->leb_size,
+ 0, 0, 0, 0);
+ if (err)
+ return err;
+ }
+ err = ubifs_leb_unmap(c, lp->lnum);
+ if (err)
+ return err;
+
+ if (c->gc_lnum == -1) {
+ c->gc_lnum = lnum;
+ return LEB_RETAINED;
+ }
+
+ return LEB_FREED;
+ }
+
/*
* We scan the entire LEB even though we only really need to scan up to
* (c->leb_size - lp->free).
@@ -682,37 +718,6 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
"(min. space %d)", lp.lnum, lp.free, lp.dirty,
lp.free + lp.dirty, min_space);
- if (lp.free + lp.dirty == c->leb_size) {
- /* An empty LEB was returned */
- dbg_gc("LEB %d is free, return it", lp.lnum);
- /*
- * ubifs_find_dirty_leb() doesn't return freeable index
- * LEBs.
- */
- ubifs_assert(!(lp.flags & LPROPS_INDEX));
- if (lp.free != c->leb_size) {
- /*
- * Write buffers must be sync'd before
- * unmapping freeable LEBs, because one of them
- * may contain data which obsoletes something
- * in 'lp.pnum'.
- */
- ret = gc_sync_wbufs(c);
- if (ret)
- goto out;
- ret = ubifs_change_one_lp(c, lp.lnum,
- c->leb_size, 0, 0, 0,
- 0);
- if (ret)
- goto out;
- }
- ret = ubifs_leb_unmap(c, lp.lnum);
- if (ret)
- goto out;
- ret = lp.lnum;
- break;
- }
-
space_before = c->leb_size - wbuf->offs - wbuf->used;
if (wbuf->lnum == -1)
space_before = 0;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index dfd168b7807e..166951e0dcd3 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -393,7 +393,7 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
ubifs_assert(wbuf->size % c->min_io_size == 0);
ubifs_assert(!c->ro_media && !c->ro_mount);
if (c->leb_size - wbuf->offs >= c->max_write_size)
- ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size ));
+ ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size));
if (c->ro_error)
return -EROFS;
@@ -452,8 +452,8 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
* @dtype: data type
*
* This function targets the write-buffer to logical eraseblock @lnum:@offs.
- * The write-buffer is synchronized if it is not empty. Returns zero in case of
- * success and a negative error code in case of failure.
+ * The write-buffer has to be empty. Returns zero in case of success and a
+ * negative error code in case of failure.
*/
int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
int dtype)
@@ -465,13 +465,7 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
ubifs_assert(offs >= 0 && offs <= c->leb_size);
ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7));
ubifs_assert(lnum != wbuf->lnum);
-
- if (wbuf->used > 0) {
- int err = ubifs_wbuf_sync_nolock(wbuf);
-
- if (err)
- return err;
- }
+ ubifs_assert(wbuf->used == 0);
spin_lock(&wbuf->lock);
wbuf->lnum = lnum;
@@ -573,7 +567,7 @@ out_timers:
int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
{
struct ubifs_info *c = wbuf->c;
- int err, written, n, aligned_len = ALIGN(len, 8), offs;
+ int err, written, n, aligned_len = ALIGN(len, 8);
dbg_io("%d bytes (%s) to jhead %s wbuf at LEB %d:%d", len,
dbg_ntype(((struct ubifs_ch *)buf)->node_type),
@@ -588,7 +582,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
ubifs_assert(!c->ro_media && !c->ro_mount);
if (c->leb_size - wbuf->offs >= c->max_write_size)
- ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size ));
+ ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size));
if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) {
err = -ENOSPC;
@@ -636,7 +630,6 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
goto exit;
}
- offs = wbuf->offs;
written = 0;
if (wbuf->used) {
@@ -653,7 +646,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
if (err)
goto out;
- offs += wbuf->size;
+ wbuf->offs += wbuf->size;
len -= wbuf->avail;
aligned_len -= wbuf->avail;
written += wbuf->avail;
@@ -672,7 +665,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
if (err)
goto out;
- offs += wbuf->size;
+ wbuf->offs += wbuf->size;
len -= wbuf->size;
aligned_len -= wbuf->size;
written += wbuf->size;
@@ -687,12 +680,13 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
n = aligned_len >> c->max_write_shift;
if (n) {
n <<= c->max_write_shift;
- dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, offs);
- err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, offs, n,
- wbuf->dtype);
+ dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum,
+ wbuf->offs);
+ err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written,
+ wbuf->offs, n, wbuf->dtype);
if (err)
goto out;
- offs += n;
+ wbuf->offs += n;
aligned_len -= n;
len -= n;
written += n;
@@ -707,7 +701,6 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
*/
memcpy(wbuf->buf, buf + written, len);
- wbuf->offs = offs;
if (c->leb_size - wbuf->offs >= c->max_write_size)
wbuf->size = c->max_write_size;
else
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index aed25e864227..34b1679e6e3a 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -141,14 +141,8 @@ again:
* LEB with some empty space.
*/
lnum = ubifs_find_free_space(c, len, &offs, squeeze);
- if (lnum >= 0) {
- /* Found an LEB, add it to the journal head */
- err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
- if (err)
- goto out_return;
- /* A new bud was successfully allocated and added to the log */
+ if (lnum >= 0)
goto out;
- }
err = lnum;
if (err != -ENOSPC)
@@ -203,12 +197,23 @@ again:
return 0;
}
- err = ubifs_add_bud_to_log(c, jhead, lnum, 0);
- if (err)
- goto out_return;
offs = 0;
out:
+ /*
+ * Make sure we synchronize the write-buffer before we add the new bud
+ * to the log. Otherwise we may have a power cut after the log
+ * reference node for the last bud (@lnum) is written but before the
+ * write-buffer data are written to the next-to-last bud
+ * (@wbuf->lnum). And the effect would be that the recovery would see
+ * that there is corruption in the next-to-last bud.
+ */
+ err = ubifs_wbuf_sync_nolock(wbuf);
+ if (err)
+ goto out_return;
+ err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
+ if (err)
+ goto out_return;
err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, wbuf->dtype);
if (err)
goto out_unlock;
@@ -380,10 +385,8 @@ out:
if (err == -ENOSPC) {
/* This are some budgeting problems, print useful information */
down_write(&c->commit_sem);
- spin_lock(&c->space_lock);
dbg_dump_stack();
- dbg_dump_budg(c);
- spin_unlock(&c->space_lock);
+ dbg_dump_budg(c, &c->bi);
dbg_dump_lprops(c);
cmt_retries = dbg_check_lprops(c);
up_write(&c->commit_sem);
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 40fa780ebea7..affea9494ae2 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -100,20 +100,6 @@ struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum)
}
/**
- * next_log_lnum - switch to the next log LEB.
- * @c: UBIFS file-system description object
- * @lnum: current log LEB
- */
-static inline int next_log_lnum(const struct ubifs_info *c, int lnum)
-{
- lnum += 1;
- if (lnum > c->log_last)
- lnum = UBIFS_LOG_LNUM;
-
- return lnum;
-}
-
-/**
* empty_log_bytes - calculate amount of empty space in the log.
* @c: UBIFS file-system description object
*/
@@ -257,7 +243,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
ref->jhead = cpu_to_le32(jhead);
if (c->lhead_offs > c->leb_size - c->ref_node_alsz) {
- c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
+ c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
c->lhead_offs = 0;
}
@@ -425,7 +411,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
/* Switch to the next log LEB */
if (c->lhead_offs) {
- c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
+ c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
c->lhead_offs = 0;
}
@@ -446,7 +432,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
c->lhead_offs += len;
if (c->lhead_offs == c->leb_size) {
- c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
+ c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
c->lhead_offs = 0;
}
@@ -533,7 +519,7 @@ int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum)
}
mutex_lock(&c->log_mutex);
for (lnum = old_ltail_lnum; lnum != c->ltail_lnum;
- lnum = next_log_lnum(c, lnum)) {
+ lnum = ubifs_next_log_lnum(c, lnum)) {
dbg_log("unmap log LEB %d", lnum);
err = ubifs_leb_unmap(c, lnum);
if (err)
@@ -642,7 +628,7 @@ static int add_node(struct ubifs_info *c, void *buf, int *lnum, int *offs,
err = ubifs_leb_change(c, *lnum, buf, sz, UBI_SHORTTERM);
if (err)
return err;
- *lnum = next_log_lnum(c, *lnum);
+ *lnum = ubifs_next_log_lnum(c, *lnum);
*offs = 0;
}
memcpy(buf + *offs, node, len);
@@ -712,7 +698,7 @@ int ubifs_consolidate_log(struct ubifs_info *c)
ubifs_scan_destroy(sleb);
if (lnum == c->lhead_lnum)
break;
- lnum = next_log_lnum(c, lnum);
+ lnum = ubifs_next_log_lnum(c, lnum);
}
if (offs) {
int sz = ALIGN(offs, c->min_io_size);
@@ -732,7 +718,7 @@ int ubifs_consolidate_log(struct ubifs_info *c)
/* Unmap remaining LEBs */
lnum = write_lnum;
do {
- lnum = next_log_lnum(c, lnum);
+ lnum = ubifs_next_log_lnum(c, lnum);
err = ubifs_leb_unmap(c, lnum);
if (err)
return err;
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 0ee0847f2421..667884f4a615 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -1007,21 +1007,11 @@ out:
}
/**
- * struct scan_check_data - data provided to scan callback function.
- * @lst: LEB properties statistics
- * @err: error code
- */
-struct scan_check_data {
- struct ubifs_lp_stats lst;
- int err;
-};
-
-/**
* scan_check_cb - scan callback.
* @c: the UBIFS file-system description object
* @lp: LEB properties to scan
* @in_tree: whether the LEB properties are in main memory
- * @data: information passed to and from the caller of the scan
+ * @lst: lprops statistics to update
*
* This function returns a code that indicates whether the scan should continue
* (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
@@ -1030,11 +1020,10 @@ struct scan_check_data {
*/
static int scan_check_cb(struct ubifs_info *c,
const struct ubifs_lprops *lp, int in_tree,
- struct scan_check_data *data)
+ struct ubifs_lp_stats *lst)
{
struct ubifs_scan_leb *sleb;
struct ubifs_scan_node *snod;
- struct ubifs_lp_stats *lst = &data->lst;
int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret;
void *buf = NULL;
@@ -1044,7 +1033,7 @@ static int scan_check_cb(struct ubifs_info *c,
if (cat != (lp->flags & LPROPS_CAT_MASK)) {
ubifs_err("bad LEB category %d expected %d",
(lp->flags & LPROPS_CAT_MASK), cat);
- goto out;
+ return -EINVAL;
}
}
@@ -1078,7 +1067,7 @@ static int scan_check_cb(struct ubifs_info *c,
}
if (!found) {
ubifs_err("bad LPT list (category %d)", cat);
- goto out;
+ return -EINVAL;
}
}
}
@@ -1090,45 +1079,40 @@ static int scan_check_cb(struct ubifs_info *c,
if ((lp->hpos != -1 && heap->arr[lp->hpos]->lnum != lnum) ||
lp != heap->arr[lp->hpos]) {
ubifs_err("bad LPT heap (category %d)", cat);
- goto out;
+ return -EINVAL;
}
}
buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
- if (!buf) {
- ubifs_err("cannot allocate memory to scan LEB %d", lnum);
- goto out;
+ if (!buf)
+ return -ENOMEM;
+
+ /*
+ * After an unclean unmount, empty and freeable LEBs
+ * may contain garbage - do not scan them.
+ */
+ if (lp->free == c->leb_size) {
+ lst->empty_lebs += 1;
+ lst->total_free += c->leb_size;
+ lst->total_dark += ubifs_calc_dark(c, c->leb_size);
+ return LPT_SCAN_CONTINUE;
+ }
+ if (lp->free + lp->dirty == c->leb_size &&
+ !(lp->flags & LPROPS_INDEX)) {
+ lst->total_free += lp->free;
+ lst->total_dirty += lp->dirty;
+ lst->total_dark += ubifs_calc_dark(c, c->leb_size);
+ return LPT_SCAN_CONTINUE;
}
sleb = ubifs_scan(c, lnum, 0, buf, 0);
if (IS_ERR(sleb)) {
- /*
- * After an unclean unmount, empty and freeable LEBs
- * may contain garbage.
- */
- if (lp->free == c->leb_size) {
- ubifs_err("scan errors were in empty LEB "
- "- continuing checking");
- lst->empty_lebs += 1;
- lst->total_free += c->leb_size;
- lst->total_dark += ubifs_calc_dark(c, c->leb_size);
- ret = LPT_SCAN_CONTINUE;
- goto exit;
- }
-
- if (lp->free + lp->dirty == c->leb_size &&
- !(lp->flags & LPROPS_INDEX)) {
- ubifs_err("scan errors were in freeable LEB "
- "- continuing checking");
- lst->total_free += lp->free;
- lst->total_dirty += lp->dirty;
- lst->total_dark += ubifs_calc_dark(c, c->leb_size);
- ret = LPT_SCAN_CONTINUE;
- goto exit;
+ ret = PTR_ERR(sleb);
+ if (ret == -EUCLEAN) {
+ dbg_dump_lprops(c);
+ dbg_dump_budg(c, &c->bi);
}
- data->err = PTR_ERR(sleb);
- ret = LPT_SCAN_STOP;
- goto exit;
+ goto out;
}
is_idx = -1;
@@ -1246,10 +1230,8 @@ static int scan_check_cb(struct ubifs_info *c,
}
ubifs_scan_destroy(sleb);
- ret = LPT_SCAN_CONTINUE;
-exit:
vfree(buf);
- return ret;
+ return LPT_SCAN_CONTINUE;
out_print:
ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, "
@@ -1258,10 +1240,10 @@ out_print:
dbg_dump_leb(c, lnum);
out_destroy:
ubifs_scan_destroy(sleb);
+ ret = -EINVAL;
out:
vfree(buf);
- data->err = -EINVAL;
- return LPT_SCAN_STOP;
+ return ret;
}
/**
@@ -1278,8 +1260,7 @@ out:
int dbg_check_lprops(struct ubifs_info *c)
{
int i, err;
- struct scan_check_data data;
- struct ubifs_lp_stats *lst = &data.lst;
+ struct ubifs_lp_stats lst;
if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
return 0;
@@ -1294,29 +1275,23 @@ int dbg_check_lprops(struct ubifs_info *c)
return err;
}
- memset(lst, 0, sizeof(struct ubifs_lp_stats));
-
- data.err = 0;
+ memset(&lst, 0, sizeof(struct ubifs_lp_stats));
err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1,
(ubifs_lpt_scan_callback)scan_check_cb,
- &data);
+ &lst);
if (err && err != -ENOSPC)
goto out;
- if (data.err) {
- err = data.err;
- goto out;
- }
- if (lst->empty_lebs != c->lst.empty_lebs ||
- lst->idx_lebs != c->lst.idx_lebs ||
- lst->total_free != c->lst.total_free ||
- lst->total_dirty != c->lst.total_dirty ||
- lst->total_used != c->lst.total_used) {
+ if (lst.empty_lebs != c->lst.empty_lebs ||
+ lst.idx_lebs != c->lst.idx_lebs ||
+ lst.total_free != c->lst.total_free ||
+ lst.total_dirty != c->lst.total_dirty ||
+ lst.total_used != c->lst.total_used) {
ubifs_err("bad overall accounting");
ubifs_err("calculated: empty_lebs %d, idx_lebs %d, "
"total_free %lld, total_dirty %lld, total_used %lld",
- lst->empty_lebs, lst->idx_lebs, lst->total_free,
- lst->total_dirty, lst->total_used);
+ lst.empty_lebs, lst.idx_lebs, lst.total_free,
+ lst.total_dirty, lst.total_used);
ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, "
"total_free %lld, total_dirty %lld, total_used %lld",
c->lst.empty_lebs, c->lst.idx_lebs, c->lst.total_free,
@@ -1325,11 +1300,11 @@ int dbg_check_lprops(struct ubifs_info *c)
goto out;
}
- if (lst->total_dead != c->lst.total_dead ||
- lst->total_dark != c->lst.total_dark) {
+ if (lst.total_dead != c->lst.total_dead ||
+ lst.total_dark != c->lst.total_dark) {
ubifs_err("bad dead/dark space accounting");
ubifs_err("calculated: total_dead %lld, total_dark %lld",
- lst->total_dead, lst->total_dark);
+ lst.total_dead, lst.total_dark);
ubifs_err("read from lprops: total_dead %lld, total_dark %lld",
c->lst.total_dead, c->lst.total_dark);
err = -EINVAL;
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 0c9c69bd983a..dfcb5748a7dc 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -29,6 +29,12 @@
#include <linux/slab.h>
#include "ubifs.h"
+#ifdef CONFIG_UBIFS_FS_DEBUG
+static int dbg_populate_lsave(struct ubifs_info *c);
+#else
+#define dbg_populate_lsave(c) 0
+#endif
+
/**
* first_dirty_cnode - find first dirty cnode.
* @c: UBIFS file-system description object
@@ -586,7 +592,7 @@ static struct ubifs_pnode *next_pnode_to_dirty(struct ubifs_info *c,
if (nnode->nbranch[iip].lnum)
break;
}
- } while (iip >= UBIFS_LPT_FANOUT);
+ } while (iip >= UBIFS_LPT_FANOUT);
/* Go right */
nnode = ubifs_get_nnode(c, nnode, iip);
@@ -815,6 +821,10 @@ static void populate_lsave(struct ubifs_info *c)
c->lpt_drty_flgs |= LSAVE_DIRTY;
ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz);
}
+
+ if (dbg_populate_lsave(c))
+ return;
+
list_for_each_entry(lprops, &c->empty_list, list) {
c->lsave[cnt++] = lprops->lnum;
if (cnt >= c->lsave_cnt)
@@ -1994,4 +2004,47 @@ void dbg_dump_lpt_lebs(const struct ubifs_info *c)
current->pid);
}
+/**
+ * dbg_populate_lsave - debugging version of 'populate_lsave()'
+ * @c: UBIFS file-system description object
+ *
+ * This is a debugging version for 'populate_lsave()' which populates lsave
+ * with random LEBs instead of useful LEBs, which is good for test coverage.
+ * Returns zero if lsave has not been populated (this debugging feature is
+ * disabled) an non-zero if lsave has been populated.
+ */
+static int dbg_populate_lsave(struct ubifs_info *c)
+{
+ struct ubifs_lprops *lprops;
+ struct ubifs_lpt_heap *heap;
+ int i;
+
+ if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+ return 0;
+ if (random32() & 3)
+ return 0;
+
+ for (i = 0; i < c->lsave_cnt; i++)
+ c->lsave[i] = c->main_first;
+
+ list_for_each_entry(lprops, &c->empty_list, list)
+ c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
+ list_for_each_entry(lprops, &c->freeable_list, list)
+ c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
+ list_for_each_entry(lprops, &c->frdi_idx_list, list)
+ c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
+
+ heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
+ for (i = 0; i < heap->cnt; i++)
+ c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
+ heap = &c->lpt_heap[LPROPS_DIRTY - 1];
+ for (i = 0; i < heap->cnt; i++)
+ c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
+ heap = &c->lpt_heap[LPROPS_FREE - 1];
+ for (i = 0; i < heap->cnt; i++)
+ c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
+
+ return 1;
+}
+
#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 21f47afdacff..278c2382e8c2 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -148,7 +148,7 @@ static int validate_master(const struct ubifs_info *c)
}
main_sz = (long long)c->main_lebs * c->leb_size;
- if (c->old_idx_sz & 7 || c->old_idx_sz >= main_sz) {
+ if (c->bi.old_idx_sz & 7 || c->bi.old_idx_sz >= main_sz) {
err = 9;
goto out;
}
@@ -218,7 +218,7 @@ static int validate_master(const struct ubifs_info *c)
}
if (c->lst.total_dead + c->lst.total_dark +
- c->lst.total_used + c->old_idx_sz > main_sz) {
+ c->lst.total_used + c->bi.old_idx_sz > main_sz) {
err = 21;
goto out;
}
@@ -286,7 +286,7 @@ int ubifs_read_master(struct ubifs_info *c)
c->gc_lnum = le32_to_cpu(c->mst_node->gc_lnum);
c->ihead_lnum = le32_to_cpu(c->mst_node->ihead_lnum);
c->ihead_offs = le32_to_cpu(c->mst_node->ihead_offs);
- c->old_idx_sz = le64_to_cpu(c->mst_node->index_size);
+ c->bi.old_idx_sz = le64_to_cpu(c->mst_node->index_size);
c->lpt_lnum = le32_to_cpu(c->mst_node->lpt_lnum);
c->lpt_offs = le32_to_cpu(c->mst_node->lpt_offs);
c->nhead_lnum = le32_to_cpu(c->mst_node->nhead_lnum);
@@ -305,7 +305,7 @@ int ubifs_read_master(struct ubifs_info *c)
c->lst.total_dead = le64_to_cpu(c->mst_node->total_dead);
c->lst.total_dark = le64_to_cpu(c->mst_node->total_dark);
- c->calc_idx_sz = c->old_idx_sz;
+ c->calc_idx_sz = c->bi.old_idx_sz;
if (c->mst_node->flags & cpu_to_le32(UBIFS_MST_NO_ORPHS))
c->no_orphs = 1;
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index c3de04dc952a..0b5296a9a4c5 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -340,4 +340,21 @@ static inline void ubifs_release_lprops(struct ubifs_info *c)
mutex_unlock(&c->lp_mutex);
}
+/**
+ * ubifs_next_log_lnum - switch to the next log LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: current log LEB
+ *
+ * This helper function returns the log LEB number which goes next after LEB
+ * 'lnum'.
+ */
+static inline int ubifs_next_log_lnum(const struct ubifs_info *c, int lnum)
+{
+ lnum += 1;
+ if (lnum > c->log_last)
+ lnum = UBIFS_LOG_LNUM;
+
+ return lnum;
+}
+
#endif /* __UBIFS_MISC_H__ */
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 09df318e368f..bd644bf587a8 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -673,7 +673,8 @@ static int kill_orphans(struct ubifs_info *c)
sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
if (IS_ERR(sleb)) {
if (PTR_ERR(sleb) == -EUCLEAN)
- sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0);
+ sleb = ubifs_recover_leb(c, lnum, 0,
+ c->sbuf, 0);
if (IS_ERR(sleb)) {
err = PTR_ERR(sleb);
break;
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 3dbad6fbd1eb..731d9e2e7b50 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -564,13 +564,16 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
}
/**
- * drop_incomplete_group - drop nodes from an incomplete group.
+ * drop_last_node - drop the last node or group of nodes.
* @sleb: scanned LEB information
* @offs: offset of dropped nodes is returned here
+ * @grouped: non-zero if whole group of nodes have to be dropped
*
- * This function returns %1 if nodes are dropped and %0 otherwise.
+ * This is a helper function for 'ubifs_recover_leb()' which drops the last
+ * node of the scanned LEB or the last group of nodes if @grouped is not zero.
+ * This function returns %1 if a node was dropped and %0 otherwise.
*/
-static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
+static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
{
int dropped = 0;
@@ -589,6 +592,8 @@ static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
kfree(snod);
sleb->nodes_cnt -= 1;
dropped = 1;
+ if (!grouped)
+ break;
}
return dropped;
}
@@ -609,8 +614,7 @@ static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
int offs, void *sbuf, int grouped)
{
- int err, len = c->leb_size - offs, need_clean = 0, quiet = 1;
- int empty_chkd = 0, start = offs;
+ int ret = 0, err, len = c->leb_size - offs, start = offs, min_io_unit;
struct ubifs_scan_leb *sleb;
void *buf = sbuf + offs;
@@ -620,12 +624,8 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
if (IS_ERR(sleb))
return sleb;
- if (sleb->ecc)
- need_clean = 1;
-
+ ubifs_assert(len >= 8);
while (len >= 8) {
- int ret;
-
dbg_scan("look at LEB %d:%d (%d bytes left)",
lnum, offs, len);
@@ -635,8 +635,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
* Scan quietly until there is an error from which we cannot
* recover
*/
- ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet);
-
+ ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0);
if (ret == SCANNED_A_NODE) {
/* A valid node, and not a padding node */
struct ubifs_ch *ch = buf;
@@ -649,70 +648,32 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
offs += node_len;
buf += node_len;
len -= node_len;
- continue;
- }
-
- if (ret > 0) {
+ } else if (ret > 0) {
/* Padding bytes or a valid padding node */
offs += ret;
buf += ret;
len -= ret;
- continue;
- }
-
- if (ret == SCANNED_EMPTY_SPACE) {
- if (!is_empty(buf, len)) {
- if (!is_last_write(c, buf, offs))
- break;
- clean_buf(c, &buf, lnum, &offs, &len);
- need_clean = 1;
- }
- empty_chkd = 1;
+ } else if (ret == SCANNED_EMPTY_SPACE ||
+ ret == SCANNED_GARBAGE ||
+ ret == SCANNED_A_BAD_PAD_NODE ||
+ ret == SCANNED_A_CORRUPT_NODE) {
+ dbg_rcvry("found corruption - %d", ret);
break;
- }
-
- if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE)
- if (is_last_write(c, buf, offs)) {
- clean_buf(c, &buf, lnum, &offs, &len);
- need_clean = 1;
- empty_chkd = 1;
- break;
- }
-
- if (ret == SCANNED_A_CORRUPT_NODE)
- if (no_more_nodes(c, buf, len, lnum, offs)) {
- clean_buf(c, &buf, lnum, &offs, &len);
- need_clean = 1;
- empty_chkd = 1;
- break;
- }
-
- if (quiet) {
- /* Redo the last scan but noisily */
- quiet = 0;
- continue;
- }
-
- switch (ret) {
- case SCANNED_GARBAGE:
- dbg_err("garbage");
- goto corrupted;
- case SCANNED_A_CORRUPT_NODE:
- case SCANNED_A_BAD_PAD_NODE:
- dbg_err("bad node");
- goto corrupted;
- default:
- dbg_err("unknown");
+ } else {
+ dbg_err("unexpected return value %d", ret);
err = -EINVAL;
goto error;
}
}
- if (!empty_chkd && !is_empty(buf, len)) {
- if (is_last_write(c, buf, offs)) {
- clean_buf(c, &buf, lnum, &offs, &len);
- need_clean = 1;
- } else {
+ if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE) {
+ if (!is_last_write(c, buf, offs))
+ goto corrupted_rescan;
+ } else if (ret == SCANNED_A_CORRUPT_NODE) {
+ if (!no_more_nodes(c, buf, len, lnum, offs))
+ goto corrupted_rescan;
+ } else if (!is_empty(buf, len)) {
+ if (!is_last_write(c, buf, offs)) {
int corruption = first_non_ff(buf, len);
/*
@@ -728,29 +689,82 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
}
}
- /* Drop nodes from incomplete group */
- if (grouped && drop_incomplete_group(sleb, &offs)) {
- buf = sbuf + offs;
- len = c->leb_size - offs;
- clean_buf(c, &buf, lnum, &offs, &len);
- need_clean = 1;
- }
+ min_io_unit = round_down(offs, c->min_io_size);
+ if (grouped)
+ /*
+ * If nodes are grouped, always drop the incomplete group at
+ * the end.
+ */
+ drop_last_node(sleb, &offs, 1);
- if (offs % c->min_io_size) {
- clean_buf(c, &buf, lnum, &offs, &len);
- need_clean = 1;
- }
+ /*
+ * While we are in the middle of the same min. I/O unit keep dropping
+ * nodes. So basically, what we want is to make sure that the last min.
+ * I/O unit where we saw the corruption is dropped completely with all
+ * the uncorrupted node which may possibly sit there.
+ *
+ * In other words, let's name the min. I/O unit where the corruption
+ * starts B, and the previous min. I/O unit A. The below code tries to
+ * deal with a situation when half of B contains valid nodes or the end
+ * of a valid node, and the second half of B contains corrupted data or
+ * garbage. This means that UBIFS had been writing to B just before the
+ * power cut happened. I do not know how realistic is this scenario
+ * that half of the min. I/O unit had been written successfully and the
+ * other half not, but this is possible in our 'failure mode emulation'
+ * infrastructure at least.
+ *
+ * So what is the problem, why we need to drop those nodes? Whey can't
+ * we just clean-up the second half of B by putting a padding node
+ * there? We can, and this works fine with one exception which was
+ * reproduced with power cut emulation testing and happens extremely
+ * rarely. The description follows, but it is worth noting that that is
+ * only about the GC head, so we could do this trick only if the bud
+ * belongs to the GC head, but it does not seem to be worth an
+ * additional "if" statement.
+ *
+ * So, imagine the file-system is full, we run GC which is moving valid
+ * nodes from LEB X to LEB Y (obviously, LEB Y is the current GC head
+ * LEB). The @c->gc_lnum is -1, which means that GC will retain LEB X
+ * and will try to continue. Imagine that LEB X is currently the
+ * dirtiest LEB, and the amount of used space in LEB Y is exactly the
+ * same as amount of free space in LEB X.
+ *
+ * And a power cut happens when nodes are moved from LEB X to LEB Y. We
+ * are here trying to recover LEB Y which is the GC head LEB. We find
+ * the min. I/O unit B as described above. Then we clean-up LEB Y by
+ * padding min. I/O unit. And later 'ubifs_rcvry_gc_commit()' function
+ * fails, because it cannot find a dirty LEB which could be GC'd into
+ * LEB Y! Even LEB X does not match because the amount of valid nodes
+ * there does not fit the free space in LEB Y any more! And this is
+ * because of the padding node which we added to LEB Y. The
+ * user-visible effect of this which I once observed and analysed is
+ * that we cannot mount the file-system with -ENOSPC error.
+ *
+ * So obviously, to make sure that situation does not happen we should
+ * free min. I/O unit B in LEB Y completely and the last used min. I/O
+ * unit in LEB Y should be A. This is basically what the below code
+ * tries to do.
+ */
+ while (min_io_unit == round_down(offs, c->min_io_size) &&
+ min_io_unit != offs &&
+ drop_last_node(sleb, &offs, grouped));
+
+ buf = sbuf + offs;
+ len = c->leb_size - offs;
+ clean_buf(c, &buf, lnum, &offs, &len);
ubifs_end_scan(c, sleb, lnum, offs);
- if (need_clean) {
- err = fix_unclean_leb(c, sleb, start);
- if (err)
- goto error;
- }
+ err = fix_unclean_leb(c, sleb, start);
+ if (err)
+ goto error;
return sleb;
+corrupted_rescan:
+ /* Re-scan the corrupted data with verbose messages */
+ dbg_err("corruptio %d", ret);
+ ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
corrupted:
ubifs_scanned_corruption(c, lnum, offs, buf);
err = -EUCLEAN;
@@ -1070,6 +1084,53 @@ int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf)
}
/**
+ * grab_empty_leb - grab an empty LEB to use as GC LEB and run commit.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function for 'ubifs_rcvry_gc_commit()' which grabs an empty
+ * LEB to be used as GC LEB (@c->gc_lnum), and then runs the commit. Returns
+ * zero in case of success and a negative error code in case of failure.
+ */
+static int grab_empty_leb(struct ubifs_info *c)
+{
+ int lnum, err;
+
+ /*
+ * Note, it is very important to first search for an empty LEB and then
+ * run the commit, not vice-versa. The reason is that there might be
+ * only one empty LEB at the moment, the one which has been the
+ * @c->gc_lnum just before the power cut happened. During the regular
+ * UBIFS operation (not now) @c->gc_lnum is marked as "taken", so no
+ * one but GC can grab it. But at this moment this single empty LEB is
+ * not marked as taken, so if we run commit - what happens? Right, the
+ * commit will grab it and write the index there. Remember that the
+ * index always expands as long as there is free space, and it only
+ * starts consolidating when we run out of space.
+ *
+ * IOW, if we run commit now, we might not be able to find a free LEB
+ * after this.
+ */
+ lnum = ubifs_find_free_leb_for_idx(c);
+ if (lnum < 0) {
+ dbg_err("could not find an empty LEB");
+ dbg_dump_lprops(c);
+ dbg_dump_budg(c, &c->bi);
+ return lnum;
+ }
+
+ /* Reset the index flag */
+ err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
+ LPROPS_INDEX, 0);
+ if (err)
+ return err;
+
+ c->gc_lnum = lnum;
+ dbg_rcvry("found empty LEB %d, run commit", lnum);
+
+ return ubifs_run_commit(c);
+}
+
+/**
* ubifs_rcvry_gc_commit - recover the GC LEB number and run the commit.
* @c: UBIFS file-system description object
*
@@ -1091,71 +1152,26 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
{
struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
struct ubifs_lprops lp;
- int lnum, err;
+ int err;
+
+ dbg_rcvry("GC head LEB %d, offs %d", wbuf->lnum, wbuf->offs);
c->gc_lnum = -1;
- if (wbuf->lnum == -1) {
- dbg_rcvry("no GC head LEB");
- goto find_free;
- }
- /*
- * See whether the used space in the dirtiest LEB fits in the GC head
- * LEB.
- */
- if (wbuf->offs == c->leb_size) {
- dbg_rcvry("no room in GC head LEB");
- goto find_free;
- }
+ if (wbuf->lnum == -1 || wbuf->offs == c->leb_size)
+ return grab_empty_leb(c);
+
err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2);
if (err) {
- /*
- * There are no dirty or empty LEBs subject to here being
- * enough for the index. Try to use
- * 'ubifs_find_free_leb_for_idx()', which will return any empty
- * LEBs (ignoring index requirements). If the index then
- * doesn't have enough LEBs the recovery commit will fail -
- * which is the same result anyway i.e. recovery fails. So
- * there is no problem ignoring index requirements and just
- * grabbing a free LEB since we have already established there
- * is not a dirty LEB we could have used instead.
- */
- if (err == -ENOSPC) {
- dbg_rcvry("could not find a dirty LEB");
- goto find_free;
- }
- return err;
- }
- ubifs_assert(!(lp.flags & LPROPS_INDEX));
- lnum = lp.lnum;
- if (lp.free + lp.dirty == c->leb_size) {
- /* An empty LEB was returned */
- if (lp.free != c->leb_size) {
- err = ubifs_change_one_lp(c, lnum, c->leb_size,
- 0, 0, 0, 0);
- if (err)
- return err;
- }
- err = ubifs_leb_unmap(c, lnum);
- if (err)
+ if (err != -ENOSPC)
return err;
- c->gc_lnum = lnum;
- dbg_rcvry("allocated LEB %d for GC", lnum);
- /* Run the commit */
- dbg_rcvry("committing");
- return ubifs_run_commit(c);
- }
- /*
- * There was no empty LEB so the used space in the dirtiest LEB must fit
- * in the GC head LEB.
- */
- if (lp.free + lp.dirty < wbuf->offs) {
- dbg_rcvry("LEB %d doesn't fit in GC head LEB %d:%d",
- lnum, wbuf->lnum, wbuf->offs);
- err = ubifs_return_leb(c, lnum);
- if (err)
- return err;
- goto find_free;
+
+ dbg_rcvry("could not find a dirty LEB");
+ return grab_empty_leb(c);
}
+
+ ubifs_assert(!(lp.flags & LPROPS_INDEX));
+ ubifs_assert(lp.free + lp.dirty >= wbuf->offs);
+
/*
* We run the commit before garbage collection otherwise subsequent
* mounts will see the GC and orphan deletion in a different order.
@@ -1164,11 +1180,8 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
err = ubifs_run_commit(c);
if (err)
return err;
- /*
- * The data in the dirtiest LEB fits in the GC head LEB, so do the GC
- * - use locking to keep 'ubifs_assert()' happy.
- */
- dbg_rcvry("GC'ing LEB %d", lnum);
+
+ dbg_rcvry("GC'ing LEB %d", lp.lnum);
mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
err = ubifs_garbage_collect_leb(c, &lp);
if (err >= 0) {
@@ -1184,37 +1197,17 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
err = -EINVAL;
return err;
}
- if (err != LEB_RETAINED) {
- dbg_err("GC returned %d", err);
+
+ ubifs_assert(err == LEB_RETAINED);
+ if (err != LEB_RETAINED)
return -EINVAL;
- }
+
err = ubifs_leb_unmap(c, c->gc_lnum);
if (err)
return err;
- dbg_rcvry("allocated LEB %d for GC", lnum);
- return 0;
-find_free:
- /*
- * There is no GC head LEB or the free space in the GC head LEB is too
- * small, or there are not dirty LEBs. Allocate gc_lnum by calling
- * 'ubifs_find_free_leb_for_idx()' so GC is not run.
- */
- lnum = ubifs_find_free_leb_for_idx(c);
- if (lnum < 0) {
- dbg_err("could not find an empty LEB");
- return lnum;
- }
- /* And reset the index flag */
- err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
- LPROPS_INDEX, 0);
- if (err)
- return err;
- c->gc_lnum = lnum;
- dbg_rcvry("allocated LEB %d for GC", lnum);
- /* Run the commit */
- dbg_rcvry("committing");
- return ubifs_run_commit(c);
+ dbg_rcvry("allocated LEB %d for GC", lp.lnum);
+ return 0;
}
/**
@@ -1456,7 +1449,7 @@ static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e)
err = ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN);
if (err)
goto out;
- dbg_rcvry("inode %lu at %d:%d size %lld -> %lld ",
+ dbg_rcvry("inode %lu at %d:%d size %lld -> %lld",
(unsigned long)e->inum, lnum, offs, i_size, e->d_size);
return 0;
@@ -1505,20 +1498,27 @@ int ubifs_recover_size(struct ubifs_info *c)
e->i_size = le64_to_cpu(ino->size);
}
}
+
if (e->exists && e->i_size < e->d_size) {
- if (!e->inode && c->ro_mount) {
+ if (c->ro_mount) {
/* Fix the inode size and pin it in memory */
struct inode *inode;
+ struct ubifs_inode *ui;
+
+ ubifs_assert(!e->inode);
inode = ubifs_iget(c->vfs_sb, e->inum);
if (IS_ERR(inode))
return PTR_ERR(inode);
+
+ ui = ubifs_inode(inode);
if (inode->i_size < e->d_size) {
dbg_rcvry("ino %lu size %lld -> %lld",
(unsigned long)e->inum,
- e->d_size, inode->i_size);
+ inode->i_size, e->d_size);
inode->i_size = e->d_size;
- ubifs_inode(inode)->ui_size = e->d_size;
+ ui->ui_size = e->d_size;
+ ui->synced_i_size = e->d_size;
e->inode = inode;
this = rb_next(this);
continue;
@@ -1533,9 +1533,11 @@ int ubifs_recover_size(struct ubifs_info *c)
iput(e->inode);
}
}
+
this = rb_next(this);
rb_erase(&e->rb, &c->size_tree);
kfree(e);
}
+
return 0;
}
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index d3d6d365bfc1..6617280d1679 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -33,44 +33,32 @@
*/
#include "ubifs.h"
-
-/*
- * Replay flags.
- *
- * REPLAY_DELETION: node was deleted
- * REPLAY_REF: node is a reference node
- */
-enum {
- REPLAY_DELETION = 1,
- REPLAY_REF = 2,
-};
+#include <linux/list_sort.h>
/**
- * struct replay_entry - replay tree entry.
+ * struct replay_entry - replay list entry.
* @lnum: logical eraseblock number of the node
* @offs: node offset
* @len: node length
+ * @deletion: non-zero if this entry corresponds to a node deletion
* @sqnum: node sequence number
- * @flags: replay flags
- * @rb: links the replay tree
+ * @list: links the replay list
* @key: node key
* @nm: directory entry name
* @old_size: truncation old size
* @new_size: truncation new size
- * @free: amount of free space in a bud
- * @dirty: amount of dirty space in a bud from padding and deletion nodes
- * @jhead: journal head number of the bud
*
- * UBIFS journal replay must compare node sequence numbers, which means it must
- * build a tree of node information to insert into the TNC.
+ * The replay process first scans all buds and builds the replay list, then
+ * sorts the replay list in nodes sequence number order, and then inserts all
+ * the replay entries to the TNC.
*/
struct replay_entry {
int lnum;
int offs;
int len;
+ unsigned int deletion:1;
unsigned long long sqnum;
- int flags;
- struct rb_node rb;
+ struct list_head list;
union ubifs_key key;
union {
struct qstr nm;
@@ -78,11 +66,6 @@ struct replay_entry {
loff_t old_size;
loff_t new_size;
};
- struct {
- int free;
- int dirty;
- int jhead;
- };
};
};
@@ -90,57 +73,64 @@ struct replay_entry {
* struct bud_entry - entry in the list of buds to replay.
* @list: next bud in the list
* @bud: bud description object
- * @free: free bytes in the bud
* @sqnum: reference node sequence number
+ * @free: free bytes in the bud
+ * @dirty: dirty bytes in the bud
*/
struct bud_entry {
struct list_head list;
struct ubifs_bud *bud;
- int free;
unsigned long long sqnum;
+ int free;
+ int dirty;
};
/**
* set_bud_lprops - set free and dirty space used by a bud.
* @c: UBIFS file-system description object
- * @r: replay entry of bud
+ * @b: bud entry which describes the bud
+ *
+ * This function makes sure the LEB properties of bud @b are set correctly
+ * after the replay. Returns zero in case of success and a negative error code
+ * in case of failure.
*/
-static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
+static int set_bud_lprops(struct ubifs_info *c, struct bud_entry *b)
{
const struct ubifs_lprops *lp;
int err = 0, dirty;
ubifs_get_lprops(c);
- lp = ubifs_lpt_lookup_dirty(c, r->lnum);
+ lp = ubifs_lpt_lookup_dirty(c, b->bud->lnum);
if (IS_ERR(lp)) {
err = PTR_ERR(lp);
goto out;
}
dirty = lp->dirty;
- if (r->offs == 0 && (lp->free != c->leb_size || lp->dirty != 0)) {
+ if (b->bud->start == 0 && (lp->free != c->leb_size || lp->dirty != 0)) {
/*
* The LEB was added to the journal with a starting offset of
* zero which means the LEB must have been empty. The LEB
- * property values should be lp->free == c->leb_size and
- * lp->dirty == 0, but that is not the case. The reason is that
- * the LEB was garbage collected. The garbage collector resets
- * the free and dirty space without recording it anywhere except
- * lprops, so if there is not a commit then lprops does not have
- * that information next time the file system is mounted.
+ * property values should be @lp->free == @c->leb_size and
+ * @lp->dirty == 0, but that is not the case. The reason is that
+ * the LEB had been garbage collected before it became the bud,
+ * and there was not commit inbetween. The garbage collector
+ * resets the free and dirty space without recording it
+ * anywhere except lprops, so if there was no commit then
+ * lprops does not have that information.
*
* We do not need to adjust free space because the scan has told
* us the exact value which is recorded in the replay entry as
- * r->free.
+ * @b->free.
*
* However we do need to subtract from the dirty space the
* amount of space that the garbage collector reclaimed, which
* is the whole LEB minus the amount of space that was free.
*/
- dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum,
+ dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", b->bud->lnum,
lp->free, lp->dirty);
- dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum,
+ dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", b->bud->lnum,
lp->free, lp->dirty);
dirty -= c->leb_size - lp->free;
/*
@@ -152,10 +142,10 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
*/
if (dirty != 0)
dbg_msg("LEB %d lp: %d free %d dirty "
- "replay: %d free %d dirty", r->lnum, lp->free,
- lp->dirty, r->free, r->dirty);
+ "replay: %d free %d dirty", b->bud->lnum,
+ lp->free, lp->dirty, b->free, b->dirty);
}
- lp = ubifs_change_lp(c, lp, r->free, dirty + r->dirty,
+ lp = ubifs_change_lp(c, lp, b->free, dirty + b->dirty,
lp->flags | LPROPS_TAKEN, 0);
if (IS_ERR(lp)) {
err = PTR_ERR(lp);
@@ -163,8 +153,9 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
}
/* Make sure the journal head points to the latest bud */
- err = ubifs_wbuf_seek_nolock(&c->jheads[r->jhead].wbuf, r->lnum,
- c->leb_size - r->free, UBI_SHORTTERM);
+ err = ubifs_wbuf_seek_nolock(&c->jheads[b->bud->jhead].wbuf,
+ b->bud->lnum, c->leb_size - b->free,
+ UBI_SHORTTERM);
out:
ubifs_release_lprops(c);
@@ -172,6 +163,27 @@ out:
}
/**
+ * set_buds_lprops - set free and dirty space for all replayed buds.
+ * @c: UBIFS file-system description object
+ *
+ * This function sets LEB properties for all replayed buds. Returns zero in
+ * case of success and a negative error code in case of failure.
+ */
+static int set_buds_lprops(struct ubifs_info *c)
+{
+ struct bud_entry *b;
+ int err;
+
+ list_for_each_entry(b, &c->replay_buds, list) {
+ err = set_bud_lprops(c, b);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/**
* trun_remove_range - apply a replay entry for a truncation to the TNC.
* @c: UBIFS file-system description object
* @r: replay entry of truncation
@@ -207,24 +219,22 @@ static int trun_remove_range(struct ubifs_info *c, struct replay_entry *r)
*/
static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
{
- int err, deletion = ((r->flags & REPLAY_DELETION) != 0);
+ int err;
- dbg_mnt("LEB %d:%d len %d flgs %d sqnum %llu %s", r->lnum,
- r->offs, r->len, r->flags, r->sqnum, DBGKEY(&r->key));
+ dbg_mnt("LEB %d:%d len %d deletion %d sqnum %llu %s", r->lnum,
+ r->offs, r->len, r->deletion, r->sqnum, DBGKEY(&r->key));
/* Set c->replay_sqnum to help deal with dangling branches. */
c->replay_sqnum = r->sqnum;
- if (r->flags & REPLAY_REF)
- err = set_bud_lprops(c, r);
- else if (is_hash_key(c, &r->key)) {
- if (deletion)
+ if (is_hash_key(c, &r->key)) {
+ if (r->deletion)
err = ubifs_tnc_remove_nm(c, &r->key, &r->nm);
else
err = ubifs_tnc_add_nm(c, &r->key, r->lnum, r->offs,
r->len, &r->nm);
} else {
- if (deletion)
+ if (r->deletion)
switch (key_type(c, &r->key)) {
case UBIFS_INO_KEY:
{
@@ -247,7 +257,7 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
return err;
if (c->need_recovery)
- err = ubifs_recover_size_accum(c, &r->key, deletion,
+ err = ubifs_recover_size_accum(c, &r->key, r->deletion,
r->new_size);
}
@@ -255,68 +265,77 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
}
/**
- * destroy_replay_tree - destroy the replay.
- * @c: UBIFS file-system description object
+ * replay_entries_cmp - compare 2 replay entries.
+ * @priv: UBIFS file-system description object
+ * @a: first replay entry
+ * @a: second replay entry
*
- * Destroy the replay tree.
+ * This is a comparios function for 'list_sort()' which compares 2 replay
+ * entries @a and @b by comparing their sequence numer. Returns %1 if @a has
+ * greater sequence number and %-1 otherwise.
*/
-static void destroy_replay_tree(struct ubifs_info *c)
+static int replay_entries_cmp(void *priv, struct list_head *a,
+ struct list_head *b)
{
- struct rb_node *this = c->replay_tree.rb_node;
- struct replay_entry *r;
-
- while (this) {
- if (this->rb_left) {
- this = this->rb_left;
- continue;
- } else if (this->rb_right) {
- this = this->rb_right;
- continue;
- }
- r = rb_entry(this, struct replay_entry, rb);
- this = rb_parent(this);
- if (this) {
- if (this->rb_left == &r->rb)
- this->rb_left = NULL;
- else
- this->rb_right = NULL;
- }
- if (is_hash_key(c, &r->key))
- kfree(r->nm.name);
- kfree(r);
- }
- c->replay_tree = RB_ROOT;
+ struct replay_entry *ra, *rb;
+
+ cond_resched();
+ if (a == b)
+ return 0;
+
+ ra = list_entry(a, struct replay_entry, list);
+ rb = list_entry(b, struct replay_entry, list);
+ ubifs_assert(ra->sqnum != rb->sqnum);
+ if (ra->sqnum > rb->sqnum)
+ return 1;
+ return -1;
}
/**
- * apply_replay_tree - apply the replay tree to the TNC.
+ * apply_replay_list - apply the replay list to the TNC.
* @c: UBIFS file-system description object
*
- * Apply the replay tree.
- * Returns zero in case of success and a negative error code in case of
- * failure.
+ * Apply all entries in the replay list to the TNC. Returns zero in case of
+ * success and a negative error code in case of failure.
*/
-static int apply_replay_tree(struct ubifs_info *c)
+static int apply_replay_list(struct ubifs_info *c)
{
- struct rb_node *this = rb_first(&c->replay_tree);
+ struct replay_entry *r;
+ int err;
- while (this) {
- struct replay_entry *r;
- int err;
+ list_sort(c, &c->replay_list, &replay_entries_cmp);
+ list_for_each_entry(r, &c->replay_list, list) {
cond_resched();
- r = rb_entry(this, struct replay_entry, rb);
err = apply_replay_entry(c, r);
if (err)
return err;
- this = rb_next(this);
}
+
return 0;
}
/**
- * insert_node - insert a node to the replay tree.
+ * destroy_replay_list - destroy the replay.
+ * @c: UBIFS file-system description object
+ *
+ * Destroy the replay list.
+ */
+static void destroy_replay_list(struct ubifs_info *c)
+{
+ struct replay_entry *r, *tmp;
+
+ list_for_each_entry_safe(r, tmp, &c->replay_list, list) {
+ if (is_hash_key(c, &r->key))
+ kfree(r->nm.name);
+ list_del(&r->list);
+ kfree(r);
+ }
+}
+
+/**
+ * insert_node - insert a node to the replay list
* @c: UBIFS file-system description object
* @lnum: node logical eraseblock number
* @offs: node offset
@@ -328,39 +347,25 @@ static int apply_replay_tree(struct ubifs_info *c)
* @old_size: truncation old size
* @new_size: truncation new size
*
- * This function inserts a scanned non-direntry node to the replay tree. The
- * replay tree is an RB-tree containing @struct replay_entry elements which are
- * indexed by the sequence number. The replay tree is applied at the very end
- * of the replay process. Since the tree is sorted in sequence number order,
- * the older modifications are applied first. This function returns zero in
- * case of success and a negative error code in case of failure.
+ * This function inserts a scanned non-direntry node to the replay list. The
+ * replay list contains @struct replay_entry elements, and we sort this list in
+ * sequence number order before applying it. The replay list is applied at the
+ * very end of the replay process. Since the list is sorted in sequence number
+ * order, the older modifications are applied first. This function returns zero
+ * in case of success and a negative error code in case of failure.
*/
static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
union ubifs_key *key, unsigned long long sqnum,
int deletion, int *used, loff_t old_size,
loff_t new_size)
{
- struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
struct replay_entry *r;
+ dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
+
if (key_inum(c, key) >= c->highest_inum)
c->highest_inum = key_inum(c, key);
- dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
- while (*p) {
- parent = *p;
- r = rb_entry(parent, struct replay_entry, rb);
- if (sqnum < r->sqnum) {
- p = &(*p)->rb_left;
- continue;
- } else if (sqnum > r->sqnum) {
- p = &(*p)->rb_right;
- continue;
- }
- ubifs_err("duplicate sqnum in replay");
- return -EINVAL;
- }
-
r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
if (!r)
return -ENOMEM;
@@ -370,19 +375,18 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
r->lnum = lnum;
r->offs = offs;
r->len = len;
+ r->deletion = !!deletion;
r->sqnum = sqnum;
- r->flags = (deletion ? REPLAY_DELETION : 0);
+ key_copy(c, key, &r->key);
r->old_size = old_size;
r->new_size = new_size;
- key_copy(c, key, &r->key);
- rb_link_node(&r->rb, parent, p);
- rb_insert_color(&r->rb, &c->replay_tree);
+ list_add_tail(&r->list, &c->replay_list);
return 0;
}
/**
- * insert_dent - insert a directory entry node into the replay tree.
+ * insert_dent - insert a directory entry node into the replay list.
* @c: UBIFS file-system description object
* @lnum: node logical eraseblock number
* @offs: node offset
@@ -394,43 +398,25 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
* @deletion: non-zero if this is a deletion
* @used: number of bytes in use in a LEB
*
- * This function inserts a scanned directory entry node to the replay tree.
- * Returns zero in case of success and a negative error code in case of
- * failure.
- *
- * This function is also used for extended attribute entries because they are
- * implemented as directory entry nodes.
+ * This function inserts a scanned directory entry node or an extended
+ * attribute entry to the replay list. Returns zero in case of success and a
+ * negative error code in case of failure.
*/
static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
union ubifs_key *key, const char *name, int nlen,
unsigned long long sqnum, int deletion, int *used)
{
- struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
struct replay_entry *r;
char *nbuf;
+ dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
if (key_inum(c, key) >= c->highest_inum)
c->highest_inum = key_inum(c, key);
- dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
- while (*p) {
- parent = *p;
- r = rb_entry(parent, struct replay_entry, rb);
- if (sqnum < r->sqnum) {
- p = &(*p)->rb_left;
- continue;
- }
- if (sqnum > r->sqnum) {
- p = &(*p)->rb_right;
- continue;
- }
- ubifs_err("duplicate sqnum in replay");
- return -EINVAL;
- }
-
r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
if (!r)
return -ENOMEM;
+
nbuf = kmalloc(nlen + 1, GFP_KERNEL);
if (!nbuf) {
kfree(r);
@@ -442,17 +428,15 @@ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
r->lnum = lnum;
r->offs = offs;
r->len = len;
+ r->deletion = !!deletion;
r->sqnum = sqnum;
+ key_copy(c, key, &r->key);
r->nm.len = nlen;
memcpy(nbuf, name, nlen);
nbuf[nlen] = '\0';
r->nm.name = nbuf;
- r->flags = (deletion ? REPLAY_DELETION : 0);
- key_copy(c, key, &r->key);
- ubifs_assert(!*p);
- rb_link_node(&r->rb, parent, p);
- rb_insert_color(&r->rb, &c->replay_tree);
+ list_add_tail(&r->list, &c->replay_list);
return 0;
}
@@ -489,29 +473,92 @@ int ubifs_validate_entry(struct ubifs_info *c,
}
/**
+ * is_last_bud - check if the bud is the last in the journal head.
+ * @c: UBIFS file-system description object
+ * @bud: bud description object
+ *
+ * This function checks if bud @bud is the last bud in its journal head. This
+ * information is then used by 'replay_bud()' to decide whether the bud can
+ * have corruptions or not. Indeed, only last buds can be corrupted by power
+ * cuts. Returns %1 if this is the last bud, and %0 if not.
+ */
+static int is_last_bud(struct ubifs_info *c, struct ubifs_bud *bud)
+{
+ struct ubifs_jhead *jh = &c->jheads[bud->jhead];
+ struct ubifs_bud *next;
+ uint32_t data;
+ int err;
+
+ if (list_is_last(&bud->list, &jh->buds_list))
+ return 1;
+
+ /*
+ * The following is a quirk to make sure we work correctly with UBIFS
+ * images used with older UBIFS.
+ *
+ * Normally, the last bud will be the last in the journal head's list
+ * of bud. However, there is one exception if the UBIFS image belongs
+ * to older UBIFS. This is fairly unlikely: one would need to use old
+ * UBIFS, then have a power cut exactly at the right point, and then
+ * try to mount this image with new UBIFS.
+ *
+ * The exception is: it is possible to have 2 buds A and B, A goes
+ * before B, and B is the last, bud B is contains no data, and bud A is
+ * corrupted at the end. The reason is that in older versions when the
+ * journal code switched the next bud (from A to B), it first added a
+ * log reference node for the new bud (B), and only after this it
+ * synchronized the write-buffer of current bud (A). But later this was
+ * changed and UBIFS started to always synchronize the write-buffer of
+ * the bud (A) before writing the log reference for the new bud (B).
+ *
+ * But because older UBIFS always synchronized A's write-buffer before
+ * writing to B, we can recognize this exceptional situation but
+ * checking the contents of bud B - if it is empty, then A can be
+ * treated as the last and we can recover it.
+ *
+ * TODO: remove this piece of code in a couple of years (today it is
+ * 16.05.2011).
+ */
+ next = list_entry(bud->list.next, struct ubifs_bud, list);
+ if (!list_is_last(&next->list, &jh->buds_list))
+ return 0;
+
+ err = ubi_read(c->ubi, next->lnum, (char *)&data,
+ next->start, 4);
+ if (err)
+ return 0;
+
+ return data == 0xFFFFFFFF;
+}
+
+/**
* replay_bud - replay a bud logical eraseblock.
* @c: UBIFS file-system description object
- * @lnum: bud logical eraseblock number to replay
- * @offs: bud start offset
- * @jhead: journal head to which this bud belongs
- * @free: amount of free space in the bud is returned here
- * @dirty: amount of dirty space from padding and deletion nodes is returned
- * here
+ * @b: bud entry which describes the bud
*
- * This function returns zero in case of success and a negative error code in
- * case of failure.
+ * This function replays bud @bud, recovers it if needed, and adds all nodes
+ * from this bud to the replay list. Returns zero in case of success and a
+ * negative error code in case of failure.
*/
-static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
- int *free, int *dirty)
+static int replay_bud(struct ubifs_info *c, struct bud_entry *b)
{
- int err = 0, used = 0;
+ int is_last = is_last_bud(c, b->bud);
+ int err = 0, used = 0, lnum = b->bud->lnum, offs = b->bud->start;
struct ubifs_scan_leb *sleb;
struct ubifs_scan_node *snod;
- struct ubifs_bud *bud;
- dbg_mnt("replay bud LEB %d, head %d", lnum, jhead);
- if (c->need_recovery)
- sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, jhead != GCHD);
+ dbg_mnt("replay bud LEB %d, head %d, offs %d, is_last %d",
+ lnum, b->bud->jhead, offs, is_last);
+
+ if (c->need_recovery && is_last)
+ /*
+ * Recover only last LEBs in the journal heads, because power
+ * cuts may cause corruptions only in these LEBs, because only
+ * these LEBs could possibly be written to at the power cut
+ * time.
+ */
+ sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf,
+ b->bud->jhead != GCHD);
else
sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0);
if (IS_ERR(sleb))
@@ -627,15 +674,13 @@ static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
goto out;
}
- bud = ubifs_search_bud(c, lnum);
- if (!bud)
- BUG();
-
+ ubifs_assert(ubifs_search_bud(c, lnum));
ubifs_assert(sleb->endpt - offs >= used);
ubifs_assert(sleb->endpt % c->min_io_size == 0);
- *dirty = sleb->endpt - offs - used;
- *free = c->leb_size - sleb->endpt;
+ b->dirty = sleb->endpt - offs - used;
+ b->free = c->leb_size - sleb->endpt;
+ dbg_mnt("bud LEB %d replied: dirty %d, free %d", lnum, b->dirty, b->free);
out:
ubifs_scan_destroy(sleb);
@@ -649,58 +694,6 @@ out_dump:
}
/**
- * insert_ref_node - insert a reference node to the replay tree.
- * @c: UBIFS file-system description object
- * @lnum: node logical eraseblock number
- * @offs: node offset
- * @sqnum: sequence number
- * @free: amount of free space in bud
- * @dirty: amount of dirty space from padding and deletion nodes
- * @jhead: journal head number for the bud
- *
- * This function inserts a reference node to the replay tree and returns zero
- * in case of success or a negative error code in case of failure.
- */
-static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
- unsigned long long sqnum, int free, int dirty,
- int jhead)
-{
- struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
- struct replay_entry *r;
-
- dbg_mnt("add ref LEB %d:%d", lnum, offs);
- while (*p) {
- parent = *p;
- r = rb_entry(parent, struct replay_entry, rb);
- if (sqnum < r->sqnum) {
- p = &(*p)->rb_left;
- continue;
- } else if (sqnum > r->sqnum) {
- p = &(*p)->rb_right;
- continue;
- }
- ubifs_err("duplicate sqnum in replay tree");
- return -EINVAL;
- }
-
- r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
- if (!r)
- return -ENOMEM;
-
- r->lnum = lnum;
- r->offs = offs;
- r->sqnum = sqnum;
- r->flags = REPLAY_REF;
- r->free = free;
- r->dirty = dirty;
- r->jhead = jhead;
-
- rb_link_node(&r->rb, parent, p);
- rb_insert_color(&r->rb, &c->replay_tree);
- return 0;
-}
-
-/**
* replay_buds - replay all buds.
* @c: UBIFS file-system description object
*
@@ -710,17 +703,16 @@ static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
static int replay_buds(struct ubifs_info *c)
{
struct bud_entry *b;
- int err, uninitialized_var(free), uninitialized_var(dirty);
+ int err;
+ unsigned long long prev_sqnum = 0;
list_for_each_entry(b, &c->replay_buds, list) {
- err = replay_bud(c, b->bud->lnum, b->bud->start, b->bud->jhead,
- &free, &dirty);
- if (err)
- return err;
- err = insert_ref_node(c, b->bud->lnum, b->bud->start, b->sqnum,
- free, dirty, b->bud->jhead);
+ err = replay_bud(c, b);
if (err)
return err;
+
+ ubifs_assert(b->sqnum > prev_sqnum);
+ prev_sqnum = b->sqnum;
}
return 0;
@@ -1060,25 +1052,29 @@ int ubifs_replay_journal(struct ubifs_info *c)
if (err)
goto out;
- err = apply_replay_tree(c);
+ err = apply_replay_list(c);
+ if (err)
+ goto out;
+
+ err = set_buds_lprops(c);
if (err)
goto out;
/*
- * UBIFS budgeting calculations use @c->budg_uncommitted_idx variable
- * to roughly estimate index growth. Things like @c->min_idx_lebs
+ * UBIFS budgeting calculations use @c->bi.uncommitted_idx variable
+ * to roughly estimate index growth. Things like @c->bi.min_idx_lebs
* depend on it. This means we have to initialize it to make sure
* budgeting works properly.
*/
- c->budg_uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt);
- c->budg_uncommitted_idx *= c->max_idx_node_sz;
+ c->bi.uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt);
+ c->bi.uncommitted_idx *= c->max_idx_node_sz;
ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery);
dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, "
"highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum,
(unsigned long)c->highest_inum);
out:
- destroy_replay_tree(c);
+ destroy_replay_list(c);
destroy_bud_list(c);
c->replaying = 0;
return err;
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index bf31b4729e51..c606f010e8df 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -475,7 +475,8 @@ failed:
* @c: UBIFS file-system description object
*
* This function returns a pointer to the superblock node or a negative error
- * code.
+ * code. Note, the user of this function is responsible of kfree()'ing the
+ * returned superblock buffer.
*/
struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c)
{
@@ -616,6 +617,7 @@ int ubifs_read_superblock(struct ubifs_info *c)
c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran);
memcpy(&c->uuid, &sup->uuid, 16);
c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT);
+ c->space_fixup = !!(sup_flags & UBIFS_FLG_SPACE_FIXUP);
/* Automatically increase file system size to the maximum size */
c->old_leb_cnt = c->leb_cnt;
@@ -650,3 +652,152 @@ out:
kfree(sup);
return err;
}
+
+/**
+ * fixup_leb - fixup/unmap an LEB containing free space.
+ * @c: UBIFS file-system description object
+ * @lnum: the LEB number to fix up
+ * @len: number of used bytes in LEB (starting at offset 0)
+ *
+ * This function reads the contents of the given LEB number @lnum, then fixes
+ * it up, so that empty min. I/O units in the end of LEB are actually erased on
+ * flash (rather than being just all-0xff real data). If the LEB is completely
+ * empty, it is simply unmapped.
+ */
+static int fixup_leb(struct ubifs_info *c, int lnum, int len)
+{
+ int err;
+
+ ubifs_assert(len >= 0);
+ ubifs_assert(len % c->min_io_size == 0);
+ ubifs_assert(len < c->leb_size);
+
+ if (len == 0) {
+ dbg_mnt("unmap empty LEB %d", lnum);
+ return ubi_leb_unmap(c->ubi, lnum);
+ }
+
+ dbg_mnt("fixup LEB %d, data len %d", lnum, len);
+ err = ubi_read(c->ubi, lnum, c->sbuf, 0, len);
+ if (err)
+ return err;
+
+ return ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN);
+}
+
+/**
+ * fixup_free_space - find & remap all LEBs containing free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function walks through all LEBs in the filesystem and fiexes up those
+ * containing free/empty space.
+ */
+static int fixup_free_space(struct ubifs_info *c)
+{
+ int lnum, err = 0;
+ struct ubifs_lprops *lprops;
+
+ ubifs_get_lprops(c);
+
+ /* Fixup LEBs in the master area */
+ for (lnum = UBIFS_MST_LNUM; lnum < UBIFS_LOG_LNUM; lnum++) {
+ err = fixup_leb(c, lnum, c->mst_offs + c->mst_node_alsz);
+ if (err)
+ goto out;
+ }
+
+ /* Unmap unused log LEBs */
+ lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
+ while (lnum != c->ltail_lnum) {
+ err = fixup_leb(c, lnum, 0);
+ if (err)
+ goto out;
+ lnum = ubifs_next_log_lnum(c, lnum);
+ }
+
+ /* Fixup the current log head */
+ err = fixup_leb(c, c->lhead_lnum, c->lhead_offs);
+ if (err)
+ goto out;
+
+ /* Fixup LEBs in the LPT area */
+ for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) {
+ int free = c->ltab[lnum - c->lpt_first].free;
+
+ if (free > 0) {
+ err = fixup_leb(c, lnum, c->leb_size - free);
+ if (err)
+ goto out;
+ }
+ }
+
+ /* Unmap LEBs in the orphans area */
+ for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
+ err = fixup_leb(c, lnum, 0);
+ if (err)
+ goto out;
+ }
+
+ /* Fixup LEBs in the main area */
+ for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) {
+ lprops = ubifs_lpt_lookup(c, lnum);
+ if (IS_ERR(lprops)) {
+ err = PTR_ERR(lprops);
+ goto out;
+ }
+
+ if (lprops->free > 0) {
+ err = fixup_leb(c, lnum, c->leb_size - lprops->free);
+ if (err)
+ goto out;
+ }
+ }
+
+out:
+ ubifs_release_lprops(c);
+ return err;
+}
+
+/**
+ * ubifs_fixup_free_space - find & fix all LEBs with free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function fixes up LEBs containing free space on first mount, if the
+ * appropriate flag was set when the FS was created. Each LEB with one or more
+ * empty min. I/O unit (i.e. free-space-count > 0) is re-written, to make sure
+ * the free space is actually erased. E.g., this is necessary for some NAND
+ * chips, since the free space may have been programmed like real "0xff" data
+ * (generating a non-0xff ECC), causing future writes to the not-really-erased
+ * NAND pages to behave badly. After the space is fixed up, the superblock flag
+ * is cleared, so that this is skipped for all future mounts.
+ */
+int ubifs_fixup_free_space(struct ubifs_info *c)
+{
+ int err;
+ struct ubifs_sb_node *sup;
+
+ ubifs_assert(c->space_fixup);
+ ubifs_assert(!c->ro_mount);
+
+ ubifs_msg("start fixing up free space");
+
+ err = fixup_free_space(c);
+ if (err)
+ return err;
+
+ sup = ubifs_read_sb_node(c);
+ if (IS_ERR(sup))
+ return PTR_ERR(sup);
+
+ /* Free-space fixup is no longer required */
+ c->space_fixup = 0;
+ sup->flags &= cpu_to_le32(~UBIFS_FLG_SPACE_FIXUP);
+
+ err = ubifs_write_sb_node(c, sup);
+ kfree(sup);
+ if (err)
+ return err;
+
+ ubifs_msg("free space fixup complete");
+ return err;
+}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 04ad07f4fcc3..6db0bdaa9f74 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -375,7 +375,7 @@ out:
ubifs_release_dirty_inode_budget(c, ui);
else {
/* We've deleted something - clean the "no space" flags */
- c->nospace = c->nospace_rp = 0;
+ c->bi.nospace = c->bi.nospace_rp = 0;
smp_wmb();
}
done:
@@ -694,11 +694,11 @@ static int init_constants_sb(struct ubifs_info *c)
* be compressed and direntries are of the maximum size.
*
* Note, data, which may be stored in inodes is budgeted separately, so
- * it is not included into 'c->inode_budget'.
+ * it is not included into 'c->bi.inode_budget'.
*/
- c->page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE;
- c->inode_budget = UBIFS_INO_NODE_SZ;
- c->dent_budget = UBIFS_MAX_DENT_NODE_SZ;
+ c->bi.page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE;
+ c->bi.inode_budget = UBIFS_INO_NODE_SZ;
+ c->bi.dent_budget = UBIFS_MAX_DENT_NODE_SZ;
/*
* When the amount of flash space used by buds becomes
@@ -742,7 +742,7 @@ static void init_constants_master(struct ubifs_info *c)
{
long long tmp64;
- c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+ c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
c->report_rp_size = ubifs_reported_space(c, c->rp_size);
/*
@@ -1144,8 +1144,8 @@ static int check_free_space(struct ubifs_info *c)
{
ubifs_assert(c->dark_wm > 0);
if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) {
- ubifs_err("insufficient free space to mount in read/write mode");
- dbg_dump_budg(c);
+ ubifs_err("insufficient free space to mount in R/W mode");
+ dbg_dump_budg(c, &c->bi);
dbg_dump_lprops(c);
return -ENOSPC;
}
@@ -1304,7 +1304,7 @@ static int mount_ubifs(struct ubifs_info *c)
if (err)
goto out_lpt;
- err = dbg_check_idx_size(c, c->old_idx_sz);
+ err = dbg_check_idx_size(c, c->bi.old_idx_sz);
if (err)
goto out_lpt;
@@ -1313,7 +1313,7 @@ static int mount_ubifs(struct ubifs_info *c)
goto out_journal;
/* Calculate 'min_idx_lebs' after journal replay */
- c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+ c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
err = ubifs_mount_orphans(c, c->need_recovery, c->ro_mount);
if (err)
@@ -1396,6 +1396,12 @@ static int mount_ubifs(struct ubifs_info *c)
} else
ubifs_assert(c->lst.taken_empty_lebs > 0);
+ if (!c->ro_mount && c->space_fixup) {
+ err = ubifs_fixup_free_space(c);
+ if (err)
+ goto out_infos;
+ }
+
err = dbg_check_filesystem(c);
if (err)
goto out_infos;
@@ -1442,7 +1448,8 @@ static int mount_ubifs(struct ubifs_info *c)
c->main_lebs, c->main_first, c->leb_cnt - 1);
dbg_msg("index LEBs: %d", c->lst.idx_lebs);
dbg_msg("total index bytes: %lld (%lld KiB, %lld MiB)",
- c->old_idx_sz, c->old_idx_sz >> 10, c->old_idx_sz >> 20);
+ c->bi.old_idx_sz, c->bi.old_idx_sz >> 10,
+ c->bi.old_idx_sz >> 20);
dbg_msg("key hash type: %d", c->key_hash_type);
dbg_msg("tree fanout: %d", c->fanout);
dbg_msg("reserved GC LEB: %d", c->gc_lnum);
@@ -1456,7 +1463,7 @@ static int mount_ubifs(struct ubifs_info *c)
dbg_msg("node sizes: ref %zu, cmt. start %zu, orph %zu",
UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ);
dbg_msg("max. node sizes: data %zu, inode %zu dentry %zu, idx %d",
- UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
+ UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout));
dbg_msg("dead watermark: %d", c->dead_wm);
dbg_msg("dark watermark: %d", c->dark_wm);
@@ -1584,6 +1591,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
}
sup->leb_cnt = cpu_to_le32(c->leb_cnt);
err = ubifs_write_sb_node(c, sup);
+ kfree(sup);
if (err)
goto out;
}
@@ -1684,6 +1692,13 @@ static int ubifs_remount_rw(struct ubifs_info *c)
*/
err = dbg_check_space_info(c);
}
+
+ if (c->space_fixup) {
+ err = ubifs_fixup_free_space(c);
+ if (err)
+ goto out;
+ }
+
mutex_unlock(&c->umount_mutex);
return err;
@@ -1766,10 +1781,9 @@ static void ubifs_put_super(struct super_block *sb)
* to write them back because of I/O errors.
*/
if (!c->ro_error) {
- ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0);
- ubifs_assert(c->budg_idx_growth == 0);
- ubifs_assert(c->budg_dd_growth == 0);
- ubifs_assert(c->budg_data_growth == 0);
+ ubifs_assert(c->bi.idx_growth == 0);
+ ubifs_assert(c->bi.dd_growth == 0);
+ ubifs_assert(c->bi.data_growth == 0);
}
/*
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index de485979ca39..8119b1fd8d94 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -2557,11 +2557,11 @@ int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
if (err) {
/* Ensure the znode is dirtied */
if (znode->cnext || !ubifs_zn_dirty(znode)) {
- znode = dirty_cow_bottom_up(c, znode);
- if (IS_ERR(znode)) {
- err = PTR_ERR(znode);
- goto out_unlock;
- }
+ znode = dirty_cow_bottom_up(c, znode);
+ if (IS_ERR(znode)) {
+ err = PTR_ERR(znode);
+ goto out_unlock;
+ }
}
err = tnc_delete(c, znode, n);
}
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 53288e5d604e..41920f357bbf 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -377,15 +377,13 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt)
c->gap_lebs = NULL;
return err;
}
- if (!dbg_force_in_the_gaps_enabled) {
+ if (dbg_force_in_the_gaps_enabled()) {
/*
* Do not print scary warnings if the debugging
* option which forces in-the-gaps is enabled.
*/
- ubifs_err("out of space");
- spin_lock(&c->space_lock);
- dbg_dump_budg(c);
- spin_unlock(&c->space_lock);
+ ubifs_warn("out of space");
+ dbg_dump_budg(c, &c->bi);
dbg_dump_lprops(c);
}
/* Try to commit anyway */
@@ -796,16 +794,16 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot)
spin_lock(&c->space_lock);
/*
* Although we have not finished committing yet, update size of the
- * committed index ('c->old_idx_sz') and zero out the index growth
+ * committed index ('c->bi.old_idx_sz') and zero out the index growth
* budget. It is OK to do this now, because we've reserved all the
* space which is needed to commit the index, and it is save for the
* budgeting subsystem to assume the index is already committed,
* even though it is not.
*/
- ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c));
- c->old_idx_sz = c->calc_idx_sz;
- c->budg_uncommitted_idx = 0;
- c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+ ubifs_assert(c->bi.min_idx_lebs == ubifs_calc_min_idx_lebs(c));
+ c->bi.old_idx_sz = c->calc_idx_sz;
+ c->bi.uncommitted_idx = 0;
+ c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
spin_unlock(&c->space_lock);
mutex_unlock(&c->tnc_mutex);
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index 191ca7863fe7..e24380cf46ed 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -408,9 +408,11 @@ enum {
* Superblock flags.
*
* UBIFS_FLG_BIGLPT: if "big" LPT model is used if set
+ * UBIFS_FLG_SPACE_FIXUP: first-mount "fixup" of free space within LEBs needed
*/
enum {
UBIFS_FLG_BIGLPT = 0x02,
+ UBIFS_FLG_SPACE_FIXUP = 0x04,
};
/**
@@ -434,7 +436,7 @@ struct ubifs_ch {
__u8 node_type;
__u8 group_type;
__u8 padding[2];
-} __attribute__ ((packed));
+} __packed;
/**
* union ubifs_dev_desc - device node descriptor.
@@ -448,7 +450,7 @@ struct ubifs_ch {
union ubifs_dev_desc {
__le32 new;
__le64 huge;
-} __attribute__ ((packed));
+} __packed;
/**
* struct ubifs_ino_node - inode node.
@@ -509,7 +511,7 @@ struct ubifs_ino_node {
__le16 compr_type;
__u8 padding2[26]; /* Watch 'zero_ino_node_unused()' if changing! */
__u8 data[];
-} __attribute__ ((packed));
+} __packed;
/**
* struct ubifs_dent_node - directory entry node.
@@ -534,7 +536,7 @@ struct ubifs_dent_node {
__le16 nlen;
__u8 padding2[4]; /* Watch 'zero_dent_node_unused()' if changing! */
__u8 name[];
-} __attribute__ ((packed));
+} __packed;
/**
* struct ubifs_data_node - data node.
@@ -555,7 +557,7 @@ struct ubifs_data_node {
__le16 compr_type;
__u8 padding[2]; /* Watch 'zero_data_node_unused()' if changing! */
__u8 data[];
-} __attribute__ ((packed));
+} __packed;
/**
* struct ubifs_trun_node - truncation node.
@@ -575,7 +577,7 @@ struct ubifs_trun_node {
__u8 padding[12]; /* Watch 'zero_trun_node_unused()' if changing! */
__le64 old_size;
__le64 new_size;
-} __attribute__ ((packed));
+} __packed;
/**
* struct ubifs_pad_node - padding node.
@@ -586,7 +588,7 @@ struct ubifs_trun_node {
struct ubifs_pad_node {
struct ubifs_ch ch;
__le32 pad_len;
-} __attribute__ ((packed));
+} __packed;
/**
* struct ubifs_sb_node - superblock node.
@@ -644,7 +646,7 @@ struct ubifs_sb_node {
__u8 uuid[16];
__le32 ro_compat_version;
__u8 padding2[3968];
-} __attribute__ ((packed));
+} __packed;
/**
* struct ubifs_mst_node - master node.
@@ -711,7 +713,7 @@ struct ubifs_mst_node {
__le32 idx_lebs;
__le32 leb_cnt;
__u8 padding[344];
-} __attribute__ ((packed));
+} __packed;
/**
* struct ubifs_ref_node - logical eraseblock reference node.
@@ -727,7 +729,7 @@ struct ubifs_ref_node {
__le32 offs;
__le32 jhead;
__u8 padding[28];
-} __attribute__ ((packed));
+} __packed;
/**
* struct ubifs_branch - key/reference/length branch
@@ -741,7 +743,7 @@ struct ubifs_branch {
__le32 offs;
__le32 len;
__u8 key[];
-} __attribute__ ((packed));
+} __packed;
/**
* struct ubifs_idx_node - indexing node.
@@ -755,7 +757,7 @@ struct ubifs_idx_node {
__le16 child_cnt;
__le16 level;
__u8 branches[];
-} __attribute__ ((packed));
+} __packed;
/**
* struct ubifs_cs_node - commit start node.
@@ -765,7 +767,7 @@ struct ubifs_idx_node {
struct ubifs_cs_node {
struct ubifs_ch ch;
__le64 cmt_no;
-} __attribute__ ((packed));
+} __packed;
/**
* struct ubifs_orph_node - orphan node.
@@ -777,6 +779,6 @@ struct ubifs_orph_node {
struct ubifs_ch ch;
__le64 cmt_no;
__le64 inos[];
-} __attribute__ ((packed));
+} __packed;
#endif /* __UBIFS_MEDIA_H__ */
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 8c40ad3c6721..93d1412a06f0 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -389,9 +389,9 @@ struct ubifs_gced_idx_leb {
* The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses
* @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot
* make sure @inode->i_size is always changed under @ui_mutex, because it
- * cannot call 'truncate_setsize()' with @ui_mutex locked, because it would deadlock
- * with 'ubifs_writepage()' (see file.c). All the other inode fields are
- * changed under @ui_mutex, so they do not need "shadow" fields. Note, one
+ * cannot call 'truncate_setsize()' with @ui_mutex locked, because it would
+ * deadlock with 'ubifs_writepage()' (see file.c). All the other inode fields
+ * are changed under @ui_mutex, so they do not need "shadow" fields. Note, one
* could consider to rework locking and base it on "shadow" fields.
*/
struct ubifs_inode {
@@ -937,6 +937,40 @@ struct ubifs_mount_opts {
unsigned int compr_type:2;
};
+/**
+ * struct ubifs_budg_info - UBIFS budgeting information.
+ * @idx_growth: amount of bytes budgeted for index growth
+ * @data_growth: amount of bytes budgeted for cached data
+ * @dd_growth: amount of bytes budgeted for cached data that will make
+ * other data dirty
+ * @uncommitted_idx: amount of bytes were budgeted for growth of the index, but
+ * which still have to be taken into account because the index
+ * has not been committed so far
+ * @old_idx_sz: size of index on flash
+ * @min_idx_lebs: minimum number of LEBs required for the index
+ * @nospace: non-zero if the file-system does not have flash space (used as
+ * optimization)
+ * @nospace_rp: the same as @nospace, but additionally means that even reserved
+ * pool is full
+ * @page_budget: budget for a page (constant, nenver changed after mount)
+ * @inode_budget: budget for an inode (constant, nenver changed after mount)
+ * @dent_budget: budget for a directory entry (constant, nenver changed after
+ * mount)
+ */
+struct ubifs_budg_info {
+ long long idx_growth;
+ long long data_growth;
+ long long dd_growth;
+ long long uncommitted_idx;
+ unsigned long long old_idx_sz;
+ int min_idx_lebs;
+ unsigned int nospace:1;
+ unsigned int nospace_rp:1;
+ int page_budget;
+ int inode_budget;
+ int dent_budget;
+};
+
struct ubifs_debug_info;
/**
@@ -980,6 +1014,7 @@ struct ubifs_debug_info;
* @cmt_wq: wait queue to sleep on if the log is full and a commit is running
*
* @big_lpt: flag that LPT is too big to write whole during commit
+ * @space_fixup: flag indicating that free space in LEBs needs to be cleaned up
* @no_chk_data_crc: do not check CRCs when reading data nodes (except during
* recovery)
* @bulk_read: enable bulk-reads
@@ -1057,32 +1092,14 @@ struct ubifs_debug_info;
* @dirty_zn_cnt: number of dirty znodes
* @clean_zn_cnt: number of clean znodes
*
- * @budg_idx_growth: amount of bytes budgeted for index growth
- * @budg_data_growth: amount of bytes budgeted for cached data
- * @budg_dd_growth: amount of bytes budgeted for cached data that will make
- * other data dirty
- * @budg_uncommitted_idx: amount of bytes were budgeted for growth of the index,
- * but which still have to be taken into account because
- * the index has not been committed so far
- * @space_lock: protects @budg_idx_growth, @budg_data_growth, @budg_dd_growth,
- * @budg_uncommited_idx, @min_idx_lebs, @old_idx_sz, @lst,
- * @nospace, and @nospace_rp;
- * @min_idx_lebs: minimum number of LEBs required for the index
- * @old_idx_sz: size of index on flash
+ * @space_lock: protects @bi and @lst
+ * @lst: lprops statistics
+ * @bi: budgeting information
* @calc_idx_sz: temporary variable which is used to calculate new index size
* (contains accurate new index size at end of TNC commit start)
- * @lst: lprops statistics
- * @nospace: non-zero if the file-system does not have flash space (used as
- * optimization)
- * @nospace_rp: the same as @nospace, but additionally means that even reserved
- * pool is full
- *
- * @page_budget: budget for a page
- * @inode_budget: budget for an inode
- * @dent_budget: budget for a directory entry
*
* @ref_node_alsz: size of the LEB reference node aligned to the min. flash
- * I/O unit
+ * I/O unit
* @mst_node_alsz: master node aligned size
* @min_idx_node_sz: minimum indexing node aligned on 8-bytes boundary
* @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary
@@ -1189,7 +1206,6 @@ struct ubifs_debug_info;
* @replaying: %1 during journal replay
* @mounting: %1 while mounting
* @remounting_rw: %1 while re-mounting from R/O mode to R/W mode
- * @replay_tree: temporary tree used during journal replay
* @replay_list: temporary list used during journal replay
* @replay_buds: list of buds to replay
* @cs_sqnum: sequence number of first node in the log (commit start node)
@@ -1238,6 +1254,7 @@ struct ubifs_info {
wait_queue_head_t cmt_wq;
unsigned int big_lpt:1;
+ unsigned int space_fixup:1;
unsigned int no_chk_data_crc:1;
unsigned int bulk_read:1;
unsigned int default_compr:2;
@@ -1308,21 +1325,10 @@ struct ubifs_info {
atomic_long_t dirty_zn_cnt;
atomic_long_t clean_zn_cnt;
- long long budg_idx_growth;
- long long budg_data_growth;
- long long budg_dd_growth;
- long long budg_uncommitted_idx;
spinlock_t space_lock;
- int min_idx_lebs;
- unsigned long long old_idx_sz;
- unsigned long long calc_idx_sz;
struct ubifs_lp_stats lst;
- unsigned int nospace:1;
- unsigned int nospace_rp:1;
-
- int page_budget;
- int inode_budget;
- int dent_budget;
+ struct ubifs_budg_info bi;
+ unsigned long long calc_idx_sz;
int ref_node_alsz;
int mst_node_alsz;
@@ -1430,7 +1436,6 @@ struct ubifs_info {
unsigned int replaying:1;
unsigned int mounting:1;
unsigned int remounting_rw:1;
- struct rb_root replay_tree;
struct list_head replay_list;
struct list_head replay_buds;
unsigned long long cs_sqnum;
@@ -1628,6 +1633,7 @@ int ubifs_write_master(struct ubifs_info *c);
int ubifs_read_superblock(struct ubifs_info *c);
struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c);
int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup);
+int ubifs_fixup_free_space(struct ubifs_info *c);
/* replay.c */
int ubifs_validate_entry(struct ubifs_info *c,
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 3299f469e712..16f19f55e63f 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -80,8 +80,8 @@ enum {
SECURITY_XATTR,
};
-static const struct inode_operations none_inode_operations;
-static const struct file_operations none_file_operations;
+static const struct inode_operations empty_iops;
+static const struct file_operations empty_fops;
/**
* create_xattr - create an extended attribute.
@@ -131,8 +131,8 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
/* Re-define all operations to be "nothing" */
inode->i_mapping->a_ops = &empty_aops;
- inode->i_op = &none_inode_operations;
- inode->i_fop = &none_file_operations;
+ inode->i_op = &empty_iops;
+ inode->i_fop = &empty_fops;
inode->i_flags |= S_SYNC | S_NOATIME | S_NOCMTIME | S_NOQUOTA;
ui = ubifs_inode(inode);