aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/jbd2/commit.c14
-rw-r--r--fs/jbd2/transaction.c58
-rw-r--r--include/linux/jbd2.h15
3 files changed, 73 insertions, 14 deletions
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 6393fd0..f22d182 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -355,6 +355,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
int flags;
int err;
unsigned long long blocknr;
+ ktime_t start_time;
+ u64 commit_time;
char *tagp = NULL;
journal_header_t *header;
journal_block_tag_t *tag = NULL;
@@ -481,6 +483,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
commit_transaction->t_state = T_FLUSH;
journal->j_committing_transaction = commit_transaction;
journal->j_running_transaction = NULL;
+ start_time = ktime_get();
commit_transaction->t_log_start = journal->j_head;
wake_up(&journal->j_wait_transaction_locked);
spin_unlock(&journal->j_state_lock);
@@ -995,6 +998,17 @@ restart_loop:
J_ASSERT(commit_transaction == journal->j_committing_transaction);
journal->j_commit_sequence = commit_transaction->t_tid;
journal->j_committing_transaction = NULL;
+ commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+
+ /*
+ * weight the commit time higher than the average time so we don't
+ * react too strongly to vast changes in the commit time
+ */
+ if (likely(journal->j_average_commit_time))
+ journal->j_average_commit_time = (commit_time +
+ journal->j_average_commit_time*3) / 4;
+ else
+ journal->j_average_commit_time = commit_time;
spin_unlock(&journal->j_state_lock);
if (journal->j_commit_callback)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 39b7805..13dcbc9 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -25,6 +25,7 @@
#include <linux/timer.h>
#include <linux/mm.h>
#include <linux/highmem.h>
+#include <linux/hrtimer.h>
static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
@@ -48,6 +49,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
{
transaction->t_journal = journal;
transaction->t_state = T_RUNNING;
+ transaction->t_start_time = ktime_get();
transaction->t_tid = journal->j_transaction_sequence++;
transaction->t_expires = jiffies + journal->j_commit_interval;
spin_lock_init(&transaction->t_handle_lock);
@@ -1193,7 +1195,7 @@ int jbd2_journal_stop(handle_t *handle)
{
transaction_t *transaction = handle->h_transaction;
journal_t *journal = transaction->t_journal;
- int old_handle_count, err;
+ int err;
pid_t pid;
J_ASSERT(journal_current_handle() == handle);
@@ -1216,24 +1218,52 @@ int jbd2_journal_stop(handle_t *handle)
/*
* Implement synchronous transaction batching. If the handle
* was synchronous, don't force a commit immediately. Let's
- * yield and let another thread piggyback onto this transaction.
- * Keep doing that while new threads continue to arrive.
- * It doesn't cost much - we're about to run a commit and sleep
- * on IO anyway. Speeds up many-threaded, many-dir operations
- * by 30x or more...
+ * yield and let another thread piggyback onto this
+ * transaction. Keep doing that while new threads continue to
+ * arrive. It doesn't cost much - we're about to run a commit
+ * and sleep on IO anyway. Speeds up many-threaded, many-dir
+ * operations by 30x or more...
+ *
+ * We try and optimize the sleep time against what the
+ * underlying disk can do, instead of having a static sleep
+ * time. This is useful for the case where our storage is so
+ * fast that it is more optimal to go ahead and force a flush
+ * and wait for the transaction to be committed than it is to
+ * wait for an arbitrary amount of time for new writers to
+ * join the transaction. We achieve this by measuring how
+ * long it takes to commit a transaction, and compare it with
+ * how long this transaction has been running, and if run time
+ * < commit time then we sleep for the delta and commit. This
+ * greatly helps super fast disks that would see slowdowns as
+ * more threads started doing fsyncs.
*
- * But don't do this if this process was the most recent one to
- * perform a synchronous write. We do this to detect the case where a
- * single process is doing a stream of sync writes. No point in waiting
- * for joiners in that case.
+ * But don't do this if this process was the most recent one
+ * to perform a synchronous write. We do this to detect the
+ * case where a single process is doing a stream of sync
+ * writes. No point in waiting for joiners in that case.
*/
pid = current->pid;
if (handle->h_sync && journal->j_last_sync_writer != pid) {
+ u64 commit_time, trans_time;
+
journal->j_last_sync_writer = pid;
- do {
- old_handle_count = transaction->t_handle_count;
- schedule_timeout_uninterruptible(1);
- } while (old_handle_count != transaction->t_handle_count);
+
+ spin_lock(&journal->j_state_lock);
+ commit_time = journal->j_average_commit_time;
+ spin_unlock(&journal->j_state_lock);
+
+ trans_time = ktime_to_ns(ktime_sub(ktime_get(),
+ transaction->t_start_time));
+
+ commit_time = min_t(u64, commit_time,
+ 1000*jiffies_to_usecs(1));
+
+ if (trans_time < commit_time) {
+ ktime_t expires = ktime_add_ns(ktime_get(),
+ commit_time);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+ }
}
current->journal_info = NULL;
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index f366457..ab8cef1 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -638,6 +638,11 @@ struct transaction_s
unsigned long t_expires;
/*
+ * When this transaction started, in nanoseconds [no locking]
+ */
+ ktime_t t_start_time;
+
+ /*
* How many handles used this transaction? [t_handle_lock]
*/
int t_handle_count;
@@ -939,8 +944,18 @@ struct journal_s
struct buffer_head **j_wbuf;
int j_wbufsize;
+ /*
+ * this is the pid of hte last person to run a synchronous operation
+ * through the journal
+ */
pid_t j_last_sync_writer;
+ /*
+ * the average amount of time in nanoseconds it takes to commit a
+ * transaction to disk. [j_state_lock]
+ */
+ u64 j_average_commit_time;
+
/* This function is called when a transaction is closed */
void (*j_commit_callback)(journal_t *,
transaction_t *);