diff --git a/pkgs/os-specific/linux/kernel/linux-2.6.20.3-ext3cow.patch b/pkgs/os-specific/linux/kernel/linux-2.6.20.3-ext3cow.patch deleted file mode 100644 index 22704f1a1db7..000000000000 --- a/pkgs/os-specific/linux/kernel/linux-2.6.20.3-ext3cow.patch +++ /dev/null @@ -1,18494 +0,0 @@ -diff -ruN linux-2.6.20.3/fs/ext3cow/acl.c linux-2.6.20.3-ext3cow/fs/ext3cow/acl.c ---- linux-2.6.20.3/fs/ext3cow/acl.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.20.3-ext3cow/fs/ext3cow/acl.c 2008-03-09 11:14:49.000000000 -0400 -@@ -0,0 +1,551 @@ -+/* -+ * linux/fs/ext3cow/acl.c -+ * -+ * Copyright (C) 2001-2003 Andreas Gruenbacher, -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "xattr.h" -+#include "acl.h" -+ -+/* -+ * Convert from filesystem to in-memory representation. -+ */ -+static struct posix_acl * -+ext3cow_acl_from_disk(const void *value, size_t size) -+{ -+ const char *end = (char *)value + size; -+ int n, count; -+ struct posix_acl *acl; -+ -+ if (!value) -+ return NULL; -+ if (size < sizeof(ext3cow_acl_header)) -+ return ERR_PTR(-EINVAL); -+ if (((ext3cow_acl_header *)value)->a_version != -+ cpu_to_le32(EXT3COW_ACL_VERSION)) -+ return ERR_PTR(-EINVAL); -+ value = (char *)value + sizeof(ext3cow_acl_header); -+ count = ext3cow_acl_count(size); -+ if (count < 0) -+ return ERR_PTR(-EINVAL); -+ if (count == 0) -+ return NULL; -+ acl = posix_acl_alloc(count, GFP_KERNEL); -+ if (!acl) -+ return ERR_PTR(-ENOMEM); -+ for (n=0; n < count; n++) { -+ ext3cow_acl_entry *entry = -+ (ext3cow_acl_entry *)value; -+ if ((char *)value + sizeof(ext3cow_acl_entry_short) > end) -+ goto fail; -+ acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); -+ acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); -+ switch(acl->a_entries[n].e_tag) { -+ case ACL_USER_OBJ: -+ case ACL_GROUP_OBJ: -+ case ACL_MASK: -+ case ACL_OTHER: -+ value = (char *)value + -+ sizeof(ext3cow_acl_entry_short); -+ acl->a_entries[n].e_id = ACL_UNDEFINED_ID; -+ break; -+ -+ case ACL_USER: -+ case ACL_GROUP: -+ value = (char *)value + sizeof(ext3cow_acl_entry); -+ if ((char *)value > end) -+ goto fail; -+ acl->a_entries[n].e_id = -+ le32_to_cpu(entry->e_id); -+ break; -+ -+ default: -+ goto fail; -+ } -+ } -+ if (value != end) -+ goto fail; -+ return acl; -+ -+fail: -+ posix_acl_release(acl); -+ return ERR_PTR(-EINVAL); -+} -+ -+/* -+ * Convert from in-memory to filesystem representation. -+ */ -+static void * -+ext3cow_acl_to_disk(const struct posix_acl *acl, size_t *size) -+{ -+ ext3cow_acl_header *ext_acl; -+ char *e; -+ size_t n; -+ -+ *size = ext3cow_acl_size(acl->a_count); -+ ext_acl = kmalloc(sizeof(ext3cow_acl_header) + acl->a_count * -+ sizeof(ext3cow_acl_entry), GFP_KERNEL); -+ if (!ext_acl) -+ return ERR_PTR(-ENOMEM); -+ ext_acl->a_version = cpu_to_le32(EXT3COW_ACL_VERSION); -+ e = (char *)ext_acl + sizeof(ext3cow_acl_header); -+ for (n=0; n < acl->a_count; n++) { -+ ext3cow_acl_entry *entry = (ext3cow_acl_entry *)e; -+ entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); -+ entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); -+ switch(acl->a_entries[n].e_tag) { -+ case ACL_USER: -+ case ACL_GROUP: -+ entry->e_id = -+ cpu_to_le32(acl->a_entries[n].e_id); -+ e += sizeof(ext3cow_acl_entry); -+ break; -+ -+ case ACL_USER_OBJ: -+ case ACL_GROUP_OBJ: -+ case ACL_MASK: -+ case ACL_OTHER: -+ e += sizeof(ext3cow_acl_entry_short); -+ break; -+ -+ default: -+ goto fail; -+ } -+ } -+ return (char *)ext_acl; -+ -+fail: -+ kfree(ext_acl); -+ return ERR_PTR(-EINVAL); -+} -+ -+static inline struct posix_acl * -+ext3cow_iget_acl(struct inode *inode, struct posix_acl **i_acl) -+{ -+ struct posix_acl *acl = EXT3COW_ACL_NOT_CACHED; -+ -+ spin_lock(&inode->i_lock); -+ if (*i_acl != EXT3COW_ACL_NOT_CACHED) -+ acl = posix_acl_dup(*i_acl); -+ spin_unlock(&inode->i_lock); -+ -+ return acl; -+} -+ -+static inline void -+ext3cow_iset_acl(struct inode *inode, struct posix_acl **i_acl, -+ struct posix_acl *acl) -+{ -+ spin_lock(&inode->i_lock); -+ if (*i_acl != EXT3COW_ACL_NOT_CACHED) -+ posix_acl_release(*i_acl); -+ *i_acl = posix_acl_dup(acl); -+ spin_unlock(&inode->i_lock); -+} -+ -+/* -+ * Inode operation get_posix_acl(). -+ * -+ * inode->i_mutex: don't care -+ */ -+static struct posix_acl * -+ext3cow_get_acl(struct inode *inode, int type) -+{ -+ struct ext3cow_inode_info *ei = EXT3COW_I(inode); -+ int name_index; -+ char *value = NULL; -+ struct posix_acl *acl; -+ int retval; -+ -+ if (!test_opt(inode->i_sb, POSIX_ACL)) -+ return NULL; -+ -+ switch(type) { -+ case ACL_TYPE_ACCESS: -+ acl = ext3cow_iget_acl(inode, &ei->i_acl); -+ if (acl != EXT3COW_ACL_NOT_CACHED) -+ return acl; -+ name_index = EXT3COW_XATTR_INDEX_POSIX_ACL_ACCESS; -+ break; -+ -+ case ACL_TYPE_DEFAULT: -+ acl = ext3cow_iget_acl(inode, &ei->i_default_acl); -+ if (acl != EXT3COW_ACL_NOT_CACHED) -+ return acl; -+ name_index = EXT3COW_XATTR_INDEX_POSIX_ACL_DEFAULT; -+ break; -+ -+ default: -+ return ERR_PTR(-EINVAL); -+ } -+ retval = ext3cow_xattr_get(inode, name_index, "", NULL, 0); -+ if (retval > 0) { -+ value = kmalloc(retval, GFP_KERNEL); -+ if (!value) -+ return ERR_PTR(-ENOMEM); -+ retval = ext3cow_xattr_get(inode, name_index, "", value, retval); -+ } -+ if (retval > 0) -+ acl = ext3cow_acl_from_disk(value, retval); -+ else if (retval == -ENODATA || retval == -ENOSYS) -+ acl = NULL; -+ else -+ acl = ERR_PTR(retval); -+ kfree(value); -+ -+ if (!IS_ERR(acl)) { -+ switch(type) { -+ case ACL_TYPE_ACCESS: -+ ext3cow_iset_acl(inode, &ei->i_acl, acl); -+ break; -+ -+ case ACL_TYPE_DEFAULT: -+ ext3cow_iset_acl(inode, &ei->i_default_acl, acl); -+ break; -+ } -+ } -+ return acl; -+} -+ -+/* -+ * Set the access or default ACL of an inode. -+ * -+ * inode->i_mutex: down unless called from ext3cow_new_inode -+ */ -+static int -+ext3cow_set_acl(handle_t *handle, struct inode *inode, int type, -+ struct posix_acl *acl) -+{ -+ struct ext3cow_inode_info *ei = EXT3COW_I(inode); -+ int name_index; -+ void *value = NULL; -+ size_t size = 0; -+ int error; -+ -+ if (S_ISLNK(inode->i_mode)) -+ return -EOPNOTSUPP; -+ -+ switch(type) { -+ case ACL_TYPE_ACCESS: -+ name_index = EXT3COW_XATTR_INDEX_POSIX_ACL_ACCESS; -+ if (acl) { -+ mode_t mode = inode->i_mode; -+ error = posix_acl_equiv_mode(acl, &mode); -+ if (error < 0) -+ return error; -+ else { -+ inode->i_mode = mode; -+ ext3cow_mark_inode_dirty(handle, inode); -+ if (error == 0) -+ acl = NULL; -+ } -+ } -+ break; -+ -+ case ACL_TYPE_DEFAULT: -+ name_index = EXT3COW_XATTR_INDEX_POSIX_ACL_DEFAULT; -+ if (!S_ISDIR(inode->i_mode)) -+ return acl ? -EACCES : 0; -+ break; -+ -+ default: -+ return -EINVAL; -+ } -+ if (acl) { -+ value = ext3cow_acl_to_disk(acl, &size); -+ if (IS_ERR(value)) -+ return (int)PTR_ERR(value); -+ } -+ -+ error = ext3cow_xattr_set_handle(handle, inode, name_index, "", -+ value, size, 0); -+ -+ kfree(value); -+ if (!error) { -+ switch(type) { -+ case ACL_TYPE_ACCESS: -+ ext3cow_iset_acl(inode, &ei->i_acl, acl); -+ break; -+ -+ case ACL_TYPE_DEFAULT: -+ ext3cow_iset_acl(inode, &ei->i_default_acl, acl); -+ break; -+ } -+ } -+ return error; -+} -+ -+static int -+ext3cow_check_acl(struct inode *inode, int mask) -+{ -+ struct posix_acl *acl = ext3cow_get_acl(inode, ACL_TYPE_ACCESS); -+ -+ if (IS_ERR(acl)) -+ return PTR_ERR(acl); -+ if (acl) { -+ int error = posix_acl_permission(inode, acl, mask); -+ posix_acl_release(acl); -+ return error; -+ } -+ -+ return -EAGAIN; -+} -+ -+int -+ext3cow_permission(struct inode *inode, int mask, struct nameidata *nd) -+{ -+ return generic_permission(inode, mask, ext3cow_check_acl); -+} -+ -+/* -+ * Initialize the ACLs of a new inode. Called from ext3cow_new_inode. -+ * -+ * dir->i_mutex: down -+ * inode->i_mutex: up (access to inode is still exclusive) -+ */ -+int -+ext3cow_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) -+{ -+ struct posix_acl *acl = NULL; -+ int error = 0; -+ -+ if (!S_ISLNK(inode->i_mode)) { -+ if (test_opt(dir->i_sb, POSIX_ACL)) { -+ acl = ext3cow_get_acl(dir, ACL_TYPE_DEFAULT); -+ if (IS_ERR(acl)) -+ return PTR_ERR(acl); -+ } -+ if (!acl) -+ inode->i_mode &= ~current->fs->umask; -+ } -+ if (test_opt(inode->i_sb, POSIX_ACL) && acl) { -+ struct posix_acl *clone; -+ mode_t mode; -+ -+ if (S_ISDIR(inode->i_mode)) { -+ error = ext3cow_set_acl(handle, inode, -+ ACL_TYPE_DEFAULT, acl); -+ if (error) -+ goto cleanup; -+ } -+ clone = posix_acl_clone(acl, GFP_KERNEL); -+ error = -ENOMEM; -+ if (!clone) -+ goto cleanup; -+ -+ mode = inode->i_mode; -+ error = posix_acl_create_masq(clone, &mode); -+ if (error >= 0) { -+ inode->i_mode = mode; -+ if (error > 0) { -+ /* This is an extended ACL */ -+ error = ext3cow_set_acl(handle, inode, -+ ACL_TYPE_ACCESS, clone); -+ } -+ } -+ posix_acl_release(clone); -+ } -+cleanup: -+ posix_acl_release(acl); -+ return error; -+} -+ -+/* -+ * Does chmod for an inode that may have an Access Control List. The -+ * inode->i_mode field must be updated to the desired value by the caller -+ * before calling this function. -+ * Returns 0 on success, or a negative error number. -+ * -+ * We change the ACL rather than storing some ACL entries in the file -+ * mode permission bits (which would be more efficient), because that -+ * would break once additional permissions (like ACL_APPEND, ACL_DELETE -+ * for directories) are added. There are no more bits available in the -+ * file mode. -+ * -+ * inode->i_mutex: down -+ */ -+int -+ext3cow_acl_chmod(struct inode *inode) -+{ -+ struct posix_acl *acl, *clone; -+ int error; -+ -+ if (S_ISLNK(inode->i_mode)) -+ return -EOPNOTSUPP; -+ if (!test_opt(inode->i_sb, POSIX_ACL)) -+ return 0; -+ acl = ext3cow_get_acl(inode, ACL_TYPE_ACCESS); -+ if (IS_ERR(acl) || !acl) -+ return PTR_ERR(acl); -+ clone = posix_acl_clone(acl, GFP_KERNEL); -+ posix_acl_release(acl); -+ if (!clone) -+ return -ENOMEM; -+ error = posix_acl_chmod_masq(clone, inode->i_mode); -+ if (!error) { -+ handle_t *handle; -+ int retries = 0; -+ -+ retry: -+ handle = ext3cow_journal_start(inode, -+ EXT3COW_DATA_TRANS_BLOCKS(inode->i_sb)); -+ if (IS_ERR(handle)) { -+ error = PTR_ERR(handle); -+ ext3cow_std_error(inode->i_sb, error); -+ goto out; -+ } -+ error = ext3cow_set_acl(handle, inode, ACL_TYPE_ACCESS, clone); -+ ext3cow_journal_stop(handle); -+ if (error == -ENOSPC && -+ ext3cow_should_retry_alloc(inode->i_sb, &retries)) -+ goto retry; -+ } -+out: -+ posix_acl_release(clone); -+ return error; -+} -+ -+/* -+ * Extended attribute handlers -+ */ -+static size_t -+ext3cow_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len, -+ const char *name, size_t name_len) -+{ -+ const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); -+ -+ if (!test_opt(inode->i_sb, POSIX_ACL)) -+ return 0; -+ if (list && size <= list_len) -+ memcpy(list, POSIX_ACL_XATTR_ACCESS, size); -+ return size; -+} -+ -+static size_t -+ext3cow_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len, -+ const char *name, size_t name_len) -+{ -+ const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); -+ -+ if (!test_opt(inode->i_sb, POSIX_ACL)) -+ return 0; -+ if (list && size <= list_len) -+ memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); -+ return size; -+} -+ -+static int -+ext3cow_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) -+{ -+ struct posix_acl *acl; -+ int error; -+ -+ if (!test_opt(inode->i_sb, POSIX_ACL)) -+ return -EOPNOTSUPP; -+ -+ acl = ext3cow_get_acl(inode, type); -+ if (IS_ERR(acl)) -+ return PTR_ERR(acl); -+ if (acl == NULL) -+ return -ENODATA; -+ error = posix_acl_to_xattr(acl, buffer, size); -+ posix_acl_release(acl); -+ -+ return error; -+} -+ -+static int -+ext3cow_xattr_get_acl_access(struct inode *inode, const char *name, -+ void *buffer, size_t size) -+{ -+ if (strcmp(name, "") != 0) -+ return -EINVAL; -+ return ext3cow_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size); -+} -+ -+static int -+ext3cow_xattr_get_acl_default(struct inode *inode, const char *name, -+ void *buffer, size_t size) -+{ -+ if (strcmp(name, "") != 0) -+ return -EINVAL; -+ return ext3cow_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size); -+} -+ -+static int -+ext3cow_xattr_set_acl(struct inode *inode, int type, const void *value, -+ size_t size) -+{ -+ handle_t *handle; -+ struct posix_acl *acl; -+ int error, retries = 0; -+ -+ if (!test_opt(inode->i_sb, POSIX_ACL)) -+ return -EOPNOTSUPP; -+ if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) -+ return -EPERM; -+ -+ if (value) { -+ acl = posix_acl_from_xattr(value, size); -+ if (IS_ERR(acl)) -+ return PTR_ERR(acl); -+ else if (acl) { -+ error = posix_acl_valid(acl); -+ if (error) -+ goto release_and_out; -+ } -+ } else -+ acl = NULL; -+ -+retry: -+ handle = ext3cow_journal_start(inode, EXT3COW_DATA_TRANS_BLOCKS(inode->i_sb)); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ error = ext3cow_set_acl(handle, inode, type, acl); -+ ext3cow_journal_stop(handle); -+ if (error == -ENOSPC && ext3cow_should_retry_alloc(inode->i_sb, &retries)) -+ goto retry; -+ -+release_and_out: -+ posix_acl_release(acl); -+ return error; -+} -+ -+static int -+ext3cow_xattr_set_acl_access(struct inode *inode, const char *name, -+ const void *value, size_t size, int flags) -+{ -+ if (strcmp(name, "") != 0) -+ return -EINVAL; -+ return ext3cow_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); -+} -+ -+static int -+ext3cow_xattr_set_acl_default(struct inode *inode, const char *name, -+ const void *value, size_t size, int flags) -+{ -+ if (strcmp(name, "") != 0) -+ return -EINVAL; -+ return ext3cow_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); -+} -+ -+struct xattr_handler ext3cow_xattr_acl_access_handler = { -+ .prefix = POSIX_ACL_XATTR_ACCESS, -+ .list = ext3cow_xattr_list_acl_access, -+ .get = ext3cow_xattr_get_acl_access, -+ .set = ext3cow_xattr_set_acl_access, -+}; -+ -+struct xattr_handler ext3cow_xattr_acl_default_handler = { -+ .prefix = POSIX_ACL_XATTR_DEFAULT, -+ .list = ext3cow_xattr_list_acl_default, -+ .get = ext3cow_xattr_get_acl_default, -+ .set = ext3cow_xattr_set_acl_default, -+}; -diff -ruN linux-2.6.20.3/fs/ext3cow/acl.h linux-2.6.20.3-ext3cow/fs/ext3cow/acl.h ---- linux-2.6.20.3/fs/ext3cow/acl.h 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.20.3-ext3cow/fs/ext3cow/acl.h 2008-03-09 11:14:48.000000000 -0400 -@@ -0,0 +1,81 @@ -+/* -+ File: fs/ext3cow/acl.h -+ -+ (C) 2001 Andreas Gruenbacher, -+*/ -+ -+#include -+ -+#define EXT3COW_ACL_VERSION 0x0001 -+ -+typedef struct { -+ __le16 e_tag; -+ __le16 e_perm; -+ __le32 e_id; -+} ext3cow_acl_entry; -+ -+typedef struct { -+ __le16 e_tag; -+ __le16 e_perm; -+} ext3cow_acl_entry_short; -+ -+typedef struct { -+ __le32 a_version; -+} ext3cow_acl_header; -+ -+static inline size_t ext3cow_acl_size(int count) -+{ -+ if (count <= 4) { -+ return sizeof(ext3cow_acl_header) + -+ count * sizeof(ext3cow_acl_entry_short); -+ } else { -+ return sizeof(ext3cow_acl_header) + -+ 4 * sizeof(ext3cow_acl_entry_short) + -+ (count - 4) * sizeof(ext3cow_acl_entry); -+ } -+} -+ -+static inline int ext3cow_acl_count(size_t size) -+{ -+ ssize_t s; -+ size -= sizeof(ext3cow_acl_header); -+ s = size - 4 * sizeof(ext3cow_acl_entry_short); -+ if (s < 0) { -+ if (size % sizeof(ext3cow_acl_entry_short)) -+ return -1; -+ return size / sizeof(ext3cow_acl_entry_short); -+ } else { -+ if (s % sizeof(ext3cow_acl_entry)) -+ return -1; -+ return s / sizeof(ext3cow_acl_entry) + 4; -+ } -+} -+ -+#ifdef CONFIG_EXT3COW_FS_POSIX_ACL -+ -+/* Value for inode->u.ext3cow_i.i_acl and inode->u.ext3cow_i.i_default_acl -+ if the ACL has not been cached */ -+#define EXT3COW_ACL_NOT_CACHED ((void *)-1) -+ -+/* acl.c */ -+extern int ext3cow_permission (struct inode *, int, struct nameidata *); -+extern int ext3cow_acl_chmod (struct inode *); -+extern int ext3cow_init_acl (handle_t *, struct inode *, struct inode *); -+ -+#else /* CONFIG_EXT3COW_FS_POSIX_ACL */ -+#include -+#define ext3cow_permission NULL -+ -+static inline int -+ext3cow_acl_chmod(struct inode *inode) -+{ -+ return 0; -+} -+ -+static inline int -+ext3cow_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) -+{ -+ return 0; -+} -+#endif /* CONFIG_EXT3COW_FS_POSIX_ACL */ -+ -diff -ruN linux-2.6.20.3/fs/ext3cow/balloc.c linux-2.6.20.3-ext3cow/fs/ext3cow/balloc.c ---- linux-2.6.20.3/fs/ext3cow/balloc.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.20.3-ext3cow/fs/ext3cow/balloc.c 2008-03-09 11:14:48.000000000 -0400 -@@ -0,0 +1,1823 @@ -+/* -+ * linux/fs/ext3cow/balloc.c -+ * -+ * Copyright (C) 1992, 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ * -+ * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993 -+ * Big-endian to little-endian byte-swapping/bitmaps by -+ * David S. Miller (davem@caip.rutgers.edu), 1995 -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ * balloc.c contains the blocks allocation and deallocation routines -+ */ -+ -+/* -+ * The free blocks are managed by bitmaps. A file system contains several -+ * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap -+ * block for inodes, N blocks for the inode table and data blocks. -+ * -+ * The file system contains group descriptors which are located after the -+ * super block. Each descriptor contains the number of the bitmap block and -+ * the free blocks count in the block. The descriptors are loaded in memory -+ * when a file system is mounted (see ext3cow_read_super). -+ */ -+ -+ -+#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) -+ -+/** -+ * ext3cow_get_group_desc() -- load group descriptor from disk -+ * @sb: super block -+ * @block_group: given block group -+ * @bh: pointer to the buffer head to store the block -+ * group descriptor -+ */ -+struct ext3cow_group_desc * ext3cow_get_group_desc(struct super_block * sb, -+ unsigned int block_group, -+ struct buffer_head ** bh) -+{ -+ unsigned long group_desc; -+ unsigned long offset; -+ struct ext3cow_group_desc * desc; -+ struct ext3cow_sb_info *sbi = EXT3COW_SB(sb); -+ -+ if (block_group >= sbi->s_groups_count) { -+ ext3cow_error (sb, "ext3cow_get_group_desc", -+ "block_group >= groups_count - " -+ "block_group = %d, groups_count = %lu", -+ block_group, sbi->s_groups_count); -+ -+ return NULL; -+ } -+ smp_rmb(); -+ -+ group_desc = block_group >> EXT3COW_DESC_PER_BLOCK_BITS(sb); -+ offset = block_group & (EXT3COW_DESC_PER_BLOCK(sb) - 1); -+ if (!sbi->s_group_desc[group_desc]) { -+ ext3cow_error (sb, "ext3cow_get_group_desc", -+ "Group descriptor not loaded - " -+ "block_group = %d, group_desc = %lu, desc = %lu", -+ block_group, group_desc, offset); -+ return NULL; -+ } -+ -+ desc = (struct ext3cow_group_desc *) sbi->s_group_desc[group_desc]->b_data; -+ if (bh) -+ *bh = sbi->s_group_desc[group_desc]; -+ return desc + offset; -+} -+ -+/** -+ * read_block_bitmap() -+ * @sb: super block -+ * @block_group: given block group -+ * -+ * Read the bitmap for a given block_group, reading into the specified -+ * slot in the superblock's bitmap cache. -+ * -+ * Return buffer_head on success or NULL in case of failure. -+ */ -+static struct buffer_head * -+read_block_bitmap(struct super_block *sb, unsigned int block_group) -+{ -+ struct ext3cow_group_desc * desc; -+ struct buffer_head * bh = NULL; -+ -+ desc = ext3cow_get_group_desc (sb, block_group, NULL); -+ if (!desc) -+ goto error_out; -+ bh = sb_bread(sb, le32_to_cpu(desc->bg_block_bitmap)); -+ if (!bh) -+ ext3cow_error (sb, "read_block_bitmap", -+ "Cannot read block bitmap - " -+ "block_group = %d, block_bitmap = %u", -+ block_group, le32_to_cpu(desc->bg_block_bitmap)); -+error_out: -+ return bh; -+} -+/* -+ * The reservation window structure operations -+ * -------------------------------------------- -+ * Operations include: -+ * dump, find, add, remove, is_empty, find_next_reservable_window, etc. -+ * -+ * We use a red-black tree to represent per-filesystem reservation -+ * windows. -+ * -+ */ -+ -+/** -+ * __rsv_window_dump() -- Dump the filesystem block allocation reservation map -+ * @rb_root: root of per-filesystem reservation rb tree -+ * @verbose: verbose mode -+ * @fn: function which wishes to dump the reservation map -+ * -+ * If verbose is turned on, it will print the whole block reservation -+ * windows(start, end). Otherwise, it will only print out the "bad" windows, -+ * those windows that overlap with their immediate neighbors. -+ */ -+#if 1 -+static void __rsv_window_dump(struct rb_root *root, int verbose, -+ const char *fn) -+{ -+ struct rb_node *n; -+ struct ext3cow_reserve_window_node *rsv, *prev; -+ int bad; -+ -+restart: -+ n = rb_first(root); -+ bad = 0; -+ prev = NULL; -+ -+ printk("Block Allocation Reservation Windows Map (%s):\n", fn); -+ while (n) { -+ rsv = rb_entry(n, struct ext3cow_reserve_window_node, rsv_node); -+ if (verbose) -+ printk("reservation window 0x%p " -+ "start: %lu, end: %lu\n", -+ rsv, rsv->rsv_start, rsv->rsv_end); -+ if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) { -+ printk("Bad reservation %p (start >= end)\n", -+ rsv); -+ bad = 1; -+ } -+ if (prev && prev->rsv_end >= rsv->rsv_start) { -+ printk("Bad reservation %p (prev->end >= start)\n", -+ rsv); -+ bad = 1; -+ } -+ if (bad) { -+ if (!verbose) { -+ printk("Restarting reservation walk in verbose mode\n"); -+ verbose = 1; -+ goto restart; -+ } -+ } -+ n = rb_next(n); -+ prev = rsv; -+ } -+ printk("Window map complete.\n"); -+ if (bad) -+ BUG(); -+} -+#define rsv_window_dump(root, verbose) \ -+ __rsv_window_dump((root), (verbose), __FUNCTION__) -+#else -+#define rsv_window_dump(root, verbose) do {} while (0) -+#endif -+ -+/** -+ * goal_in_my_reservation() -+ * @rsv: inode's reservation window -+ * @grp_goal: given goal block relative to the allocation block group -+ * @group: the current allocation block group -+ * @sb: filesystem super block -+ * -+ * Test if the given goal block (group relative) is within the file's -+ * own block reservation window range. -+ * -+ * If the reservation window is outside the goal allocation group, return 0; -+ * grp_goal (given goal block) could be -1, which means no specific -+ * goal block. In this case, always return 1. -+ * If the goal block is within the reservation window, return 1; -+ * otherwise, return 0; -+ */ -+static int -+goal_in_my_reservation(struct ext3cow_reserve_window *rsv, ext3cow_grpblk_t grp_goal, -+ unsigned int group, struct super_block * sb) -+{ -+ ext3cow_fsblk_t group_first_block, group_last_block; -+ -+ group_first_block = ext3cow_group_first_block_no(sb, group); -+ group_last_block = group_first_block + (EXT3COW_BLOCKS_PER_GROUP(sb) - 1); -+ -+ if ((rsv->_rsv_start > group_last_block) || -+ (rsv->_rsv_end < group_first_block)) -+ return 0; -+ if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start) -+ || (grp_goal + group_first_block > rsv->_rsv_end))) -+ return 0; -+ return 1; -+} -+ -+/** -+ * search_reserve_window() -+ * @rb_root: root of reservation tree -+ * @goal: target allocation block -+ * -+ * Find the reserved window which includes the goal, or the previous one -+ * if the goal is not in any window. -+ * Returns NULL if there are no windows or if all windows start after the goal. -+ */ -+static struct ext3cow_reserve_window_node * -+search_reserve_window(struct rb_root *root, ext3cow_fsblk_t goal) -+{ -+ struct rb_node *n = root->rb_node; -+ struct ext3cow_reserve_window_node *rsv; -+ -+ if (!n) -+ return NULL; -+ -+ do { -+ rsv = rb_entry(n, struct ext3cow_reserve_window_node, rsv_node); -+ -+ if (goal < rsv->rsv_start) -+ n = n->rb_left; -+ else if (goal > rsv->rsv_end) -+ n = n->rb_right; -+ else -+ return rsv; -+ } while (n); -+ /* -+ * We've fallen off the end of the tree: the goal wasn't inside -+ * any particular node. OK, the previous node must be to one -+ * side of the interval containing the goal. If it's the RHS, -+ * we need to back up one. -+ */ -+ if (rsv->rsv_start > goal) { -+ n = rb_prev(&rsv->rsv_node); -+ rsv = rb_entry(n, struct ext3cow_reserve_window_node, rsv_node); -+ } -+ return rsv; -+} -+ -+/** -+ * ext3cow_rsv_window_add() -- Insert a window to the block reservation rb tree. -+ * @sb: super block -+ * @rsv: reservation window to add -+ * -+ * Must be called with rsv_lock hold. -+ */ -+void ext3cow_rsv_window_add(struct super_block *sb, -+ struct ext3cow_reserve_window_node *rsv) -+{ -+ struct rb_root *root = &EXT3COW_SB(sb)->s_rsv_window_root; -+ struct rb_node *node = &rsv->rsv_node; -+ ext3cow_fsblk_t start = rsv->rsv_start; -+ -+ struct rb_node ** p = &root->rb_node; -+ struct rb_node * parent = NULL; -+ struct ext3cow_reserve_window_node *this; -+ -+ while (*p) -+ { -+ parent = *p; -+ this = rb_entry(parent, struct ext3cow_reserve_window_node, rsv_node); -+ -+ if (start < this->rsv_start) -+ p = &(*p)->rb_left; -+ else if (start > this->rsv_end) -+ p = &(*p)->rb_right; -+ else { -+ rsv_window_dump(root, 1); -+ BUG(); -+ } -+ } -+ -+ rb_link_node(node, parent, p); -+ rb_insert_color(node, root); -+} -+ -+/** -+ * ext3cow_rsv_window_remove() -- unlink a window from the reservation rb tree -+ * @sb: super block -+ * @rsv: reservation window to remove -+ * -+ * Mark the block reservation window as not allocated, and unlink it -+ * from the filesystem reservation window rb tree. Must be called with -+ * rsv_lock hold. -+ */ -+static void rsv_window_remove(struct super_block *sb, -+ struct ext3cow_reserve_window_node *rsv) -+{ -+ rsv->rsv_start = EXT3COW_RESERVE_WINDOW_NOT_ALLOCATED; -+ rsv->rsv_end = EXT3COW_RESERVE_WINDOW_NOT_ALLOCATED; -+ rsv->rsv_alloc_hit = 0; -+ rb_erase(&rsv->rsv_node, &EXT3COW_SB(sb)->s_rsv_window_root); -+} -+ -+/* -+ * rsv_is_empty() -- Check if the reservation window is allocated. -+ * @rsv: given reservation window to check -+ * -+ * returns 1 if the end block is EXT3COW_RESERVE_WINDOW_NOT_ALLOCATED. -+ */ -+static inline int rsv_is_empty(struct ext3cow_reserve_window *rsv) -+{ -+ /* a valid reservation end block could not be 0 */ -+ return rsv->_rsv_end == EXT3COW_RESERVE_WINDOW_NOT_ALLOCATED; -+} -+ -+/** -+ * ext3cow_init_block_alloc_info() -+ * @inode: file inode structure -+ * -+ * Allocate and initialize the reservation window structure, and -+ * link the window to the ext3cow inode structure at last -+ * -+ * The reservation window structure is only dynamically allocated -+ * and linked to ext3cow inode the first time the open file -+ * needs a new block. So, before every ext3cow_new_block(s) call, for -+ * regular files, we should check whether the reservation window -+ * structure exists or not. In the latter case, this function is called. -+ * Fail to do so will result in block reservation being turned off for that -+ * open file. -+ * -+ * This function is called from ext3cow_get_blocks_handle(), also called -+ * when setting the reservation window size through ioctl before the file -+ * is open for write (needs block allocation). -+ * -+ * Needs truncate_mutex protection prior to call this function. -+ */ -+void ext3cow_init_block_alloc_info(struct inode *inode) -+{ -+ struct ext3cow_inode_info *ei = EXT3COW_I(inode); -+ struct ext3cow_block_alloc_info *block_i = ei->i_block_alloc_info; -+ struct super_block *sb = inode->i_sb; -+ -+ block_i = kmalloc(sizeof(*block_i), GFP_NOFS); -+ if (block_i) { -+ struct ext3cow_reserve_window_node *rsv = &block_i->rsv_window_node; -+ -+ rsv->rsv_start = EXT3COW_RESERVE_WINDOW_NOT_ALLOCATED; -+ rsv->rsv_end = EXT3COW_RESERVE_WINDOW_NOT_ALLOCATED; -+ -+ /* -+ * if filesystem is mounted with NORESERVATION, the goal -+ * reservation window size is set to zero to indicate -+ * block reservation is off -+ */ -+ if (!test_opt(sb, RESERVATION)) -+ rsv->rsv_goal_size = 0; -+ else -+ rsv->rsv_goal_size = EXT3COW_DEFAULT_RESERVE_BLOCKS; -+ rsv->rsv_alloc_hit = 0; -+ block_i->last_alloc_logical_block = 0; -+ block_i->last_alloc_physical_block = 0; -+ } -+ ei->i_block_alloc_info = block_i; -+} -+ -+/** -+ * ext3cow_discard_reservation() -+ * @inode: inode -+ * -+ * Discard(free) block reservation window on last file close, or truncate -+ * or at last iput(). -+ * -+ * It is being called in three cases: -+ * ext3cow_release_file(): last writer close the file -+ * ext3cow_clear_inode(): last iput(), when nobody link to this file. -+ * ext3cow_truncate(): when the block indirect map is about to change. -+ * -+ */ -+void ext3cow_discard_reservation(struct inode *inode) -+{ -+ struct ext3cow_inode_info *ei = EXT3COW_I(inode); -+ struct ext3cow_block_alloc_info *block_i = ei->i_block_alloc_info; -+ struct ext3cow_reserve_window_node *rsv; -+ spinlock_t *rsv_lock = &EXT3COW_SB(inode->i_sb)->s_rsv_window_lock; -+ -+ if (!block_i) -+ return; -+ -+ rsv = &block_i->rsv_window_node; -+ if (!rsv_is_empty(&rsv->rsv_window)) { -+ spin_lock(rsv_lock); -+ if (!rsv_is_empty(&rsv->rsv_window)) -+ rsv_window_remove(inode->i_sb, rsv); -+ spin_unlock(rsv_lock); -+ } -+} -+ -+/** -+ * ext3cow_free_blocks_sb() -- Free given blocks and update quota -+ * @handle: handle to this transaction -+ * @sb: super block -+ * @block: start physcial block to free -+ * @count: number of blocks to free -+ * @pdquot_freed_blocks: pointer to quota -+ */ -+void ext3cow_free_blocks_sb(handle_t *handle, struct super_block *sb, -+ ext3cow_fsblk_t block, unsigned long count, -+ unsigned long *pdquot_freed_blocks) -+{ -+ struct buffer_head *bitmap_bh = NULL; -+ struct buffer_head *gd_bh; -+ unsigned long block_group; -+ ext3cow_grpblk_t bit; -+ unsigned long i; -+ unsigned long overflow; -+ struct ext3cow_group_desc * desc; -+ struct ext3cow_super_block * es; -+ struct ext3cow_sb_info *sbi; -+ int err = 0, ret; -+ ext3cow_grpblk_t group_freed; -+ -+ *pdquot_freed_blocks = 0; -+ sbi = EXT3COW_SB(sb); -+ es = sbi->s_es; -+ if (block < le32_to_cpu(es->s_first_data_block) || -+ block + count < block || -+ block + count > le32_to_cpu(es->s_blocks_count)) { -+ ext3cow_error (sb, "ext3cow_free_blocks", -+ "Freeing blocks not in datazone - " -+ "block = "E3FSBLK", count = %lu", block, count); -+ goto error_return; -+ } -+ -+ //TODO: Remove: -+ printk(KERN_INFO "freeing block(s) %lu-%lu\n", block, block + count - 1); -+ ext3cow_debug ("freeing block(s) %lu-%lu\n", block, block + count - 1); -+ -+do_more: -+ overflow = 0; -+ block_group = (block - le32_to_cpu(es->s_first_data_block)) / -+ EXT3COW_BLOCKS_PER_GROUP(sb); -+ bit = (block - le32_to_cpu(es->s_first_data_block)) % -+ EXT3COW_BLOCKS_PER_GROUP(sb); -+ /* -+ * Check to see if we are freeing blocks across a group -+ * boundary. -+ */ -+ if (bit + count > EXT3COW_BLOCKS_PER_GROUP(sb)) { -+ overflow = bit + count - EXT3COW_BLOCKS_PER_GROUP(sb); -+ count -= overflow; -+ } -+ brelse(bitmap_bh); -+ bitmap_bh = read_block_bitmap(sb, block_group); -+ if (!bitmap_bh) -+ goto error_return; -+ desc = ext3cow_get_group_desc (sb, block_group, &gd_bh); -+ if (!desc) -+ goto error_return; -+ -+ if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) || -+ in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) || -+ in_range (block, le32_to_cpu(desc->bg_inode_table), -+ sbi->s_itb_per_group) || -+ in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table), -+ sbi->s_itb_per_group)) -+ ext3cow_error (sb, "ext3cow_free_blocks", -+ "Freeing blocks in system zones - " -+ "Block = "E3FSBLK", count = %lu", -+ block, count); -+ -+ /* -+ * We are about to start releasing blocks in the bitmap, -+ * so we need undo access. -+ */ -+ /* @@@ check errors */ -+ BUFFER_TRACE(bitmap_bh, "getting undo access"); -+ err = ext3cow_journal_get_undo_access(handle, bitmap_bh); -+ if (err) -+ goto error_return; -+ -+ /* -+ * We are about to modify some metadata. Call the journal APIs -+ * to unshare ->b_data if a currently-committing transaction is -+ * using it -+ */ -+ BUFFER_TRACE(gd_bh, "get_write_access"); -+ err = ext3cow_journal_get_write_access(handle, gd_bh); -+ if (err) -+ goto error_return; -+ -+ jbd_lock_bh_state(bitmap_bh); -+ -+ for (i = 0, group_freed = 0; i < count; i++) { -+ /* -+ * An HJ special. This is expensive... -+ */ -+#ifdef CONFIG_JBD_DEBUG -+ jbd_unlock_bh_state(bitmap_bh); -+ { -+ struct buffer_head *debug_bh; -+ debug_bh = sb_find_get_block(sb, block + i); -+ if (debug_bh) { -+ BUFFER_TRACE(debug_bh, "Deleted!"); -+ if (!bh2jh(bitmap_bh)->b_committed_data) -+ BUFFER_TRACE(debug_bh, -+ "No commited data in bitmap"); -+ BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap"); -+ __brelse(debug_bh); -+ } -+ } -+ jbd_lock_bh_state(bitmap_bh); -+#endif -+ if (need_resched()) { -+ jbd_unlock_bh_state(bitmap_bh); -+ cond_resched(); -+ jbd_lock_bh_state(bitmap_bh); -+ } -+ /* @@@ This prevents newly-allocated data from being -+ * freed and then reallocated within the same -+ * transaction. -+ * -+ * Ideally we would want to allow that to happen, but to -+ * do so requires making journal_forget() capable of -+ * revoking the queued write of a data block, which -+ * implies blocking on the journal lock. *forget() -+ * cannot block due to truncate races. -+ * -+ * Eventually we can fix this by making journal_forget() -+ * return a status indicating whether or not it was able -+ * to revoke the buffer. On successful revoke, it is -+ * safe not to set the allocation bit in the committed -+ * bitmap, because we know that there is no outstanding -+ * activity on the buffer any more and so it is safe to -+ * reallocate it. -+ */ -+ BUFFER_TRACE(bitmap_bh, "set in b_committed_data"); -+ J_ASSERT_BH(bitmap_bh, -+ bh2jh(bitmap_bh)->b_committed_data != NULL); -+ ext3cow_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i, -+ bh2jh(bitmap_bh)->b_committed_data); -+ -+ /* -+ * We clear the bit in the bitmap after setting the committed -+ * data bit, because this is the reverse order to that which -+ * the allocator uses. -+ */ -+ BUFFER_TRACE(bitmap_bh, "clear bit"); -+ if (!ext3cow_clear_bit_atomic(sb_bgl_lock(sbi, block_group), -+ bit + i, bitmap_bh->b_data)) { -+ jbd_unlock_bh_state(bitmap_bh); -+ ext3cow_error(sb, __FUNCTION__, -+ "bit already cleared for block "E3FSBLK, -+ block + i); -+ jbd_lock_bh_state(bitmap_bh); -+ BUFFER_TRACE(bitmap_bh, "bit already cleared"); -+ } else { -+ group_freed++; -+ } -+ } -+ jbd_unlock_bh_state(bitmap_bh); -+ -+ spin_lock(sb_bgl_lock(sbi, block_group)); -+ desc->bg_free_blocks_count = -+ cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) + -+ group_freed); -+ spin_unlock(sb_bgl_lock(sbi, block_group)); -+ percpu_counter_mod(&sbi->s_freeblocks_counter, count); -+ -+ /* We dirtied the bitmap block */ -+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); -+ err = ext3cow_journal_dirty_metadata(handle, bitmap_bh); -+ -+ /* And the group descriptor block */ -+ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); -+ ret = ext3cow_journal_dirty_metadata(handle, gd_bh); -+ if (!err) err = ret; -+ *pdquot_freed_blocks += group_freed; -+ -+ if (overflow && !err) { -+ block += count; -+ count = overflow; -+ goto do_more; -+ } -+ sb->s_dirt = 1; -+error_return: -+ brelse(bitmap_bh); -+ ext3cow_std_error(sb, err); -+ return; -+} -+ -+/** -+ * ext3cow_free_blocks() -- Free given blocks and update quota -+ * @handle: handle for this transaction -+ * @inode: inode -+ * @block: start physical block to free -+ * @count: number of blocks to count -+ */ -+void ext3cow_free_blocks(handle_t *handle, struct inode *inode, -+ ext3cow_fsblk_t block, unsigned long count) -+{ -+ struct super_block * sb; -+ unsigned long dquot_freed_blocks; -+ -+ sb = inode->i_sb; -+ if (!sb) { -+ printk ("ext3cow_free_blocks: nonexistent device"); -+ return; -+ } -+ ext3cow_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); -+ if (dquot_freed_blocks) -+ DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); -+ return; -+} -+ -+/** -+ * ext3cow_test_allocatable() -+ * @nr: given allocation block group -+ * @bh: bufferhead contains the bitmap of the given block group -+ * -+ * For ext3cow allocations, we must not reuse any blocks which are -+ * allocated in the bitmap buffer's "last committed data" copy. This -+ * prevents deletes from freeing up the page for reuse until we have -+ * committed the delete transaction. -+ * -+ * If we didn't do this, then deleting something and reallocating it as -+ * data would allow the old block to be overwritten before the -+ * transaction committed (because we force data to disk before commit). -+ * This would lead to corruption if we crashed between overwriting the -+ * data and committing the delete. -+ * -+ * @@@ We may want to make this allocation behaviour conditional on -+ * data-writes at some point, and disable it for metadata allocations or -+ * sync-data inodes. -+ */ -+static int ext3cow_test_allocatable(ext3cow_grpblk_t nr, struct buffer_head *bh) -+{ -+ int ret; -+ struct journal_head *jh = bh2jh(bh); -+ -+ if (ext3cow_test_bit(nr, bh->b_data)) -+ return 0; -+ -+ jbd_lock_bh_state(bh); -+ if (!jh->b_committed_data) -+ ret = 1; -+ else -+ ret = !ext3cow_test_bit(nr, jh->b_committed_data); -+ jbd_unlock_bh_state(bh); -+ return ret; -+} -+ -+/** -+ * bitmap_search_next_usable_block() -+ * @start: the starting block (group relative) of the search -+ * @bh: bufferhead contains the block group bitmap -+ * @maxblocks: the ending block (group relative) of the reservation -+ * -+ * The bitmap search --- search forward alternately through the actual -+ * bitmap on disk and the last-committed copy in journal, until we find a -+ * bit free in both bitmaps. -+ */ -+static ext3cow_grpblk_t -+bitmap_search_next_usable_block(ext3cow_grpblk_t start, struct buffer_head *bh, -+ ext3cow_grpblk_t maxblocks) -+{ -+ ext3cow_grpblk_t next; -+ struct journal_head *jh = bh2jh(bh); -+ -+ while (start < maxblocks) { -+ next = ext3cow_find_next_zero_bit(bh->b_data, maxblocks, start); -+ if (next >= maxblocks) -+ return -1; -+ if (ext3cow_test_allocatable(next, bh)) -+ return next; -+ jbd_lock_bh_state(bh); -+ if (jh->b_committed_data) -+ start = ext3cow_find_next_zero_bit(jh->b_committed_data, -+ maxblocks, next); -+ jbd_unlock_bh_state(bh); -+ } -+ return -1; -+} -+ -+/** -+ * find_next_usable_block() -+ * @start: the starting block (group relative) to find next -+ * allocatable block in bitmap. -+ * @bh: bufferhead contains the block group bitmap -+ * @maxblocks: the ending block (group relative) for the search -+ * -+ * Find an allocatable block in a bitmap. We honor both the bitmap and -+ * its last-committed copy (if that exists), and perform the "most -+ * appropriate allocation" algorithm of looking for a free block near -+ * the initial goal; then for a free byte somewhere in the bitmap; then -+ * for any free bit in the bitmap. -+ */ -+static ext3cow_grpblk_t -+find_next_usable_block(ext3cow_grpblk_t start, struct buffer_head *bh, -+ ext3cow_grpblk_t maxblocks) -+{ -+ ext3cow_grpblk_t here, next; -+ char *p, *r; -+ -+ if (start > 0) { -+ /* -+ * The goal was occupied; search forward for a free -+ * block within the next XX blocks. -+ * -+ * end_goal is more or less random, but it has to be -+ * less than EXT3COW_BLOCKS_PER_GROUP. Aligning up to the -+ * next 64-bit boundary is simple.. -+ */ -+ ext3cow_grpblk_t end_goal = (start + 63) & ~63; -+ if (end_goal > maxblocks) -+ end_goal = maxblocks; -+ here = ext3cow_find_next_zero_bit(bh->b_data, end_goal, start); -+ if (here < end_goal && ext3cow_test_allocatable(here, bh)) -+ return here; -+ ext3cow_debug("Bit not found near goal\n"); -+ } -+ -+ here = start; -+ if (here < 0) -+ here = 0; -+ -+ p = ((char *)bh->b_data) + (here >> 3); -+ r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3)); -+ next = (r - ((char *)bh->b_data)) << 3; -+ -+ if (next < maxblocks && next >= start && ext3cow_test_allocatable(next, bh)) -+ return next; -+ -+ /* -+ * The bitmap search --- search forward alternately through the actual -+ * bitmap and the last-committed copy until we find a bit free in -+ * both -+ */ -+ here = bitmap_search_next_usable_block(here, bh, maxblocks); -+ return here; -+} -+ -+/** -+ * claim_block() -+ * @block: the free block (group relative) to allocate -+ * @bh: the bufferhead containts the block group bitmap -+ * -+ * We think we can allocate this block in this bitmap. Try to set the bit. -+ * If that succeeds then check that nobody has allocated and then freed the -+ * block since we saw that is was not marked in b_committed_data. If it _was_ -+ * allocated and freed then clear the bit in the bitmap again and return -+ * zero (failure). -+ */ -+static inline int -+claim_block(spinlock_t *lock, ext3cow_grpblk_t block, struct buffer_head *bh) -+{ -+ struct journal_head *jh = bh2jh(bh); -+ int ret; -+ -+ if (ext3cow_set_bit_atomic(lock, block, bh->b_data)) -+ return 0; -+ jbd_lock_bh_state(bh); -+ if (jh->b_committed_data && ext3cow_test_bit(block,jh->b_committed_data)) { -+ ext3cow_clear_bit_atomic(lock, block, bh->b_data); -+ ret = 0; -+ } else { -+ ret = 1; -+ } -+ jbd_unlock_bh_state(bh); -+ return ret; -+} -+ -+/** -+ * ext3cow_try_to_allocate() -+ * @sb: superblock -+ * @handle: handle to this transaction -+ * @group: given allocation block group -+ * @bitmap_bh: bufferhead holds the block bitmap -+ * @grp_goal: given target block within the group -+ * @count: target number of blocks to allocate -+ * @my_rsv: reservation window -+ * -+ * Attempt to allocate blocks within a give range. Set the range of allocation -+ * first, then find the first free bit(s) from the bitmap (within the range), -+ * and at last, allocate the blocks by claiming the found free bit as allocated. -+ * -+ * To set the range of this allocation: -+ * if there is a reservation window, only try to allocate block(s) from the -+ * file's own reservation window; -+ * Otherwise, the allocation range starts from the give goal block, ends at -+ * the block group's last block. -+ * -+ * If we failed to allocate the desired block then we may end up crossing to a -+ * new bitmap. In that case we must release write access to the old one via -+ * ext3cow_journal_release_buffer(), else we'll run out of credits. -+ */ -+static ext3cow_grpblk_t -+ext3cow_try_to_allocate(struct super_block *sb, handle_t *handle, int group, -+ struct buffer_head *bitmap_bh, ext3cow_grpblk_t grp_goal, -+ unsigned long *count, struct ext3cow_reserve_window *my_rsv) -+{ -+ ext3cow_fsblk_t group_first_block; -+ ext3cow_grpblk_t start, end; -+ unsigned long num = 0; -+ -+ /* we do allocation within the reservation window if we have a window */ -+ if (my_rsv) { -+ group_first_block = ext3cow_group_first_block_no(sb, group); -+ if (my_rsv->_rsv_start >= group_first_block) -+ start = my_rsv->_rsv_start - group_first_block; -+ else -+ /* reservation window cross group boundary */ -+ start = 0; -+ end = my_rsv->_rsv_end - group_first_block + 1; -+ if (end > EXT3COW_BLOCKS_PER_GROUP(sb)) -+ /* reservation window crosses group boundary */ -+ end = EXT3COW_BLOCKS_PER_GROUP(sb); -+ if ((start <= grp_goal) && (grp_goal < end)) -+ start = grp_goal; -+ else -+ grp_goal = -1; -+ } else { -+ if (grp_goal > 0) -+ start = grp_goal; -+ else -+ start = 0; -+ end = EXT3COW_BLOCKS_PER_GROUP(sb); -+ } -+ -+ BUG_ON(start > EXT3COW_BLOCKS_PER_GROUP(sb)); -+ -+repeat: -+ if (grp_goal < 0 || !ext3cow_test_allocatable(grp_goal, bitmap_bh)) { -+ grp_goal = find_next_usable_block(start, bitmap_bh, end); -+ if (grp_goal < 0) -+ goto fail_access; -+ if (!my_rsv) { -+ int i; -+ -+ for (i = 0; i < 7 && grp_goal > start && -+ ext3cow_test_allocatable(grp_goal - 1, -+ bitmap_bh); -+ i++, grp_goal--) -+ ; -+ } -+ } -+ start = grp_goal; -+ -+ if (!claim_block(sb_bgl_lock(EXT3COW_SB(sb), group), -+ grp_goal, bitmap_bh)) { -+ /* -+ * The block was allocated by another thread, or it was -+ * allocated and then freed by another thread -+ */ -+ start++; -+ grp_goal++; -+ if (start >= end) -+ goto fail_access; -+ goto repeat; -+ } -+ num++; -+ grp_goal++; -+ while (num < *count && grp_goal < end -+ && ext3cow_test_allocatable(grp_goal, bitmap_bh) -+ && claim_block(sb_bgl_lock(EXT3COW_SB(sb), group), -+ grp_goal, bitmap_bh)) { -+ num++; -+ grp_goal++; -+ } -+ *count = num; -+ return grp_goal - num; -+fail_access: -+ *count = num; -+ return -1; -+} -+ -+/** -+ * find_next_reservable_window(): -+ * find a reservable space within the given range. -+ * It does not allocate the reservation window for now: -+ * alloc_new_reservation() will do the work later. -+ * -+ * @search_head: the head of the searching list; -+ * This is not necessarily the list head of the whole filesystem -+ * -+ * We have both head and start_block to assist the search -+ * for the reservable space. The list starts from head, -+ * but we will shift to the place where start_block is, -+ * then start from there, when looking for a reservable space. -+ * -+ * @size: the target new reservation window size -+ * -+ * @group_first_block: the first block we consider to start -+ * the real search from -+ * -+ * @last_block: -+ * the maximum block number that our goal reservable space -+ * could start from. This is normally the last block in this -+ * group. The search will end when we found the start of next -+ * possible reservable space is out of this boundary. -+ * This could handle the cross boundary reservation window -+ * request. -+ * -+ * basically we search from the given range, rather than the whole -+ * reservation double linked list, (start_block, last_block) -+ * to find a free region that is of my size and has not -+ * been reserved. -+ * -+ */ -+static int find_next_reservable_window( -+ struct ext3cow_reserve_window_node *search_head, -+ struct ext3cow_reserve_window_node *my_rsv, -+ struct super_block * sb, -+ ext3cow_fsblk_t start_block, -+ ext3cow_fsblk_t last_block) -+{ -+ struct rb_node *next; -+ struct ext3cow_reserve_window_node *rsv, *prev; -+ ext3cow_fsblk_t cur; -+ int size = my_rsv->rsv_goal_size; -+ -+ /* TODO: make the start of the reservation window byte-aligned */ -+ /* cur = *start_block & ~7;*/ -+ cur = start_block; -+ rsv = search_head; -+ if (!rsv) -+ return -1; -+ -+ while (1) { -+ if (cur <= rsv->rsv_end) -+ cur = rsv->rsv_end + 1; -+ -+ /* TODO? -+ * in the case we could not find a reservable space -+ * that is what is expected, during the re-search, we could -+ * remember what's the largest reservable space we could have -+ * and return that one. -+ * -+ * For now it will fail if we could not find the reservable -+ * space with expected-size (or more)... -+ */ -+ if (cur > last_block) -+ return -1; /* fail */ -+ -+ prev = rsv; -+ next = rb_next(&rsv->rsv_node); -+ rsv = rb_entry(next,struct ext3cow_reserve_window_node,rsv_node); -+ -+ /* -+ * Reached the last reservation, we can just append to the -+ * previous one. -+ */ -+ if (!next) -+ break; -+ -+ if (cur + size <= rsv->rsv_start) { -+ /* -+ * Found a reserveable space big enough. We could -+ * have a reservation across the group boundary here -+ */ -+ break; -+ } -+ } -+ /* -+ * we come here either : -+ * when we reach the end of the whole list, -+ * and there is empty reservable space after last entry in the list. -+ * append it to the end of the list. -+ * -+ * or we found one reservable space in the middle of the list, -+ * return the reservation window that we could append to. -+ * succeed. -+ */ -+ -+ if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window))) -+ rsv_window_remove(sb, my_rsv); -+ -+ /* -+ * Let's book the whole avaliable window for now. We will check the -+ * disk bitmap later and then, if there are free blocks then we adjust -+ * the window size if it's larger than requested. -+ * Otherwise, we will remove this node from the tree next time -+ * call find_next_reservable_window. -+ */ -+ my_rsv->rsv_start = cur; -+ my_rsv->rsv_end = cur + size - 1; -+ my_rsv->rsv_alloc_hit = 0; -+ -+ if (prev != my_rsv) -+ ext3cow_rsv_window_add(sb, my_rsv); -+ -+ return 0; -+} -+ -+/** -+ * alloc_new_reservation()--allocate a new reservation window -+ * -+ * To make a new reservation, we search part of the filesystem -+ * reservation list (the list that inside the group). We try to -+ * allocate a new reservation window near the allocation goal, -+ * or the beginning of the group, if there is no goal. -+ * -+ * We first find a reservable space after the goal, then from -+ * there, we check the bitmap for the first free block after -+ * it. If there is no free block until the end of group, then the -+ * whole group is full, we failed. Otherwise, check if the free -+ * block is inside the expected reservable space, if so, we -+ * succeed. -+ * If the first free block is outside the reservable space, then -+ * start from the first free block, we search for next available -+ * space, and go on. -+ * -+ * on succeed, a new reservation will be found and inserted into the list -+ * It contains at least one free block, and it does not overlap with other -+ * reservation windows. -+ * -+ * failed: we failed to find a reservation window in this group -+ * -+ * @rsv: the reservation -+ * -+ * @grp_goal: The goal (group-relative). It is where the search for a -+ * free reservable space should start from. -+ * if we have a grp_goal(grp_goal >0 ), then start from there, -+ * no grp_goal(grp_goal = -1), we start from the first block -+ * of the group. -+ * -+ * @sb: the super block -+ * @group: the group we are trying to allocate in -+ * @bitmap_bh: the block group block bitmap -+ * -+ */ -+static int alloc_new_reservation(struct ext3cow_reserve_window_node *my_rsv, -+ ext3cow_grpblk_t grp_goal, struct super_block *sb, -+ unsigned int group, struct buffer_head *bitmap_bh) -+{ -+ struct ext3cow_reserve_window_node *search_head; -+ ext3cow_fsblk_t group_first_block, group_end_block, start_block; -+ ext3cow_grpblk_t first_free_block; -+ struct rb_root *fs_rsv_root = &EXT3COW_SB(sb)->s_rsv_window_root; -+ unsigned long size; -+ int ret; -+ spinlock_t *rsv_lock = &EXT3COW_SB(sb)->s_rsv_window_lock; -+ -+ group_first_block = ext3cow_group_first_block_no(sb, group); -+ group_end_block = group_first_block + (EXT3COW_BLOCKS_PER_GROUP(sb) - 1); -+ -+ if (grp_goal < 0) -+ start_block = group_first_block; -+ else -+ start_block = grp_goal + group_first_block; -+ -+ size = my_rsv->rsv_goal_size; -+ -+ if (!rsv_is_empty(&my_rsv->rsv_window)) { -+ /* -+ * if the old reservation is cross group boundary -+ * and if the goal is inside the old reservation window, -+ * we will come here when we just failed to allocate from -+ * the first part of the window. We still have another part -+ * that belongs to the next group. In this case, there is no -+ * point to discard our window and try to allocate a new one -+ * in this group(which will fail). we should -+ * keep the reservation window, just simply move on. -+ * -+ * Maybe we could shift the start block of the reservation -+ * window to the first block of next group. -+ */ -+ -+ if ((my_rsv->rsv_start <= group_end_block) && -+ (my_rsv->rsv_end > group_end_block) && -+ (start_block >= my_rsv->rsv_start)) -+ return -1; -+ -+ if ((my_rsv->rsv_alloc_hit > -+ (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) { -+ /* -+ * if the previously allocation hit ratio is -+ * greater than 1/2, then we double the size of -+ * the reservation window the next time, -+ * otherwise we keep the same size window -+ */ -+ size = size * 2; -+ if (size > EXT3COW_MAX_RESERVE_BLOCKS) -+ size = EXT3COW_MAX_RESERVE_BLOCKS; -+ my_rsv->rsv_goal_size= size; -+ } -+ } -+ -+ spin_lock(rsv_lock); -+ /* -+ * shift the search start to the window near the goal block -+ */ -+ search_head = search_reserve_window(fs_rsv_root, start_block); -+ -+ /* -+ * find_next_reservable_window() simply finds a reservable window -+ * inside the given range(start_block, group_end_block). -+ * -+ * To make sure the reservation window has a free bit inside it, we -+ * need to check the bitmap after we found a reservable window. -+ */ -+retry: -+ ret = find_next_reservable_window(search_head, my_rsv, sb, -+ start_block, group_end_block); -+ -+ if (ret == -1) { -+ if (!rsv_is_empty(&my_rsv->rsv_window)) -+ rsv_window_remove(sb, my_rsv); -+ spin_unlock(rsv_lock); -+ return -1; -+ } -+ -+ /* -+ * On success, find_next_reservable_window() returns the -+ * reservation window where there is a reservable space after it. -+ * Before we reserve this reservable space, we need -+ * to make sure there is at least a free block inside this region. -+ * -+ * searching the first free bit on the block bitmap and copy of -+ * last committed bitmap alternatively, until we found a allocatable -+ * block. Search start from the start block of the reservable space -+ * we just found. -+ */ -+ spin_unlock(rsv_lock); -+ first_free_block = bitmap_search_next_usable_block( -+ my_rsv->rsv_start - group_first_block, -+ bitmap_bh, group_end_block - group_first_block + 1); -+ -+ if (first_free_block < 0) { -+ /* -+ * no free block left on the bitmap, no point -+ * to reserve the space. return failed. -+ */ -+ spin_lock(rsv_lock); -+ if (!rsv_is_empty(&my_rsv->rsv_window)) -+ rsv_window_remove(sb, my_rsv); -+ spin_unlock(rsv_lock); -+ return -1; /* failed */ -+ } -+ -+ start_block = first_free_block + group_first_block; -+ /* -+ * check if the first free block is within the -+ * free space we just reserved -+ */ -+ if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end) -+ return 0; /* success */ -+ /* -+ * if the first free bit we found is out of the reservable space -+ * continue search for next reservable space, -+ * start from where the free block is, -+ * we also shift the list head to where we stopped last time -+ */ -+ search_head = my_rsv; -+ spin_lock(rsv_lock); -+ goto retry; -+} -+ -+/** -+ * try_to_extend_reservation() -+ * @my_rsv: given reservation window -+ * @sb: super block -+ * @size: the delta to extend -+ * -+ * Attempt to expand the reservation window large enough to have -+ * required number of free blocks -+ * -+ * Since ext3cow_try_to_allocate() will always allocate blocks within -+ * the reservation window range, if the window size is too small, -+ * multiple blocks allocation has to stop at the end of the reservation -+ * window. To make this more efficient, given the total number of -+ * blocks needed and the current size of the window, we try to -+ * expand the reservation window size if necessary on a best-effort -+ * basis before ext3cow_new_blocks() tries to allocate blocks, -+ */ -+static void try_to_extend_reservation(struct ext3cow_reserve_window_node *my_rsv, -+ struct super_block *sb, int size) -+{ -+ struct ext3cow_reserve_window_node *next_rsv; -+ struct rb_node *next; -+ spinlock_t *rsv_lock = &EXT3COW_SB(sb)->s_rsv_window_lock; -+ -+ if (!spin_trylock(rsv_lock)) -+ return; -+ -+ next = rb_next(&my_rsv->rsv_node); -+ -+ if (!next) -+ my_rsv->rsv_end += size; -+ else { -+ next_rsv = rb_entry(next, struct ext3cow_reserve_window_node, rsv_node); -+ -+ if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size) -+ my_rsv->rsv_end += size; -+ else -+ my_rsv->rsv_end = next_rsv->rsv_start - 1; -+ } -+ spin_unlock(rsv_lock); -+} -+ -+/** -+ * ext3cow_try_to_allocate_with_rsv() -+ * @sb: superblock -+ * @handle: handle to this transaction -+ * @group: given allocation block group -+ * @bitmap_bh: bufferhead holds the block bitmap -+ * @grp_goal: given target block within the group -+ * @count: target number of blocks to allocate -+ * @my_rsv: reservation window -+ * @errp: pointer to store the error code -+ * -+ * This is the main function used to allocate a new block and its reservation -+ * window. -+ * -+ * Each time when a new block allocation is need, first try to allocate from -+ * its own reservation. If it does not have a reservation window, instead of -+ * looking for a free bit on bitmap first, then look up the reservation list to -+ * see if it is inside somebody else's reservation window, we try to allocate a -+ * reservation window for it starting from the goal first. Then do the block -+ * allocation within the reservation window. -+ * -+ * This will avoid keeping on searching the reservation list again and -+ * again when somebody is looking for a free block (without -+ * reservation), and there are lots of free blocks, but they are all -+ * being reserved. -+ * -+ * We use a red-black tree for the per-filesystem reservation list. -+ * -+ */ -+static ext3cow_grpblk_t -+ext3cow_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle, -+ unsigned int group, struct buffer_head *bitmap_bh, -+ ext3cow_grpblk_t grp_goal, -+ struct ext3cow_reserve_window_node * my_rsv, -+ unsigned long *count, int *errp) -+{ -+ ext3cow_fsblk_t group_first_block, group_last_block; -+ ext3cow_grpblk_t ret = 0; -+ int fatal; -+ unsigned long num = *count; -+ -+ *errp = 0; -+ -+ /* -+ * Make sure we use undo access for the bitmap, because it is critical -+ * that we do the frozen_data COW on bitmap buffers in all cases even -+ * if the buffer is in BJ_Forget state in the committing transaction. -+ */ -+ BUFFER_TRACE(bitmap_bh, "get undo access for new block"); -+ fatal = ext3cow_journal_get_undo_access(handle, bitmap_bh); -+ if (fatal) { -+ *errp = fatal; -+ return -1; -+ } -+ -+ /* -+ * we don't deal with reservation when -+ * filesystem is mounted without reservation -+ * or the file is not a regular file -+ * or last attempt to allocate a block with reservation turned on failed -+ */ -+ if (my_rsv == NULL ) { -+ ret = ext3cow_try_to_allocate(sb, handle, group, bitmap_bh, -+ grp_goal, count, NULL); -+ goto out; -+ } -+ /* -+ * grp_goal is a group relative block number (if there is a goal) -+ * 0 <= grp_goal < EXT3COW_BLOCKS_PER_GROUP(sb) -+ * first block is a filesystem wide block number -+ * first block is the block number of the first block in this group -+ */ -+ group_first_block = ext3cow_group_first_block_no(sb, group); -+ group_last_block = group_first_block + (EXT3COW_BLOCKS_PER_GROUP(sb) - 1); -+ -+ /* -+ * Basically we will allocate a new block from inode's reservation -+ * window. -+ * -+ * We need to allocate a new reservation window, if: -+ * a) inode does not have a reservation window; or -+ * b) last attempt to allocate a block from existing reservation -+ * failed; or -+ * c) we come here with a goal and with a reservation window -+ * -+ * We do not need to allocate a new reservation window if we come here -+ * at the beginning with a goal and the goal is inside the window, or -+ * we don't have a goal but already have a reservation window. -+ * then we could go to allocate from the reservation window directly. -+ */ -+ while (1) { -+ if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) || -+ !goal_in_my_reservation(&my_rsv->rsv_window, -+ grp_goal, group, sb)) { -+ if (my_rsv->rsv_goal_size < *count) -+ my_rsv->rsv_goal_size = *count; -+ ret = alloc_new_reservation(my_rsv, grp_goal, sb, -+ group, bitmap_bh); -+ if (ret < 0) -+ break; /* failed */ -+ -+ if (!goal_in_my_reservation(&my_rsv->rsv_window, -+ grp_goal, group, sb)) -+ grp_goal = -1; -+ } else if (grp_goal >= 0) { -+ int curr = my_rsv->rsv_end - -+ (grp_goal + group_first_block) + 1; -+ -+ if (curr < *count) -+ try_to_extend_reservation(my_rsv, sb, -+ *count - curr); -+ } -+ -+ if ((my_rsv->rsv_start > group_last_block) || -+ (my_rsv->rsv_end < group_first_block)) { -+ rsv_window_dump(&EXT3COW_SB(sb)->s_rsv_window_root, 1); -+ BUG(); -+ } -+ ret = ext3cow_try_to_allocate(sb, handle, group, bitmap_bh, -+ grp_goal, &num, &my_rsv->rsv_window); -+ if (ret >= 0) { -+ my_rsv->rsv_alloc_hit += num; -+ *count = num; -+ break; /* succeed */ -+ } -+ num = *count; -+ } -+out: -+ if (ret >= 0) { -+ BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for " -+ "bitmap block"); -+ fatal = ext3cow_journal_dirty_metadata(handle, bitmap_bh); -+ if (fatal) { -+ *errp = fatal; -+ return -1; -+ } -+ return ret; -+ } -+ -+ BUFFER_TRACE(bitmap_bh, "journal_release_buffer"); -+ ext3cow_journal_release_buffer(handle, bitmap_bh); -+ return ret; -+} -+ -+/** -+ * ext3cow_has_free_blocks() -+ * @sbi: in-core super block structure. -+ * -+ * Check if filesystem has at least 1 free block available for allocation. -+ */ -+static int ext3cow_has_free_blocks(struct ext3cow_sb_info *sbi) -+{ -+ ext3cow_fsblk_t free_blocks, root_blocks; -+ -+ free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); -+ root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count); -+ if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && -+ sbi->s_resuid != current->fsuid && -+ (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { -+ return 0; -+ } -+ return 1; -+} -+ -+/** -+ * ext3cow_should_retry_alloc() -+ * @sb: super block -+ * @retries number of attemps has been made -+ * -+ * ext3cow_should_retry_alloc() is called when ENOSPC is returned, and if -+ * it is profitable to retry the operation, this function will wait -+ * for the current or commiting transaction to complete, and then -+ * return TRUE. -+ * -+ * if the total number of retries exceed three times, return FALSE. -+ */ -+int ext3cow_should_retry_alloc(struct super_block *sb, int *retries) -+{ -+ if (!ext3cow_has_free_blocks(EXT3COW_SB(sb)) || (*retries)++ > 3) -+ return 0; -+ -+ jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); -+ -+ return journal_force_commit_nested(EXT3COW_SB(sb)->s_journal); -+} -+ -+/** -+ * ext3cow_new_blocks() -- core block(s) allocation function -+ * @handle: handle to this transaction -+ * @inode: file inode -+ * @goal: given target block(filesystem wide) -+ * @count: target number of blocks to allocate -+ * @errp: error code -+ * -+ * ext3cow_new_blocks uses a goal block to assist allocation. It tries to -+ * allocate block(s) from the block group contains the goal block first. If that -+ * fails, it will try to allocate block(s) from other block groups without -+ * any specific goal block. -+ * -+ */ -+ext3cow_fsblk_t ext3cow_new_blocks(handle_t *handle, struct inode *inode, -+ ext3cow_fsblk_t goal, unsigned long *count, int *errp) -+{ -+ struct buffer_head *bitmap_bh = NULL; -+ struct buffer_head *gdp_bh; -+ int group_no; -+ int goal_group; -+ ext3cow_grpblk_t grp_target_blk; /* blockgroup relative goal block */ -+ ext3cow_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/ -+ ext3cow_fsblk_t ret_block; /* filesyetem-wide allocated block */ -+ int bgi; /* blockgroup iteration index */ -+ int fatal = 0, err; -+ int performed_allocation = 0; -+ ext3cow_grpblk_t free_blocks; /* number of free blocks in a group */ -+ struct super_block *sb; -+ struct ext3cow_group_desc *gdp; -+ struct ext3cow_super_block *es; -+ struct ext3cow_sb_info *sbi; -+ struct ext3cow_reserve_window_node *my_rsv = NULL; -+ struct ext3cow_block_alloc_info *block_i; -+ unsigned short windowsz = 0; -+#ifdef EXT3COWFS_DEBUG -+ static int goal_hits, goal_attempts; -+#endif -+ unsigned long ngroups; -+ unsigned long num = *count; -+ -+ *errp = -ENOSPC; -+ sb = inode->i_sb; -+ if (!sb) { -+ printk("ext3cow_new_block: nonexistent device"); -+ return 0; -+ } -+ -+ /* -+ * Check quota for allocation of this block. -+ */ -+ if (DQUOT_ALLOC_BLOCK(inode, num)) { -+ *errp = -EDQUOT; -+ return 0; -+ } -+ -+ sbi = EXT3COW_SB(sb); -+ es = EXT3COW_SB(sb)->s_es; -+ ext3cow_debug("goal=%lu.\n", goal); -+ /* -+ * Allocate a block from reservation only when -+ * filesystem is mounted with reservation(default,-o reservation), and -+ * it's a regular file, and -+ * the desired window size is greater than 0 (One could use ioctl -+ * command EXT3COW_IOC_SETRSVSZ to set the window size to 0 to turn off -+ * reservation on that particular file) -+ */ -+ block_i = EXT3COW_I(inode)->i_block_alloc_info; -+ if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0)) -+ my_rsv = &block_i->rsv_window_node; -+ -+ if (!ext3cow_has_free_blocks(sbi)) { -+ *errp = -ENOSPC; -+ goto out; -+ } -+ -+ /* -+ * First, test whether the goal block is free. -+ */ -+ if (goal < le32_to_cpu(es->s_first_data_block) || -+ goal >= le32_to_cpu(es->s_blocks_count)) -+ goal = le32_to_cpu(es->s_first_data_block); -+ group_no = (goal - le32_to_cpu(es->s_first_data_block)) / -+ EXT3COW_BLOCKS_PER_GROUP(sb); -+ goal_group = group_no; -+retry_alloc: -+ gdp = ext3cow_get_group_desc(sb, group_no, &gdp_bh); -+ if (!gdp) -+ goto io_error; -+ -+ free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); -+ /* -+ * if there is not enough free blocks to make a new resevation -+ * turn off reservation for this allocation -+ */ -+ if (my_rsv && (free_blocks < windowsz) -+ && (rsv_is_empty(&my_rsv->rsv_window))) -+ my_rsv = NULL; -+ -+ if (free_blocks > 0) { -+ grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) % -+ EXT3COW_BLOCKS_PER_GROUP(sb)); -+ bitmap_bh = read_block_bitmap(sb, group_no); -+ if (!bitmap_bh) -+ goto io_error; -+ grp_alloc_blk = ext3cow_try_to_allocate_with_rsv(sb, handle, -+ group_no, bitmap_bh, grp_target_blk, -+ my_rsv, &num, &fatal); -+ if (fatal) -+ goto out; -+ if (grp_alloc_blk >= 0) -+ goto allocated; -+ } -+ -+ ngroups = EXT3COW_SB(sb)->s_groups_count; -+ smp_rmb(); -+ -+ /* -+ * Now search the rest of the groups. We assume that -+ * i and gdp correctly point to the last group visited. -+ */ -+ for (bgi = 0; bgi < ngroups; bgi++) { -+ group_no++; -+ if (group_no >= ngroups) -+ group_no = 0; -+ gdp = ext3cow_get_group_desc(sb, group_no, &gdp_bh); -+ if (!gdp) -+ goto io_error; -+ free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); -+ /* -+ * skip this group if the number of -+ * free blocks is less than half of the reservation -+ * window size. -+ */ -+ if (free_blocks <= (windowsz/2)) -+ continue; -+ -+ brelse(bitmap_bh); -+ bitmap_bh = read_block_bitmap(sb, group_no); -+ if (!bitmap_bh) -+ goto io_error; -+ /* -+ * try to allocate block(s) from this group, without a goal(-1). -+ */ -+ grp_alloc_blk = ext3cow_try_to_allocate_with_rsv(sb, handle, -+ group_no, bitmap_bh, -1, my_rsv, -+ &num, &fatal); -+ if (fatal) -+ goto out; -+ if (grp_alloc_blk >= 0) -+ goto allocated; -+ } -+ /* -+ * We may end up a bogus ealier ENOSPC error due to -+ * filesystem is "full" of reservations, but -+ * there maybe indeed free blocks avaliable on disk -+ * In this case, we just forget about the reservations -+ * just do block allocation as without reservations. -+ */ -+ if (my_rsv) { -+ my_rsv = NULL; -+ windowsz = 0; -+ group_no = goal_group; -+ goto retry_alloc; -+ } -+ /* No space left on the device */ -+ *errp = -ENOSPC; -+ goto out; -+ -+allocated: -+ -+ ext3cow_debug("using block group %d(%d)\n", -+ group_no, gdp->bg_free_blocks_count); -+ -+ BUFFER_TRACE(gdp_bh, "get_write_access"); -+ fatal = ext3cow_journal_get_write_access(handle, gdp_bh); -+ if (fatal) -+ goto out; -+ -+ ret_block = grp_alloc_blk + ext3cow_group_first_block_no(sb, group_no); -+ -+ if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) || -+ in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) || -+ in_range(ret_block, le32_to_cpu(gdp->bg_inode_table), -+ EXT3COW_SB(sb)->s_itb_per_group) || -+ in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table), -+ EXT3COW_SB(sb)->s_itb_per_group)) -+ ext3cow_error(sb, "ext3cow_new_block", -+ "Allocating block in system zone - " -+ "blocks from "E3FSBLK", length %lu", -+ ret_block, num); -+ -+ performed_allocation = 1; -+ -+#ifdef CONFIG_JBD_DEBUG -+ { -+ struct buffer_head *debug_bh; -+ -+ /* Record bitmap buffer state in the newly allocated block */ -+ debug_bh = sb_find_get_block(sb, ret_block); -+ if (debug_bh) { -+ BUFFER_TRACE(debug_bh, "state when allocated"); -+ BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state"); -+ brelse(debug_bh); -+ } -+ } -+ jbd_lock_bh_state(bitmap_bh); -+ spin_lock(sb_bgl_lock(sbi, group_no)); -+ if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) { -+ int i; -+ -+ for (i = 0; i < num; i++) { -+ if (ext3cow_test_bit(grp_alloc_blk+i, -+ bh2jh(bitmap_bh)->b_committed_data)) { -+ printk("%s: block was unexpectedly set in " -+ "b_committed_data\n", __FUNCTION__); -+ } -+ } -+ } -+ ext3cow_debug("found bit %d\n", grp_alloc_blk); -+ spin_unlock(sb_bgl_lock(sbi, group_no)); -+ jbd_unlock_bh_state(bitmap_bh); -+#endif -+ -+ if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) { -+ ext3cow_error(sb, "ext3cow_new_block", -+ "block("E3FSBLK") >= blocks count(%d) - " -+ "block_group = %d, es == %p ", ret_block, -+ le32_to_cpu(es->s_blocks_count), group_no, es); -+ goto out; -+ } -+ -+ /* -+ * It is up to the caller to add the new buffer to a journal -+ * list of some description. We don't know in advance whether -+ * the caller wants to use it as metadata or data. -+ */ -+ ext3cow_debug("allocating block %lu. Goal hits %d of %d.\n", -+ ret_block, goal_hits, goal_attempts); -+ -+ spin_lock(sb_bgl_lock(sbi, group_no)); -+ gdp->bg_free_blocks_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)-num); -+ spin_unlock(sb_bgl_lock(sbi, group_no)); -+ percpu_counter_mod(&sbi->s_freeblocks_counter, -num); -+ -+ BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor"); -+ err = ext3cow_journal_dirty_metadata(handle, gdp_bh); -+ if (!fatal) -+ fatal = err; -+ -+ sb->s_dirt = 1; -+ if (fatal) -+ goto out; -+ -+ *errp = 0; -+ brelse(bitmap_bh); -+ DQUOT_FREE_BLOCK(inode, *count-num); -+ *count = num; -+ return ret_block; -+ -+io_error: -+ *errp = -EIO; -+out: -+ if (fatal) { -+ *errp = fatal; -+ ext3cow_std_error(sb, fatal); -+ } -+ /* -+ * Undo the block allocation -+ */ -+ if (!performed_allocation) -+ DQUOT_FREE_BLOCK(inode, *count); -+ brelse(bitmap_bh); -+ return 0; -+} -+ -+ext3cow_fsblk_t ext3cow_new_block(handle_t *handle, struct inode *inode, -+ ext3cow_fsblk_t goal, int *errp) -+{ -+ unsigned long count = 1; -+ -+ return ext3cow_new_blocks(handle, inode, goal, &count, errp); -+} -+ -+/** -+ * ext3cow_count_free_blocks() -- count filesystem free blocks -+ * @sb: superblock -+ * -+ * Adds up the number of free blocks from each block group. -+ */ -+ext3cow_fsblk_t ext3cow_count_free_blocks(struct super_block *sb) -+{ -+ ext3cow_fsblk_t desc_count; -+ struct ext3cow_group_desc *gdp; -+ int i; -+ unsigned long ngroups = EXT3COW_SB(sb)->s_groups_count; -+#ifdef EXT3COWFS_DEBUG -+ struct ext3cow_super_block *es; -+ ext3cow_fsblk_t bitmap_count; -+ unsigned long x; -+ struct buffer_head *bitmap_bh = NULL; -+ -+ es = EXT3COW_SB(sb)->s_es; -+ desc_count = 0; -+ bitmap_count = 0; -+ gdp = NULL; -+ -+ smp_rmb(); -+ for (i = 0; i < ngroups; i++) { -+ gdp = ext3cow_get_group_desc(sb, i, NULL); -+ if (!gdp) -+ continue; -+ desc_count += le16_to_cpu(gdp->bg_free_blocks_count); -+ brelse(bitmap_bh); -+ bitmap_bh = read_block_bitmap(sb, i); -+ if (bitmap_bh == NULL) -+ continue; -+ -+ x = ext3cow_count_free(bitmap_bh, sb->s_blocksize); -+ printk("group %d: stored = %d, counted = %lu\n", -+ i, le16_to_cpu(gdp->bg_free_blocks_count), x); -+ bitmap_count += x; -+ } -+ brelse(bitmap_bh); -+ printk("ext3cow_count_free_blocks: stored = "E3FSBLK -+ ", computed = "E3FSBLK", "E3FSBLK"\n", -+ le32_to_cpu(es->s_free_blocks_count), -+ desc_count, bitmap_count); -+ return bitmap_count; -+#else -+ desc_count = 0; -+ smp_rmb(); -+ for (i = 0; i < ngroups; i++) { -+ gdp = ext3cow_get_group_desc(sb, i, NULL); -+ if (!gdp) -+ continue; -+ desc_count += le16_to_cpu(gdp->bg_free_blocks_count); -+ } -+ -+ return desc_count; -+#endif -+} -+ -+static inline int -+block_in_use(ext3cow_fsblk_t block, struct super_block *sb, unsigned char *map) -+{ -+ return ext3cow_test_bit ((block - -+ le32_to_cpu(EXT3COW_SB(sb)->s_es->s_first_data_block)) % -+ EXT3COW_BLOCKS_PER_GROUP(sb), map); -+} -+ -+static inline int test_root(int a, int b) -+{ -+ int num = b; -+ -+ while (a > num) -+ num *= b; -+ return num == a; -+} -+ -+static int ext3cow_group_sparse(int group) -+{ -+ if (group <= 1) -+ return 1; -+ if (!(group & 1)) -+ return 0; -+ return (test_root(group, 7) || test_root(group, 5) || -+ test_root(group, 3)); -+} -+ -+/** -+ * ext3cow_bg_has_super - number of blocks used by the superblock in group -+ * @sb: superblock for filesystem -+ * @group: group number to check -+ * -+ * Return the number of blocks used by the superblock (primary or backup) -+ * in this group. Currently this will be only 0 or 1. -+ */ -+int ext3cow_bg_has_super(struct super_block *sb, int group) -+{ -+ if (EXT3COW_HAS_RO_COMPAT_FEATURE(sb, -+ EXT3COW_FEATURE_RO_COMPAT_SPARSE_SUPER) && -+ !ext3cow_group_sparse(group)) -+ return 0; -+ return 1; -+} -+ -+static unsigned long ext3cow_bg_num_gdb_meta(struct super_block *sb, int group) -+{ -+ unsigned long metagroup = group / EXT3COW_DESC_PER_BLOCK(sb); -+ unsigned long first = metagroup * EXT3COW_DESC_PER_BLOCK(sb); -+ unsigned long last = first + EXT3COW_DESC_PER_BLOCK(sb) - 1; -+ -+ if (group == first || group == first + 1 || group == last) -+ return 1; -+ return 0; -+} -+ -+static unsigned long ext3cow_bg_num_gdb_nometa(struct super_block *sb, int group) -+{ -+ if (EXT3COW_HAS_RO_COMPAT_FEATURE(sb, -+ EXT3COW_FEATURE_RO_COMPAT_SPARSE_SUPER) && -+ !ext3cow_group_sparse(group)) -+ return 0; -+ return EXT3COW_SB(sb)->s_gdb_count; -+} -+ -+/** -+ * ext3cow_bg_num_gdb - number of blocks used by the group table in group -+ * @sb: superblock for filesystem -+ * @group: group number to check -+ * -+ * Return the number of blocks used by the group descriptor table -+ * (primary or backup) in this group. In the future there may be a -+ * different number of descriptor blocks in each group. -+ */ -+unsigned long ext3cow_bg_num_gdb(struct super_block *sb, int group) -+{ -+ unsigned long first_meta_bg = -+ le32_to_cpu(EXT3COW_SB(sb)->s_es->s_first_meta_bg); -+ unsigned long metagroup = group / EXT3COW_DESC_PER_BLOCK(sb); -+ -+ if (!EXT3COW_HAS_INCOMPAT_FEATURE(sb,EXT3COW_FEATURE_INCOMPAT_META_BG) || -+ metagroup < first_meta_bg) -+ return ext3cow_bg_num_gdb_nometa(sb,group); -+ -+ return ext3cow_bg_num_gdb_meta(sb,group); -+ -+} -diff -ruN linux-2.6.20.3/fs/ext3cow/bitmap.c linux-2.6.20.3-ext3cow/fs/ext3cow/bitmap.c ---- linux-2.6.20.3/fs/ext3cow/bitmap.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.20.3-ext3cow/fs/ext3cow/bitmap.c 2008-03-09 11:14:48.000000000 -0400 -@@ -0,0 +1,32 @@ -+/* -+ * linux/fs/ext3/bitmap.c -+ * -+ * Copyright (C) 1992, 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ */ -+ -+#include -+#include -+#include -+ -+#ifdef EXT3COWFS_DEBUG -+ -+static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; -+ -+unsigned long ext3cow_count_free (struct buffer_head * map, unsigned int numchars) -+{ -+ unsigned int i; -+ unsigned long sum = 0; -+ -+ if (!map) -+ return (0); -+ for (i = 0; i < numchars; i++) -+ sum += nibblemap[map->b_data[i] & 0xf] + -+ nibblemap[(map->b_data[i] >> 4) & 0xf]; -+ return (sum); -+} -+ -+#endif /* EXT3COWFS_DEBUG */ -+ -diff -ruN linux-2.6.20.3/fs/ext3cow/CHANGELOG linux-2.6.20.3-ext3cow/fs/ext3cow/CHANGELOG ---- linux-2.6.20.3/fs/ext3cow/CHANGELOG 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.20.3-ext3cow/fs/ext3cow/CHANGELOG 2008-03-09 11:27:12.000000000 -0400 -@@ -0,0 +1,12 @@ -+3-9-08 -+- Fixed a bug that resulted in the first block in a newly allocated indirect block to be allocated over and over again. -+- Fixed a bug that resulted in COW bitmaps not to be reset after truncate. -+- Bug e2fsprogs that caused aborting journal fixed. -+ -+6-20-97 -+- Finished the rollback code for inode chains in case of error. -+ -+6-18-07 -+- Added support for 32-bit uid's and gid's back in again -+- Took out support for block fragmentation -+- Hopefully fixed the non-sticking uid/gid bug. -\ No newline at end of file -diff -ruN linux-2.6.20.3/fs/ext3cow/dir.c linux-2.6.20.3-ext3cow/fs/ext3cow/dir.c ---- linux-2.6.20.3/fs/ext3cow/dir.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.20.3-ext3cow/fs/ext3cow/dir.c 2008-03-09 11:14:49.000000000 -0400 -@@ -0,0 +1,732 @@ -+/* -+ * linux/fs/ext3cow/dir.c -+ * -+ * Copyright (C) 1992, 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ * -+ * from -+ * -+ * linux/fs/minix/dir.c -+ * -+ * Copyright (C) 1991, 1992 Linus Torvalds -+ * -+ * ext3cow directory handling functions -+ * -+ * Big-endian to little-endian byte-swapping/bitmaps by -+ * David S. Miller (davem@caip.rutgers.edu), 1995 -+ * -+ * Hash Tree Directory indexing (c) 2001 Daniel Phillips -+ * -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+static unsigned char ext3cow_filetype_table[] = { -+ DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK -+}; -+ -+static int ext3cow_readdir(struct file *, void *, filldir_t); -+static int ext3cow_dx_readdir(struct file * filp, -+ void * dirent, filldir_t filldir); -+static int ext3cow_release_dir (struct inode * inode, -+ struct file * filp); -+ -+const struct file_operations ext3cow_dir_operations = { -+ .llseek = generic_file_llseek, -+ .read = generic_read_dir, -+ .readdir = ext3cow_readdir, /* we take BKL. needed?*/ -+ .ioctl = ext3cow_ioctl, /* BKL held */ -+#ifdef CONFIG_COMPAT -+ .compat_ioctl = ext3cow_compat_ioctl, -+#endif -+ .fsync = ext3cow_sync_file, /* BKL held */ -+#ifdef CONFIG_EXT3COW_INDEX -+ .release = ext3cow_release_dir, -+#endif -+}; -+ -+ -+static unsigned char get_dtype(struct super_block *sb, int filetype) -+{ -+ if (!EXT3COW_HAS_INCOMPAT_FEATURE(sb, EXT3COW_FEATURE_INCOMPAT_FILETYPE) || -+ (filetype >= EXT3COW_FT_MAX)) -+ return DT_UNKNOWN; -+ -+ return (ext3cow_filetype_table[filetype]); -+} -+ -+static int ext3cow_readversions(struct file * filp, void * dirent, -+ filldir_t filldir) -+{ -+ int error = 0; -+ unsigned long offset; -+ int i, stored; -+ struct buffer_head *bh; -+ struct ext3cow_dir_entry_2 * de; -+ struct super_block * sb; -+ int err; -+ struct inode *dir = filp->f_dentry->d_inode; -+ char *at; -+ unsigned long ino; -+ int ref_len = filp->f_dentry->d_name.len -1; -+ -+ sb = dir->i_sb; -+ -+ stored = 0; -+ bh = NULL; -+ offset = filp->f_pos & (sb->s_blocksize - 1); -+ -+ at = strrchr(filp->f_dentry->d_name.name, EXT3COW_FLUX_TOKEN); -+ -+ while (!error && !stored && filp->f_pos < dir->i_size) { -+ unsigned long blk = (filp->f_pos) >> EXT3COW_BLOCK_SIZE_BITS(sb); -+ struct buffer_head map_bh; -+ -+ bh = NULL; -+ map_bh.b_state = 0; -+ err = ext3cow_get_blocks_handle(NULL, dir, blk, 1, -+ &map_bh, 0, 0); -+ if (err > 0) { -+ page_cache_readahead(sb->s_bdev->bd_inode->i_mapping, -+ &filp->f_ra, -+ filp, -+ map_bh.b_blocknr >> -+ (PAGE_CACHE_SHIFT - dir->i_blkbits), -+ 1); -+ bh = ext3cow_bread(NULL, dir, blk, 0, &err); -+ } -+ -+ /* -+ * We ignore I/O errors on directories so users have a chance -+ * of recovering data when there's a bad sector -+ */ -+ if (!bh) { -+ ext3cow_error (sb, "ext3cow_versions", -+ "directory #%lu contains a hole at offset %lu", -+ dir->i_ino, (unsigned long)filp->f_pos); -+ /* corrupt size? Maybe no more blocks to read */ -+ if (filp->f_pos > dir->i_blocks << 9) -+ break; -+ filp->f_pos += sb->s_blocksize - offset; -+ continue; -+ } -+ -+ ver_revalidate: -+ /* If the dir block has changed since the last call to -+ * readdir(2), then we might be pointing to an invalid -+ * dirent right now. Scan from the start of the block -+ * to make sure. */ -+ if (filp->f_version != dir->i_version) { -+ for (i = 0; i < sb->s_blocksize && i < offset; ) { -+ de = (struct ext3cow_dir_entry_2 *) -+ (bh->b_data + i); -+ /* It's too expensive to do a full -+ * dirent test each time round this -+ * loop, but we do have to test at -+ * least that it is non-zero. A -+ * failure will be detected in the -+ * dirent test below. */ -+ if (le16_to_cpu(de->rec_len) < -+ EXT3COW_DIR_REC_LEN(1)) -+ break; -+ i += le16_to_cpu(de->rec_len); -+ } -+ offset = i; -+ filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) -+ | offset; -+ filp->f_version = dir->i_version; -+ } -+ -+ while (!error && filp->f_pos < dir->i_size -+ && offset < sb->s_blocksize) { -+ de = (struct ext3cow_dir_entry_2 *) (bh->b_data + offset); -+ if (!ext3cow_check_dir_entry ("ext3cow_readversions", dir, de, -+ bh, offset)) { -+ /* On error, skip the f_pos to the -+ next block. */ -+ filp->f_pos = (filp->f_pos | -+ (sb->s_blocksize - 1)) + 1; -+ brelse (bh); -+ return stored; -+ } -+ offset += le16_to_cpu(de->rec_len); -+ -+ if (le32_to_cpu(de->inode)){ -+ unsigned long version = filp->f_version; -+ unsigned char d_type = DT_UNKNOWN; -+ -+ /* We might block in the next section -+ * if the data destination is -+ * currently swapped out. So, use a -+ * version stamp to detect whether or -+ * not the directory has been modified -+ * during the copy operation. -+ */ -+ -+ if (EXT3COW_HAS_INCOMPAT_FEATURE(sb, -+ EXT3COW_FEATURE_INCOMPAT_FILETYPE) -+ && de->file_type < EXT3COW_FT_MAX) -+ d_type = -+ ext3cow_filetype_table[de->file_type]; -+ if (de->name_len == ref_len -+ && strncmp(filp->f_dentry->d_name.name, de->name, ref_len)==0) { -+ -+ struct inode * inde; -+ char * name; -+ -+ name = kmalloc(EXT3COW_NAME_LEN, GFP_KERNEL); -+ strncpy(name, de->name, de->name_len); -+ inde = iget(dir->i_sb, de->inode); -+ -+ if (de->death_epoch!=0 && de->birth_epoch!=de->death_epoch) { -+ name[de->name_len]='\0'; -+ sprintf(name,"%s@%d",name, de->death_epoch); -+ error = filldir(dirent, name, -+ strlen(name), -+ filp->f_pos, -+ le32_to_cpu(inde->i_ino), -+ d_type); -+ stored++; -+ } -+ -+ while (EXT3COW_I(inde)->i_next_inode!=0) { -+ name[de->name_len]='\0'; -+ sprintf(name,"%s@%d",name, EXT3COW_I_EPOCHNUMBER(inde)); -+ error = filldir(dirent, name, -+ strlen(name), -+ filp->f_pos, -+ le32_to_cpu(inde->i_ino), -+ d_type); -+ ino = EXT3COW_I(inde)->i_next_inode; -+ iput(inde); -+ inde = iget(dir->i_sb, ino); -+ stored++; -+ } -+ -+ kfree(name); -+ iput(inde); -+ -+ if (error) -+ break; -+ -+ if (!stored && -+ EXT3COW_IS_DIRENT_SCOPED(de, EXT3COW_I_EPOCHNUMBER(dir))) { -+ error = filldir(dirent, de->name, -+ de->name_len, -+ filp->f_pos, -+ le32_to_cpu(de->inode), -+ d_type); -+ } -+ -+ if (error) -+ break; -+ if (version != filp->f_version) -+ goto ver_revalidate; -+ stored ++; -+ } -+ } -+ -+ filp->f_pos += le16_to_cpu(de->rec_len); -+ } -+ offset = 0; -+ brelse (bh); -+ } -+ return 0; -+} -+ -+ -+int ext3cow_check_dir_entry (const char * function, struct inode * dir, -+ struct ext3cow_dir_entry_2 * de, -+ struct buffer_head * bh, -+ unsigned long offset) -+{ -+ const char * error_msg = NULL; -+ const int rlen = le16_to_cpu(de->rec_len); -+ unsigned int current_epoch = EXT3COW_S_EPOCHNUMBER(dir->i_sb); -+ -+ if (rlen < EXT3COW_DIR_REC_LEN(1)) -+ error_msg = "rec_len is smaller than minimal"; -+ else if (rlen % 4 != 0) -+ error_msg = "rec_len % 4 != 0"; -+ else if (rlen < EXT3COW_DIR_REC_LEN(de->name_len)) -+ error_msg = "rec_len is too small for name_len"; -+ else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) -+ error_msg = "directory entry across blocks"; -+ else if (le32_to_cpu(de->inode) > -+ le32_to_cpu(EXT3COW_SB(dir->i_sb)->s_es->s_inodes_count)) -+ error_msg = "inode out of bounds"; -+ /* Some bounds on versioned entries -znjp*/ -+ else if (le32_to_cpu(de->death_epoch) != EXT3COW_DIRENT_ALIVE && -+ le32_to_cpu(de->birth_epoch) > le32_to_cpu(de->death_epoch)) -+ error_msg = "entry died before it was born"; -+ else if (le32_to_cpu(de->birth_epoch) > current_epoch) -+ error_msg = "entry was born in the future"; -+ else if (le32_to_cpu(de->death_epoch) > current_epoch) -+ error_msg = "entry has already died in the future"; -+ -+ if (error_msg != NULL) -+ ext3cow_error (dir->i_sb, function, -+ "bad entry in directory #%lu: %s - " -+ "offset=%lu, inode=%lu, rec_len=%d, name_len=%d, " -+ "birth_epoch=%d death_epoch=%d", -+ dir->i_ino, error_msg, offset, -+ (unsigned long) le32_to_cpu(de->inode), -+ rlen, de->name_len, de->birth_epoch, de->death_epoch); -+ return error_msg == NULL ? 1 : 0; -+} -+ -+static int ext3cow_readdir(struct file * filp, -+ void * dirent, filldir_t filldir) -+{ -+ int error = 0; -+ unsigned long offset; -+ int i, stored; -+ struct ext3cow_dir_entry_2 *de; -+ struct super_block *sb; -+ int err; -+ struct inode *inode = filp->f_path.dentry->d_inode; -+ int ret = 0; -+ -+ /* is this a version listing? */ -+ if (filp->f_dentry->d_name.name[filp->f_dentry->d_name.len-1] == -+ EXT3COW_FLUX_TOKEN) -+ return ext3cow_readversions(filp, dirent, filldir); -+ -+ sb = inode->i_sb; -+ -+#ifdef CONFIG_EXT3COW_INDEX -+ if (EXT3COW_HAS_COMPAT_FEATURE(inode->i_sb, -+ EXT3COW_FEATURE_COMPAT_DIR_INDEX) && -+ ((EXT3COW_I(inode)->i_flags & EXT3COW_INDEX_FL) || -+ ((inode->i_size >> sb->s_blocksize_bits) == 1))) { -+ -+ err = ext3cow_dx_readdir(filp, dirent, filldir); -+ if (err != ERR_BAD_DX_DIR) { -+ ret = err; -+ goto out; -+ } -+ /* -+ * We don't set the inode dirty flag since it's not -+ * critical that it get flushed back to the disk. -+ */ -+ EXT3COW_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT3COW_INDEX_FL; -+ } -+#endif -+ stored = 0; -+ offset = filp->f_pos & (sb->s_blocksize - 1); -+ -+ while (!error && !stored && filp->f_pos < inode->i_size) { -+ unsigned long blk = filp->f_pos >> EXT3COW_BLOCK_SIZE_BITS(sb); -+ struct buffer_head map_bh; -+ struct buffer_head *bh = NULL; -+ -+ map_bh.b_state = 0; -+ err = ext3cow_get_blocks_handle(NULL, inode, blk, 1, -+ &map_bh, 0, 0); -+ if (err > 0) { -+ page_cache_readahead(sb->s_bdev->bd_inode->i_mapping, -+ &filp->f_ra, -+ filp, -+ map_bh.b_blocknr >> -+ (PAGE_CACHE_SHIFT - inode->i_blkbits), -+ 1); -+ bh = ext3cow_bread(NULL, inode, blk, 0, &err); -+ } -+ -+ /* -+ * We ignore I/O errors on directories so users have a chance -+ * of recovering data when there's a bad sector -+ */ -+ if (!bh) { -+ ext3cow_error (sb, "ext3cow_readdir", -+ "directory #%lu contains a hole at offset %lu", -+ inode->i_ino, (unsigned long)filp->f_pos); -+ /* corrupt size? Maybe no more blocks to read */ -+ if (filp->f_pos > inode->i_blocks << 9) -+ break; -+ filp->f_pos += sb->s_blocksize - offset; -+ continue; -+ } -+ -+revalidate: -+ /* If the dir block has changed since the last call to -+ * readdir(2), then we might be pointing to an invalid -+ * dirent right now. Scan from the start of the block -+ * to make sure. */ -+ if (filp->f_version != inode->i_version) { -+ for (i = 0; i < sb->s_blocksize && i < offset; ) { -+ de = (struct ext3cow_dir_entry_2 *) -+ (bh->b_data + i); -+ /* It's too expensive to do a full -+ * dirent test each time round this -+ * loop, but we do have to test at -+ * least that it is non-zero. A -+ * failure will be detected in the -+ * dirent test below. */ -+ if (le16_to_cpu(de->rec_len) < -+ EXT3COW_DIR_REC_LEN(1)) -+ break; -+ i += le16_to_cpu(de->rec_len); -+ } -+ offset = i; -+ filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) -+ | offset; -+ filp->f_version = inode->i_version; -+ } -+ -+ while (!error && filp->f_pos < inode->i_size -+ && offset < sb->s_blocksize) { -+ de = (struct ext3cow_dir_entry_2 *) (bh->b_data + offset); -+ if (!ext3cow_check_dir_entry ("ext3cow_readdir", inode, de, -+ bh, offset)) { -+ /* On error, skip the f_pos to the -+ next block. */ -+ filp->f_pos = (filp->f_pos | -+ (sb->s_blocksize - 1)) + 1; -+ brelse (bh); -+ ret = stored; -+ goto out; -+ } -+ offset += le16_to_cpu(de->rec_len); -+ /* -+ printk("Inode %ld Epoch number %u: is -+ dir %d -> %s be %d de %d scoped? %d\n", -+ dir->i_ino, -+ EXT3COW_I_EPOCHNUMBER(dir), -+ de->inode, -+ de->name, -+ de->birth_epoch, -+ de->death_epoch, -+ EXT3COW_IS_DIRENT_SCOPED(de, EXT3COW_I_EPOCHNUMBER(dir))); -+ */ -+ -+ /* Only add scoped dirents - znjp */ -+ if (le32_to_cpu(de->inode) && -+ EXT3COW_IS_DIRENT_SCOPED(de, EXT3COW_I_EPOCHNUMBER(inode))) { -+ /* We might block in the next section -+ * if the data destination is -+ * currently swapped out. So, use a -+ * version stamp to detect whether or -+ * not the directory has been modified -+ * during the copy operation. -+ */ -+ unsigned long version = filp->f_version; -+ -+ error = filldir(dirent, de->name, -+ de->name_len, -+ filp->f_pos, -+ le32_to_cpu(de->inode), -+ get_dtype(sb, de->file_type)); -+ if (error) -+ break; -+ if (version != filp->f_version) -+ goto revalidate; -+ stored ++; -+ } -+ filp->f_pos += le16_to_cpu(de->rec_len); -+ } -+ offset = 0; -+ brelse (bh); -+ } -+out: -+ return ret; -+} -+ -+#ifdef CONFIG_EXT3COW_INDEX -+/* -+ * These functions convert from the major/minor hash to an f_pos -+ * value. -+ * -+ * Currently we only use major hash numer. This is unfortunate, but -+ * on 32-bit machines, the same VFS interface is used for lseek and -+ * llseek, so if we use the 64 bit offset, then the 32-bit versions of -+ * lseek/telldir/seekdir will blow out spectacularly, and from within -+ * the ext2 low-level routine, we don't know if we're being called by -+ * a 64-bit version of the system call or the 32-bit version of the -+ * system call. Worse yet, NFSv2 only allows for a 32-bit readdir -+ * cookie. Sigh. -+ */ -+#define hash2pos(major, minor) (major >> 1) -+#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff) -+#define pos2min_hash(pos) (0) -+ -+/* -+ * This structure holds the nodes of the red-black tree used to store -+ * the directory entry in hash order. -+ */ -+struct fname { -+ __u32 hash; -+ __u32 minor_hash; -+ struct rb_node rb_hash; -+ struct fname *next; -+ __u32 inode; -+ __u8 name_len; -+ __u8 file_type; -+ char name[0]; -+}; -+ -+/* -+ * This functoin implements a non-recursive way of freeing all of the -+ * nodes in the red-black tree. -+ */ -+static void free_rb_tree_fname(struct rb_root *root) -+{ -+ struct rb_node *n = root->rb_node; -+ struct rb_node *parent; -+ struct fname *fname; -+ -+ while (n) { -+ /* Do the node's children first */ -+ if ((n)->rb_left) { -+ n = n->rb_left; -+ continue; -+ } -+ if (n->rb_right) { -+ n = n->rb_right; -+ continue; -+ } -+ /* -+ * The node has no children; free it, and then zero -+ * out parent's link to it. Finally go to the -+ * beginning of the loop and try to free the parent -+ * node. -+ */ -+ parent = rb_parent(n); -+ fname = rb_entry(n, struct fname, rb_hash); -+ while (fname) { -+ struct fname * old = fname; -+ fname = fname->next; -+ kfree (old); -+ } -+ if (!parent) -+ root->rb_node = NULL; -+ else if (parent->rb_left == n) -+ parent->rb_left = NULL; -+ else if (parent->rb_right == n) -+ parent->rb_right = NULL; -+ n = parent; -+ } -+ root->rb_node = NULL; -+} -+ -+ -+static struct dir_private_info *create_dir_info(loff_t pos) -+{ -+ struct dir_private_info *p; -+ -+ p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL); -+ if (!p) -+ return NULL; -+ p->root.rb_node = NULL; -+ p->curr_node = NULL; -+ p->extra_fname = NULL; -+ p->last_pos = 0; -+ p->curr_hash = pos2maj_hash(pos); -+ p->curr_minor_hash = pos2min_hash(pos); -+ p->next_hash = 0; -+ return p; -+} -+ -+void ext3cow_htree_free_dir_info(struct dir_private_info *p) -+{ -+ free_rb_tree_fname(&p->root); -+ kfree(p); -+} -+ -+/* -+ * Given a directory entry, enter it into the fname rb tree. -+ */ -+int ext3cow_htree_store_dirent(struct file *dir_file, __u32 hash, -+ __u32 minor_hash, -+ struct ext3cow_dir_entry_2 *dirent) -+{ -+ struct rb_node **p, *parent = NULL; -+ struct fname * fname, *new_fn; -+ struct dir_private_info *info; -+ int len; -+ -+ info = (struct dir_private_info *) dir_file->private_data; -+ p = &info->root.rb_node; -+ -+ /* Create and allocate the fname structure */ -+ len = sizeof(struct fname) + dirent->name_len + 1; -+ new_fn = kzalloc(len, GFP_KERNEL); -+ if (!new_fn) -+ return -ENOMEM; -+ new_fn->hash = hash; -+ new_fn->minor_hash = minor_hash; -+ new_fn->inode = le32_to_cpu(dirent->inode); -+ new_fn->name_len = dirent->name_len; -+ new_fn->file_type = dirent->file_type; -+ memcpy(new_fn->name, dirent->name, dirent->name_len); -+ new_fn->name[dirent->name_len] = 0; -+ -+ while (*p) { -+ parent = *p; -+ fname = rb_entry(parent, struct fname, rb_hash); -+ -+ /* -+ * If the hash and minor hash match up, then we put -+ * them on a linked list. This rarely happens... -+ */ -+ if ((new_fn->hash == fname->hash) && -+ (new_fn->minor_hash == fname->minor_hash)) { -+ new_fn->next = fname->next; -+ fname->next = new_fn; -+ return 0; -+ } -+ -+ if (new_fn->hash < fname->hash) -+ p = &(*p)->rb_left; -+ else if (new_fn->hash > fname->hash) -+ p = &(*p)->rb_right; -+ else if (new_fn->minor_hash < fname->minor_hash) -+ p = &(*p)->rb_left; -+ else /* if (new_fn->minor_hash > fname->minor_hash) */ -+ p = &(*p)->rb_right; -+ } -+ -+ rb_link_node(&new_fn->rb_hash, parent, p); -+ rb_insert_color(&new_fn->rb_hash, &info->root); -+ return 0; -+} -+ -+ -+ -+/* -+ * This is a helper function for ext3cow_dx_readdir. It calls filldir -+ * for all entres on the fname linked list. (Normally there is only -+ * one entry on the linked list, unless there are 62 bit hash collisions.) -+ */ -+static int call_filldir(struct file * filp, void * dirent, -+ filldir_t filldir, struct fname *fname) -+{ -+ struct dir_private_info *info = filp->private_data; -+ loff_t curr_pos; -+ struct inode *inode = filp->f_path.dentry->d_inode; -+ struct super_block * sb; -+ int error; -+ -+ sb = inode->i_sb; -+ -+ printk(KERN_INFO, "Got %s\n", filp->f_path.dentry->d_name.name); -+ -+ if (!fname) { -+ printk("call_filldir: called with null fname?!?\n"); -+ return 0; -+ } -+ curr_pos = hash2pos(fname->hash, fname->minor_hash); -+ while (fname) { -+ error = filldir(dirent, fname->name, -+ fname->name_len, curr_pos, -+ fname->inode, -+ get_dtype(sb, fname->file_type)); -+ if (error) { -+ filp->f_pos = curr_pos; -+ info->extra_fname = fname->next; -+ return error; -+ } -+ fname = fname->next; -+ } -+ return 0; -+} -+ -+static int ext3cow_dx_readdir(struct file * filp, -+ void * dirent, filldir_t filldir) -+{ -+ struct dir_private_info *info = filp->private_data; -+ struct inode *inode = filp->f_path.dentry->d_inode; -+ struct fname *fname; -+ int ret; -+ -+ if (!info) { -+ info = create_dir_info(filp->f_pos); -+ if (!info) -+ return -ENOMEM; -+ filp->private_data = info; -+ } -+ -+ if (filp->f_pos == EXT3COW_HTREE_EOF) -+ return 0; /* EOF */ -+ -+ /* Some one has messed with f_pos; reset the world */ -+ if (info->last_pos != filp->f_pos) { -+ free_rb_tree_fname(&info->root); -+ info->curr_node = NULL; -+ info->extra_fname = NULL; -+ info->curr_hash = pos2maj_hash(filp->f_pos); -+ info->curr_minor_hash = pos2min_hash(filp->f_pos); -+ } -+ -+ /* -+ * If there are any leftover names on the hash collision -+ * chain, return them first. -+ */ -+ if (info->extra_fname && -+ call_filldir(filp, dirent, filldir, info->extra_fname)) -+ goto finished; -+ -+ if (!info->curr_node) -+ info->curr_node = rb_first(&info->root); -+ -+ while (1) { -+ /* -+ * Fill the rbtree if we have no more entries, -+ * or the inode has changed since we last read in the -+ * cached entries. -+ */ -+ if ((!info->curr_node) || -+ (filp->f_version != inode->i_version)) { -+ info->curr_node = NULL; -+ free_rb_tree_fname(&info->root); -+ filp->f_version = inode->i_version; -+ ret = ext3cow_htree_fill_tree(filp, info->curr_hash, -+ info->curr_minor_hash, -+ &info->next_hash); -+ if (ret < 0) -+ return ret; -+ if (ret == 0) { -+ filp->f_pos = EXT3COW_HTREE_EOF; -+ break; -+ } -+ info->curr_node = rb_first(&info->root); -+ } -+ -+ fname = rb_entry(info->curr_node, struct fname, rb_hash); -+ info->curr_hash = fname->hash; -+ info->curr_minor_hash = fname->minor_hash; -+ if (call_filldir(filp, dirent, filldir, fname)) -+ break; -+ -+ info->curr_node = rb_next(info->curr_node); -+ if (!info->curr_node) { -+ if (info->next_hash == ~0) { -+ filp->f_pos = EXT3COW_HTREE_EOF; -+ break; -+ } -+ info->curr_hash = info->next_hash; -+ info->curr_minor_hash = 0; -+ } -+ } -+finished: -+ info->last_pos = filp->f_pos; -+ return 0; -+} -+ -+static int ext3cow_release_dir (struct inode * inode, struct file * filp) -+{ -+ if (filp->private_data) -+ ext3cow_htree_free_dir_info(filp->private_data); -+ -+ return 0; -+} -+ -+#endif -diff -ruN linux-2.6.20.3/fs/ext3cow/ext3cow_jbd.c linux-2.6.20.3-ext3cow/fs/ext3cow/ext3cow_jbd.c ---- linux-2.6.20.3/fs/ext3cow/ext3cow_jbd.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.20.3-ext3cow/fs/ext3cow/ext3cow_jbd.c 2008-03-09 11:14:48.000000000 -0400 -@@ -0,0 +1,59 @@ -+/* -+ * Interface between ext3cow and JBD -+ */ -+ -+#include -+ -+int __ext3cow_journal_get_undo_access(const char *where, handle_t *handle, -+ struct buffer_head *bh) -+{ -+ int err = journal_get_undo_access(handle, bh); -+ if (err) -+ ext3cow_journal_abort_handle(where, __FUNCTION__, bh, handle,err); -+ return err; -+} -+ -+int __ext3cow_journal_get_write_access(const char *where, handle_t *handle, -+ struct buffer_head *bh) -+{ -+ int err = journal_get_write_access(handle, bh); -+ if (err) -+ ext3cow_journal_abort_handle(where, __FUNCTION__, bh, handle,err); -+ return err; -+} -+ -+int __ext3cow_journal_forget(const char *where, handle_t *handle, -+ struct buffer_head *bh) -+{ -+ int err = journal_forget(handle, bh); -+ if (err) -+ ext3cow_journal_abort_handle(where, __FUNCTION__, bh, handle,err); -+ return err; -+} -+ -+int __ext3cow_journal_revoke(const char *where, handle_t *handle, -+ unsigned long blocknr, struct buffer_head *bh) -+{ -+ int err = journal_revoke(handle, blocknr, bh); -+ if (err) -+ ext3cow_journal_abort_handle(where, __FUNCTION__, bh, handle,err); -+ return err; -+} -+ -+int __ext3cow_journal_get_create_access(const char *where, -+ handle_t *handle, struct buffer_head *bh) -+{ -+ int err = journal_get_create_access(handle, bh); -+ if (err) -+ ext3cow_journal_abort_handle(where, __FUNCTION__, bh, handle,err); -+ return err; -+} -+ -+int __ext3cow_journal_dirty_metadata(const char *where, -+ handle_t *handle, struct buffer_head *bh) -+{ -+ int err = journal_dirty_metadata(handle, bh); -+ if (err) -+ ext3cow_journal_abort_handle(where, __FUNCTION__, bh, handle,err); -+ return err; -+} -diff -ruN linux-2.6.20.3/fs/ext3cow/file.c linux-2.6.20.3-ext3cow/fs/ext3cow/file.c ---- linux-2.6.20.3/fs/ext3cow/file.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.20.3-ext3cow/fs/ext3cow/file.c 2008-03-09 11:14:48.000000000 -0400 -@@ -0,0 +1,147 @@ -+/* -+ * linux/fs/ext3cow/file.c -+ * -+ * Copyright (C) 1992, 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ * -+ * from -+ * -+ * linux/fs/minix/file.c -+ * -+ * Copyright (C) 1991, 1992 Linus Torvalds -+ * -+ * ext3cow fs regular file handling primitives -+ * -+ * 64-bit file support on 64-bit platforms by Jakub Jelinek -+ * (jj@sunsite.ms.mff.cuni.cz) -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include "xattr.h" -+#include "acl.h" -+ -+/* -+ * Called when an inode is released. Note that this is different -+ * from ext3cow_file_open: open gets called at every open, but release -+ * gets called only when /all/ the files are closed. -+ */ -+static int ext3cow_release_file (struct inode * inode, struct file * filp) -+{ -+ /* if we are the last writer on the inode, drop the block reservation */ -+ if ((filp->f_mode & FMODE_WRITE) && -+ (atomic_read(&inode->i_writecount) == 1)) -+ { -+ mutex_lock(&EXT3COW_I(inode)->truncate_mutex); -+ ext3cow_discard_reservation(inode); -+ mutex_unlock(&EXT3COW_I(inode)->truncate_mutex); -+ } -+ if (is_dx(inode) && filp->private_data) -+ ext3cow_htree_free_dir_info(filp->private_data); -+ -+ return 0; -+} -+ -+static ssize_t -+ext3cow_file_write(struct kiocb *iocb, const struct iovec *iov, -+ unsigned long nr_segs, loff_t pos) -+{ -+ struct file *file = iocb->ki_filp; -+ struct inode *inode = file->f_path.dentry->d_inode; -+ struct inode *dir = file->f_path.dentry->d_parent->d_inode; -+ ssize_t ret = 0; -+ int err = 0; -+ -+ /* This is the place where we create a new version on write -znjp */ -+ if(EXT3COW_S_EPOCHNUMBER(inode->i_sb) > EXT3COW_I_EPOCHNUMBER(inode)){ -+ err = ext3cow_dup_inode(dir, inode); -+ if(err) -+ return err; -+ } -+ -+ ret = generic_file_aio_write(iocb, iov, nr_segs, pos); -+ -+ /* -+ * Skip flushing if there was an error, or if nothing was written. -+ */ -+ if (ret <= 0) -+ return ret; -+ -+ /* -+ * If the inode is IS_SYNC, or is O_SYNC and we are doing data -+ * journalling then we need to make sure that we force the transaction -+ * to disk to keep all metadata uptodate synchronously. -+ */ -+ if (file->f_flags & O_SYNC) { -+ /* -+ * If we are non-data-journaled, then the dirty data has -+ * already been flushed to backing store by generic_osync_inode, -+ * and the inode has been flushed too if there have been any -+ * modifications other than mere timestamp updates. -+ * -+ * Open question --- do we care about flushing timestamps too -+ * if the inode is IS_SYNC? -+ */ -+ if (!ext3cow_should_journal_data(inode)) -+ return ret; -+ -+ goto force_commit; -+ } -+ -+ /* -+ * So we know that there has been no forced data flush. If the inode -+ * is marked IS_SYNC, we need to force one ourselves. -+ */ -+ if (!IS_SYNC(inode)) -+ return ret; -+ -+ /* -+ * Open question #2 --- should we force data to disk here too? If we -+ * don't, the only impact is that data=writeback filesystems won't -+ * flush data to disk automatically on IS_SYNC, only metadata (but -+ * historically, that is what ext2 has done.) -+ */ -+ -+force_commit: -+ err = ext3cow_force_commit(inode->i_sb); -+ if (err) -+ return err; -+ return ret; -+} -+ -+const struct file_operations ext3cow_file_operations = { -+ .llseek = generic_file_llseek, -+ .read = do_sync_read, -+ .write = do_sync_write, -+ .aio_read = generic_file_aio_read, -+ .aio_write = ext3cow_file_write, -+ .ioctl = ext3cow_ioctl, -+#ifdef CONFIG_COMPAT -+ .compat_ioctl = ext3cow_compat_ioctl, -+#endif -+ .mmap = generic_file_mmap, -+ .open = generic_file_open, -+ .release = ext3cow_release_file, -+ .fsync = ext3cow_sync_file, -+ .sendfile = generic_file_sendfile, -+ .splice_read = generic_file_splice_read, -+ .splice_write = generic_file_splice_write, -+}; -+ -+struct inode_operations ext3cow_file_inode_operations = { -+ .truncate = ext3cow_truncate, -+ .setattr = ext3cow_setattr, -+#ifdef CONFIG_EXT3COW_FS_XATTR -+ .setxattr = generic_setxattr, -+ .getxattr = generic_getxattr, -+ .listxattr = ext3cow_listxattr, -+ .removexattr = generic_removexattr, -+#endif -+ .permission = ext3cow_permission, -+}; -+ -diff -ruN linux-2.6.20.3/fs/ext3cow/fsync.c linux-2.6.20.3-ext3cow/fs/ext3cow/fsync.c ---- linux-2.6.20.3/fs/ext3cow/fsync.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.20.3-ext3cow/fs/ext3cow/fsync.c 2008-03-09 11:14:48.000000000 -0400 -@@ -0,0 +1,88 @@ -+/* -+ * linux/fs/ext3cow/fsync.c -+ * -+ * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com) -+ * from -+ * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ * from -+ * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds -+ * -+ * ext3cowfs fsync primitive -+ * -+ * Big-endian to little-endian byte-swapping/bitmaps by -+ * David S. Miller (davem@caip.rutgers.edu), 1995 -+ * -+ * Removed unnecessary code duplication for little endian machines -+ * and excessive __inline__s. -+ * Andi Kleen, 1997 -+ * -+ * Major simplications and cleanup - we only need to do the metadata, because -+ * we can depend on generic_block_fdatasync() to sync the data blocks. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ * akpm: A new design for ext3cow_sync_file(). -+ * -+ * This is only called from sys_fsync(), sys_fdatasync() and sys_msync(). -+ * There cannot be a transaction open by this task. -+ * Another task could have dirtied this inode. Its data can be in any -+ * state in the journalling system. -+ * -+ * What we do is just kick off a commit and wait on it. This will snapshot the -+ * inode to disk. -+ */ -+ -+int ext3cow_sync_file(struct file * file, struct dentry *dentry, int datasync) -+{ -+ struct inode *inode = dentry->d_inode; -+ int ret = 0; -+ -+ J_ASSERT(ext3cow_journal_current_handle() == 0); -+ -+ /* -+ * data=writeback: -+ * The caller's filemap_fdatawrite()/wait will sync the data. -+ * sync_inode() will sync the metadata -+ * -+ * data=ordered: -+ * The caller's filemap_fdatawrite() will write the data and -+ * sync_inode() will write the inode if it is dirty. Then the caller's -+ * filemap_fdatawait() will wait on the pages. -+ * -+ * data=journal: -+ * filemap_fdatawrite won't do anything (the buffers are clean). -+ * ext3cow_force_commit will write the file data into the journal and -+ * will wait on that. -+ * filemap_fdatawait() will encounter a ton of newly-dirtied pages -+ * (they were dirtied by commit). But that's OK - the blocks are -+ * safe in-journal, which is all fsync() needs to ensure. -+ */ -+ if (ext3cow_should_journal_data(inode)) { -+ ret = ext3cow_force_commit(inode->i_sb); -+ goto out; -+ } -+ -+ /* -+ * The VFS has written the file data. If the inode is unaltered -+ * then we need not start a commit. -+ */ -+ if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) { -+ struct writeback_control wbc = { -+ .sync_mode = WB_SYNC_ALL, -+ .nr_to_write = 0, /* sys_fsync did this */ -+ }; -+ ret = sync_inode(inode, &wbc); -+ } -+out: -+ return ret; -+} -diff -ruN linux-2.6.20.3/fs/ext3cow/hash.c linux-2.6.20.3-ext3cow/fs/ext3cow/hash.c ---- linux-2.6.20.3/fs/ext3cow/hash.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.20.3-ext3cow/fs/ext3cow/hash.c 2008-03-09 11:14:48.000000000 -0400 -@@ -0,0 +1,152 @@ -+/* -+ * linux/fs/ext3cow/hash.c -+ * -+ * Copyright (C) 2002 by Theodore Ts'o -+ * -+ * This file is released under the GPL v2. -+ * -+ * This file may be redistributed under the terms of the GNU Public -+ * License. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+ -+#define DELTA 0x9E3779B9 -+ -+static void TEA_transform(__u32 buf[4], __u32 const in[]) -+{ -+ __u32 sum = 0; -+ __u32 b0 = buf[0], b1 = buf[1]; -+ __u32 a = in[0], b = in[1], c = in[2], d = in[3]; -+ int n = 16; -+ -+ do { -+ sum += DELTA; -+ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); -+ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); -+ } while(--n); -+ -+ buf[0] += b0; -+ buf[1] += b1; -+} -+ -+ -+/* The old legacy hash */ -+static __u32 dx_hack_hash (const char *name, int len) -+{ -+ __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; -+ while (len--) { -+ __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); -+ -+ if (hash & 0x80000000) hash -= 0x7fffffff; -+ hash1 = hash0; -+ hash0 = hash; -+ } -+ return (hash0 << 1); -+} -+ -+static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) -+{ -+ __u32 pad, val; -+ int i; -+ -+ pad = (__u32)len | ((__u32)len << 8); -+ pad |= pad << 16; -+ -+ val = pad; -+ if (len > num*4) -+ len = num * 4; -+ for (i=0; i < len; i++) { -+ if ((i % 4) == 0) -+ val = pad; -+ val = msg[i] + (val << 8); -+ if ((i % 4) == 3) { -+ *buf++ = val; -+ val = pad; -+ num--; -+ } -+ } -+ if (--num >= 0) -+ *buf++ = val; -+ while (--num >= 0) -+ *buf++ = pad; -+} -+ -+/* -+ * Returns the hash of a filename. If len is 0 and name is NULL, then -+ * this function can be used to test whether or not a hash version is -+ * supported. -+ * -+ * The seed is an 4 longword (32 bits) "secret" which can be used to -+ * uniquify a hash. If the seed is all zero's, then some default seed -+ * may be used. -+ * -+ * A particular hash version specifies whether or not the seed is -+ * represented, and whether or not the returned hash is 32 bits or 64 -+ * bits. 32 bit hashes will return 0 for the minor hash. -+ */ -+int ext3cowfs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) -+{ -+ __u32 hash; -+ __u32 minor_hash = 0; -+ const char *p; -+ int i; -+ __u32 in[8], buf[4]; -+ -+ /* Initialize the default seed for the hash checksum functions */ -+ buf[0] = 0x67452301; -+ buf[1] = 0xefcdab89; -+ buf[2] = 0x98badcfe; -+ buf[3] = 0x10325476; -+ -+ /* Check to see if the seed is all zero's */ -+ if (hinfo->seed) { -+ for (i=0; i < 4; i++) { -+ if (hinfo->seed[i]) -+ break; -+ } -+ if (i < 4) -+ memcpy(buf, hinfo->seed, sizeof(buf)); -+ } -+ -+ switch (hinfo->hash_version) { -+ case DX_HASH_LEGACY: -+ hash = dx_hack_hash(name, len); -+ break; -+ case DX_HASH_HALF_MD4: -+ p = name; -+ while (len > 0) { -+ str2hashbuf(p, len, in, 8); -+ half_md4_transform(buf, in); -+ len -= 32; -+ p += 32; -+ } -+ minor_hash = buf[2]; -+ hash = buf[1]; -+ break; -+ case DX_HASH_TEA: -+ p = name; -+ while (len > 0) { -+ str2hashbuf(p, len, in, 4); -+ TEA_transform(buf, in); -+ len -= 16; -+ p += 16; -+ } -+ hash = buf[0]; -+ minor_hash = buf[1]; -+ break; -+ default: -+ hinfo->hash = 0; -+ return -1; -+ } -+ hash = hash & ~1; -+ if (hash == (EXT3COW_HTREE_EOF << 1)) -+ hash = (EXT3COW_HTREE_EOF-1) << 1; -+ hinfo->hash = hash; -+ hinfo->minor_hash = minor_hash; -+ return 0; -+} -diff -ruN linux-2.6.20.3/fs/ext3cow/ialloc.c linux-2.6.20.3-ext3cow/fs/ext3cow/ialloc.c ---- linux-2.6.20.3/fs/ext3cow/ialloc.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.20.3-ext3cow/fs/ext3cow/ialloc.c 2008-03-09 11:14:48.000000000 -0400 -@@ -0,0 +1,764 @@ -+/* -+ * linux/fs/ext3cow/ialloc.c -+ * -+ * Copyright (C) 1992, 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ * -+ * BSD ufs-inspired inode and directory allocation by -+ * Stephen Tweedie (sct@redhat.com), 1993 -+ * Big-endian to little-endian byte-swapping/bitmaps by -+ * David S. Miller (davem@caip.rutgers.edu), 1995 -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#include "xattr.h" -+#include "acl.h" -+ -+/* -+ * ialloc.c contains the inodes allocation and deallocation routines -+ */ -+ -+/* -+ * The free inodes are managed by bitmaps. A file system contains several -+ * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap -+ * block for inodes, N blocks for the inode table and data blocks. -+ * -+ * The file system contains group descriptors which are located after the -+ * super block. Each descriptor contains the number of the bitmap block and -+ * the free blocks count in the block. -+ */ -+ -+ -+/* -+ * Read the inode allocation bitmap for a given block_group, reading -+ * into the specified slot in the superblock's bitmap cache. -+ * -+ * Return buffer_head of bitmap on success or NULL. -+ */ -+static struct buffer_head * -+read_inode_bitmap(struct super_block * sb, unsigned long block_group) -+{ -+ struct ext3cow_group_desc *desc; -+ struct buffer_head *bh = NULL; -+ -+ desc = ext3cow_get_group_desc(sb, block_group, NULL); -+ if (!desc) -+ goto error_out; -+ -+ bh = sb_bread(sb, le32_to_cpu(desc->bg_inode_bitmap)); -+ if (!bh) -+ ext3cow_error(sb, "read_inode_bitmap", -+ "Cannot read inode bitmap - " -+ "block_group = %lu, inode_bitmap = %u", -+ block_group, le32_to_cpu(desc->bg_inode_bitmap)); -+error_out: -+ return bh; -+} -+ -+/* -+ * NOTE! When we get the inode, we're the only people -+ * that have access to it, and as such there are no -+ * race conditions we have to worry about. The inode -+ * is not on the hash-lists, and it cannot be reached -+ * through the filesystem because the directory entry -+ * has been deleted earlier. -+ * -+ * HOWEVER: we must make sure that we get no aliases, -+ * which means that we have to call "clear_inode()" -+ * _before_ we mark the inode not in use in the inode -+ * bitmaps. Otherwise a newly created file might use -+ * the same inode number (not actually the same pointer -+ * though), and then we'd have two inodes sharing the -+ * same inode number and space on the harddisk. -+ */ -+void ext3cow_free_inode (handle_t *handle, struct inode * inode) -+{ -+ struct super_block * sb = inode->i_sb; -+ int is_directory; -+ unsigned long ino; -+ struct buffer_head *bitmap_bh = NULL; -+ struct buffer_head *bh2; -+ unsigned long block_group; -+ unsigned long bit; -+ struct ext3cow_group_desc * gdp; -+ struct ext3cow_super_block * es; -+ struct ext3cow_sb_info *sbi; -+ int fatal = 0, err; -+ -+ if (atomic_read(&inode->i_count) > 1) { -+ printk ("ext3cow_free_inode: inode has count=%d\n", -+ atomic_read(&inode->i_count)); -+ return; -+ } -+ if (inode->i_nlink) { -+ printk ("ext3cow_free_inode: inode has nlink=%d\n", -+ inode->i_nlink); -+ return; -+ } -+ if (!sb) { -+ printk("ext3cow_free_inode: inode on nonexistent device\n"); -+ return; -+ } -+ sbi = EXT3COW_SB(sb); -+ -+ ino = inode->i_ino; -+ ext3cow_debug ("freeing inode %lu\n", ino); -+ -+ /* -+ * Note: we must free any quota before locking the superblock, -+ * as writing the quota to disk may need the lock as well. -+ */ -+ DQUOT_INIT(inode); -+ ext3cow_xattr_delete_inode(handle, inode); -+ DQUOT_FREE_INODE(inode); -+ DQUOT_DROP(inode); -+ -+ is_directory = S_ISDIR(inode->i_mode); -+ -+ /* Do this BEFORE marking the inode not in use or returning an error */ -+ clear_inode (inode); -+ -+ es = EXT3COW_SB(sb)->s_es; -+ if (ino < EXT3COW_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { -+ ext3cow_error (sb, "ext3cow_free_inode", -+ "reserved or nonexistent inode %lu", ino); -+ goto error_return; -+ } -+ block_group = (ino - 1) / EXT3COW_INODES_PER_GROUP(sb); -+ bit = (ino - 1) % EXT3COW_INODES_PER_GROUP(sb); -+ bitmap_bh = read_inode_bitmap(sb, block_group); -+ if (!bitmap_bh) -+ goto error_return; -+ -+ BUFFER_TRACE(bitmap_bh, "get_write_access"); -+ fatal = ext3cow_journal_get_write_access(handle, bitmap_bh); -+ if (fatal) -+ goto error_return; -+ -+ /* Ok, now we can actually update the inode bitmaps.. */ -+ if (!ext3cow_clear_bit_atomic(sb_bgl_lock(sbi, block_group), -+ bit, bitmap_bh->b_data)) -+ ext3cow_error (sb, "ext3cow_free_inode", -+ "bit already cleared for inode %lu", ino); -+ else { -+ gdp = ext3cow_get_group_desc (sb, block_group, &bh2); -+ -+ BUFFER_TRACE(bh2, "get_write_access"); -+ fatal = ext3cow_journal_get_write_access(handle, bh2); -+ if (fatal) goto error_return; -+ -+ if (gdp) { -+ spin_lock(sb_bgl_lock(sbi, block_group)); -+ gdp->bg_free_inodes_count = cpu_to_le16( -+ le16_to_cpu(gdp->bg_free_inodes_count) + 1); -+ if (is_directory) -+ gdp->bg_used_dirs_count = cpu_to_le16( -+ le16_to_cpu(gdp->bg_used_dirs_count) - 1); -+ spin_unlock(sb_bgl_lock(sbi, block_group)); -+ percpu_counter_inc(&sbi->s_freeinodes_counter); -+ if (is_directory) -+ percpu_counter_dec(&sbi->s_dirs_counter); -+ -+ } -+ BUFFER_TRACE(bh2, "call ext3cow_journal_dirty_metadata"); -+ err = ext3cow_journal_dirty_metadata(handle, bh2); -+ if (!fatal) fatal = err; -+ } -+ BUFFER_TRACE(bitmap_bh, "call ext3cow_journal_dirty_metadata"); -+ err = ext3cow_journal_dirty_metadata(handle, bitmap_bh); -+ if (!fatal) -+ fatal = err; -+ sb->s_dirt = 1; -+error_return: -+ brelse(bitmap_bh); -+ ext3cow_std_error(sb, fatal); -+} -+ -+/* -+ * There are two policies for allocating an inode. If the new inode is -+ * a directory, then a forward search is made for a block group with both -+ * free space and a low directory-to-inode ratio; if that fails, then of -+ * the groups with above-average free space, that group with the fewest -+ * directories already is chosen. -+ * -+ * For other inodes, search forward from the parent directory\'s block -+ * group to find a free inode. -+ */ -+static int find_group_dir(struct super_block *sb, struct inode *parent) -+{ -+ int ngroups = EXT3COW_SB(sb)->s_groups_count; -+ unsigned int freei, avefreei; -+ struct ext3cow_group_desc *desc, *best_desc = NULL; -+ struct buffer_head *bh; -+ int group, best_group = -1; -+ -+ freei = percpu_counter_read_positive(&EXT3COW_SB(sb)->s_freeinodes_counter); -+ avefreei = freei / ngroups; -+ -+ for (group = 0; group < ngroups; group++) { -+ desc = ext3cow_get_group_desc (sb, group, &bh); -+ if (!desc || !desc->bg_free_inodes_count) -+ continue; -+ if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) -+ continue; -+ if (!best_desc || -+ (le16_to_cpu(desc->bg_free_blocks_count) > -+ le16_to_cpu(best_desc->bg_free_blocks_count))) { -+ best_group = group; -+ best_desc = desc; -+ } -+ } -+ return best_group; -+} -+ -+/* -+ * Orlov's allocator for directories. -+ * -+ * We always try to spread first-level directories. -+ * -+ * If there are blockgroups with both free inodes and free blocks counts -+ * not worse than average we return one with smallest directory count. -+ * Otherwise we simply return a random group. -+ * -+ * For the rest rules look so: -+ * -+ * It's OK to put directory into a group unless -+ * it has too many directories already (max_dirs) or -+ * it has too few free inodes left (min_inodes) or -+ * it has too few free blocks left (min_blocks) or -+ * it's already running too large debt (max_debt). -+ * Parent's group is prefered, if it doesn't satisfy these -+ * conditions we search cyclically through the rest. If none -+ * of the groups look good we just look for a group with more -+ * free inodes than average (starting at parent's group). -+ * -+ * Debt is incremented each time we allocate a directory and decremented -+ * when we allocate an inode, within 0--255. -+ */ -+ -+#define INODE_COST 64 -+#define BLOCK_COST 256 -+ -+static int find_group_orlov(struct super_block *sb, struct inode *parent) -+{ -+ int parent_group = EXT3COW_I(parent)->i_block_group; -+ struct ext3cow_sb_info *sbi = EXT3COW_SB(sb); -+ struct ext3cow_super_block *es = sbi->s_es; -+ int ngroups = sbi->s_groups_count; -+ int inodes_per_group = EXT3COW_INODES_PER_GROUP(sb); -+ unsigned int freei, avefreei; -+ ext3cow_fsblk_t freeb, avefreeb; -+ ext3cow_fsblk_t blocks_per_dir; -+ unsigned int ndirs; -+ int max_debt, max_dirs, min_inodes; -+ ext3cow_grpblk_t min_blocks; -+ int group = -1, i; -+ struct ext3cow_group_desc *desc; -+ struct buffer_head *bh; -+ -+ freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); -+ avefreei = freei / ngroups; -+ freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter); -+ avefreeb = freeb / ngroups; -+ ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); -+ -+ if ((parent == sb->s_root->d_inode) || -+ (EXT3COW_I(parent)->i_flags & EXT3COW_TOPDIR_FL)) { -+ int best_ndir = inodes_per_group; -+ int best_group = -1; -+ -+ get_random_bytes(&group, sizeof(group)); -+ parent_group = (unsigned)group % ngroups; -+ for (i = 0; i < ngroups; i++) { -+ group = (parent_group + i) % ngroups; -+ desc = ext3cow_get_group_desc (sb, group, &bh); -+ if (!desc || !desc->bg_free_inodes_count) -+ continue; -+ if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir) -+ continue; -+ if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) -+ continue; -+ if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb) -+ continue; -+ best_group = group; -+ best_ndir = le16_to_cpu(desc->bg_used_dirs_count); -+ } -+ if (best_group >= 0) -+ return best_group; -+ goto fallback; -+ } -+ -+ blocks_per_dir = (le32_to_cpu(es->s_blocks_count) - freeb) / ndirs; -+ -+ max_dirs = ndirs / ngroups + inodes_per_group / 16; -+ min_inodes = avefreei - inodes_per_group / 4; -+ min_blocks = avefreeb - EXT3COW_BLOCKS_PER_GROUP(sb) / 4; -+ -+ max_debt = EXT3COW_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, (ext3cow_fsblk_t)BLOCK_COST); -+ if (max_debt * INODE_COST > inodes_per_group) -+ max_debt = inodes_per_group / INODE_COST; -+ if (max_debt > 255) -+ max_debt = 255; -+ if (max_debt == 0) -+ max_debt = 1; -+ -+ for (i = 0; i < ngroups; i++) { -+ group = (parent_group + i) % ngroups; -+ desc = ext3cow_get_group_desc (sb, group, &bh); -+ if (!desc || !desc->bg_free_inodes_count) -+ continue; -+ if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs) -+ continue; -+ if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes) -+ continue; -+ if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks) -+ continue; -+ return group; -+ } -+ -+fallback: -+ for (i = 0; i < ngroups; i++) { -+ group = (parent_group + i) % ngroups; -+ desc = ext3cow_get_group_desc (sb, group, &bh); -+ if (!desc || !desc->bg_free_inodes_count) -+ continue; -+ if (le16_to_cpu(desc->bg_free_inodes_count) >= avefreei) -+ return group; -+ } -+ -+ if (avefreei) { -+ /* -+ * The free-inodes counter is approximate, and for really small -+ * filesystems the above test can fail to find any blockgroups -+ */ -+ avefreei = 0; -+ goto fallback; -+ } -+ -+ return -1; -+} -+ -+static int find_group_other(struct super_block *sb, struct inode *parent) -+{ -+ int parent_group = EXT3COW_I(parent)->i_block_group; -+ int ngroups = EXT3COW_SB(sb)->s_groups_count; -+ struct ext3cow_group_desc *desc; -+ struct buffer_head *bh; -+ int group, i; -+ -+ /* -+ * Try to place the inode in its parent directory -+ */ -+ group = parent_group; -+ desc = ext3cow_get_group_desc (sb, group, &bh); -+ if (desc && le16_to_cpu(desc->bg_free_inodes_count) && -+ le16_to_cpu(desc->bg_free_blocks_count)) -+ return group; -+ -+ /* -+ * We're going to place this inode in a different blockgroup from its -+ * parent. We want to cause files in a common directory to all land in -+ * the same blockgroup. But we want files which are in a different -+ * directory which shares a blockgroup with our parent to land in a -+ * different blockgroup. -+ * -+ * So add our directory's i_ino into the starting point for the hash. -+ */ -+ group = (group + parent->i_ino) % ngroups; -+ -+ /* -+ * Use a quadratic hash to find a group with a free inode and some free -+ * blocks. -+ */ -+ for (i = 1; i < ngroups; i <<= 1) { -+ group += i; -+ if (group >= ngroups) -+ group -= ngroups; -+ desc = ext3cow_get_group_desc (sb, group, &bh); -+ if (desc && le16_to_cpu(desc->bg_free_inodes_count) && -+ le16_to_cpu(desc->bg_free_blocks_count)) -+ return group; -+ } -+ -+ /* -+ * That failed: try linear search for a free inode, even if that group -+ * has no free blocks. -+ */ -+ group = parent_group; -+ for (i = 0; i < ngroups; i++) { -+ if (++group >= ngroups) -+ group = 0; -+ desc = ext3cow_get_group_desc (sb, group, &bh); -+ if (desc && le16_to_cpu(desc->bg_free_inodes_count)) -+ return group; -+ } -+ -+ return -1; -+} -+ -+/* -+ * There are two policies for allocating an inode. If the new inode is -+ * a directory, then a forward search is made for a block group with both -+ * free space and a low directory-to-inode ratio; if that fails, then of -+ * the groups with above-average free space, that group with the fewest -+ * directories already is chosen. -+ * -+ * For other inodes, search forward from the parent directory's block -+ * group to find a free inode. -+ */ -+struct inode *ext3cow_new_inode(handle_t *handle, struct inode * dir, int mode) -+{ -+ struct super_block *sb; -+ struct buffer_head *bitmap_bh = NULL; -+ struct buffer_head *bh2; -+ int group; -+ unsigned long ino = 0; -+ struct inode * inode; -+ struct ext3cow_group_desc * gdp = NULL; -+ struct ext3cow_super_block * es; -+ struct ext3cow_inode_info *ei; -+ struct ext3cow_sb_info *sbi; -+ int err = 0; -+ struct inode *ret; -+ int i; -+ -+ /* Cannot create files in a deleted directory */ -+ if (!dir || !dir->i_nlink) -+ return ERR_PTR(-EPERM); -+ -+ sb = dir->i_sb; -+ inode = new_inode(sb); -+ if (!inode) -+ return ERR_PTR(-ENOMEM); -+ ei = EXT3COW_I(inode); -+ -+ sbi = EXT3COW_SB(sb); -+ es = sbi->s_es; -+ if (S_ISDIR(mode)) { -+ if (test_opt (sb, OLDALLOC)) -+ group = find_group_dir(sb, dir); -+ else -+ group = find_group_orlov(sb, dir); -+ } else -+ group = find_group_other(sb, dir); -+ -+ err = -ENOSPC; -+ if (group == -1) -+ goto out; -+ -+ for (i = 0; i < sbi->s_groups_count; i++) { -+ err = -EIO; -+ -+ gdp = ext3cow_get_group_desc(sb, group, &bh2); -+ if (!gdp) -+ goto fail; -+ -+ brelse(bitmap_bh); -+ bitmap_bh = read_inode_bitmap(sb, group); -+ if (!bitmap_bh) -+ goto fail; -+ -+ ino = 0; -+ -+repeat_in_this_group: -+ ino = ext3cow_find_next_zero_bit((unsigned long *) -+ bitmap_bh->b_data, EXT3COW_INODES_PER_GROUP(sb), ino); -+ if (ino < EXT3COW_INODES_PER_GROUP(sb)) { -+ -+ BUFFER_TRACE(bitmap_bh, "get_write_access"); -+ err = ext3cow_journal_get_write_access(handle, bitmap_bh); -+ if (err) -+ goto fail; -+ -+ if (!ext3cow_set_bit_atomic(sb_bgl_lock(sbi, group), -+ ino, bitmap_bh->b_data)) { -+ /* we won it */ -+ BUFFER_TRACE(bitmap_bh, -+ "call ext3cow_journal_dirty_metadata"); -+ err = ext3cow_journal_dirty_metadata(handle, -+ bitmap_bh); -+ if (err) -+ goto fail; -+ goto got; -+ } -+ /* we lost it */ -+ journal_release_buffer(handle, bitmap_bh); -+ -+ if (++ino < EXT3COW_INODES_PER_GROUP(sb)) -+ goto repeat_in_this_group; -+ } -+ -+ /* -+ * This case is possible in concurrent environment. It is very -+ * rare. We cannot repeat the find_group_xxx() call because -+ * that will simply return the same blockgroup, because the -+ * group descriptor metadata has not yet been updated. -+ * So we just go onto the next blockgroup. -+ */ -+ if (++group == sbi->s_groups_count) -+ group = 0; -+ } -+ err = -ENOSPC; -+ goto out; -+ -+got: -+ ino += group * EXT3COW_INODES_PER_GROUP(sb) + 1; -+ if (ino < EXT3COW_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { -+ ext3cow_error (sb, "ext3cow_new_inode", -+ "reserved inode or inode > inodes count - " -+ "block_group = %d, inode=%lu", group, ino); -+ err = -EIO; -+ goto fail; -+ } -+ -+ BUFFER_TRACE(bh2, "get_write_access"); -+ err = ext3cow_journal_get_write_access(handle, bh2); -+ if (err) goto fail; -+ spin_lock(sb_bgl_lock(sbi, group)); -+ gdp->bg_free_inodes_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1); -+ if (S_ISDIR(mode)) { -+ gdp->bg_used_dirs_count = -+ cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1); -+ } -+ spin_unlock(sb_bgl_lock(sbi, group)); -+ BUFFER_TRACE(bh2, "call ext3cow_journal_dirty_metadata"); -+ err = ext3cow_journal_dirty_metadata(handle, bh2); -+ if (err) goto fail; -+ -+ percpu_counter_dec(&sbi->s_freeinodes_counter); -+ if (S_ISDIR(mode)) -+ percpu_counter_inc(&sbi->s_dirs_counter); -+ sb->s_dirt = 1; -+ -+ inode->i_uid = current->fsuid; -+ if (test_opt (sb, GRPID)) -+ inode->i_gid = dir->i_gid; -+ else if (dir->i_mode & S_ISGID) { -+ inode->i_gid = dir->i_gid; -+ if (S_ISDIR(mode)) -+ mode |= S_ISGID; -+ } else -+ inode->i_gid = current->fsgid; -+ inode->i_mode = mode; -+ -+ inode->i_ino = ino; -+ /* This is the optimal IO size (for stat), not the fs block size */ -+ inode->i_blocks = 0; -+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; -+ -+ /* For versioning -znjp */ -+ ei->i_cow_bitmap = 0x0000; -+ ei->i_epoch_number = EXT3COW_S_EPOCHNUMBER(dir->i_sb); -+ ei->i_next_inode = 0; -+ -+ memset(ei->i_data, 0, sizeof(ei->i_data)); -+ ei->i_dir_start_lookup = 0; -+ ei->i_disksize = 0; -+ -+ ei->i_flags = EXT3COW_I(dir)->i_flags & ~EXT3COW_INDEX_FL; -+ if (S_ISLNK(mode)) -+ ei->i_flags &= ~(EXT3COW_IMMUTABLE_FL|EXT3COW_APPEND_FL); -+ /* dirsync only applies to directories */ -+ if (!S_ISDIR(mode)) -+ ei->i_flags &= ~EXT3COW_DIRSYNC_FL; -+#ifdef EXT3COW_FRAGMENTS -+ /* Taken out for versioning -znjp */ -+ //ei->i_faddr = 0; -+ //ei->i_frag_no = 0; -+ //ei->i_frag_size = 0; -+#endif -+ ei->i_file_acl = 0; -+ ei->i_dir_acl = 0; -+ ei->i_dtime = 0; -+ ei->i_block_alloc_info = NULL; -+ ei->i_block_group = group; -+ -+ ext3cow_set_inode_flags(inode); -+ if (IS_DIRSYNC(inode)) -+ handle->h_sync = 1; -+ insert_inode_hash(inode); -+ spin_lock(&sbi->s_next_gen_lock); -+ inode->i_generation = sbi->s_next_generation++; -+ spin_unlock(&sbi->s_next_gen_lock); -+ -+ ei->i_state = EXT3COW_STATE_NEW; -+ ei->i_extra_isize = -+ (EXT3COW_INODE_SIZE(inode->i_sb) > EXT3COW_GOOD_OLD_INODE_SIZE) ? -+ sizeof(struct ext3cow_inode) - EXT3COW_GOOD_OLD_INODE_SIZE : 0; -+ -+ ret = inode; -+ if(DQUOT_ALLOC_INODE(inode)) { -+ err = -EDQUOT; -+ goto fail_drop; -+ } -+ -+ err = ext3cow_init_acl(handle, inode, dir); -+ if (err) -+ goto fail_free_drop; -+ -+ err = ext3cow_init_security(handle,inode, dir); -+ if (err) -+ goto fail_free_drop; -+ -+ err = ext3cow_mark_inode_dirty(handle, inode); -+ if (err) { -+ ext3cow_std_error(sb, err); -+ goto fail_free_drop; -+ } -+ -+ ext3cow_debug("allocating inode %lu\n", inode->i_ino); -+ goto really_out; -+fail: -+ ext3cow_std_error(sb, err); -+out: -+ iput(inode); -+ ret = ERR_PTR(err); -+really_out: -+ brelse(bitmap_bh); -+ return ret; -+ -+fail_free_drop: -+ DQUOT_FREE_INODE(inode); -+ -+fail_drop: -+ DQUOT_DROP(inode); -+ inode->i_flags |= S_NOQUOTA; -+ inode->i_nlink = 0; -+ iput(inode); -+ brelse(bitmap_bh); -+ return ERR_PTR(err); -+} -+ -+/* Verify that we are loading a valid orphan from disk */ -+struct inode *ext3cow_orphan_get(struct super_block *sb, unsigned long ino) -+{ -+ unsigned long max_ino = le32_to_cpu(EXT3COW_SB(sb)->s_es->s_inodes_count); -+ unsigned long block_group; -+ int bit; -+ struct buffer_head *bitmap_bh = NULL; -+ struct inode *inode = NULL; -+ -+ /* Error cases - e2fsck has already cleaned up for us */ -+ if (ino > max_ino) { -+ ext3cow_warning(sb, __FUNCTION__, -+ "bad orphan ino %lu! e2fsck was run?", ino); -+ goto out; -+ } -+ -+ block_group = (ino - 1) / EXT3COW_INODES_PER_GROUP(sb); -+ bit = (ino - 1) % EXT3COW_INODES_PER_GROUP(sb); -+ bitmap_bh = read_inode_bitmap(sb, block_group); -+ if (!bitmap_bh) { -+ ext3cow_warning(sb, __FUNCTION__, -+ "inode bitmap error for orphan %lu", ino); -+ goto out; -+ } -+ -+ /* Having the inode bit set should be a 100% indicator that this -+ * is a valid orphan (no e2fsck run on fs). Orphans also include -+ * inodes that were being truncated, so we can't check i_nlink==0. -+ */ -+ if (!ext3cow_test_bit(bit, bitmap_bh->b_data) || -+ !(inode = iget(sb, ino)) || is_bad_inode(inode) || -+ NEXT_ORPHAN(inode) > max_ino) { -+ ext3cow_warning(sb, __FUNCTION__, -+ "bad orphan inode %lu! e2fsck was run?", ino); -+ printk(KERN_NOTICE "ext3cow_test_bit(bit=%d, block=%llu) = %d\n", -+ bit, (unsigned long long)bitmap_bh->b_blocknr, -+ ext3cow_test_bit(bit, bitmap_bh->b_data)); -+ printk(KERN_NOTICE "inode=%p\n", inode); -+ if (inode) { -+ printk(KERN_NOTICE "is_bad_inode(inode)=%d\n", -+ is_bad_inode(inode)); -+ printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n", -+ NEXT_ORPHAN(inode)); -+ printk(KERN_NOTICE "max_ino=%lu\n", max_ino); -+ } -+ /* Avoid freeing blocks if we got a bad deleted inode */ -+ if (inode && inode->i_nlink == 0) -+ inode->i_blocks = 0; -+ iput(inode); -+ inode = NULL; -+ } -+out: -+ brelse(bitmap_bh); -+ return inode; -+} -+ -+unsigned long ext3cow_count_free_inodes (struct super_block * sb) -+{ -+ unsigned long desc_count; -+ struct ext3cow_group_desc *gdp; -+ int i; -+#ifdef EXT3COWFS_DEBUG -+ struct ext3cow_super_block *es; -+ unsigned long bitmap_count, x; -+ struct buffer_head *bitmap_bh = NULL; -+ -+ es = EXT3COW_SB(sb)->s_es; -+ desc_count = 0; -+ bitmap_count = 0; -+ gdp = NULL; -+ for (i = 0; i < EXT3COW_SB(sb)->s_groups_count; i++) { -+ gdp = ext3cow_get_group_desc (sb, i, NULL); -+ if (!gdp) -+ continue; -+ desc_count += le16_to_cpu(gdp->bg_free_inodes_count); -+ brelse(bitmap_bh); -+ bitmap_bh = read_inode_bitmap(sb, i); -+ if (!bitmap_bh) -+ continue; -+ -+ x = ext3cow_count_free(bitmap_bh, EXT3COW_INODES_PER_GROUP(sb) / 8); -+ printk("group %d: stored = %d, counted = %lu\n", -+ i, le16_to_cpu(gdp->bg_free_inodes_count), x); -+ bitmap_count += x; -+ } -+ brelse(bitmap_bh); -+ printk("ext3cow_count_free_inodes: stored = %u, computed = %lu, %lu\n", -+ le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); -+ return desc_count; -+#else -+ desc_count = 0; -+ for (i = 0; i < EXT3COW_SB(sb)->s_groups_count; i++) { -+ gdp = ext3cow_get_group_desc (sb, i, NULL); -+ if (!gdp) -+ continue; -+ desc_count += le16_to_cpu(gdp->bg_free_inodes_count); -+ cond_resched(); -+ } -+ return desc_count; -+#endif -+} -+ -+/* Called at mount-time, super-block is locked */ -+unsigned long ext3cow_count_dirs (struct super_block * sb) -+{ -+ unsigned long count = 0; -+ int i; -+ -+ for (i = 0; i < EXT3COW_SB(sb)->s_groups_count; i++) { -+ struct ext3cow_group_desc *gdp = ext3cow_get_group_desc (sb, i, NULL); -+ if (!gdp) -+ continue; -+ count += le16_to_cpu(gdp->bg_used_dirs_count); -+ } -+ return count; -+} -+ -diff -ruN linux-2.6.20.3/fs/ext3cow/inode.c linux-2.6.20.3-ext3cow/fs/ext3cow/inode.c ---- linux-2.6.20.3/fs/ext3cow/inode.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.20.3-ext3cow/fs/ext3cow/inode.c 2008-03-09 11:14:48.000000000 -0400 -@@ -0,0 +1,3502 @@ -+/* -+ * linux/fs/ext3cow/inode.c -+ * -+ * Copyright (C) 1992, 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ * -+ * from -+ * -+ * linux/fs/minix/inode.c -+ * -+ * Copyright (C) 1991, 1992 Linus Torvalds -+ * -+ * Goal-directed block allocation by Stephen Tweedie -+ * (sct@redhat.com), 1993, 1998 -+ * Big-endian to little-endian byte-swapping/bitmaps by -+ * David S. Miller (davem@caip.rutgers.edu), 1995 -+ * 64-bit file support on 64-bit platforms by Jakub Jelinek -+ * (jj@sunsite.ms.mff.cuni.cz) -+ * -+ * Assorted race fixes, rewrite of ext3cow_get_block() by Al Viro, 2000 -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "xattr.h" -+#include "acl.h" -+ -+static int ext3cow_writepage_trans_blocks(struct inode *inode); -+ -+/* -+ * Test whether an inode is a fast symlink. -+ */ -+static int ext3cow_inode_is_fast_symlink(struct inode *inode) -+{ -+ int ea_blocks = EXT3COW_I(inode)->i_file_acl ? -+ (inode->i_sb->s_blocksize >> 9) : 0; -+ -+ return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); -+} -+ -+/* -+ * The ext3cow forget function must perform a revoke if we are freeing data -+ * which has been journaled. Metadata (eg. indirect blocks) must be -+ * revoked in all cases. -+ * -+ * "bh" may be NULL: a metadata block may have been freed from memory -+ * but there may still be a record of it in the journal, and that record -+ * still needs to be revoked. -+ */ -+int ext3cow_forget(handle_t *handle, int is_metadata, struct inode *inode, -+ struct buffer_head *bh, ext3cow_fsblk_t blocknr) -+{ -+ int err; -+ -+ might_sleep(); -+ -+ BUFFER_TRACE(bh, "enter"); -+ -+ jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " -+ "data mode %lx\n", -+ bh, is_metadata, inode->i_mode, -+ test_opt(inode->i_sb, DATA_FLAGS)); -+ -+ /* Never use the revoke function if we are doing full data -+ * journaling: there is no need to, and a V1 superblock won't -+ * support it. Otherwise, only skip the revoke on un-journaled -+ * data blocks. */ -+ -+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3COW_MOUNT_JOURNAL_DATA || -+ (!is_metadata && !ext3cow_should_journal_data(inode))) { -+ if (bh) { -+ BUFFER_TRACE(bh, "call journal_forget"); -+ return ext3cow_journal_forget(handle, bh); -+ } -+ return 0; -+ } -+ -+ /* -+ * data!=journal && (is_metadata || should_journal_data(inode)) -+ */ -+ BUFFER_TRACE(bh, "call ext3cow_journal_revoke"); -+ err = ext3cow_journal_revoke(handle, blocknr, bh); -+ if (err) -+ ext3cow_abort(inode->i_sb, __FUNCTION__, -+ "error %d when attempting revoke", err); -+ BUFFER_TRACE(bh, "exit"); -+ return err; -+} -+ -+/* -+ * Work out how many blocks we need to proceed with the next chunk of a -+ * truncate transaction. -+ */ -+static unsigned long blocks_for_truncate(struct inode *inode) -+{ -+ unsigned long needed; -+ -+ needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); -+ -+ /* Give ourselves just enough room to cope with inodes in which -+ * i_blocks is corrupt: we've seen disk corruptions in the past -+ * which resulted in random data in an inode which looked enough -+ * like a regular file for ext3cow to try to delete it. Things -+ * will go a bit crazy if that happens, but at least we should -+ * try not to panic the whole kernel. */ -+ if (needed < 2) -+ needed = 2; -+ -+ /* But we need to bound the transaction so we don't overflow the -+ * journal. */ -+ if (needed > EXT3COW_MAX_TRANS_DATA) -+ needed = EXT3COW_MAX_TRANS_DATA; -+ -+ return EXT3COW_DATA_TRANS_BLOCKS(inode->i_sb) + needed; -+} -+ -+/* -+ * Truncate transactions can be complex and absolutely huge. So we need to -+ * be able to restart the transaction at a conventient checkpoint to make -+ * sure we don't overflow the journal. -+ * -+ * start_transaction gets us a new handle for a truncate transaction, -+ * and extend_transaction tries to extend the existing one a bit. If -+ * extend fails, we need to propagate the failure up and restart the -+ * transaction in the top-level truncate loop. --sct -+ */ -+static handle_t *start_transaction(struct inode *inode) -+{ -+ handle_t *result; -+ -+ result = ext3cow_journal_start(inode, blocks_for_truncate(inode)); -+ if (!IS_ERR(result)) -+ return result; -+ -+ ext3cow_std_error(inode->i_sb, PTR_ERR(result)); -+ return result; -+} -+ -+/* -+ * Try to extend this transaction for the purposes of truncation. -+ * -+ * Returns 0 if we managed to create more room. If we can't create more -+ * room, and the transaction must be restarted we return 1. -+ */ -+static int try_to_extend_transaction(handle_t *handle, struct inode *inode) -+{ -+ if (handle->h_buffer_credits > EXT3COW_RESERVE_TRANS_BLOCKS) -+ return 0; -+ if (!ext3cow_journal_extend(handle, blocks_for_truncate(inode))) -+ return 0; -+ return 1; -+} -+ -+/* -+ * Restart the transaction associated with *handle. This does a commit, -+ * so before we call here everything must be consistently dirtied against -+ * this transaction. -+ */ -+static int ext3cow_journal_test_restart(handle_t *handle, struct inode *inode) -+{ -+ jbd_debug(2, "restarting handle %p\n", handle); -+ return ext3cow_journal_restart(handle, blocks_for_truncate(inode)); -+} -+ -+/* -+ * Called at the last iput() if i_nlink is zero. -+ */ -+void ext3cow_delete_inode (struct inode * inode) -+{ -+ handle_t *handle; -+ -+ truncate_inode_pages(&inode->i_data, 0); -+ -+ if (is_bad_inode(inode)) -+ goto no_delete; -+ -+ handle = start_transaction(inode); -+ if (IS_ERR(handle)) { -+ /* -+ * If we're going to skip the normal cleanup, we still need to -+ * make sure that the in-core orphan linked list is properly -+ * cleaned up. -+ */ -+ ext3cow_orphan_del(NULL, inode); -+ goto no_delete; -+ } -+ -+ if (IS_SYNC(inode)) -+ handle->h_sync = 1; -+ inode->i_size = 0; -+ if (inode->i_blocks) -+ ext3cow_truncate(inode); -+ /* -+ * Kill off the orphan record which ext3cow_truncate created. -+ * AKPM: I think this can be inside the above `if'. -+ * Note that ext3cow_orphan_del() has to be able to cope with the -+ * deletion of a non-existent orphan - this is because we don't -+ * know if ext3cow_truncate() actually created an orphan record. -+ * (Well, we could do this if we need to, but heck - it works) -+ */ -+ ext3cow_orphan_del(handle, inode); -+ EXT3COW_I(inode)->i_dtime = get_seconds(); -+ -+ /* -+ * One subtle ordering requirement: if anything has gone wrong -+ * (transaction abort, IO errors, whatever), then we can still -+ * do these next steps (the fs will already have been marked as -+ * having errors), but we can't free the inode if the mark_dirty -+ * fails. -+ */ -+ if (ext3cow_mark_inode_dirty(handle, inode)) -+ /* If that failed, just do the required in-core inode clear. */ -+ clear_inode(inode); -+ else -+ ext3cow_free_inode(handle, inode); -+ ext3cow_journal_stop(handle); -+ return; -+no_delete: -+ clear_inode(inode); /* We must guarantee clearing of inode... */ -+} -+ -+typedef struct { -+ __le32 *p; -+ __le32 key; -+ struct buffer_head *bh; -+} Indirect; -+ -+static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) -+{ -+ p->key = *(p->p = v); -+ p->bh = bh; -+} -+ -+static int verify_chain(Indirect *from, Indirect *to) -+{ -+ while (from <= to && from->key == *from->p) -+ from++; -+ return (from > to); -+} -+ -+//TODO: Delete at some point -+/* znjp - used for bitmap testing */ -+ -+ static void printbin(u32 val, int size) { -+ u32 mask; -+ -+ mask=(1UL << (size-1)); -+ while (mask) { -+ if (mask & val) -+ printk("1"); -+ else -+ printk("0"); -+ mask /= 2; -+ } -+ printk("\n"); -+ -+ } -+ -+ -+/** -+ * ext3cow_block_to_path - parse the block number into array of offsets -+ * @inode: inode in question (we are only interested in its superblock) -+ * @i_block: block number to be parsed -+ * @offsets: array to store the offsets in -+ * @boundary: set this non-zero if the referred-to block is likely to be -+ * followed (on disk) by an indirect block. -+ * -+ * To store the locations of file's data ext3cow uses a data structure common -+ * for UNIX filesystems - tree of pointers anchored in the inode, with -+ * data blocks at leaves and indirect blocks in intermediate nodes. -+ * This function translates the block number into path in that tree - -+ * return value is the path length and @offsets[n] is the offset of -+ * pointer to (n+1)th node in the nth one. If @block is out of range -+ * (negative or too large) warning is printed and zero returned. -+ * -+ * Note: function doesn't find node addresses, so no IO is needed. All -+ * we need to know is the capacity of indirect blocks (taken from the -+ * inode->i_sb). -+ */ -+ -+/* -+ * Portability note: the last comparison (check that we fit into triple -+ * indirect block) is spelled differently, because otherwise on an -+ * architecture with 32-bit longs and 8Kb pages we might get into trouble -+ * if our filesystem had 8Kb blocks. We might use long long, but that would -+ * kill us on x86. Oh, well, at least the sign propagation does not matter - -+ * i_block would have to be negative in the very beginning, so we would not -+ * get there at all. -+ */ -+ -+static int ext3cow_block_to_path(struct inode *inode, -+ long i_block, int offsets[4], int *boundary) -+{ -+ /* TODO: Check for efficientcy -znjp */ -+ int ptrs = EXT3COW_ADDR_PER_BLOCK(inode->i_sb); -+ const long direct_blocks = EXT3COW_NDIR_BLOCKS, -+ indirect_blocks = ptrs, -+ double_blocks = (ptrs * ptrs); -+ //double_blocks = (1 << (ptrs_bits * 2)); -+ int n = 0; -+ int final = 0; -+ -+ if (i_block < 0) { -+ ext3cow_warning (inode->i_sb, "ext3cow_block_to_path", "block < 0"); -+ } else if (i_block < direct_blocks) { -+ offsets[n++] = i_block; -+ final = direct_blocks; -+ } else if ( (i_block -= direct_blocks) < indirect_blocks) { -+ offsets[n++] = EXT3COW_IND_BLOCK; -+ offsets[n++] = i_block; -+ final = ptrs; -+ } else if ((i_block -= indirect_blocks) < double_blocks) { -+ offsets[n++] = EXT3COW_DIND_BLOCK; -+ offsets[n++] = (i_block/ptrs); //i_block >> ptrs_bits; -+ offsets[n++] = (i_block%ptrs); //i_block & (ptrs - 1); -+ final = ptrs; -+ } else if (((i_block -= double_blocks)/(double_blocks)) < ptrs) { -+ // } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { -+ offsets[n++] = EXT3COW_TIND_BLOCK; -+ offsets[n++] = (i_block/double_blocks); //i_block >> (ptrs_bits * 2); -+ offsets[n++] = (i_block/double_blocks)%ptrs; //(i_block >> ptrs_bits) & (ptrs - 1); -+ offsets[n++] = i_block%ptrs; //i_block & (ptrs - 1); -+ final = ptrs; -+ } else { -+ ext3cow_warning(inode->i_sb, "ext3cow_block_to_path", "block > big"); -+ } -+ if (boundary) -+ *boundary = final - 1 - (i_block & (ptrs - 1)); -+ return n; -+} -+ -+/** -+ * ext3cow_get_branch - read the chain of indirect blocks leading to data -+ * @inode: inode in question -+ * @depth: depth of the chain (1 - direct pointer, etc.) -+ * @offsets: offsets of pointers in inode/indirect blocks -+ * @chain: place to store the result -+ * @err: here we store the error value -+ * -+ * Function fills the array of triples and returns %NULL -+ * if everything went OK or the pointer to the last filled triple -+ * (incomplete one) otherwise. Upon the return chain[i].key contains -+ * the number of (i+1)-th block in the chain (as it is stored in memory, -+ * i.e. little-endian 32-bit), chain[i].p contains the address of that -+ * number (it points into struct inode for i==0 and into the bh->b_data -+ * for i>0) and chain[i].bh points to the buffer_head of i-th indirect -+ * block for i>0 and NULL for i==0. In other words, it holds the block -+ * numbers of the chain, addresses they were taken from (and where we can -+ * verify that chain did not change) and buffer_heads hosting these -+ * numbers. -+ * -+ * Function stops when it stumbles upon zero pointer (absent block) -+ * (pointer to last triple returned, *@err == 0) -+ * or when it gets an IO error reading an indirect block -+ * (ditto, *@err == -EIO) -+ * or when it notices that chain had been changed while it was reading -+ * (ditto, *@err == -EAGAIN) -+ * or when it reads all @depth-1 indirect blocks successfully and finds -+ * the whole chain, all way to the data (returns %NULL, *err == 0). -+ * If this is COW we set the cow field to 1. We know if it's COW -+ * because there will already be a key. We need this field so we -+ * zero out the data already in the buffer. -+ * The create flag let's us know if were just looking for a block -+ * to read, or a block to write. We only set the bitmap when -+ * we're looking for a block to write, either on new allocation -+ * or on COWing. -znjp -+ */ -+static Indirect *ext3cow_get_branch(struct inode *inode, int depth, -+ int *offsets, -+ Indirect chain[4], int *err, int *cow, -+ int create) -+{ -+ struct super_block *sb = inode->i_sb; -+ Indirect *p = chain; -+ struct buffer_head *bh = NULL; -+ u32 *bitmap_w = NULL; -+ int ptrs = EXT3COW_ADDR_PER_BLOCK(inode->i_sb); -+ int nbitsperword = (sizeof(u32) * 8); -+ -+ *err = 0; -+ *cow = 0; -+ -+ -+ /* i_data is not going away, no lock needed */ -+ add_chain (chain, NULL, EXT3COW_I(inode)->i_data + *offsets); -+ if (!p->key){ -+ /* Set the bitmap on allocation - znjp */ -+ if(create){ -+ EXT3COW_I(inode)->i_cow_bitmap |= (1UL << *offsets); -+ } -+ goto no_block; -+ } -+ -+ /* Are we writing and COWing any direct blocks? -znjp */ -+ if(create && !(EXT3COW_I(inode)->i_cow_bitmap & (1UL << *offsets))){ -+ //printk(KERN_INFO "COWing direct block\n"); -+ *(p->p) = 0; -+ p->key = 0; -+ /* Set the bitamp when COWing -znjp */ -+ EXT3COW_I(inode)->i_cow_bitmap |= (1UL << *offsets); -+ *cow = 1; -+ goto no_block; -+ } -+ -+ while (--depth) { -+ -+ bh = sb_bread(sb, le32_to_cpu(p->key)); -+ if (!bh) -+ goto failure; -+ -+ /* Reader: pointers */ -+ if (!verify_chain(chain, p)) -+ goto changed; -+ add_chain(++p, bh, (__le32*)bh->b_data + *++offsets); -+ /* Reader: end */ -+ /* Find correct bitmap word */ -+ bitmap_w = (u32*)bh->b_data + ptrs + (*offsets/nbitsperword); -+ if (!p->key){ -+ /* Set the bitmap when allocating -znjp */ -+ if(create){ -+ *bitmap_w |= (u32)(1UL << (int)(*offsets%nbitsperword)); -+ } -+ goto no_block; -+ } -+ -+ /* Are we COWing any indirect blocks? -znjp */ -+ if(create && !(*bitmap_w & (1UL << (int)(*offsets%nbitsperword)))){ -+ //printk(KERN_INFO "COWing indirect block\n"); -+ *(p->p) = 0; -+ p->key = 0; -+ /* Set the bitmap -znjp */ -+ *bitmap_w |= (u32)(1UL << (int)(*offsets%nbitsperword)); -+ *cow = 1; -+ goto no_block; -+ } -+ } -+ return NULL; -+ -+changed: -+ brelse(bh); -+ *err = -EAGAIN; -+ goto no_block; -+failure: -+ *err = -EIO; -+no_block: -+ return p; -+} -+ -+/** -+ * ext3cow_find_near - find a place for allocation with sufficient locality -+ * @inode: owner -+ * @ind: descriptor of indirect block. -+ * -+ * This function returns the prefered place for block allocation. -+ * It is used when heuristic for sequential allocation fails. -+ * Rules are: -+ * + if there is a block to the left of our position - allocate near it. -+ * + if pointer will live in indirect block - allocate near that block. -+ * + if pointer will live in inode - allocate in the same -+ * cylinder group. -+ * -+ * In the latter case we colour the starting block by the callers PID to -+ * prevent it from clashing with concurrent allocations for a different inode -+ * in the same block group. The PID is used here so that functionally related -+ * files will be close-by on-disk. -+ * -+ * Caller must make sure that @ind is valid and will stay that way. -+ */ -+static ext3cow_fsblk_t ext3cow_find_near(struct inode *inode, Indirect *ind) -+{ -+ struct ext3cow_inode_info *ei = EXT3COW_I(inode); -+ __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data; -+ __le32 *p; -+ ext3cow_fsblk_t bg_start; -+ ext3cow_grpblk_t colour; -+ -+ /* Try to find previous block */ -+ for (p = ind->p - 1; p >= start; p--) { -+ if (*p) -+ return le32_to_cpu(*p); -+ } -+ -+ /* No such thing, so let's try location of indirect block */ -+ if (ind->bh) -+ return ind->bh->b_blocknr; -+ -+ /* -+ * It is going to be referred to from the inode itself? OK, just put it -+ * into the same cylinder group then. -+ */ -+ bg_start = ext3cow_group_first_block_no(inode->i_sb, ei->i_block_group); -+ colour = (current->pid % 16) * -+ (EXT3COW_BLOCKS_PER_GROUP(inode->i_sb) / 16); -+ return bg_start + colour; -+} -+ -+/** -+ * ext3cow_find_goal - find a prefered place for allocation. -+ * @inode: owner -+ * @block: block we want -+ * @chain: chain of indirect blocks -+ * @partial: pointer to the last triple within a chain -+ * @goal: place to store the result. -+ * -+ * Normally this function find the prefered place for block allocation, -+ * stores it in *@goal and returns zero. -+ */ -+ -+static ext3cow_fsblk_t ext3cow_find_goal(struct inode *inode, long block, -+ Indirect chain[4], Indirect *partial) -+{ -+ struct ext3cow_block_alloc_info *block_i; -+ -+ block_i = EXT3COW_I(inode)->i_block_alloc_info; -+ -+ /* -+ * try the heuristic for sequential allocation, -+ * failing that at least try to get decent locality. -+ */ -+ if (block_i && (block == block_i->last_alloc_logical_block + 1) -+ && (block_i->last_alloc_physical_block != 0)) { -+ return block_i->last_alloc_physical_block + 1; -+ } -+ -+ return ext3cow_find_near(inode, partial); -+} -+ -+/** -+ * ext3cow_blks_to_allocate: Look up the block map and count the number -+ * of direct blocks need to be allocated for the given branch. -+ * -+ * @branch: chain of indirect blocks -+ * @k: number of blocks need for indirect blocks -+ * @blks: number of data blocks to be mapped. -+ * @blocks_to_boundary: the offset in the indirect block -+ * -+ * return the total number of blocks to be allocate, including the -+ * direct and indirect blocks. -+ */ -+static int ext3cow_blks_to_allocate(Indirect *branch, int k, unsigned long blks, -+ int blocks_to_boundary) -+{ -+ unsigned long count = 0; -+ -+ /* -+ * Simple case, [t,d]Indirect block(s) has not allocated yet -+ * then it's clear blocks on that path have not allocated -+ */ -+ if (k > 0) { -+ /* right now we don't handle cross boundary allocation */ -+ if (blks < blocks_to_boundary + 1) -+ count += blks; -+ else -+ count += blocks_to_boundary + 1; -+ return count; -+ } -+ -+ count++; -+ while (count < blks && count <= blocks_to_boundary && -+ le32_to_cpu(*(branch[0].p + count)) == 0) { -+ count++; -+ } -+ return count; -+} -+ -+/** -+ * ext3cow_alloc_blocks: multiple allocate blocks needed for a branch -+ * @indirect_blks: the number of blocks need to allocate for indirect -+ * blocks -+ * -+ * @new_blocks: on return it will store the new block numbers for -+ * the indirect blocks(if needed) and the first direct block, -+ * @blks: on return it will store the total number of allocated -+ * direct blocks -+ */ -+static int ext3cow_alloc_blocks(handle_t *handle, struct inode *inode, -+ ext3cow_fsblk_t goal, int indirect_blks, int blks, -+ ext3cow_fsblk_t new_blocks[4], int *err) -+{ -+ int target, i; -+ unsigned long count = 0; -+ int index = 0; -+ ext3cow_fsblk_t current_block = 0; -+ int ret = 0; -+ -+ /* -+ * Here we try to allocate the requested multiple blocks at once, -+ * on a best-effort basis. -+ * To build a branch, we should allocate blocks for -+ * the indirect blocks(if not allocated yet), and at least -+ * the first direct block of this branch. That's the -+ * minimum number of blocks need to allocate(required) -+ */ -+ target = blks + indirect_blks; -+ -+ while (1) { -+ count = target; -+ /* allocating blocks for indirect blocks and direct blocks */ -+ current_block = ext3cow_new_blocks(handle,inode,goal,&count,err); -+ if (*err) -+ goto failed_out; -+ -+ target -= count; -+ /* allocate blocks for indirect blocks */ -+ while (index < indirect_blks && count) { -+ new_blocks[index++] = current_block++; -+ count--; -+ } -+ -+ if (count > 0) -+ break; -+ } -+ -+ /* save the new block number for the first direct block */ -+ new_blocks[index] = current_block; -+ -+ /* total number of blocks allocated for direct blocks */ -+ ret = count; -+ *err = 0; -+ return ret; -+failed_out: -+ for (i = 0; i key). Upon the exit we have the same -+ * picture as after the successful ext3cow_get_block(), except that in one -+ * place chain is disconnected - *branch->p is still zero (we did not -+ * set the last link), but branch->key contains the number that should -+ * be placed into *branch->p to fill that gap. -+ * -+ * If allocation fails we free all blocks we've allocated (and forget -+ * their buffer_heads) and return the error value the from failed -+ * ext3cow_alloc_block() (normally -ENOSPC). Otherwise we set the chain -+ * as described above and return 0. -+ */ -+static int ext3cow_alloc_branch(handle_t *handle, struct inode *inode, -+ int indirect_blks, int *blks, ext3cow_fsblk_t goal, -+ int *offsets, Indirect *branch) -+{ -+ int blocksize = inode->i_sb->s_blocksize; -+ int i, n = 0; -+ int err = 0; -+ struct buffer_head *bh; -+ int num; -+ ext3cow_fsblk_t new_blocks[4]; -+ ext3cow_fsblk_t current_block; -+ -+ u32 *bitmap_w = NULL; -+ int ptrs = EXT3COW_ADDR_PER_BLOCK(inode->i_sb); -+ int nbitsperword = (sizeof(u32) * 8); -+ -+ num = ext3cow_alloc_blocks(handle, inode, goal, indirect_blks, -+ *blks, new_blocks, &err); -+ if (err) -+ return err; -+ -+ branch[0].key = cpu_to_le32(new_blocks[0]); -+ /* -+ * metadata blocks and data blocks are allocated. -+ */ -+ -+ for (n = 1; n <= indirect_blks; n++) { -+ /* -+ * Get buffer_head for parent block, zero it out -+ * and set the pointer to new one, then send -+ * parent to disk. -+ */ -+ bh = sb_getblk(inode->i_sb, new_blocks[n-1]); -+ branch[n].bh = bh; -+ lock_buffer(bh); -+ BUFFER_TRACE(bh, "call get_create_access"); -+ err = ext3cow_journal_get_create_access(handle, bh); -+ if (err) { -+ unlock_buffer(bh); -+ brelse(bh); -+ goto failed; -+ } -+ -+ memset(bh->b_data, 0, blocksize); -+ /* Mark the cow bitmap for each new indirect block allocated. -+ * We had to put this here, because get_branch was insufficient -+ * when allocating an indirect block. -znjp -+ */ -+ bitmap_w = (u32*)bh->b_data + ptrs + (offsets[n]/nbitsperword); -+ *bitmap_w |= (u32)(1UL << (int)(offsets[n]%nbitsperword)); -+ -+ branch[n].p = (__le32 *) bh->b_data + offsets[n]; -+ branch[n].key = cpu_to_le32(new_blocks[n]); -+ *branch[n].p = branch[n].key; -+ if ( n == indirect_blks) { -+ current_block = new_blocks[n]; -+ /* -+ * End of chain, update the last new metablock of -+ * the chain to point to the new allocated -+ * data blocks numbers -+ */ -+ for (i=1; i < num; i++) -+ *(branch[n].p + i) = cpu_to_le32(++current_block); -+ } -+ BUFFER_TRACE(bh, "marking uptodate"); -+ set_buffer_uptodate(bh); -+ unlock_buffer(bh); -+ -+ BUFFER_TRACE(bh, "call ext3cow_journal_dirty_metadata"); -+ err = ext3cow_journal_dirty_metadata(handle, bh); -+ if (err) -+ goto failed; -+ } -+ *blks = num; -+ return err; -+failed: -+ /* Allocation failed, free what we already allocated */ -+ for (i = 1; i <= n ; i++) { -+ BUFFER_TRACE(branch[i].bh, "call journal_forget"); -+ ext3cow_journal_forget(handle, branch[i].bh); -+ } -+ for (i = 0; i i_blocks, etc.). In case of success we end up with the full -+ * chain to new block and return 0. -+ */ -+static int ext3cow_splice_branch(handle_t *handle, struct inode *inode, -+ long block, Indirect *where, int num, int blks) -+{ -+ int i; -+ int err = 0; -+ struct ext3cow_block_alloc_info *block_i; -+ ext3cow_fsblk_t current_block; -+ -+ block_i = EXT3COW_I(inode)->i_block_alloc_info; -+ /* -+ * If we're splicing into a [td]indirect block (as opposed to the -+ * inode) then we need to get write access to the [td]indirect block -+ * before the splice. -+ */ -+ if (where->bh) { -+ BUFFER_TRACE(where->bh, "get_write_access"); -+ err = ext3cow_journal_get_write_access(handle, where->bh); -+ if (err) -+ goto err_out; -+ } -+ /* That's it */ -+ -+ *where->p = where->key; -+ -+ /* -+ * Update the host buffer_head or inode to point to more just allocated -+ * direct blocks blocks -+ */ -+ if (num == 0 && blks > 1) { -+ current_block = le32_to_cpu(where->key) + 1; -+ for (i = 1; i < blks; i++) -+ *(where->p + i ) = cpu_to_le32(current_block++); -+ } -+ -+ /* -+ * update the most recently allocated logical & physical block -+ * in i_block_alloc_info, to assist find the proper goal block for next -+ * allocation -+ */ -+ if (block_i) { -+ block_i->last_alloc_logical_block = block + blks - 1; -+ block_i->last_alloc_physical_block = -+ le32_to_cpu(where[num].key) + blks - 1; -+ } -+ -+ /* We are done with atomic stuff, now do the rest of housekeeping */ -+ -+ inode->i_ctime = CURRENT_TIME_SEC; -+ ext3cow_mark_inode_dirty(handle, inode); -+ -+ /* had we spliced it onto indirect block? */ -+ if (where->bh) { -+ /* -+ * If we spliced it onto an indirect block, we haven't -+ * altered the inode. Note however that if it is being spliced -+ * onto an indirect block at the very end of the file (the -+ * file is growing) then we *will* alter the inode to reflect -+ * the new i_size. But that is not done here - it is done in -+ * generic_commit_write->__mark_inode_dirty->ext3cow_dirty_inode. -+ */ -+ jbd_debug(5, "splicing indirect only\n"); -+ BUFFER_TRACE(where->bh, "call ext3cow_journal_dirty_metadata"); -+ err = ext3cow_journal_dirty_metadata(handle, where->bh); -+ if (err) -+ goto err_out; -+ } else { -+ /* -+ * OK, we spliced it into the inode itself on a direct block. -+ * Inode was dirtied above. -+ */ -+ jbd_debug(5, "splicing direct\n"); -+ } -+ return err; -+ -+err_out: -+ for (i = 1; i <= num; i++) { -+ BUFFER_TRACE(where[i].bh, "call journal_forget"); -+ ext3cow_journal_forget(handle, where[i].bh); -+ ext3cow_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1); -+ } -+ ext3cow_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks); -+ -+ return err; -+} -+ -+/* -+ * Allocation strategy is simple: if we have to allocate something, we will -+ * have to go the whole way to leaf. So let's do it before attaching anything -+ * to tree, set linkage between the newborn blocks, write them if sync is -+ * required, recheck the path, free and repeat if check fails, otherwise -+ * set the last missing link (that will protect us from any truncate-generated -+ * removals - all blocks on the path are immune now) and possibly force the -+ * write on the parent block. -+ * That has a nice additional property: no special recovery from the failed -+ * allocations is needed - we simply release blocks and do not touch anything -+ * reachable from inode. -+ * -+ * `handle' can be NULL if create == 0. -+ * -+ * The BKL may not be held on entry here. Be sure to take it early. -+ * return > 0, # of blocks mapped or allocated. -+ * return = 0, if plain lookup failed. -+ * return < 0, error case. -+ */ -+int ext3cow_get_blocks_handle(handle_t *handle, struct inode *inode, -+ sector_t iblock, unsigned long maxblocks, -+ struct buffer_head *bh_result, -+ int create, int extend_disksize) -+{ -+ int err = -EIO; -+ int offsets[4]; -+ Indirect chain[4]; -+ Indirect *partial; -+ ext3cow_fsblk_t goal; -+ int indirect_blks; -+ int blocks_to_boundary = 0; -+ int depth; -+ struct ext3cow_inode_info *ei = EXT3COW_I(inode); -+ int count = 0; -+ ext3cow_fsblk_t first_block = 0; -+ int cow = 0; /* To determine wether we clear the buffer of not -znjp */ -+ -+ -+ J_ASSERT(handle != NULL || create == 0); -+ depth = ext3cow_block_to_path(inode,iblock,offsets,&blocks_to_boundary); -+ -+ if (depth == 0) -+ goto out; -+ -+ partial = ext3cow_get_branch(inode, depth, offsets, -+ chain, &err, &cow, create); -+ -+ /* Simplest case - block found, no allocation needed */ -+ if (!partial) { -+ first_block = le32_to_cpu(chain[depth - 1].key); -+ if(!cow) /* Don't clear the buffer if it's a COW allocation -znjp */ -+ clear_buffer_new(bh_result); -+ count++; -+ /*map more blocks*/ -+ while (count < maxblocks && count <= blocks_to_boundary) { -+ ext3cow_fsblk_t blk; -+ -+ if (!verify_chain(chain, partial)) { -+ /* -+ * Indirect block might be removed by -+ * truncate while we were reading it. -+ * Handling of that case: forget what we've -+ * got now. Flag the err as EAGAIN, so it -+ * will reread. -+ */ -+ err = -EAGAIN; -+ count = 0; -+ break; -+ } -+ blk = le32_to_cpu(*(chain[depth-1].p + count)); -+ -+ if (blk == first_block + count) -+ count++; -+ else -+ break; -+ } -+ if (err != -EAGAIN) -+ goto got_it; -+ } -+ -+ /* Next simple case - plain lookup or failed read of indirect block */ -+ if (!create || err == -EIO) -+ goto cleanup; -+ -+ mutex_lock(&ei->truncate_mutex); -+ -+ /* -+ * If the indirect block is missing while we are reading -+ * the chain(ext3cow_get_branch() returns -EAGAIN err), or -+ * if the chain has been changed after we grab the semaphore, -+ * (either because another process truncated this branch, or -+ * another get_block allocated this branch) re-grab the chain to see if -+ * the request block has been allocated or not. -+ * -+ * Since we already block the truncate/other get_block -+ * at this point, we will have the current copy of the chain when we -+ * splice the branch into the tree. -+ */ -+ if (err == -EAGAIN || !verify_chain(chain, partial)) { -+ while (partial > chain) { -+ brelse(partial->bh); -+ partial--; -+ } -+ partial = ext3cow_get_branch(inode, depth, offsets, -+ chain, &err, &cow, create); -+ if (!partial) { -+ count++; -+ mutex_unlock(&ei->truncate_mutex); -+ if (err) -+ goto cleanup; -+ /* Don't clear the buffer if we're COWing it -znjp */ -+ if(!cow) -+ clear_buffer_new(bh_result); -+ goto got_it; -+ } -+ } -+ -+ /* -+ * Okay, we need to do block allocation. Lazily initialize the block -+ * allocation info here if necessary -+ */ -+ if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info)) -+ ext3cow_init_block_alloc_info(inode); -+ -+ goal = ext3cow_find_goal(inode, iblock, chain, partial); -+ -+ /* the number of blocks need to allocate for [d,t]indirect blocks */ -+ indirect_blks = (chain + depth) - partial - 1; -+ -+ /* -+ * Next look up the indirect map to count the totoal number of -+ * direct blocks to allocate for this branch. -+ */ -+ count = ext3cow_blks_to_allocate(partial, indirect_blks, -+ maxblocks, blocks_to_boundary); -+ /* -+ * Block out ext3cow_truncate while we alter the tree -+ */ -+ err = ext3cow_alloc_branch(handle, inode, indirect_blks, &count, goal, -+ offsets + (partial - chain), partial); -+ -+ /* -+ * The ext3cow_splice_branch call will free and forget any buffers -+ * on the new chain if there is a failure, but that risks using -+ * up transaction credits, especially for bitmaps where the -+ * credits cannot be returned. Can we handle this somehow? We -+ * may need to return -EAGAIN upwards in the worst case. --sct -+ */ -+ if (!err) -+ err = ext3cow_splice_branch(handle, inode, iblock, -+ partial, indirect_blks, count); -+ /* -+ * i_disksize growing is protected by truncate_mutex. Don't forget to -+ * protect it if you're about to implement concurrent -+ * ext3cow_get_block() -bzzz -+ */ -+ if (!err && extend_disksize && inode->i_size > ei->i_disksize) -+ ei->i_disksize = inode->i_size; -+ mutex_unlock(&ei->truncate_mutex); -+ if (err) -+ goto cleanup; -+ -+ set_buffer_new(bh_result); -+got_it: -+ map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); -+ if (count > blocks_to_boundary) -+ set_buffer_boundary(bh_result); -+ err = count; -+ /* Clean up and exit */ -+ partial = chain + depth - 1; /* the whole chain */ -+cleanup: -+ while (partial > chain) { -+ BUFFER_TRACE(partial->bh, "call brelse"); -+ brelse(partial->bh); -+ partial--; -+ } -+ BUFFER_TRACE(bh_result, "returned"); -+out: -+ return err; -+} -+ -+#define DIO_CREDITS (EXT3COW_RESERVE_TRANS_BLOCKS + 32) -+ -+static int ext3cow_get_block(struct inode *inode, sector_t iblock, -+ struct buffer_head *bh_result, int create) -+{ -+ handle_t *handle = journal_current_handle(); -+ int ret = 0; -+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; -+ -+ if (!create) -+ goto get_block; /* A read */ -+ -+ if (max_blocks == 1) -+ goto get_block; /* A single block get */ -+ -+ if (handle->h_transaction->t_state == T_LOCKED) { -+ /* -+ * Huge direct-io writes can hold off commits for long -+ * periods of time. Let this commit run. -+ */ -+ ext3cow_journal_stop(handle); -+ handle = ext3cow_journal_start(inode, DIO_CREDITS); -+ if (IS_ERR(handle)) -+ ret = PTR_ERR(handle); -+ goto get_block; -+ } -+ -+ if (handle->h_buffer_credits <= EXT3COW_RESERVE_TRANS_BLOCKS) { -+ /* -+ * Getting low on buffer credits... -+ */ -+ ret = ext3cow_journal_extend(handle, DIO_CREDITS); -+ if (ret > 0) { -+ /* -+ * Couldn't extend the transaction. Start a new one. -+ */ -+ ret = ext3cow_journal_restart(handle, DIO_CREDITS); -+ } -+ } -+ -+get_block: -+ if (ret == 0) { -+ ret = ext3cow_get_blocks_handle(handle, inode, iblock, -+ max_blocks, bh_result, create, 0); -+ if (ret > 0) { -+ bh_result->b_size = (ret << inode->i_blkbits); -+ ret = 0; -+ } -+ } -+ return ret; -+} -+ -+/* -+ * `handle' can be NULL if create is zero -+ */ -+struct buffer_head *ext3cow_getblk(handle_t *handle, struct inode *inode, -+ long block, int create, int *errp) -+{ -+ struct buffer_head dummy; -+ int fatal = 0, err; -+ -+ J_ASSERT(handle != NULL || create == 0); -+ -+ dummy.b_state = 0; -+ dummy.b_blocknr = -1000; -+ buffer_trace_init(&dummy.b_history); -+ err = ext3cow_get_blocks_handle(handle, inode, block, 1, -+ &dummy, create, 1); -+ /* -+ * ext3cow_get_blocks_handle() returns number of blocks -+ * mapped. 0 in case of a HOLE. -+ */ -+ if (err > 0) { -+ if (err > 1) -+ WARN_ON(1); -+ err = 0; -+ } -+ *errp = err; -+ if (!err && buffer_mapped(&dummy)) { -+ struct buffer_head *bh; -+ bh = sb_getblk(inode->i_sb, dummy.b_blocknr); -+ if (!bh) { -+ *errp = -EIO; -+ goto err; -+ } -+ if (buffer_new(&dummy)) { -+ J_ASSERT(create != 0); -+ J_ASSERT(handle != 0); -+ -+ /* -+ * Now that we do not always journal data, we should -+ * keep in mind whether this should always journal the -+ * new buffer as metadata. For now, regular file -+ * writes use ext3cow_get_block instead, so it's not a -+ * problem. -+ */ -+ lock_buffer(bh); -+ BUFFER_TRACE(bh, "call get_create_access"); -+ fatal = ext3cow_journal_get_create_access(handle, bh); -+ if (!fatal && !buffer_uptodate(bh)) { -+ memset(bh->b_data,0,inode->i_sb->s_blocksize); -+ set_buffer_uptodate(bh); -+ } -+ unlock_buffer(bh); -+ BUFFER_TRACE(bh, "call ext3cow_journal_dirty_metadata"); -+ err = ext3cow_journal_dirty_metadata(handle, bh); -+ if (!fatal) -+ fatal = err; -+ } else { -+ BUFFER_TRACE(bh, "not a new buffer"); -+ } -+ if (fatal) { -+ *errp = fatal; -+ brelse(bh); -+ bh = NULL; -+ } -+ return bh; -+ } -+err: -+ return NULL; -+} -+ -+struct buffer_head *ext3cow_bread(handle_t *handle, struct inode *inode, -+ int block, int create, int *err) -+{ -+ struct buffer_head * bh; -+ -+ bh = ext3cow_getblk(handle, inode, block, create, err); -+ if (!bh) -+ return bh; -+ if (buffer_uptodate(bh)) -+ return bh; -+ ll_rw_block(READ_META, 1, &bh); -+ wait_on_buffer(bh); -+ if (buffer_uptodate(bh)) -+ return bh; -+ put_bh(bh); -+ *err = -EIO; -+ return NULL; -+} -+ -+static int walk_page_buffers( handle_t *handle, -+ struct buffer_head *head, -+ unsigned from, -+ unsigned to, -+ int *partial, -+ int (*fn)( handle_t *handle, -+ struct buffer_head *bh)) -+{ -+ struct buffer_head *bh; -+ unsigned block_start, block_end; -+ unsigned blocksize = head->b_size; -+ int err, ret = 0; -+ struct buffer_head *next; -+ -+ for ( bh = head, block_start = 0; -+ ret == 0 && (bh != head || !block_start); -+ block_start = block_end, bh = next) -+ { -+ next = bh->b_this_page; -+ block_end = block_start + blocksize; -+ if (block_end <= from || block_start >= to) { -+ if (partial && !buffer_uptodate(bh)) -+ *partial = 1; -+ continue; -+ } -+ err = (*fn)(handle, bh); -+ if (!ret) -+ ret = err; -+ } -+ return ret; -+} -+ -+/* -+ * To preserve ordering, it is essential that the hole instantiation and -+ * the data write be encapsulated in a single transaction. We cannot -+ * close off a transaction and start a new one between the ext3cow_get_block() -+ * and the commit_write(). So doing the journal_start at the start of -+ * prepare_write() is the right place. -+ * -+ * Also, this function can nest inside ext3cow_writepage() -> -+ * block_write_full_page(). In that case, we *know* that ext3cow_writepage() -+ * has generated enough buffer credits to do the whole page. So we won't -+ * block on the journal in that case, which is good, because the caller may -+ * be PF_MEMALLOC. -+ * -+ * By accident, ext3cow can be reentered when a transaction is open via -+ * quota file writes. If we were to commit the transaction while thus -+ * reentered, there can be a deadlock - we would be holding a quota -+ * lock, and the commit would never complete if another thread had a -+ * transaction open and was blocking on the quota lock - a ranking -+ * violation. -+ * -+ * So what we do is to rely on the fact that journal_stop/journal_start -+ * will _not_ run commit under these circumstances because handle->h_ref -+ * is elevated. We'll still have enough credits for the tiny quotafile -+ * write. -+ */ -+static int do_journal_get_write_access(handle_t *handle, -+ struct buffer_head *bh) -+{ -+ if (!buffer_mapped(bh) || buffer_freed(bh)) -+ return 0; -+ return ext3cow_journal_get_write_access(handle, bh); -+} -+ -+/* -+ * The idea of this helper function is following: -+ * if prepare_write has allocated some blocks, but not all of them, the -+ * transaction must include the content of the newly allocated blocks. -+ * This content is expected to be set to zeroes by block_prepare_write(). -+ * 2006/10/14 SAW -+ */ -+static int ext3cow_prepare_failure(struct file *file, struct page *page, -+ unsigned from, unsigned to) -+{ -+ struct address_space *mapping; -+ struct buffer_head *bh, *head, *next; -+ unsigned block_start, block_end; -+ unsigned blocksize; -+ int ret; -+ handle_t *handle = ext3cow_journal_current_handle(); -+ -+ mapping = page->mapping; -+ if (ext3cow_should_writeback_data(mapping->host)) { -+ /* optimization: no constraints about data */ -+skip: -+ return ext3cow_journal_stop(handle); -+ } -+ -+ head = page_buffers(page); -+ blocksize = head->b_size; -+ for ( bh = head, block_start = 0; -+ bh != head || !block_start; -+ block_start = block_end, bh = next) -+ { -+ next = bh->b_this_page; -+ block_end = block_start + blocksize; -+ if (block_end <= from) -+ continue; -+ if (block_start >= to) { -+ block_start = to; -+ break; -+ } -+ if (!buffer_mapped(bh)) -+ /* prepare_write failed on this bh */ -+ break; -+ if (ext3cow_should_journal_data(mapping->host)) { -+ ret = do_journal_get_write_access(handle, bh); -+ if (ret) { -+ ext3cow_journal_stop(handle); -+ return ret; -+ } -+ } -+ /* -+ * block_start here becomes the first block where the current iteration -+ * of prepare_write failed. -+ */ -+ } -+ if (block_start <= from) -+ goto skip; -+ -+ /* commit allocated and zeroed buffers */ -+ return mapping->a_ops->commit_write(file, page, from, block_start); -+} -+ -+/* Used to quickly unmap all buffers in a page for COWing -znjp */ -+static int ext3cow_clear_buffer_mapped(handle_t *handle, -+ struct buffer_head *bh) -+{ -+ clear_buffer_mapped(bh); -+ return 0; -+} -+ -+static int ext3cow_prepare_write(struct file *file, struct page *page, -+ unsigned from, unsigned to) -+{ -+ struct inode *inode = page->mapping->host; -+ int ret, ret2; -+ int needed_blocks = ext3cow_writepage_trans_blocks(inode); -+ handle_t *handle; -+ int retries = 0; -+ -+retry: -+ handle = ext3cow_journal_start(inode, needed_blocks); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ /* Unset the BH_Mapped flag so get_block is always called -znjp */ -+ if(page_has_buffers(page)) -+ ret = walk_page_buffers(handle, page_buffers(page), -+ from, to, NULL, ext3cow_clear_buffer_mapped); -+ -+ if (test_opt(inode->i_sb, NOBH) && ext3cow_should_writeback_data(inode)) -+ ret = nobh_prepare_write(page, from, to, ext3cow_get_block); -+ else -+ ret = block_prepare_write(page, from, to, ext3cow_get_block); -+ if (ret) -+ goto failure; -+ -+ if (ext3cow_should_journal_data(inode)) { -+ ret = walk_page_buffers(handle, page_buffers(page), -+ from, to, NULL, do_journal_get_write_access); -+ if (ret) -+ /* fatal error, just put the handle and return */ -+ journal_stop(handle); -+ } -+ return ret; -+ -+failure: -+ ret2 = ext3cow_prepare_failure(file, page, from, to); -+ if (ret2 < 0) -+ return ret2; -+ if (ret == -ENOSPC && ext3cow_should_retry_alloc(inode->i_sb, &retries)) -+ goto retry; -+ /* retry number exceeded, or other error like -EDQUOT */ -+ return ret; -+} -+ -+int ext3cow_journal_dirty_data(handle_t *handle, struct buffer_head *bh) -+{ -+ int err = journal_dirty_data(handle, bh); -+ if (err) -+ ext3cow_journal_abort_handle(__FUNCTION__, __FUNCTION__, -+ bh, handle,err); -+ return err; -+} -+ -+/* For commit_write() in data=journal mode */ -+static int commit_write_fn(handle_t *handle, struct buffer_head *bh) -+{ -+ if (!buffer_mapped(bh) || buffer_freed(bh)) -+ return 0; -+ set_buffer_uptodate(bh); -+ return ext3cow_journal_dirty_metadata(handle, bh); -+} -+ -+/* -+ * We need to pick up the new inode size which generic_commit_write gave us -+ * `file' can be NULL - eg, when called from page_symlink(). -+ * -+ * ext3cow never places buffers on inode->i_mapping->private_list. metadata -+ * buffers are managed internally. -+ */ -+static int ext3cow_ordered_commit_write(struct file *file, struct page *page, -+ unsigned from, unsigned to) -+{ -+ handle_t *handle = ext3cow_journal_current_handle(); -+ struct inode *inode = page->mapping->host; -+ int ret = 0, ret2; -+ -+ ret = walk_page_buffers(handle, page_buffers(page), -+ from, to, NULL, ext3cow_journal_dirty_data); -+ -+ if (ret == 0) { -+ /* -+ * generic_commit_write() will run mark_inode_dirty() if i_size -+ * changes. So let's piggyback the i_disksize mark_inode_dirty -+ * into that. -+ */ -+ loff_t new_i_size; -+ -+ new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; -+ if (new_i_size > EXT3COW_I(inode)->i_disksize) -+ EXT3COW_I(inode)->i_disksize = new_i_size; -+ ret = generic_commit_write(file, page, from, to); -+ } -+ ret2 = ext3cow_journal_stop(handle); -+ if (!ret) -+ ret = ret2; -+ return ret; -+} -+ -+static int ext3cow_writeback_commit_write(struct file *file, struct page *page, -+ unsigned from, unsigned to) -+{ -+ handle_t *handle = ext3cow_journal_current_handle(); -+ struct inode *inode = page->mapping->host; -+ int ret = 0, ret2; -+ loff_t new_i_size; -+ -+ new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; -+ if (new_i_size > EXT3COW_I(inode)->i_disksize) -+ EXT3COW_I(inode)->i_disksize = new_i_size; -+ -+ if (test_opt(inode->i_sb, NOBH) && ext3cow_should_writeback_data(inode)) -+ ret = nobh_commit_write(file, page, from, to); -+ else -+ ret = generic_commit_write(file, page, from, to); -+ -+ ret2 = ext3cow_journal_stop(handle); -+ if (!ret) -+ ret = ret2; -+ return ret; -+} -+ -+static int ext3cow_journalled_commit_write(struct file *file, -+ struct page *page, unsigned from, unsigned to) -+{ -+ handle_t *handle = ext3cow_journal_current_handle(); -+ struct inode *inode = page->mapping->host; -+ int ret = 0, ret2; -+ int partial = 0; -+ loff_t pos; -+ -+ /* -+ * Here we duplicate the generic_commit_write() functionality -+ */ -+ pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; -+ -+ ret = walk_page_buffers(handle, page_buffers(page), from, -+ to, &partial, commit_write_fn); -+ if (!partial) -+ SetPageUptodate(page); -+ if (pos > inode->i_size) -+ i_size_write(inode, pos); -+ EXT3COW_I(inode)->i_state |= EXT3COW_STATE_JDATA; -+ if (inode->i_size > EXT3COW_I(inode)->i_disksize) { -+ EXT3COW_I(inode)->i_disksize = inode->i_size; -+ ret2 = ext3cow_mark_inode_dirty(handle, inode); -+ if (!ret) -+ ret = ret2; -+ } -+ ret2 = ext3cow_journal_stop(handle); -+ if (!ret) -+ ret = ret2; -+ return ret; -+} -+ -+/* -+ * bmap() is special. It gets used by applications such as lilo and by -+ * the swapper to find the on-disk block of a specific piece of data. -+ * -+ * Naturally, this is dangerous if the block concerned is still in the -+ * journal. If somebody makes a swapfile on an ext3cow data-journaling -+ * filesystem and enables swap, then they may get a nasty shock when the -+ * data getting swapped to that swapfile suddenly gets overwritten by -+ * the original zero's written out previously to the journal and -+ * awaiting writeback in the kernel's buffer cache. -+ * -+ * So, if we see any bmap calls here on a modified, data-journaled file, -+ * take extra steps to flush any blocks which might be in the cache. -+ */ -+static sector_t ext3cow_bmap(struct address_space *mapping, sector_t block) -+{ -+ struct inode *inode = mapping->host; -+ journal_t *journal; -+ int err; -+ -+ if (EXT3COW_I(inode)->i_state & EXT3COW_STATE_JDATA) { -+ /* -+ * This is a REALLY heavyweight approach, but the use of -+ * bmap on dirty files is expected to be extremely rare: -+ * only if we run lilo or swapon on a freshly made file -+ * do we expect this to happen. -+ * -+ * (bmap requires CAP_SYS_RAWIO so this does not -+ * represent an unprivileged user DOS attack --- we'd be -+ * in trouble if mortal users could trigger this path at -+ * will.) -+ * -+ * NB. EXT3COW_STATE_JDATA is not set on files other than -+ * regular files. If somebody wants to bmap a directory -+ * or symlink and gets confused because the buffer -+ * hasn't yet been flushed to disk, they deserve -+ * everything they get. -+ */ -+ -+ EXT3COW_I(inode)->i_state &= ~EXT3COW_STATE_JDATA; -+ journal = EXT3COW_JOURNAL(inode); -+ journal_lock_updates(journal); -+ err = journal_flush(journal); -+ journal_unlock_updates(journal); -+ -+ if (err) -+ return 0; -+ } -+ -+ return generic_block_bmap(mapping,block,ext3cow_get_block); -+} -+ -+static int bget_one(handle_t *handle, struct buffer_head *bh) -+{ -+ get_bh(bh); -+ return 0; -+} -+ -+static int bput_one(handle_t *handle, struct buffer_head *bh) -+{ -+ put_bh(bh); -+ return 0; -+} -+ -+static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) -+{ -+ if (buffer_mapped(bh)) -+ return ext3cow_journal_dirty_data(handle, bh); -+ return 0; -+} -+ -+/* -+ * Note that we always start a transaction even if we're not journalling -+ * data. This is to preserve ordering: any hole instantiation within -+ * __block_write_full_page -> ext3cow_get_block() should be journalled -+ * along with the data so we don't crash and then get metadata which -+ * refers to old data. -+ * -+ * In all journalling modes block_write_full_page() will start the I/O. -+ * -+ * Problem: -+ * -+ * ext3cow_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> -+ * ext3cow_writepage() -+ * -+ * Similar for: -+ * -+ * ext3cow_file_write() -> generic_file_write() -> __alloc_pages() -> ... -+ * -+ * Same applies to ext3cow_get_block(). We will deadlock on various things like -+ * lock_journal and i_truncate_mutex. -+ * -+ * Setting PF_MEMALLOC here doesn't work - too many internal memory -+ * allocations fail. -+ * -+ * 16May01: If we're reentered then journal_current_handle() will be -+ * non-zero. We simply *return*. -+ * -+ * 1 July 2001: @@@ FIXME: -+ * In journalled data mode, a data buffer may be metadata against the -+ * current transaction. But the same file is part of a shared mapping -+ * and someone does a writepage() on it. -+ * -+ * We will move the buffer onto the async_data list, but *after* it has -+ * been dirtied. So there's a small window where we have dirty data on -+ * BJ_Metadata. -+ * -+ * Note that this only applies to the last partial page in the file. The -+ * bit which block_write_full_page() uses prepare/commit for. (That's -+ * broken code anyway: it's wrong for msync()). -+ * -+ * It's a rare case: affects the final partial page, for journalled data -+ * where the file is subject to bith write() and writepage() in the same -+ * transction. To fix it we'll need a custom block_write_full_page(). -+ * We'll probably need that anyway for journalling writepage() output. -+ * -+ * We don't honour synchronous mounts for writepage(). That would be -+ * disastrous. Any write() or metadata operation will sync the fs for -+ * us. -+ * -+ * AKPM2: if all the page's buffers are mapped to disk and !data=journal, -+ * we don't need to open a transaction here. -+ */ -+static int ext3cow_ordered_writepage(struct page *page, -+ struct writeback_control *wbc) -+{ -+ struct inode *inode = page->mapping->host; -+ struct buffer_head *page_bufs; -+ handle_t *handle = NULL; -+ int ret = 0; -+ int err; -+ -+ J_ASSERT(PageLocked(page)); -+ -+ /* -+ * We give up here if we're reentered, because it might be for a -+ * different filesystem. -+ */ -+ if (ext3cow_journal_current_handle()) -+ goto out_fail; -+ -+ handle = ext3cow_journal_start(inode, ext3cow_writepage_trans_blocks(inode)); -+ -+ if (IS_ERR(handle)) { -+ ret = PTR_ERR(handle); -+ goto out_fail; -+ } -+ -+ if (!page_has_buffers(page)) { -+ create_empty_buffers(page, inode->i_sb->s_blocksize, -+ (1 << BH_Dirty)|(1 << BH_Uptodate)); -+ } -+ page_bufs = page_buffers(page); -+ walk_page_buffers(handle, page_bufs, 0, -+ PAGE_CACHE_SIZE, NULL, bget_one); -+ -+ ret = block_write_full_page(page, ext3cow_get_block, wbc); -+ -+ /* -+ * The page can become unlocked at any point now, and -+ * truncate can then come in and change things. So we -+ * can't touch *page from now on. But *page_bufs is -+ * safe due to elevated refcount. -+ */ -+ -+ /* -+ * And attach them to the current transaction. But only if -+ * block_write_full_page() succeeded. Otherwise they are unmapped, -+ * and generally junk. -+ */ -+ if (ret == 0) { -+ err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, -+ NULL, journal_dirty_data_fn); -+ if (!ret) -+ ret = err; -+ } -+ walk_page_buffers(handle, page_bufs, 0, -+ PAGE_CACHE_SIZE, NULL, bput_one); -+ err = ext3cow_journal_stop(handle); -+ if (!ret) -+ ret = err; -+ return ret; -+ -+out_fail: -+ redirty_page_for_writepage(wbc, page); -+ unlock_page(page); -+ return ret; -+} -+ -+static int ext3cow_writeback_writepage(struct page *page, -+ struct writeback_control *wbc) -+{ -+ struct inode *inode = page->mapping->host; -+ handle_t *handle = NULL; -+ int ret = 0; -+ int err; -+ -+ if (ext3cow_journal_current_handle()) -+ goto out_fail; -+ -+ handle = ext3cow_journal_start(inode, ext3cow_writepage_trans_blocks(inode)); -+ if (IS_ERR(handle)) { -+ ret = PTR_ERR(handle); -+ goto out_fail; -+ } -+ -+ if (test_opt(inode->i_sb, NOBH) && ext3cow_should_writeback_data(inode)) -+ ret = nobh_writepage(page, ext3cow_get_block, wbc); -+ else -+ ret = block_write_full_page(page, ext3cow_get_block, wbc); -+ -+ err = ext3cow_journal_stop(handle); -+ if (!ret) -+ ret = err; -+ return ret; -+ -+out_fail: -+ redirty_page_for_writepage(wbc, page); -+ unlock_page(page); -+ return ret; -+} -+ -+static int ext3cow_journalled_writepage(struct page *page, -+ struct writeback_control *wbc) -+{ -+ struct inode *inode = page->mapping->host; -+ handle_t *handle = NULL; -+ int ret = 0; -+ int err; -+ -+ if (ext3cow_journal_current_handle()) -+ goto no_write; -+ -+ handle = ext3cow_journal_start(inode, ext3cow_writepage_trans_blocks(inode)); -+ if (IS_ERR(handle)) { -+ ret = PTR_ERR(handle); -+ goto no_write; -+ } -+ -+ if (!page_has_buffers(page) || PageChecked(page)) { -+ /* -+ * It's mmapped pagecache. Add buffers and journal it. There -+ * doesn't seem much point in redirtying the page here. -+ */ -+ ClearPageChecked(page); -+ ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, -+ ext3cow_get_block); -+ if (ret != 0) { -+ ext3cow_journal_stop(handle); -+ goto out_unlock; -+ } -+ ret = walk_page_buffers(handle, page_buffers(page), 0, -+ PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); -+ -+ err = walk_page_buffers(handle, page_buffers(page), 0, -+ PAGE_CACHE_SIZE, NULL, commit_write_fn); -+ if (ret == 0) -+ ret = err; -+ EXT3COW_I(inode)->i_state |= EXT3COW_STATE_JDATA; -+ unlock_page(page); -+ } else { -+ /* -+ * It may be a page full of checkpoint-mode buffers. We don't -+ * really know unless we go poke around in the buffer_heads. -+ * But block_write_full_page will do the right thing. -+ */ -+ ret = block_write_full_page(page, ext3cow_get_block, wbc); -+ } -+ err = ext3cow_journal_stop(handle); -+ if (!ret) -+ ret = err; -+out: -+ return ret; -+ -+no_write: -+ redirty_page_for_writepage(wbc, page); -+out_unlock: -+ unlock_page(page); -+ goto out; -+} -+ -+static int ext3cow_readpage(struct file *file, struct page *page) -+{ -+ return mpage_readpage(page, ext3cow_get_block); -+} -+ -+static int -+ext3cow_readpages(struct file *file, struct address_space *mapping, -+ struct list_head *pages, unsigned nr_pages) -+{ -+ return mpage_readpages(mapping, pages, nr_pages, ext3cow_get_block); -+} -+ -+static void ext3cow_invalidatepage(struct page *page, unsigned long offset) -+{ -+ journal_t *journal = EXT3COW_JOURNAL(page->mapping->host); -+ -+ /* -+ * If it's a full truncate we just forget about the pending dirtying -+ */ -+ if (offset == 0) -+ ClearPageChecked(page); -+ -+ journal_invalidatepage(journal, page, offset); -+} -+ -+static int ext3cow_releasepage(struct page *page, gfp_t wait) -+{ -+ journal_t *journal = EXT3COW_JOURNAL(page->mapping->host); -+ -+ WARN_ON(PageChecked(page)); -+ if (!page_has_buffers(page)) -+ return 0; -+ return journal_try_to_free_buffers(journal, page, wait); -+} -+ -+/* -+ * If the O_DIRECT write will extend the file then add this inode to the -+ * orphan list. So recovery will truncate it back to the original size -+ * if the machine crashes during the write. -+ * -+ * If the O_DIRECT write is intantiating holes inside i_size and the machine -+ * crashes then stale disk data _may_ be exposed inside the file. -+ */ -+static ssize_t ext3cow_direct_IO(int rw, struct kiocb *iocb, -+ const struct iovec *iov, loff_t offset, -+ unsigned long nr_segs) -+{ -+ struct file *file = iocb->ki_filp; -+ struct inode *inode = file->f_mapping->host; -+ struct ext3cow_inode_info *ei = EXT3COW_I(inode); -+ handle_t *handle = NULL; -+ ssize_t ret; -+ int orphan = 0; -+ size_t count = iov_length(iov, nr_segs); -+ -+ if (rw == WRITE) { -+ loff_t final_size = offset + count; -+ -+ handle = ext3cow_journal_start(inode, DIO_CREDITS); -+ if (IS_ERR(handle)) { -+ ret = PTR_ERR(handle); -+ goto out; -+ } -+ if (final_size > inode->i_size) { -+ ret = ext3cow_orphan_add(handle, inode); -+ if (ret) -+ goto out_stop; -+ orphan = 1; -+ ei->i_disksize = inode->i_size; -+ } -+ } -+ -+ ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, -+ offset, nr_segs, -+ ext3cow_get_block, NULL); -+ -+ /* -+ * Reacquire the handle: ext3cow_get_block() can restart the transaction -+ */ -+ handle = journal_current_handle(); -+ -+out_stop: -+ if (handle) { -+ int err; -+ -+ if (orphan && inode->i_nlink) -+ ext3cow_orphan_del(handle, inode); -+ if (orphan && ret > 0) { -+ loff_t end = offset + ret; -+ if (end > inode->i_size) { -+ ei->i_disksize = end; -+ i_size_write(inode, end); -+ /* -+ * We're going to return a positive `ret' -+ * here due to non-zero-length I/O, so there's -+ * no way of reporting error returns from -+ * ext3cow_mark_inode_dirty() to userspace. So -+ * ignore it. -+ */ -+ ext3cow_mark_inode_dirty(handle, inode); -+ } -+ } -+ err = ext3cow_journal_stop(handle); -+ if (ret == 0) -+ ret = err; -+ } -+out: -+ return ret; -+} -+ -+/* -+ * Pages can be marked dirty completely asynchronously from ext3cow's journalling -+ * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do -+ * much here because ->set_page_dirty is called under VFS locks. The page is -+ * not necessarily locked. -+ * -+ * We cannot just dirty the page and leave attached buffers clean, because the -+ * buffers' dirty state is "definitive". We cannot just set the buffers dirty -+ * or jbddirty because all the journalling code will explode. -+ * -+ * So what we do is to mark the page "pending dirty" and next time writepage -+ * is called, propagate that into the buffers appropriately. -+ */ -+static int ext3cow_journalled_set_page_dirty(struct page *page) -+{ -+ SetPageChecked(page); -+ return __set_page_dirty_nobuffers(page); -+} -+ -+static const struct address_space_operations ext3cow_ordered_aops = { -+ .readpage = ext3cow_readpage, -+ .readpages = ext3cow_readpages, -+ .writepage = ext3cow_ordered_writepage, -+ .sync_page = block_sync_page, -+ .prepare_write = ext3cow_prepare_write, -+ .commit_write = ext3cow_ordered_commit_write, -+ .bmap = ext3cow_bmap, -+ .invalidatepage = ext3cow_invalidatepage, -+ .releasepage = ext3cow_releasepage, -+ .direct_IO = ext3cow_direct_IO, -+ .migratepage = buffer_migrate_page, -+}; -+ -+static const struct address_space_operations ext3cow_writeback_aops = { -+ .readpage = ext3cow_readpage, -+ .readpages = ext3cow_readpages, -+ .writepage = ext3cow_writeback_writepage, -+ .sync_page = block_sync_page, -+ .prepare_write = ext3cow_prepare_write, -+ .commit_write = ext3cow_writeback_commit_write, -+ .bmap = ext3cow_bmap, -+ .invalidatepage = ext3cow_invalidatepage, -+ .releasepage = ext3cow_releasepage, -+ .direct_IO = ext3cow_direct_IO, -+ .migratepage = buffer_migrate_page, -+}; -+ -+static const struct address_space_operations ext3cow_journalled_aops = { -+ .readpage = ext3cow_readpage, -+ .readpages = ext3cow_readpages, -+ .writepage = ext3cow_journalled_writepage, -+ .sync_page = block_sync_page, -+ .prepare_write = ext3cow_prepare_write, -+ .commit_write = ext3cow_journalled_commit_write, -+ .set_page_dirty = ext3cow_journalled_set_page_dirty, -+ .bmap = ext3cow_bmap, -+ .invalidatepage = ext3cow_invalidatepage, -+ .releasepage = ext3cow_releasepage, -+}; -+ -+void ext3cow_set_aops(struct inode *inode) -+{ -+ if (ext3cow_should_order_data(inode)) -+ inode->i_mapping->a_ops = &ext3cow_ordered_aops; -+ else if (ext3cow_should_writeback_data(inode)) -+ inode->i_mapping->a_ops = &ext3cow_writeback_aops; -+ else -+ inode->i_mapping->a_ops = &ext3cow_journalled_aops; -+} -+ -+/* -+ * ext3cow_block_truncate_page() zeroes out a mapping from file offset `from' -+ * up to the end of the block which corresponds to `from'. -+ * This required during truncate. We need to physically zero the tail end -+ * of that block so it doesn't yield old data if the file is later grown. -+ */ -+static int ext3cow_block_truncate_page(handle_t *handle, struct page *page, -+ struct address_space *mapping, loff_t from) -+{ -+ ext3cow_fsblk_t index = from >> PAGE_CACHE_SHIFT; -+ unsigned offset = from & (PAGE_CACHE_SIZE-1); -+ unsigned blocksize, iblock, length, pos; -+ struct inode *inode = mapping->host; -+ struct buffer_head *bh; -+ int err = 0; -+ void *kaddr; -+ -+ blocksize = inode->i_sb->s_blocksize; -+ length = blocksize - (offset & (blocksize - 1)); -+ iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); -+ -+ /* -+ * For "nobh" option, we can only work if we don't need to -+ * read-in the page - otherwise we create buffers to do the IO. -+ */ -+ if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) && -+ ext3cow_should_writeback_data(inode) && PageUptodate(page)) { -+ kaddr = kmap_atomic(page, KM_USER0); -+ memset(kaddr + offset, 0, length); -+ flush_dcache_page(page); -+ kunmap_atomic(kaddr, KM_USER0); -+ set_page_dirty(page); -+ goto unlock; -+ } -+ -+ if (!page_has_buffers(page)) -+ create_empty_buffers(page, blocksize, 0); -+ -+ /* Find the buffer that contains "offset" */ -+ bh = page_buffers(page); -+ pos = blocksize; -+ while (offset >= pos) { -+ bh = bh->b_this_page; -+ iblock++; -+ pos += blocksize; -+ } -+ -+ err = 0; -+ if (buffer_freed(bh)) { -+ BUFFER_TRACE(bh, "freed: skip"); -+ goto unlock; -+ } -+ -+ if (!buffer_mapped(bh)) { -+ BUFFER_TRACE(bh, "unmapped"); -+ ext3cow_get_block(inode, iblock, bh, 0); -+ /* unmapped? It's a hole - nothing to do */ -+ if (!buffer_mapped(bh)) { -+ BUFFER_TRACE(bh, "still unmapped"); -+ goto unlock; -+ } -+ } -+ -+ /* Ok, it's mapped. Make sure it's up-to-date */ -+ if (PageUptodate(page)) -+ set_buffer_uptodate(bh); -+ -+ if (!buffer_uptodate(bh)) { -+ err = -EIO; -+ ll_rw_block(READ, 1, &bh); -+ wait_on_buffer(bh); -+ /* Uhhuh. Read error. Complain and punt. */ -+ if (!buffer_uptodate(bh)) -+ goto unlock; -+ } -+ -+ if (ext3cow_should_journal_data(inode)) { -+ BUFFER_TRACE(bh, "get write access"); -+ err = ext3cow_journal_get_write_access(handle, bh); -+ if (err) -+ goto unlock; -+ } -+ -+ kaddr = kmap_atomic(page, KM_USER0); -+ memset(kaddr + offset, 0, length); -+ flush_dcache_page(page); -+ kunmap_atomic(kaddr, KM_USER0); -+ -+ BUFFER_TRACE(bh, "zeroed end of block"); -+ -+ err = 0; -+ if (ext3cow_should_journal_data(inode)) { -+ err = ext3cow_journal_dirty_metadata(handle, bh); -+ } else { -+ if (ext3cow_should_order_data(inode)) -+ err = ext3cow_journal_dirty_data(handle, bh); -+ mark_buffer_dirty(bh); -+ } -+ -+unlock: -+ unlock_page(page); -+ page_cache_release(page); -+ return err; -+} -+ -+/* -+ * Probably it should be a library function... search for first non-zero word -+ * or memcmp with zero_page, whatever is better for particular architecture. -+ * Linus? -+ */ -+static inline int all_zeroes(__le32 *p, __le32 *q) -+{ -+ while (p < q) -+ if (*p++) -+ return 0; -+ return 1; -+} -+ -+/** -+ * ext3cow_find_shared - find the indirect blocks for partial truncation. -+ * @inode: inode in question -+ * @depth: depth of the affected branch -+ * @offsets: offsets of pointers in that branch (see ext3cow_block_to_path) -+ * @chain: place to store the pointers to partial indirect blocks -+ * @top: place to the (detached) top of branch -+ * -+ * This is a helper function used by ext3cow_truncate(). -+ * -+ * When we do truncate() we may have to clean the ends of several -+ * indirect blocks but leave the blocks themselves alive. Block is -+ * partially truncated if some data below the new i_size is refered -+ * from it (and it is on the path to the first completely truncated -+ * data block, indeed). We have to free the top of that path along -+ * with everything to the right of the path. Since no allocation -+ * past the truncation point is possible until ext3cow_truncate() -+ * finishes, we may safely do the latter, but top of branch may -+ * require special attention - pageout below the truncation point -+ * might try to populate it. -+ * -+ * We atomically detach the top of branch from the tree, store the -+ * block number of its root in *@top, pointers to buffer_heads of -+ * partially truncated blocks - in @chain[].bh and pointers to -+ * their last elements that should not be removed - in -+ * @chain[].p. Return value is the pointer to last filled element -+ * of @chain. -+ * -+ * The work left to caller to do the actual freeing of subtrees: -+ * a) free the subtree starting from *@top -+ * b) free the subtrees whose roots are stored in -+ * (@chain[i].p+1 .. end of @chain[i].bh->b_data) -+ * c) free the subtrees growing from the inode past the @chain[0]. -+ * (no partially truncated stuff there). */ -+ -+static Indirect *ext3cow_find_shared(struct inode *inode, int depth, -+ int offsets[4], Indirect chain[4], __le32 *top) -+{ -+ Indirect *partial, *p; -+ int k, err, cow; -+ -+ *top = 0; -+ /* Make k index the deepest non-null offest + 1 */ -+ for (k = depth; k > 1 && !offsets[k-1]; k--) -+ ; -+ partial = ext3cow_get_branch(inode, k, offsets, chain, &err, &cow, 0); -+ /* Writer: pointers */ -+ if (!partial) -+ partial = chain + k-1; -+ /* -+ * If the branch acquired continuation since we've looked at it - -+ * fine, it should all survive and (new) top doesn't belong to us. -+ */ -+ if (!partial->key && *partial->p) -+ /* Writer: end */ -+ goto no_top; -+ for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--) -+ ; -+ /* -+ * OK, we've found the last block that must survive. The rest of our -+ * branch should be detached before unlocking. However, if that rest -+ * of branch is all ours and does not grow immediately from the inode -+ * it's easier to cheat and just decrement partial->p. -+ */ -+ if (p == chain + k - 1 && p > chain) { -+ p->p--; -+ } else { -+ *top = *p->p; -+ /* Nope, don't do this in ext3cow. Must leave the tree intact */ -+#if 0 -+ *p->p = 0; -+#endif -+ } -+ /* Writer: end */ -+ -+ while(partial > p) { -+ brelse(partial->bh); -+ partial--; -+ } -+no_top: -+ return partial; -+} -+ -+/* -+ * Zero a number of block pointers in either an inode or an indirect block. -+ * If we restart the transaction we must again get write access to the -+ * indirect block for further modification. -+ * -+ * We release `count' blocks on disk, but (last - first) may be greater -+ * than `count' because there can be holes in there. -+ */ -+static void ext3cow_clear_blocks(handle_t *handle, struct inode *inode, -+ struct buffer_head *bh, ext3cow_fsblk_t block_to_free, -+ unsigned long count, __le32 *first, __le32 *last) -+{ -+ __le32 *p; -+ if (try_to_extend_transaction(handle, inode)) { -+ if (bh) { -+ BUFFER_TRACE(bh, "call ext3cow_journal_dirty_metadata"); -+ ext3cow_journal_dirty_metadata(handle, bh); -+ } -+ ext3cow_mark_inode_dirty(handle, inode); -+ ext3cow_journal_test_restart(handle, inode); -+ if (bh) { -+ BUFFER_TRACE(bh, "retaking write access"); -+ ext3cow_journal_get_write_access(handle, bh); -+ } -+ } -+ -+ /* -+ * Any buffers which are on the journal will be in memory. We find -+ * them on the hash table so journal_revoke() will run journal_forget() -+ * on them. We've already detached each block from the file, so -+ * bforget() in journal_forget() should be safe. -+ * -+ * AKPM: turn on bforget in journal_forget()!!! -+ */ -+ for (p = first; p < last; p++) { -+ u32 nr = le32_to_cpu(*p); -+ if (nr) { -+ struct buffer_head *bh; -+ -+ *p = 0; -+ bh = sb_find_get_block(inode->i_sb, nr); -+ ext3cow_forget(handle, 0, inode, bh, nr); -+ } -+ } -+ -+ ext3cow_free_blocks(handle, inode, block_to_free, count); -+} -+ -+/** -+ * ext3cow_free_data - free a list of data blocks -+ * @handle: handle for this transaction -+ * @inode: inode we are dealing with -+ * @this_bh: indirect buffer_head which contains *@first and *@last -+ * @first: array of block numbers -+ * @last: points immediately past the end of array -+ * -+ * We are freeing all blocks refered from that array (numbers are stored as -+ * little-endian 32-bit) and updating @inode->i_blocks appropriately. -+ * -+ * We accumulate contiguous runs of blocks to free. Conveniently, if these -+ * blocks are contiguous then releasing them at one time will only affect one -+ * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't -+ * actually use a lot of journal space. -+ * -+ * @this_bh will be %NULL if @first and @last point into the inode's direct -+ * block pointers. -+ */ -+static void ext3cow_free_data(handle_t *handle, struct inode *inode, -+ struct buffer_head *this_bh, -+ __le32 *first, __le32 *last) -+{ -+ ext3cow_fsblk_t block_to_free = 0; /* Starting block # of a run */ -+ unsigned long count = 0; /* Number of blocks in the run */ -+ __le32 *block_to_free_p = NULL; /* Pointer into inode/ind -+ corresponding to -+ block_to_free */ -+ ext3cow_fsblk_t nr; /* Current block # */ -+ __le32 *p; /* Pointer into inode/ind -+ for current block */ -+ int err; -+ -+ if (this_bh) { /* For indirect block */ -+ BUFFER_TRACE(this_bh, "get_write_access"); -+ err = ext3cow_journal_get_write_access(handle, this_bh); -+ /* Important: if we can't update the indirect pointers -+ * to the blocks, we can't free them. */ -+ if (err) -+ return; -+ } -+ -+ for (p = first; p < last; p++) { -+ nr = le32_to_cpu(*p); -+ if (nr) { -+ /* accumulate blocks to free if they're contiguous */ -+ if (count == 0) { -+ block_to_free = nr; -+ block_to_free_p = p; -+ count = 1; -+ } else if (nr == block_to_free + count) { -+ count++; -+ } else { -+ ext3cow_clear_blocks(handle, inode, this_bh, -+ block_to_free, -+ count, block_to_free_p, p); -+ block_to_free = nr; -+ block_to_free_p = p; -+ count = 1; -+ } -+ } -+ } -+ -+ if (count > 0) -+ ext3cow_clear_blocks(handle, inode, this_bh, block_to_free, -+ count, block_to_free_p, p); -+ -+ if (this_bh) { -+ BUFFER_TRACE(this_bh, "call ext3cow_journal_dirty_metadata"); -+ ext3cow_journal_dirty_metadata(handle, this_bh); -+ } -+} -+ -+/** -+ * ext3cow_free_branches - free an array of branches -+ * @handle: JBD handle for this transaction -+ * @inode: inode we are dealing with -+ * @parent_bh: the buffer_head which contains *@first and *@last -+ * @first: array of block numbers -+ * @last: pointer immediately past the end of array -+ * @depth: depth of the branches to free -+ * -+ * We are freeing all blocks refered from these branches (numbers are -+ * stored as little-endian 32-bit) and updating @inode->i_blocks -+ * appropriately. -+ */ -+static void ext3cow_free_branches(handle_t *handle, struct inode *inode, -+ struct buffer_head *parent_bh, -+ __le32 *first, __le32 *last, int depth) -+{ -+ ext3cow_fsblk_t nr; -+ __le32 *p; -+ -+ if (is_handle_aborted(handle)) -+ return; -+ -+ if (depth--) { -+ struct buffer_head *bh; -+ int addr_per_block = EXT3COW_ADDR_PER_BLOCK(inode->i_sb); -+ u32 *bitmap_word = NULL, *first_block = NULL; -+ unsigned int count = 0, cur = 0, bcount = 0; -+ int i = 0; -+ p = last; -+ while (--p >= first) { -+ nr = le32_to_cpu(*p); -+ if (!nr) -+ continue; /* A hole */ -+ -+ /* Go read the buffer for the next level down */ -+ bh = sb_bread(inode->i_sb, nr); -+ -+ /* -+ * A read failure? Report error and clear slot -+ * (should be rare). -+ */ -+ if (!bh) { -+ ext3cow_error(inode->i_sb, "ext3cow_free_branches", -+ "Read failure, inode=%lu, block="E3FSBLK, -+ inode->i_ino, nr); -+ continue; -+ } -+ /* Only free the branches that have been newly allocated - znjp */ -+ /* Also, set the bits back to 0 in the bitmap -znjp */ -+ cur = 0; -+ count = 0; -+ bitmap_word = (u32*)bh->b_data + addr_per_block; -+ -+ for(bcount = 0; bcount < EXT3COW_COWBITMAPS_PER_IBLOCK(inode->i_sb); -+ bcount++){ -+ for(i = 0; i < EXT3COW_COWBITMAP_SIZE; i++, cur++){ -+ if(cur >= addr_per_block) -+ goto free; -+ if(le32_to_cpu(*bitmap_word) & (1UL << i)){ -+ if(count == 0){ -+ first_block = (u32*)bh->b_data + cur; -+ count = 1; -+ }else if((u32*)first_block + count == (u32*)bh->b_data + cur){ -+ count++; -+ }else{ -+ BUFFER_TRACE(bh, "free child branches"); -+ ext3cow_free_branches(handle, inode, bh, (u32*)first_block, -+ (u32*)first_block + count, depth); -+ first_block = (u32*)bh->b_data + cur; -+ count = 1; -+ } -+ /* Set the bit in the bitmap back to 0 */ -+ *bitmap_word ^= (1UL << i); -+ } -+ } -+ (u32*)bitmap_word++; -+ } -+ free: -+ if(count){ -+ BUFFER_TRACE(bh, "free child branches"); -+ ext3cow_free_branches(handle, inode, bh, (u32*)first_block, -+ (u32*)first_block + count, depth); -+ } -+ -+ /* -+ * We've probably journalled the indirect block several -+ * times during the truncate. But it's no longer -+ * needed and we now drop it from the transaction via -+ * journal_revoke(). -+ * -+ * That's easy if it's exclusively part of this -+ * transaction. But if it's part of the committing -+ * transaction then journal_forget() will simply -+ * brelse() it. That means that if the underlying -+ * block is reallocated in ext3cow_get_block(), -+ * unmap_underlying_metadata() will find this block -+ * and will try to get rid of it. damn, damn. -+ * -+ * If this block has already been committed to the -+ * journal, a revoke record will be written. And -+ * revoke records must be emitted *before* clearing -+ * this block's bit in the bitmaps. -+ */ -+ ext3cow_forget(handle, 1, inode, bh, bh->b_blocknr); -+ -+ /* -+ * Everything below this this pointer has been -+ * released. Now let this top-of-subtree go. -+ * -+ * We want the freeing of this indirect block to be -+ * atomic in the journal with the updating of the -+ * bitmap block which owns it. So make some room in -+ * the journal. -+ * -+ * We zero the parent pointer *after* freeing its -+ * pointee in the bitmaps, so if extend_transaction() -+ * for some reason fails to put the bitmap changes and -+ * the release into the same transaction, recovery -+ * will merely complain about releasing a free block, -+ * rather than leaking blocks. -+ */ -+ if (is_handle_aborted(handle)) -+ return; -+ if (try_to_extend_transaction(handle, inode)) { -+ ext3cow_mark_inode_dirty(handle, inode); -+ ext3cow_journal_test_restart(handle, inode); -+ } -+ -+ ext3cow_free_blocks(handle, inode, nr, 1); -+ -+ if (parent_bh) { -+ /* -+ * The block which we have just freed is -+ * pointed to by an indirect block: journal it -+ */ -+ BUFFER_TRACE(parent_bh, "get_write_access"); -+ if (!ext3cow_journal_get_write_access(handle, -+ parent_bh)){ -+ *p = 0; -+ BUFFER_TRACE(parent_bh, -+ "call ext3cow_journal_dirty_metadata"); -+ ext3cow_journal_dirty_metadata(handle, -+ parent_bh); -+ } -+ } -+ } -+ } else { -+ /* We have reached the bottom of the tree. */ -+ BUFFER_TRACE(parent_bh, "free data blocks"); -+ ext3cow_free_data(handle, inode, parent_bh, first, last); -+ } -+} -+ -+/* -+ * ext3cow_truncate() -+ * -+ * We block out ext3cow_get_block() block instantiations across the entire -+ * transaction, and VFS/VM ensures that ext3cow_truncate() cannot run -+ * simultaneously on behalf of the same inode. -+ * -+ * As we work through the truncate and commmit bits of it to the journal there -+ * is one core, guiding principle: the file's tree must always be consistent on -+ * disk. We must be able to restart the truncate after a crash. -+ * -+ * The file's tree may be transiently inconsistent in memory (although it -+ * probably isn't), but whenever we close off and commit a journal transaction, -+ * the contents of (the filesystem + the journal) must be consistent and -+ * restartable. It's pretty simple, really: bottom up, right to left (although -+ * left-to-right works OK too). -+ * -+ * Note that at recovery time, journal replay occurs *before* the restart of -+ * truncate against the orphan inode list. -+ * -+ * The committed inode has the new, desired i_size (which is the same as -+ * i_disksize in this case). After a crash, ext3cow_orphan_cleanup() will see -+ * that this inode's truncate did not complete and it will again call -+ * ext3cow_truncate() to have another go. So there will be instantiated blocks -+ * to the right of the truncation point in a crashed ext3cow filesystem. But -+ * that's fine - as long as they are linked from the inode, the post-crash -+ * ext3cow_truncate() run will find them and release them. -+ */ -+void ext3cow_truncate(struct inode *inode) -+{ -+ handle_t *handle; -+ struct ext3cow_inode_info *ei = EXT3COW_I(inode); -+ __le32 *i_data = ei->i_data; -+ int addr_per_block = EXT3COW_ADDR_PER_BLOCK(inode->i_sb); -+ struct address_space *mapping = inode->i_mapping; -+ int offsets[4]; -+ Indirect chain[4]; -+ Indirect *partial; -+ __le32 nr = 0; -+ int n; -+ long last_block; -+ unsigned blocksize = inode->i_sb->s_blocksize; -+ struct page *page; -+ -+ -+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || -+ S_ISLNK(inode->i_mode))) -+ return; -+ if (ext3cow_inode_is_fast_symlink(inode)) -+ return; -+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode) || -+ EXT3COW_IS_UNCHANGEABLE(inode)) /* znjp */ -+ return; -+ -+ /* If the inode needs to be dup'd, then there are no blocks -+ * to truncate; they all are part of the previous version. -+ * - znjp */ -+ if(EXT3COW_S_EPOCHNUMBER(inode->i_sb) > EXT3COW_I_EPOCHNUMBER(inode)){ -+ ext3cow_dup_inode(NULL, inode); -+ return; -+ } -+ -+ /* -+ * We have to lock the EOF page here, because lock_page() nests -+ * outside journal_start(). -+ */ -+ if ((inode->i_size & (blocksize - 1)) == 0) { -+ /* Block boundary? Nothing to do */ -+ page = NULL; -+ } else { -+ page = grab_cache_page(mapping, -+ inode->i_size >> PAGE_CACHE_SHIFT); -+ if (!page) -+ return; -+ } -+ -+ handle = start_transaction(inode); -+ if (IS_ERR(handle)) { -+ if (page) { -+ clear_highpage(page); -+ flush_dcache_page(page); -+ unlock_page(page); -+ page_cache_release(page); -+ } -+ return; /* AKPM: return what? */ -+ } -+ -+ last_block = (inode->i_size + blocksize-1) -+ >> EXT3COW_BLOCK_SIZE_BITS(inode->i_sb); -+ -+ if (page) -+ ext3cow_block_truncate_page(handle, page, mapping, inode->i_size); -+ -+ n = ext3cow_block_to_path(inode, last_block, offsets, NULL); -+ if (n == 0) -+ goto out_stop; /* error */ -+ -+ /* -+ * OK. This truncate is going to happen. We add the inode to the -+ * orphan list, so that if this truncate spans multiple transactions, -+ * and we crash, we will resume the truncate when the filesystem -+ * recovers. It also marks the inode dirty, to catch the new size. -+ * -+ * Implication: the file must always be in a sane, consistent -+ * truncatable state while each transaction commits. -+ */ -+ if (ext3cow_orphan_add(handle, inode)) -+ goto out_stop; -+ -+ /* -+ * The orphan list entry will now protect us from any crash which -+ * occurs before the truncate completes, so it is now safe to propagate -+ * the new, shorter inode size (held for now in i_size) into the -+ * on-disk inode. We do this via i_disksize, which is the value which -+ * ext3cow *really* writes onto the disk inode. -+ */ -+ ei->i_disksize = inode->i_size; -+ -+ /* -+ * From here we block out all ext3cow_get_block() callers who want to -+ * modify the block allocation tree. -+ */ -+ mutex_lock(&ei->truncate_mutex); -+ -+ if (n == 1) { /* direct blocks */ -+ unsigned int count = 0; -+ unsigned long block_to_free = 0; -+ unsigned long b = 0; -+ -+ /* We only want to remove blocks that were allocated in this -+ * epoch, i.e., have 1 bit in the bitmap. -znjp */ -+ /* If we're going to truncate a block, we should its -+ * corresponding bit in the bitmap back to 0, meaning, -+ * it needs to be allocated - znjp */ -+ for(b = offsets[0]; b < EXT3COW_NDIR_BLOCKS; b++){ -+ if(EXT3COW_I(inode)->i_cow_bitmap & (1UL << b)){ -+ if(count == 0){ -+ block_to_free = b; -+ count = 1; -+ }else if(b == block_to_free + count){ -+ count++; -+ }else{ -+ ext3cow_free_data(handle, inode, NULL, i_data + (int)block_to_free, -+ i_data + (int)(block_to_free + count)); -+ block_to_free = b; -+ count = 1; -+ } -+ /* Turn off the bit in the bitmap */ -+ EXT3COW_I(inode)->i_cow_bitmap ^= (1UL << b); -+ } -+ } -+ if(count > 0) -+ ext3cow_free_data(handle, inode, NULL, i_data+(int)block_to_free, -+ i_data + (int)(block_to_free + count)); -+ goto do_indirects; -+ } -+ -+ partial = ext3cow_find_shared(inode, n, offsets, chain, &nr); -+ /* Kill the top of shared branch (not detached) */ -+ if (nr) { -+ if (partial == chain) { -+ /* Shared branch grows from the inode */ -+ ext3cow_free_branches(handle, inode, NULL, -+ &nr, &nr+1, (chain+n-1) - partial); -+ *partial->p = 0; -+ /* -+ * We mark the inode dirty prior to restart, -+ * and prior to stop. No need for it here. -+ */ -+ } else { -+ /* Shared branch grows from an indirect block */ -+ BUFFER_TRACE(partial->bh, "get_write_access"); -+ ext3cow_free_branches(handle, inode, partial->bh, -+ partial->p, -+ partial->p+1, (chain+n-1) - partial); -+ } -+ } -+ /* Clear the ends of indirect blocks on the shared branch */ -+ while (partial > chain) { -+ ext3cow_free_branches(handle, inode, partial->bh, partial->p + 1, -+ (__le32*)partial->bh->b_data+addr_per_block, -+ (chain+n-1) - partial); -+ BUFFER_TRACE(partial->bh, "call brelse"); -+ brelse (partial->bh); -+ partial--; -+ } -+do_indirects: -+ /* Kill the remaining (whole) subtrees */ -+ /* Unless we don't have to. If the indirect block has a 0 bit -+ * then all of the children do too, so we can skip the branch - znjp -+ */ -+ switch (offsets[0]) { -+ default: -+ if(EXT3COW_I(inode)->i_cow_bitmap & (1UL << EXT3COW_IND_BLOCK)){ -+ nr = i_data[EXT3COW_IND_BLOCK]; -+ if (nr) { -+ ext3cow_free_branches(handle, inode, NULL, &nr, &nr+1, 1); -+ i_data[EXT3COW_IND_BLOCK] = 0; -+ } -+ /* And set bitmap back to 0 */ -+ EXT3COW_I(inode)->i_cow_bitmap ^= (1UL << EXT3COW_IND_BLOCK); -+ } -+ case EXT3COW_IND_BLOCK: -+ if(EXT3COW_I(inode)->i_cow_bitmap & (1UL << EXT3COW_DIND_BLOCK)){ -+ nr = i_data[EXT3COW_DIND_BLOCK]; -+ if (nr) { -+ ext3cow_free_branches(handle, inode, NULL, &nr, &nr+1, 2); -+ i_data[EXT3COW_DIND_BLOCK] = 0; -+ } -+ EXT3COW_I(inode)->i_cow_bitmap ^= (1UL << EXT3COW_DIND_BLOCK); -+ } -+ case EXT3COW_DIND_BLOCK: -+ if(EXT3COW_I(inode)->i_cow_bitmap & (1UL << EXT3COW_TIND_BLOCK)){ -+ nr = i_data[EXT3COW_TIND_BLOCK]; -+ if (nr) { -+ ext3cow_free_branches(handle, inode, NULL, &nr, &nr+1, 3); -+ i_data[EXT3COW_TIND_BLOCK] = 0; -+ } -+ EXT3COW_I(inode)->i_cow_bitmap ^= (1UL << EXT3COW_TIND_BLOCK); -+ } -+ case EXT3COW_TIND_BLOCK: -+ ; -+ } -+ -+ ext3cow_discard_reservation(inode); -+ -+ mutex_unlock(&ei->truncate_mutex); -+ inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; -+ ext3cow_mark_inode_dirty(handle, inode); -+ -+ /* -+ * In a multi-transaction truncate, we only make the final transaction -+ * synchronous -+ */ -+ if (IS_SYNC(inode)) -+ handle->h_sync = 1; -+out_stop: -+ /* -+ * If this was a simple ftruncate(), and the file will remain alive -+ * then we need to clear up the orphan record which we created above. -+ * However, if this was a real unlink then we were called by -+ * ext3cow_delete_inode(), and we allow that function to clean up the -+ * orphan info for us. -+ */ -+ if (inode->i_nlink) -+ ext3cow_orphan_del(handle, inode); -+ -+ ext3cow_journal_stop(handle); -+} -+ -+static ext3cow_fsblk_t ext3cow_get_inode_block(struct super_block *sb, -+ unsigned long ino, struct ext3cow_iloc *iloc) -+{ -+ unsigned long desc, group_desc, block_group; -+ unsigned long offset; -+ ext3cow_fsblk_t block; -+ struct buffer_head *bh; -+ struct ext3cow_group_desc * gdp; -+ -+ if (!ext3cow_valid_inum(sb, ino)) { -+ /* -+ * This error is already checked for in namei.c unless we are -+ * looking at an NFS filehandle, in which case no error -+ * report is needed -+ */ -+ return 0; -+ } -+ -+ block_group = (ino - 1) / EXT3COW_INODES_PER_GROUP(sb); -+ if (block_group >= EXT3COW_SB(sb)->s_groups_count) { -+ ext3cow_error(sb,"ext3cow_get_inode_block","group >= groups count"); -+ return 0; -+ } -+ smp_rmb(); -+ group_desc = block_group >> EXT3COW_DESC_PER_BLOCK_BITS(sb); -+ desc = block_group & (EXT3COW_DESC_PER_BLOCK(sb) - 1); -+ bh = EXT3COW_SB(sb)->s_group_desc[group_desc]; -+ if (!bh) { -+ ext3cow_error (sb, "ext3cow_get_inode_block", -+ "Descriptor not loaded"); -+ return 0; -+ } -+ -+ gdp = (struct ext3cow_group_desc *)bh->b_data; -+ /* -+ * Figure out the offset within the block group inode table -+ */ -+ offset = ((ino - 1) % EXT3COW_INODES_PER_GROUP(sb)) * -+ EXT3COW_INODE_SIZE(sb); -+ block = le32_to_cpu(gdp[desc].bg_inode_table) + -+ (offset >> EXT3COW_BLOCK_SIZE_BITS(sb)); -+ -+ iloc->block_group = block_group; -+ iloc->offset = offset & (EXT3COW_BLOCK_SIZE(sb) - 1); -+ return block; -+} -+ -+/* -+ * ext3cow_get_inode_loc returns with an extra refcount against the inode's -+ * underlying buffer_head on success. If 'in_mem' is true, we have all -+ * data in memory that is needed to recreate the on-disk version of this -+ * inode. -+ */ -+static int __ext3cow_get_inode_loc(struct inode *inode, -+ struct ext3cow_iloc *iloc, int in_mem) -+{ -+ ext3cow_fsblk_t block; -+ struct buffer_head *bh; -+ -+ block = ext3cow_get_inode_block(inode->i_sb, inode->i_ino, iloc); -+ if (!block) -+ return -EIO; -+ -+ bh = sb_getblk(inode->i_sb, block); -+ if (!bh) { -+ ext3cow_error (inode->i_sb, "ext3cow_get_inode_loc", -+ "unable to read inode block - " -+ "inode=%lu, block="E3FSBLK, -+ inode->i_ino, block); -+ return -EIO; -+ } -+ if (!buffer_uptodate(bh)) { -+ lock_buffer(bh); -+ if (buffer_uptodate(bh)) { -+ /* someone brought it uptodate while we waited */ -+ unlock_buffer(bh); -+ goto has_buffer; -+ } -+ -+ /* -+ * If we have all information of the inode in memory and this -+ * is the only valid inode in the block, we need not read the -+ * block. -+ */ -+ if (in_mem) { -+ struct buffer_head *bitmap_bh; -+ struct ext3cow_group_desc *desc; -+ int inodes_per_buffer; -+ int inode_offset, i; -+ int block_group; -+ int start; -+ -+ block_group = (inode->i_ino - 1) / -+ EXT3COW_INODES_PER_GROUP(inode->i_sb); -+ inodes_per_buffer = bh->b_size / -+ EXT3COW_INODE_SIZE(inode->i_sb); -+ inode_offset = ((inode->i_ino - 1) % -+ EXT3COW_INODES_PER_GROUP(inode->i_sb)); -+ start = inode_offset & ~(inodes_per_buffer - 1); -+ -+ /* Is the inode bitmap in cache? */ -+ desc = ext3cow_get_group_desc(inode->i_sb, -+ block_group, NULL); -+ if (!desc) -+ goto make_io; -+ -+ bitmap_bh = sb_getblk(inode->i_sb, -+ le32_to_cpu(desc->bg_inode_bitmap)); -+ if (!bitmap_bh) -+ goto make_io; -+ -+ /* -+ * If the inode bitmap isn't in cache then the -+ * optimisation may end up performing two reads instead -+ * of one, so skip it. -+ */ -+ if (!buffer_uptodate(bitmap_bh)) { -+ brelse(bitmap_bh); -+ goto make_io; -+ } -+ for (i = start; i < start + inodes_per_buffer; i++) { -+ if (i == inode_offset) -+ continue; -+ if (ext3cow_test_bit(i, bitmap_bh->b_data)) -+ break; -+ } -+ brelse(bitmap_bh); -+ if (i == start + inodes_per_buffer) { -+ /* all other inodes are free, so skip I/O */ -+ memset(bh->b_data, 0, bh->b_size); -+ set_buffer_uptodate(bh); -+ unlock_buffer(bh); -+ goto has_buffer; -+ } -+ } -+ -+make_io: -+ /* -+ * There are other valid inodes in the buffer, this inode -+ * has in-inode xattrs, or we don't have this inode in memory. -+ * Read the block from disk. -+ */ -+ get_bh(bh); -+ bh->b_end_io = end_buffer_read_sync; -+ submit_bh(READ_META, bh); -+ wait_on_buffer(bh); -+ if (!buffer_uptodate(bh)) { -+ ext3cow_error(inode->i_sb, "ext3cow_get_inode_loc", -+ "unable to read inode block - " -+ "inode=%lu, block="E3FSBLK, -+ inode->i_ino, block); -+ brelse(bh); -+ return -EIO; -+ } -+ } -+has_buffer: -+ iloc->bh = bh; -+ return 0; -+} -+ -+int ext3cow_get_inode_loc(struct inode *inode, struct ext3cow_iloc *iloc) -+{ -+ /* We have all inode data except xattrs in memory here. */ -+ return __ext3cow_get_inode_loc(inode, iloc, -+ !(EXT3COW_I(inode)->i_state & EXT3COW_STATE_XATTR)); -+} -+ -+void ext3cow_set_inode_flags(struct inode *inode) -+{ -+ unsigned int flags = EXT3COW_I(inode)->i_flags; -+ -+ inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); -+ if (flags & EXT3COW_SYNC_FL) -+ inode->i_flags |= S_SYNC; -+ if (flags & EXT3COW_APPEND_FL) -+ inode->i_flags |= S_APPEND; -+ if (flags & EXT3COW_IMMUTABLE_FL) -+ inode->i_flags |= S_IMMUTABLE; -+ if (flags & EXT3COW_NOATIME_FL) -+ inode->i_flags |= S_NOATIME; -+ if (flags & EXT3COW_DIRSYNC_FL) -+ inode->i_flags |= S_DIRSYNC; -+} -+ -+void ext3cow_read_inode(struct inode * inode) -+{ -+ struct ext3cow_iloc iloc; -+ struct ext3cow_inode *raw_inode; -+ struct ext3cow_inode_info *ei = EXT3COW_I(inode); -+ struct buffer_head *bh; -+ int block; -+ -+#ifdef CONFIG_EXT3COW_FS_POSIX_ACL -+ ei->i_acl = EXT3COW_ACL_NOT_CACHED; -+ ei->i_default_acl = EXT3COW_ACL_NOT_CACHED; -+#endif -+ ei->i_block_alloc_info = NULL; -+ -+ if (__ext3cow_get_inode_loc(inode, &iloc, 0)) -+ goto bad_inode; -+ bh = iloc.bh; -+ raw_inode = ext3cow_raw_inode(&iloc); -+ inode->i_mode = le16_to_cpu(raw_inode->i_mode); -+ inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); -+ inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); -+ if(!(test_opt (inode->i_sb, NO_UID32))) { -+ inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; -+ inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; -+ } -+ inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); -+ inode->i_size = le32_to_cpu(raw_inode->i_size); -+ inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime); -+ inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime); -+ inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime); -+ inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0; -+ -+ ei->i_state = 0; -+ ei->i_dir_start_lookup = 0; -+ ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); -+ /* We now have enough fields to check if the inode was active or not. -+ * This is needed because nfsd might try to access dead inodes -+ * the test is that same one that e2fsck uses -+ * NeilBrown 1999oct15 -+ */ -+ if (inode->i_nlink == 0) { -+ if (inode->i_mode == 0 || -+ !(EXT3COW_SB(inode->i_sb)->s_mount_state & EXT3COW_ORPHAN_FS)) { -+ /* this inode is deleted */ -+ brelse (bh); -+ goto bad_inode; -+ } -+ /* The only unlinked inodes we let through here have -+ * valid i_mode and are being read by the orphan -+ * recovery code: that's fine, we're about to complete -+ * the process of deleting those. */ -+ } -+ inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); -+ ei->i_flags = le32_to_cpu(raw_inode->i_flags); -+ /* For versioning -znjp */ -+ ei->i_cow_bitmap = le32_to_cpu(raw_inode->i_cowbitmap); -+ ei->i_epoch_number = le32_to_cpu(raw_inode->i_epch_number); -+ ei->i_next_inode = le32_to_cpu(raw_inode->i_nxt_inode); -+ -+#ifdef EXT3COW_FRAGMENTS -+ /* Taken out for versioning -znjp */ -+ //ei->i_faddr = le32_to_cpu(raw_inode->i_faddr); -+ //ei->i_frag_no = raw_inode->i_frag; -+ //ei->i_frag_size = raw_inode->i_fsize; -+#endif -+ ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl); -+ if (!S_ISREG(inode->i_mode)) { -+ ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl); -+ } else { -+ inode->i_size |= -+ ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32; -+ } -+ ei->i_disksize = inode->i_size; -+ inode->i_generation = le32_to_cpu(raw_inode->i_generation); -+ ei->i_block_group = iloc.block_group; -+ /* -+ * NOTE! The in-memory inode i_data array is in little-endian order -+ * even on big-endian machines: we do NOT byteswap the block numbers! -+ */ -+ for (block = 0; block < EXT3COW_N_BLOCKS; block++) -+ ei->i_data[block] = raw_inode->i_block[block]; -+ INIT_LIST_HEAD(&ei->i_orphan); -+ -+ if (inode->i_ino >= EXT3COW_FIRST_INO(inode->i_sb) + 1 && -+ EXT3COW_INODE_SIZE(inode->i_sb) > EXT3COW_GOOD_OLD_INODE_SIZE) { -+ /* -+ * When mke2fs creates big inodes it does not zero out -+ * the unused bytes above EXT3COW_GOOD_OLD_INODE_SIZE, -+ * so ignore those first few inodes. -+ */ -+ ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); -+ if (EXT3COW_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > -+ EXT3COW_INODE_SIZE(inode->i_sb)) -+ goto bad_inode; -+ if (ei->i_extra_isize == 0) { -+ /* The extra space is currently unused. Use it. */ -+ ei->i_extra_isize = sizeof(struct ext3cow_inode) - -+ EXT3COW_GOOD_OLD_INODE_SIZE; -+ } else { -+ __le32 *magic = (void *)raw_inode + -+ EXT3COW_GOOD_OLD_INODE_SIZE + -+ ei->i_extra_isize; -+ if (*magic == cpu_to_le32(EXT3COW_XATTR_MAGIC)) -+ ei->i_state |= EXT3COW_STATE_XATTR; -+ } -+ } else -+ ei->i_extra_isize = 0; -+ -+ if (S_ISREG(inode->i_mode)) { -+ inode->i_op = &ext3cow_file_inode_operations; -+ inode->i_fop = &ext3cow_file_operations; -+ ext3cow_set_aops(inode); -+ } else if (S_ISDIR(inode->i_mode)) { -+ inode->i_op = &ext3cow_dir_inode_operations; -+ inode->i_fop = &ext3cow_dir_operations; -+ } else if (S_ISLNK(inode->i_mode)) { -+ if (ext3cow_inode_is_fast_symlink(inode)) -+ inode->i_op = &ext3cow_fast_symlink_inode_operations; -+ else { -+ inode->i_op = &ext3cow_symlink_inode_operations; -+ ext3cow_set_aops(inode); -+ } -+ } else { -+ inode->i_op = &ext3cow_special_inode_operations; -+ if (raw_inode->i_block[0]) -+ init_special_inode(inode, inode->i_mode, -+ old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); -+ else -+ init_special_inode(inode, inode->i_mode, -+ new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); -+ } -+ brelse (iloc.bh); -+ ext3cow_set_inode_flags(inode); -+ return; -+ -+bad_inode: -+ make_bad_inode(inode); -+ return; -+} -+ -+/* -+ * Post the struct inode info into an on-disk inode location in the -+ * buffer-cache. This gobbles the caller's reference to the -+ * buffer_head in the inode location struct. -+ * -+ * The caller must have write access to iloc->bh. -+ */ -+static int ext3cow_do_update_inode(handle_t *handle, -+ struct inode *inode, -+ struct ext3cow_iloc *iloc) -+{ -+ struct ext3cow_inode *raw_inode = ext3cow_raw_inode(iloc); -+ struct ext3cow_inode_info *ei = EXT3COW_I(inode); -+ struct buffer_head *bh = iloc->bh; -+ int err = 0, rc, block; -+ -+ /* For fields not not tracking in the in-memory inode, -+ * initialise them to zero for new inodes. */ -+ if (ei->i_state & EXT3COW_STATE_NEW) -+ memset(raw_inode, 0, EXT3COW_SB(inode->i_sb)->s_inode_size); -+ -+ raw_inode->i_mode = cpu_to_le16(inode->i_mode); -+ -+ -+ if(!(test_opt(inode->i_sb, NO_UID32))) { -+ raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); -+ raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); -+ -+ /* Fix up interoperability with old kernels. Otherwise, old inodes get -+ * re-used with the upper 16 bits of the uid/gid intact -+ */ -+ -+ if(!ei->i_dtime) { -+ raw_inode->i_uid_high = -+ cpu_to_le16(high_16_bits(inode->i_uid)); -+ raw_inode->i_gid_high = -+ cpu_to_le16(high_16_bits(inode->i_gid)); -+ } else { -+ raw_inode->i_uid_high = 0; -+ raw_inode->i_gid_high = 0; -+ } -+ -+ } else { -+ raw_inode->i_uid_low = -+ cpu_to_le16(fs_high2lowuid(inode->i_uid)); -+ raw_inode->i_gid_low = -+ cpu_to_le16(fs_high2lowgid(inode->i_gid)); -+ raw_inode->i_uid_high = 0; -+ raw_inode->i_gid_high = 0; -+ } -+ -+ raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); -+ raw_inode->i_size = cpu_to_le32(ei->i_disksize); -+ raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); -+ raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); -+ raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); -+ raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); -+ raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); -+ raw_inode->i_flags = cpu_to_le32(ei->i_flags); -+ /* For versioning -znjp */ -+ raw_inode->i_cowbitmap = cpu_to_le16(EXT3COW_I(inode)->i_cow_bitmap); -+ raw_inode->i_epch_number = cpu_to_le32(EXT3COW_I(inode)->i_epoch_number); -+ raw_inode->i_nxt_inode = cpu_to_le32(EXT3COW_I(inode)->i_next_inode); -+ -+#ifdef EXT3COW_FRAGMENTS -+ /* Taken out for versioning -znjp */ -+ //raw_inode->i_faddr = cpu_to_le32(ei->i_faddr); -+ //raw_inode->i_frag = ei->i_frag_no; -+ //raw_inode->i_fsize = ei->i_frag_size; -+#endif -+ raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl); -+ if (!S_ISREG(inode->i_mode)) { -+ raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl); -+ } else { -+ raw_inode->i_size_high = -+ cpu_to_le32(ei->i_disksize >> 32); -+ if (ei->i_disksize > 0x7fffffffULL) { -+ struct super_block *sb = inode->i_sb; -+ if (!EXT3COW_HAS_RO_COMPAT_FEATURE(sb, -+ EXT3COW_FEATURE_RO_COMPAT_LARGE_FILE) || -+ EXT3COW_SB(sb)->s_es->s_rev_level == -+ cpu_to_le32(EXT3COW_GOOD_OLD_REV)) { -+ /* If this is the first large file -+ * created, add a flag to the superblock. -+ */ -+ err = ext3cow_journal_get_write_access(handle, -+ EXT3COW_SB(sb)->s_sbh); -+ if (err) -+ goto out_brelse; -+ ext3cow_update_dynamic_rev(sb); -+ EXT3COW_SET_RO_COMPAT_FEATURE(sb, -+ EXT3COW_FEATURE_RO_COMPAT_LARGE_FILE); -+ sb->s_dirt = 1; -+ handle->h_sync = 1; -+ err = ext3cow_journal_dirty_metadata(handle, -+ EXT3COW_SB(sb)->s_sbh); -+ } -+ } -+ } -+ raw_inode->i_generation = cpu_to_le32(inode->i_generation); -+ if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { -+ if (old_valid_dev(inode->i_rdev)) { -+ raw_inode->i_block[0] = -+ cpu_to_le32(old_encode_dev(inode->i_rdev)); -+ raw_inode->i_block[1] = 0; -+ } else { -+ raw_inode->i_block[0] = 0; -+ raw_inode->i_block[1] = -+ cpu_to_le32(new_encode_dev(inode->i_rdev)); -+ raw_inode->i_block[2] = 0; -+ } -+ } else for (block = 0; block < EXT3COW_N_BLOCKS; block++) -+ raw_inode->i_block[block] = ei->i_data[block]; -+ -+ if (ei->i_extra_isize) -+ raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); -+ -+ BUFFER_TRACE(bh, "call ext3cow_journal_dirty_metadata"); -+ rc = ext3cow_journal_dirty_metadata(handle, bh); -+ if (!err) -+ err = rc; -+ ei->i_state &= ~EXT3COW_STATE_NEW; -+ -+out_brelse: -+ brelse (bh); -+ ext3cow_std_error(inode->i_sb, err); -+ return err; -+} -+ -+/* -+ * ext3cow_write_inode() -+ * -+ * We are called from a few places: -+ * -+ * - Within generic_file_write() for O_SYNC files. -+ * Here, there will be no transaction running. We wait for any running -+ * trasnaction to commit. -+ * -+ * - Within sys_sync(), kupdate and such. -+ * We wait on commit, if tol to. -+ * -+ * - Within prune_icache() (PF_MEMALLOC == true) -+ * Here we simply return. We can't afford to block kswapd on the -+ * journal commit. -+ * -+ * In all cases it is actually safe for us to return without doing anything, -+ * because the inode has been copied into a raw inode buffer in -+ * ext3cow_mark_inode_dirty(). This is a correctness thing for O_SYNC and for -+ * knfsd. -+ * -+ * Note that we are absolutely dependent upon all inode dirtiers doing the -+ * right thing: they *must* call mark_inode_dirty() after dirtying info in -+ * which we are interested. -+ * -+ * It would be a bug for them to not do this. The code: -+ * -+ * mark_inode_dirty(inode) -+ * stuff(); -+ * inode->i_size = expr; -+ * -+ * is in error because a kswapd-driven write_inode() could occur while -+ * `stuff()' is running, and the new i_size will be lost. Plus the inode -+ * will no longer be on the superblock's dirty inode list. -+ */ -+int ext3cow_write_inode(struct inode *inode, int wait) -+{ -+ if (current->flags & PF_MEMALLOC) -+ return 0; -+ -+ if (ext3cow_journal_current_handle()) { -+ jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n"); -+ dump_stack(); -+ return -EIO; -+ } -+ -+ if (!wait) -+ return 0; -+ -+ return ext3cow_force_commit(inode->i_sb); -+} -+ -+/* -+ * ext3cow_setattr() -+ * -+ * Called from notify_change. -+ * -+ * We want to trap VFS attempts to truncate the file as soon as -+ * possible. In particular, we want to make sure that when the VFS -+ * shrinks i_size, we put the inode on the orphan list and modify -+ * i_disksize immediately, so that during the subsequent flushing of -+ * dirty pages and freeing of disk blocks, we can guarantee that any -+ * commit will leave the blocks being flushed in an unused state on -+ * disk. (On recovery, the inode will get truncated and the blocks will -+ * be freed, so we have a strong guarantee that no future commit will -+ * leave these blocks visible to the user.) -+ * -+ * Called with inode->sem down. -+ */ -+int ext3cow_setattr(struct dentry *dentry, struct iattr *attr) -+{ -+ struct inode *inode = dentry->d_inode; -+ int error, rc = 0; -+ const unsigned int ia_valid = attr->ia_valid; -+ -+ error = inode_change_ok(inode, attr); -+ if (error) -+ return error; -+ -+ /* For versioning -znjp */ -+ if(is_unchangeable(inode, dentry)){ -+ error = -EROFS; -+ goto err_out; -+ } -+ -+ if(EXT3COW_S_EPOCHNUMBER(inode->i_sb) > EXT3COW_I_EPOCHNUMBER(inode)){ -+ error = ext3cow_dup_inode(dentry->d_parent->d_inode, inode); -+ if(error) -+ goto err_out; -+ } -+ -+ if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || -+ (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { -+ handle_t *handle; -+ -+ /* (user+group)*(old+new) structure, inode write (sb, -+ * inode block, ? - but truncate inode update has it) */ -+ handle = ext3cow_journal_start(inode, 2*(EXT3COW_QUOTA_INIT_BLOCKS(inode->i_sb)+ -+ EXT3COW_QUOTA_DEL_BLOCKS(inode->i_sb))+3); -+ if (IS_ERR(handle)) { -+ error = PTR_ERR(handle); -+ goto err_out; -+ } -+ error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0; -+ if (error) { -+ ext3cow_journal_stop(handle); -+ return error; -+ } -+ /* Update corresponding info in inode so that everything is in -+ * one transaction */ -+ if (attr->ia_valid & ATTR_UID) -+ inode->i_uid = attr->ia_uid; -+ if (attr->ia_valid & ATTR_GID) -+ inode->i_gid = attr->ia_gid; -+ error = ext3cow_mark_inode_dirty(handle, inode); -+ ext3cow_journal_stop(handle); -+ } -+ -+ if (S_ISREG(inode->i_mode) && -+ attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { -+ handle_t *handle; -+ -+ handle = ext3cow_journal_start(inode, 3); -+ if (IS_ERR(handle)) { -+ error = PTR_ERR(handle); -+ goto err_out; -+ } -+ -+ error = ext3cow_orphan_add(handle, inode); -+ EXT3COW_I(inode)->i_disksize = attr->ia_size; -+ rc = ext3cow_mark_inode_dirty(handle, inode); -+ if (!error) -+ error = rc; -+ ext3cow_journal_stop(handle); -+ } -+ -+ rc = inode_setattr(inode, attr); -+ -+ /* If inode_setattr's call to ext3cow_truncate failed to get a -+ * transaction handle at all, we need to clean up the in-core -+ * orphan list manually. */ -+ if (inode->i_nlink) -+ ext3cow_orphan_del(NULL, inode); -+ -+ if (!rc && (ia_valid & ATTR_MODE)) -+ rc = ext3cow_acl_chmod(inode); -+ -+err_out: -+ ext3cow_std_error(inode->i_sb, error); -+ if (!error) -+ error = rc; -+ return error; -+} -+ -+ -+/* -+ * How many blocks doth make a writepage()? -+ * -+ * With N blocks per page, it may be: -+ * N data blocks -+ * 2 indirect block -+ * 2 dindirect -+ * 1 tindirect -+ * N+5 bitmap blocks (from the above) -+ * N+5 group descriptor summary blocks -+ * 1 inode block -+ * 1 superblock. -+ * 2 * EXT3COW_SINGLEDATA_TRANS_BLOCKS for the quote files -+ * -+ * 3 * (N + 5) + 2 + 2 * EXT3COW_SINGLEDATA_TRANS_BLOCKS -+ * -+ * With ordered or writeback data it's the same, less the N data blocks. -+ * -+ * If the inode's direct blocks can hold an integral number of pages then a -+ * page cannot straddle two indirect blocks, and we can only touch one indirect -+ * and dindirect block, and the "5" above becomes "3". -+ * -+ * This still overestimates under most circumstances. If we were to pass the -+ * start and end offsets in here as well we could do block_to_path() on each -+ * block and work out the exact number of indirects which are touched. Pah. -+ */ -+ -+static int ext3cow_writepage_trans_blocks(struct inode *inode) -+{ -+ int bpp = ext3cow_journal_blocks_per_page(inode); -+ int indirects = (EXT3COW_NDIR_BLOCKS % bpp) ? 5 : 3; -+ int ret; -+ -+ if (ext3cow_should_journal_data(inode)) -+ ret = 3 * (bpp + indirects) + 2; -+ else -+ ret = 2 * (bpp + indirects) + 2; -+ -+#ifdef CONFIG_QUOTA -+ /* We know that structure was already allocated during DQUOT_INIT so -+ * we will be updating only the data blocks + inodes */ -+ ret += 2*EXT3COW_QUOTA_TRANS_BLOCKS(inode->i_sb); -+#endif -+ -+ return ret; -+} -+ -+/* -+ * The caller must have previously called ext3cow_reserve_inode_write(). -+ * Give this, we know that the caller already has write access to iloc->bh. -+ */ -+int ext3cow_mark_iloc_dirty(handle_t *handle, -+ struct inode *inode, struct ext3cow_iloc *iloc) -+{ -+ int err = 0; -+ -+ /* the do_update_inode consumes one bh->b_count */ -+ get_bh(iloc->bh); -+ -+ /* ext3cow_do_update_inode() does journal_dirty_metadata */ -+ err = ext3cow_do_update_inode(handle, inode, iloc); -+ put_bh(iloc->bh); -+ return err; -+} -+ -+/* -+ * On success, We end up with an outstanding reference count against -+ * iloc->bh. This _must_ be cleaned up later. -+ */ -+ -+int -+ext3cow_reserve_inode_write(handle_t *handle, struct inode *inode, -+ struct ext3cow_iloc *iloc) -+{ -+ int err = 0; -+ if (handle) { -+ err = ext3cow_get_inode_loc(inode, iloc); -+ if (!err) { -+ BUFFER_TRACE(iloc->bh, "get_write_access"); -+ err = ext3cow_journal_get_write_access(handle, iloc->bh); -+ if (err) { -+ brelse(iloc->bh); -+ iloc->bh = NULL; -+ } -+ } -+ } -+ ext3cow_std_error(inode->i_sb, err); -+ return err; -+} -+ -+/* -+ * What we do here is to mark the in-core inode as clean with respect to inode -+ * dirtiness (it may still be data-dirty). -+ * This means that the in-core inode may be reaped by prune_icache -+ * without having to perform any I/O. This is a very good thing, -+ * because *any* task may call prune_icache - even ones which -+ * have a transaction open against a different journal. -+ * -+ * Is this cheating? Not really. Sure, we haven't written the -+ * inode out, but prune_icache isn't a user-visible syncing function. -+ * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) -+ * we start and wait on commits. -+ * -+ * Is this efficient/effective? Well, we're being nice to the system -+ * by cleaning up our inodes proactively so they can be reaped -+ * without I/O. But we are potentially leaving up to five seconds' -+ * worth of inodes floating about which prune_icache wants us to -+ * write out. One way to fix that would be to get prune_icache() -+ * to do a write_super() to free up some memory. It has the desired -+ * effect. -+ */ -+int ext3cow_mark_inode_dirty(handle_t *handle, struct inode *inode) -+{ -+ struct ext3cow_iloc iloc; -+ int err; -+ -+ if(EXT3COW_IS_FAKEINODE(inode)) -+ return 0; -+ -+ might_sleep(); -+ err = ext3cow_reserve_inode_write(handle, inode, &iloc); -+ if (!err) -+ err = ext3cow_mark_iloc_dirty(handle, inode, &iloc); -+ return err; -+} -+ -+/* -+ * ext3cow_dirty_inode() is called from __mark_inode_dirty() -+ * -+ * We're really interested in the case where a file is being extended. -+ * i_size has been changed by generic_commit_write() and we thus need -+ * to include the updated inode in the current transaction. -+ * -+ * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks -+ * are allocated to the file. -+ * -+ * If the inode is marked synchronous, we don't honour that here - doing -+ * so would cause a commit on atime updates, which we don't bother doing. -+ * We handle synchronous inodes at the highest possible level. -+ */ -+void ext3cow_dirty_inode(struct inode *inode) -+{ -+ handle_t *current_handle = ext3cow_journal_current_handle(); -+ handle_t *handle; -+ -+ handle = ext3cow_journal_start(inode, 2); -+ if (IS_ERR(handle)) -+ goto out; -+ if (current_handle && -+ current_handle->h_transaction != handle->h_transaction) { -+ /* This task has a transaction open against a different fs */ -+ printk(KERN_EMERG "%s: transactions do not match!\n", -+ __FUNCTION__); -+ } else { -+ jbd_debug(5, "marking dirty. outer handle=%p\n", -+ current_handle); -+ ext3cow_mark_inode_dirty(handle, inode); -+ } -+ ext3cow_journal_stop(handle); -+out: -+ return; -+} -+ -+#if 0 -+/* -+ * Bind an inode's backing buffer_head into this transaction, to prevent -+ * it from being flushed to disk early. Unlike -+ * ext3cow_reserve_inode_write, this leaves behind no bh reference and -+ * returns no iloc structure, so the caller needs to repeat the iloc -+ * lookup to mark the inode dirty later. -+ */ -+static int ext3cow_pin_inode(handle_t *handle, struct inode *inode) -+{ -+ struct ext3cow_iloc iloc; -+ -+ int err = 0; -+ if (handle) { -+ err = ext3cow_get_inode_loc(inode, &iloc); -+ if (!err) { -+ BUFFER_TRACE(iloc.bh, "get_write_access"); -+ err = journal_get_write_access(handle, iloc.bh); -+ if (!err) -+ err = ext3cow_journal_dirty_metadata(handle, -+ iloc.bh); -+ brelse(iloc.bh); -+ } -+ } -+ ext3cow_std_error(inode->i_sb, err); -+ return err; -+} -+#endif -+ -+int ext3cow_change_inode_journal_flag(struct inode *inode, int val) -+{ -+ journal_t *journal; -+ handle_t *handle; -+ int err; -+ -+ /* -+ * We have to be very careful here: changing a data block's -+ * journaling status dynamically is dangerous. If we write a -+ * data block to the journal, change the status and then delete -+ * that block, we risk forgetting to revoke the old log record -+ * from the journal and so a subsequent replay can corrupt data. -+ * So, first we make sure that the journal is empty and that -+ * nobody is changing anything. -+ */ -+ -+ journal = EXT3COW_JOURNAL(inode); -+ if (is_journal_aborted(journal) || IS_RDONLY(inode)) -+ return -EROFS; -+ -+ journal_lock_updates(journal); -+ journal_flush(journal); -+ -+ /* -+ * OK, there are no updates running now, and all cached data is -+ * synced to disk. We are now in a completely consistent state -+ * which doesn't have anything in the journal, and we know that -+ * no filesystem updates are running, so it is safe to modify -+ * the inode's in-core data-journaling state flag now. -+ */ -+ -+ if (val) -+ EXT3COW_I(inode)->i_flags |= EXT3COW_JOURNAL_DATA_FL; -+ else -+ EXT3COW_I(inode)->i_flags &= ~EXT3COW_JOURNAL_DATA_FL; -+ ext3cow_set_aops(inode); -+ -+ journal_unlock_updates(journal); -+ -+ /* Finally we can mark the inode as dirty. */ -+ -+ handle = ext3cow_journal_start(inode, 1); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ err = ext3cow_mark_inode_dirty(handle, inode); -+ handle->h_sync = 1; -+ ext3cow_journal_stop(handle); -+ ext3cow_std_error(inode->i_sb, err); -+ -+ return err; -+} -diff -ruN linux-2.6.20.3/fs/ext3cow/ioctl.c linux-2.6.20.3-ext3cow/fs/ext3cow/ioctl.c ---- linux-2.6.20.3/fs/ext3cow/ioctl.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.20.3-ext3cow/fs/ext3cow/ioctl.c 2008-03-09 11:14:48.000000000 -0400 -@@ -0,0 +1,312 @@ -+/* -+ * linux/fs/ext3cow/ioctl.c -+ * -+ * Copyright (C) 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+int ext3cow_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, -+ unsigned long arg) -+{ -+ struct ext3cow_inode_info *ei = EXT3COW_I(inode); -+ unsigned int flags; -+ unsigned short rsv_window_size; -+ -+ ext3cow_debug ("cmd = %u, arg = %lu\n", cmd, arg); -+ -+ switch (cmd) { -+ /* Some IOCTLs for version */ -+ case EXT3COW_IOC_TAKESNAPSHOT: -+ return (unsigned int)ext3cow_take_snapshot(inode->i_sb); -+ case EXT3COW_IOC_GETEPOCH: -+ return (unsigned int)EXT3COW_S_EPOCHNUMBER(inode->i_sb); -+ case EXT3COW_IOC_GETFLAGS: -+ flags = ei->i_flags & EXT3COW_FL_USER_VISIBLE; -+ return put_user(flags, (int __user *) arg); -+ case EXT3COW_IOC_SETFLAGS: { -+ handle_t *handle = NULL; -+ int err; -+ struct ext3cow_iloc iloc; -+ unsigned int oldflags; -+ unsigned int jflag; -+ -+ if (IS_RDONLY(inode)) -+ return -EROFS; -+ -+ if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) -+ return -EACCES; -+ -+ if (get_user(flags, (int __user *) arg)) -+ return -EFAULT; -+ -+ if (!S_ISDIR(inode->i_mode)) -+ flags &= ~EXT3COW_DIRSYNC_FL; -+ -+ mutex_lock(&inode->i_mutex); -+ oldflags = ei->i_flags; -+ -+ /* The JOURNAL_DATA flag is modifiable only by root */ -+ jflag = flags & EXT3COW_JOURNAL_DATA_FL; -+ -+ /* -+ * The IMMUTABLE and APPEND_ONLY flags can only be changed by -+ * the relevant capability. -+ * -+ * This test looks nicer. Thanks to Pauline Middelink -+ */ -+ if ((flags ^ oldflags) & (EXT3COW_APPEND_FL | EXT3COW_IMMUTABLE_FL)) { -+ if (!capable(CAP_LINUX_IMMUTABLE)) { -+ mutex_unlock(&inode->i_mutex); -+ return -EPERM; -+ } -+ } -+ -+ /* -+ * The JOURNAL_DATA flag can only be changed by -+ * the relevant capability. -+ */ -+ if ((jflag ^ oldflags) & (EXT3COW_JOURNAL_DATA_FL)) { -+ if (!capable(CAP_SYS_RESOURCE)) { -+ mutex_unlock(&inode->i_mutex); -+ return -EPERM; -+ } -+ } -+ -+ -+ handle = ext3cow_journal_start(inode, 1); -+ if (IS_ERR(handle)) { -+ mutex_unlock(&inode->i_mutex); -+ return PTR_ERR(handle); -+ } -+ if (IS_SYNC(inode)) -+ handle->h_sync = 1; -+ err = ext3cow_reserve_inode_write(handle, inode, &iloc); -+ if (err) -+ goto flags_err; -+ -+ flags = flags & EXT3COW_FL_USER_MODIFIABLE; -+ flags |= oldflags & ~EXT3COW_FL_USER_MODIFIABLE; -+ ei->i_flags = flags; -+ -+ ext3cow_set_inode_flags(inode); -+ inode->i_ctime = CURRENT_TIME_SEC; -+ -+ err = ext3cow_mark_iloc_dirty(handle, inode, &iloc); -+flags_err: -+ ext3cow_journal_stop(handle); -+ if (err) { -+ mutex_unlock(&inode->i_mutex); -+ return err; -+ } -+ -+ if ((jflag ^ oldflags) & (EXT3COW_JOURNAL_DATA_FL)) -+ err = ext3cow_change_inode_journal_flag(inode, jflag); -+ mutex_unlock(&inode->i_mutex); -+ return err; -+ } -+ case EXT3COW_IOC_GETVERSION: -+ case EXT3COW_IOC_GETVERSION_OLD: -+ return put_user(inode->i_generation, (int __user *) arg); -+ case EXT3COW_IOC_SETVERSION: -+ case EXT3COW_IOC_SETVERSION_OLD: { -+ handle_t *handle; -+ struct ext3cow_iloc iloc; -+ __u32 generation; -+ int err; -+ -+ if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) -+ return -EPERM; -+ if (IS_RDONLY(inode)) -+ return -EROFS; -+ if (get_user(generation, (int __user *) arg)) -+ return -EFAULT; -+ -+ handle = ext3cow_journal_start(inode, 1); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ err = ext3cow_reserve_inode_write(handle, inode, &iloc); -+ if (err == 0) { -+ inode->i_ctime = CURRENT_TIME_SEC; -+ inode->i_generation = generation; -+ err = ext3cow_mark_iloc_dirty(handle, inode, &iloc); -+ } -+ ext3cow_journal_stop(handle); -+ return err; -+ } -+#ifdef CONFIG_JBD_DEBUG -+ case EXT3COW_IOC_WAIT_FOR_READONLY: -+ /* -+ * This is racy - by the time we're woken up and running, -+ * the superblock could be released. And the module could -+ * have been unloaded. So sue me. -+ * -+ * Returns 1 if it slept, else zero. -+ */ -+ { -+ struct super_block *sb = inode->i_sb; -+ DECLARE_WAITQUEUE(wait, current); -+ int ret = 0; -+ -+ set_current_state(TASK_INTERRUPTIBLE); -+ add_wait_queue(&EXT3COW_SB(sb)->ro_wait_queue, &wait); -+ if (timer_pending(&EXT3COW_SB(sb)->turn_ro_timer)) { -+ schedule(); -+ ret = 1; -+ } -+ remove_wait_queue(&EXT3COW_SB(sb)->ro_wait_queue, &wait); -+ return ret; -+ } -+#endif -+ case EXT3COW_IOC_GETRSVSZ: -+ if (test_opt(inode->i_sb, RESERVATION) -+ && S_ISREG(inode->i_mode) -+ && ei->i_block_alloc_info) { -+ rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size; -+ return put_user(rsv_window_size, (int __user *)arg); -+ } -+ return -ENOTTY; -+ case EXT3COW_IOC_SETRSVSZ: { -+ -+ if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) -+ return -ENOTTY; -+ -+ if (IS_RDONLY(inode)) -+ return -EROFS; -+ -+ if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) -+ return -EACCES; -+ -+ if (get_user(rsv_window_size, (int __user *)arg)) -+ return -EFAULT; -+ -+ if (rsv_window_size > EXT3COW_MAX_RESERVE_BLOCKS) -+ rsv_window_size = EXT3COW_MAX_RESERVE_BLOCKS; -+ -+ /* -+ * need to allocate reservation structure for this inode -+ * before set the window size -+ */ -+ mutex_lock(&ei->truncate_mutex); -+ if (!ei->i_block_alloc_info) -+ ext3cow_init_block_alloc_info(inode); -+ -+ if (ei->i_block_alloc_info){ -+ struct ext3cow_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node; -+ rsv->rsv_goal_size = rsv_window_size; -+ } -+ mutex_unlock(&ei->truncate_mutex); -+ return 0; -+ } -+ case EXT3COW_IOC_GROUP_EXTEND: { -+ ext3cow_fsblk_t n_blocks_count; -+ struct super_block *sb = inode->i_sb; -+ int err; -+ -+ if (!capable(CAP_SYS_RESOURCE)) -+ return -EPERM; -+ -+ if (IS_RDONLY(inode)) -+ return -EROFS; -+ -+ if (get_user(n_blocks_count, (__u32 __user *)arg)) -+ return -EFAULT; -+ -+ err = ext3cow_group_extend(sb, EXT3COW_SB(sb)->s_es, n_blocks_count); -+ journal_lock_updates(EXT3COW_SB(sb)->s_journal); -+ journal_flush(EXT3COW_SB(sb)->s_journal); -+ journal_unlock_updates(EXT3COW_SB(sb)->s_journal); -+ -+ return err; -+ } -+ case EXT3COW_IOC_GROUP_ADD: { -+ struct ext3cow_new_group_data input; -+ struct super_block *sb = inode->i_sb; -+ int err; -+ -+ if (!capable(CAP_SYS_RESOURCE)) -+ return -EPERM; -+ -+ if (IS_RDONLY(inode)) -+ return -EROFS; -+ -+ if (copy_from_user(&input, (struct ext3cow_new_group_input __user *)arg, -+ sizeof(input))) -+ return -EFAULT; -+ -+ err = ext3cow_group_add(sb, &input); -+ journal_lock_updates(EXT3COW_SB(sb)->s_journal); -+ journal_flush(EXT3COW_SB(sb)->s_journal); -+ journal_unlock_updates(EXT3COW_SB(sb)->s_journal); -+ -+ return err; -+ } -+ -+ -+ default: -+ return -ENOTTY; -+ } -+} -+ -+#ifdef CONFIG_COMPAT -+long ext3cow_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -+{ -+ struct inode *inode = file->f_path.dentry->d_inode; -+ int ret; -+ -+ /* These are just misnamed, they actually get/put from/to user an int */ -+ switch (cmd) { -+ case EXT3COW_IOC32_GETFLAGS: -+ cmd = EXT3COW_IOC_GETFLAGS; -+ break; -+ case EXT3COW_IOC32_SETFLAGS: -+ cmd = EXT3COW_IOC_SETFLAGS; -+ break; -+ case EXT3COW_IOC32_GETVERSION: -+ cmd = EXT3COW_IOC_GETVERSION; -+ break; -+ case EXT3COW_IOC32_SETVERSION: -+ cmd = EXT3COW_IOC_SETVERSION; -+ break; -+ case EXT3COW_IOC32_GROUP_EXTEND: -+ cmd = EXT3COW_IOC_GROUP_EXTEND; -+ break; -+ case EXT3COW_IOC32_GETVERSION_OLD: -+ cmd = EXT3COW_IOC_GETVERSION_OLD; -+ break; -+ case EXT3COW_IOC32_SETVERSION_OLD: -+ cmd = EXT3COW_IOC_SETVERSION_OLD; -+ break; -+#ifdef CONFIG_JBD_DEBUG -+ case EXT3COW_IOC32_WAIT_FOR_READONLY: -+ cmd = EXT3COW_IOC_WAIT_FOR_READONLY; -+ break; -+#endif -+ case EXT3COW_IOC32_GETRSVSZ: -+ cmd = EXT3COW_IOC_GETRSVSZ; -+ break; -+ case EXT3COW_IOC32_SETRSVSZ: -+ cmd = EXT3COW_IOC_SETRSVSZ; -+ break; -+ case EXT3COW_IOC_GROUP_ADD: -+ break; -+ default: -+ return -ENOIOCTLCMD; -+ } -+ lock_kernel(); -+ ret = ext3cow_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg)); -+ unlock_kernel(); -+ return ret; -+} -+#endif -diff -ruN linux-2.6.20.3/fs/ext3cow/Makefile linux-2.6.20.3-ext3cow/fs/ext3cow/Makefile ---- linux-2.6.20.3/fs/ext3cow/Makefile 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.20.3-ext3cow/fs/ext3cow/Makefile 2008-03-09 11:14:49.000000000 -0400 -@@ -0,0 +1,12 @@ -+# -+# Makefile for the linux ext3cow-filesystem routines. -+# -+ -+obj-$(CONFIG_EXT3COW_FS) += ext3cow.o -+ -+ext3cow-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ -+ ioctl.o namei.o super.o symlink.o hash.o resize.o ext3cow_jbd.o -+ -+ext3cow-$(CONFIG_EXT3COW_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o -+ext3cow-$(CONFIG_EXT3COW_FS_POSIX_ACL) += acl.o -+ext3cow-$(CONFIG_EXT3COW_FS_SECURITY) += xattr_security.o -diff -ruN linux-2.6.20.3/fs/ext3cow/namei.c linux-2.6.20.3-ext3cow/fs/ext3cow/namei.c ---- linux-2.6.20.3/fs/ext3cow/namei.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.20.3-ext3cow/fs/ext3cow/namei.c 2008-03-09 11:14:48.000000000 -0400 -@@ -0,0 +1,2979 @@ -+/* -+ * linux/fs/ext3cow/namei.c -+ * -+ * Copyright (C) 1992, 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ * -+ * from -+ * -+ * linux/fs/minix/namei.c -+ * -+ * Copyright (C) 1991, 1992 Linus Torvalds -+ * -+ * Big-endian to little-endian byte-swapping/bitmaps by -+ * David S. Miller (davem@caip.rutgers.edu), 1995 -+ * Directory entry file type support and forward compatibility hooks -+ * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 -+ * Hash Tree Directory indexing (c) -+ * Daniel Phillips, 2001 -+ * Hash Tree Directory indexing porting -+ * Christopher Li, 2002 -+ * Hash Tree Directory indexing cleanup -+ * Theodore Ts'o, 2002 -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include "namei.h" -+#include "xattr.h" -+#include "acl.h" -+ -+/* -+ * define how far ahead to read directories while searching them. -+ */ -+#define NAMEI_RA_CHUNKS 2 -+#define NAMEI_RA_BLOCKS 4 -+#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) -+#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) -+ -+/* is the inode marked unchangeable or does the name -+ contain an epoch less than the current system epoch -znjp */ -+int is_unchangeable(struct inode *inode, struct dentry *dentry){ -+ -+ char *at = NULL; -+ -+ if (inode && (EXT3COW_IS_UNCHANGEABLE(inode) || IS_IMMUTABLE(inode))) -+ return 1; -+ if(dentry) -+ at = strrchr(dentry->d_name.name, EXT3COW_FLUX_TOKEN); -+ if(at && (simple_strtol(&at[1], (char **)NULL, 10) > 0)) -+ return 1; -+ -+ return 0; -+} -+ -+static struct buffer_head *ext3cow_append(handle_t *handle, -+ struct inode *inode, -+ u32 *block, int *err) -+{ -+ struct buffer_head *bh; -+ -+ *block = inode->i_size >> inode->i_sb->s_blocksize_bits; -+ -+ if ((bh = ext3cow_bread(handle, inode, *block, 1, err))) { -+ inode->i_size += inode->i_sb->s_blocksize; -+ EXT3COW_I(inode)->i_disksize = inode->i_size; -+ ext3cow_journal_get_write_access(handle,bh); -+ } -+ return bh; -+} -+ -+#ifndef assert -+#define assert(test) J_ASSERT(test) -+#endif -+ -+#ifndef swap -+#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0) -+#endif -+ -+#ifdef DX_DEBUG -+#define dxtrace(command) command -+#else -+#define dxtrace(command) -+#endif -+ -+struct fake_dirent -+{ -+ __le32 inode; -+ __le16 rec_len; -+ u8 name_len; -+ u8 file_type; -+}; -+ -+struct dx_countlimit -+{ -+ __le16 limit; -+ __le16 count; -+}; -+ -+struct dx_entry -+{ -+ __le32 hash; -+ __le32 block; -+}; -+ -+/* -+ * dx_root_info is laid out so that if it should somehow get overlaid by a -+ * dirent the two low bits of the hash version will be zero. Therefore, the -+ * hash version mod 4 should never be 0. Sincerely, the paranoia department. -+ */ -+ -+struct dx_root -+{ -+ struct fake_dirent dot; -+ char dot_name[4]; -+ struct fake_dirent dotdot; -+ char dotdot_name[4]; -+ struct dx_root_info -+ { -+ __le32 reserved_zero; -+ u8 hash_version; -+ u8 info_length; /* 8 */ -+ u8 indirect_levels; -+ u8 unused_flags; -+ } -+ info; -+ struct dx_entry entries[0]; -+}; -+ -+struct dx_node -+{ -+ struct fake_dirent fake; -+ struct dx_entry entries[0]; -+}; -+ -+ -+struct dx_frame -+{ -+ struct buffer_head *bh; -+ struct dx_entry *entries; -+ struct dx_entry *at; -+}; -+ -+struct dx_map_entry -+{ -+ u32 hash; -+ u32 offs; -+}; -+ -+#ifdef CONFIG_EXT3COW_INDEX -+static inline unsigned dx_get_block (struct dx_entry *entry); -+static void dx_set_block (struct dx_entry *entry, unsigned value); -+static inline unsigned dx_get_hash (struct dx_entry *entry); -+static void dx_set_hash (struct dx_entry *entry, unsigned value); -+static unsigned dx_get_count (struct dx_entry *entries); -+static unsigned dx_get_limit (struct dx_entry *entries); -+static void dx_set_count (struct dx_entry *entries, unsigned value); -+static void dx_set_limit (struct dx_entry *entries, unsigned value); -+static unsigned dx_root_limit (struct inode *dir, unsigned infosize); -+static unsigned dx_node_limit (struct inode *dir); -+static struct dx_frame *dx_probe(struct dentry *dentry, -+ struct inode *dir, -+ struct dx_hash_info *hinfo, -+ struct dx_frame *frame, -+ int *err); -+static void dx_release (struct dx_frame *frames); -+static int dx_make_map (struct ext3cow_dir_entry_2 *de, int size, -+ struct dx_hash_info *hinfo, struct dx_map_entry map[]); -+static void dx_sort_map(struct dx_map_entry *map, unsigned count); -+static struct ext3cow_dir_entry_2 *dx_move_dirents (char *from, char *to, -+ struct dx_map_entry *offsets, int count); -+static struct ext3cow_dir_entry_2* dx_pack_dirents (char *base, int size); -+static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); -+static int ext3cow_htree_next_block(struct inode *dir, __u32 hash, -+ struct dx_frame *frame, -+ struct dx_frame *frames, -+ __u32 *start_hash); -+static struct buffer_head * ext3cow_dx_find_entry(struct dentry *dentry, -+ struct ext3cow_dir_entry_2 **res_dir, int *err); -+static int ext3cow_dx_add_entry(handle_t *handle, struct dentry *dentry, -+ struct inode *inode); -+ -+/* -+ * Future: use high four bits of block for coalesce-on-delete flags -+ * Mask them off for now. -+ */ -+ -+static inline unsigned dx_get_block (struct dx_entry *entry) -+{ -+ return le32_to_cpu(entry->block) & 0x00ffffff; -+} -+ -+static inline void dx_set_block (struct dx_entry *entry, unsigned value) -+{ -+ entry->block = cpu_to_le32(value); -+} -+ -+static inline unsigned dx_get_hash (struct dx_entry *entry) -+{ -+ return le32_to_cpu(entry->hash); -+} -+ -+static inline void dx_set_hash (struct dx_entry *entry, unsigned value) -+{ -+ entry->hash = cpu_to_le32(value); -+} -+ -+static inline unsigned dx_get_count (struct dx_entry *entries) -+{ -+ return le16_to_cpu(((struct dx_countlimit *) entries)->count); -+} -+ -+static inline unsigned dx_get_limit (struct dx_entry *entries) -+{ -+ return le16_to_cpu(((struct dx_countlimit *) entries)->limit); -+} -+ -+static inline void dx_set_count (struct dx_entry *entries, unsigned value) -+{ -+ ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); -+} -+ -+static inline void dx_set_limit (struct dx_entry *entries, unsigned value) -+{ -+ ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); -+} -+ -+static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) -+{ -+ unsigned entry_space = dir->i_sb->s_blocksize - EXT3COW_DIR_REC_LEN(1) - -+ EXT3COW_DIR_REC_LEN(2) - infosize; -+ return 0? 20: entry_space / sizeof(struct dx_entry); -+} -+ -+static inline unsigned dx_node_limit (struct inode *dir) -+{ -+ unsigned entry_space = dir->i_sb->s_blocksize - EXT3COW_DIR_REC_LEN(0); -+ return 0? 22: entry_space / sizeof(struct dx_entry); -+} -+ -+/* -+ * Debug -+ */ -+#ifdef DX_DEBUG -+static void dx_show_index (char * label, struct dx_entry *entries) -+{ -+ int i, n = dx_get_count (entries); -+ printk("%s index ", label); -+ for (i = 0; i < n; i++) -+ { -+ printk("%x->%u ", i? dx_get_hash(entries + i): 0, dx_get_block(entries + i)); -+ } -+ printk("\n"); -+} -+ -+struct stats -+{ -+ unsigned names; -+ unsigned space; -+ unsigned bcount; -+}; -+ -+static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3cow_dir_entry_2 *de, -+ int size, int show_names) -+{ -+ unsigned names = 0, space = 0; -+ char *base = (char *) de; -+ struct dx_hash_info h = *hinfo; -+ -+ printk("names: "); -+ while ((char *) de < base + size) -+ { -+ if (de->inode) -+ { -+ if (show_names) -+ { -+ int len = de->name_len; -+ char *name = de->name; -+ while (len--) printk("%c", *name++); -+ ext3cowfs_dirhash(de->name, de->name_len, &h); -+ printk(":%x.%u ", h.hash, -+ ((char *) de - base)); -+ } -+ space += EXT3COW_DIR_REC_LEN(de->name_len); -+ names++; -+ } -+ de = (struct ext3cow_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); -+ } -+ printk("(%i)\n", names); -+ return (struct stats) { names, space, 1 }; -+} -+ -+struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, -+ struct dx_entry *entries, int levels) -+{ -+ unsigned blocksize = dir->i_sb->s_blocksize; -+ unsigned count = dx_get_count (entries), names = 0, space = 0, i; -+ unsigned bcount = 0; -+ struct buffer_head *bh; -+ int err; -+ printk("%i indexed blocks...\n", count); -+ for (i = 0; i < count; i++, entries++) -+ { -+ u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0; -+ u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; -+ struct stats stats; -+ printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); -+ if (!(bh = ext3cow_bread (NULL,dir, block, 0,&err))) continue; -+ stats = levels? -+ dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): -+ dx_show_leaf(hinfo, (struct ext3cow_dir_entry_2 *) bh->b_data, blocksize, 0); -+ names += stats.names; -+ space += stats.space; -+ bcount += stats.bcount; -+ brelse (bh); -+ } -+ if (bcount) -+ printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ", -+ names, space/bcount,(space/bcount)*100/blocksize); -+ return (struct stats) { names, space, bcount}; -+} -+#endif /* DX_DEBUG */ -+ -+/* -+ * Probe for a directory leaf block to search. -+ * -+ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format -+ * error in the directory index, and the caller should fall back to -+ * searching the directory normally. The callers of dx_probe **MUST** -+ * check for this error code, and make sure it never gets reflected -+ * back to userspace. -+ */ -+static struct dx_frame * -+dx_probe(struct dentry *dentry, struct inode *dir, -+ struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) -+{ -+ unsigned count, indirect; -+ struct dx_entry *at, *entries, *p, *q, *m; -+ struct dx_root *root; -+ struct buffer_head *bh; -+ struct dx_frame *frame = frame_in; -+ u32 hash; -+ -+ frame->bh = NULL; -+ if (dentry) -+ dir = dentry->d_parent->d_inode; -+ if (!(bh = ext3cow_bread (NULL,dir, 0, 0, err))) -+ goto fail; -+ root = (struct dx_root *) bh->b_data; -+ if (root->info.hash_version != DX_HASH_TEA && -+ root->info.hash_version != DX_HASH_HALF_MD4 && -+ root->info.hash_version != DX_HASH_LEGACY) { -+ ext3cow_warning(dir->i_sb, __FUNCTION__, -+ "Unrecognised inode hash code %d", -+ root->info.hash_version); -+ brelse(bh); -+ *err = ERR_BAD_DX_DIR; -+ goto fail; -+ } -+ hinfo->hash_version = root->info.hash_version; -+ hinfo->seed = EXT3COW_SB(dir->i_sb)->s_hash_seed; -+ if (dentry) -+ ext3cowfs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); -+ hash = hinfo->hash; -+ -+ if (root->info.unused_flags & 1) { -+ ext3cow_warning(dir->i_sb, __FUNCTION__, -+ "Unimplemented inode hash flags: %#06x", -+ root->info.unused_flags); -+ brelse(bh); -+ *err = ERR_BAD_DX_DIR; -+ goto fail; -+ } -+ -+ if ((indirect = root->info.indirect_levels) > 1) { -+ ext3cow_warning(dir->i_sb, __FUNCTION__, -+ "Unimplemented inode hash depth: %#06x", -+ root->info.indirect_levels); -+ brelse(bh); -+ *err = ERR_BAD_DX_DIR; -+ goto fail; -+ } -+ -+ entries = (struct dx_entry *) (((char *)&root->info) + -+ root->info.info_length); -+ assert(dx_get_limit(entries) == dx_root_limit(dir, -+ root->info.info_length)); -+ dxtrace (printk("Look up %x", hash)); -+ while (1) -+ { -+ count = dx_get_count(entries); -+ assert (count && count <= dx_get_limit(entries)); -+ p = entries + 1; -+ q = entries + count - 1; -+ while (p <= q) -+ { -+ m = p + (q - p)/2; -+ dxtrace(printk(".")); -+ if (dx_get_hash(m) > hash) -+ q = m - 1; -+ else -+ p = m + 1; -+ } -+ -+ if (0) // linear search cross check -+ { -+ unsigned n = count - 1; -+ at = entries; -+ while (n--) -+ { -+ dxtrace(printk(",")); -+ if (dx_get_hash(++at) > hash) -+ { -+ at--; -+ break; -+ } -+ } -+ assert (at == p - 1); -+ } -+ -+ at = p - 1; -+ dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); -+ frame->bh = bh; -+ frame->entries = entries; -+ frame->at = at; -+ if (!indirect--) return frame; -+ if (!(bh = ext3cow_bread (NULL,dir, dx_get_block(at), 0, err))) -+ goto fail2; -+ at = entries = ((struct dx_node *) bh->b_data)->entries; -+ assert (dx_get_limit(entries) == dx_node_limit (dir)); -+ frame++; -+ } -+fail2: -+ while (frame >= frame_in) { -+ brelse(frame->bh); -+ frame--; -+ } -+fail: -+ return NULL; -+} -+ -+static void dx_release (struct dx_frame *frames) -+{ -+ if (frames[0].bh == NULL) -+ return; -+ -+ if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) -+ brelse(frames[1].bh); -+ brelse(frames[0].bh); -+} -+ -+/* -+ * This function increments the frame pointer to search the next leaf -+ * block, and reads in the necessary intervening nodes if the search -+ * should be necessary. Whether or not the search is necessary is -+ * controlled by the hash parameter. If the hash value is even, then -+ * the search is only continued if the next block starts with that -+ * hash value. This is used if we are searching for a specific file. -+ * -+ * If the hash value is HASH_NB_ALWAYS, then always go to the next block. -+ * -+ * This function returns 1 if the caller should continue to search, -+ * or 0 if it should not. If there is an error reading one of the -+ * index blocks, it will a negative error code. -+ * -+ * If start_hash is non-null, it will be filled in with the starting -+ * hash of the next page. -+ */ -+static int ext3cow_htree_next_block(struct inode *dir, __u32 hash, -+ struct dx_frame *frame, -+ struct dx_frame *frames, -+ __u32 *start_hash) -+{ -+ struct dx_frame *p; -+ struct buffer_head *bh; -+ int err, num_frames = 0; -+ __u32 bhash; -+ -+ p = frame; -+ /* -+ * Find the next leaf page by incrementing the frame pointer. -+ * If we run out of entries in the interior node, loop around and -+ * increment pointer in the parent node. When we break out of -+ * this loop, num_frames indicates the number of interior -+ * nodes need to be read. -+ */ -+ while (1) { -+ if (++(p->at) < p->entries + dx_get_count(p->entries)) -+ break; -+ if (p == frames) -+ return 0; -+ num_frames++; -+ p--; -+ } -+ -+ /* -+ * If the hash is 1, then continue only if the next page has a -+ * continuation hash of any value. This is used for readdir -+ * handling. Otherwise, check to see if the hash matches the -+ * desired contiuation hash. If it doesn't, return since -+ * there's no point to read in the successive index pages. -+ */ -+ bhash = dx_get_hash(p->at); -+ if (start_hash) -+ *start_hash = bhash; -+ if ((hash & 1) == 0) { -+ if ((bhash & ~1) != hash) -+ return 0; -+ } -+ /* -+ * If the hash is HASH_NB_ALWAYS, we always go to the next -+ * block so no check is necessary -+ */ -+ while (num_frames--) { -+ if (!(bh = ext3cow_bread(NULL, dir, dx_get_block(p->at), -+ 0, &err))) -+ return err; /* Failure */ -+ p++; -+ brelse (p->bh); -+ p->bh = bh; -+ p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; -+ } -+ return 1; -+} -+ -+ -+/* -+ * p is at least 6 bytes before the end of page -+ */ -+static inline struct ext3cow_dir_entry_2 *ext3cow_next_entry(struct ext3cow_dir_entry_2 *p) -+{ -+ return (struct ext3cow_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len)); -+} -+ -+/* -+ * This function fills a red-black tree with information from a -+ * directory block. It returns the number directory entries loaded -+ * into the tree. If there is an error it is returned in err. -+ */ -+static int htree_dirblock_to_tree(struct file *dir_file, -+ struct inode *dir, int block, -+ struct dx_hash_info *hinfo, -+ __u32 start_hash, __u32 start_minor_hash) -+{ -+ struct buffer_head *bh; -+ struct ext3cow_dir_entry_2 *de, *top; -+ int err, count = 0; -+ -+ dxtrace(printk("In htree dirblock_to_tree: block %d\n", block)); -+ if (!(bh = ext3cow_bread (NULL, dir, block, 0, &err))) -+ return err; -+ -+ de = (struct ext3cow_dir_entry_2 *) bh->b_data; -+ top = (struct ext3cow_dir_entry_2 *) ((char *) de + -+ dir->i_sb->s_blocksize - -+ EXT3COW_DIR_REC_LEN(0)); -+ for (; de < top; de = ext3cow_next_entry(de)) { -+ if (!ext3cow_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, -+ (block<i_sb)) -+ +((char *)de - bh->b_data))) { -+ /* On error, skip the f_pos to the next block. */ -+ dir_file->f_pos = (dir_file->f_pos | -+ (dir->i_sb->s_blocksize - 1)) + 1; -+ brelse (bh); -+ return count; -+ } -+ ext3cowfs_dirhash(de->name, de->name_len, hinfo); -+ if ((hinfo->hash < start_hash) || -+ ((hinfo->hash == start_hash) && -+ (hinfo->minor_hash < start_minor_hash))) -+ continue; -+ if (de->inode == 0) -+ continue; -+ if ((err = ext3cow_htree_store_dirent(dir_file, -+ hinfo->hash, hinfo->minor_hash, de)) != 0) { -+ brelse(bh); -+ return err; -+ } -+ count++; -+ } -+ brelse(bh); -+ return count; -+} -+ -+ -+/* -+ * This function fills a red-black tree with information from a -+ * directory. We start scanning the directory in hash order, starting -+ * at start_hash and start_minor_hash. -+ * -+ * This function returns the number of entries inserted into the tree, -+ * or a negative error code. -+ */ -+int ext3cow_htree_fill_tree(struct file *dir_file, __u32 start_hash, -+ __u32 start_minor_hash, __u32 *next_hash) -+{ -+ struct dx_hash_info hinfo; -+ struct ext3cow_dir_entry_2 *de; -+ struct dx_frame frames[2], *frame; -+ struct inode *dir; -+ int block, err; -+ int count = 0; -+ int ret; -+ __u32 hashval; -+ -+ dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, -+ start_minor_hash)); -+ dir = dir_file->f_path.dentry->d_inode; -+ if (!(EXT3COW_I(dir)->i_flags & EXT3COW_INDEX_FL)) { -+ hinfo.hash_version = EXT3COW_SB(dir->i_sb)->s_def_hash_version; -+ hinfo.seed = EXT3COW_SB(dir->i_sb)->s_hash_seed; -+ count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, -+ start_hash, start_minor_hash); -+ *next_hash = ~0; -+ return count; -+ } -+ hinfo.hash = start_hash; -+ hinfo.minor_hash = 0; -+ frame = dx_probe(NULL, dir_file->f_path.dentry->d_inode, &hinfo, frames, &err); -+ if (!frame) -+ return err; -+ -+ /* Add '.' and '..' from the htree header */ -+ if (!start_hash && !start_minor_hash) { -+ de = (struct ext3cow_dir_entry_2 *) frames[0].bh->b_data; -+ if ((err = ext3cow_htree_store_dirent(dir_file, 0, 0, de)) != 0) -+ goto errout; -+ count++; -+ } -+ if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) { -+ de = (struct ext3cow_dir_entry_2 *) frames[0].bh->b_data; -+ de = ext3cow_next_entry(de); -+ if ((err = ext3cow_htree_store_dirent(dir_file, 2, 0, de)) != 0) -+ goto errout; -+ count++; -+ } -+ -+ while (1) { -+ block = dx_get_block(frame->at); -+ ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo, -+ start_hash, start_minor_hash); -+ if (ret < 0) { -+ err = ret; -+ goto errout; -+ } -+ count += ret; -+ hashval = ~0; -+ ret = ext3cow_htree_next_block(dir, HASH_NB_ALWAYS, -+ frame, frames, &hashval); -+ *next_hash = hashval; -+ if (ret < 0) { -+ err = ret; -+ goto errout; -+ } -+ /* -+ * Stop if: (a) there are no more entries, or -+ * (b) we have inserted at least one entry and the -+ * next hash value is not a continuation -+ */ -+ if ((ret == 0) || -+ (count && ((hashval & 1) == 0))) -+ break; -+ } -+ dx_release(frames); -+ dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", -+ count, *next_hash)); -+ return count; -+errout: -+ dx_release(frames); -+ return (err); -+} -+ -+ -+/* -+ * Directory block splitting, compacting -+ */ -+ -+static int dx_make_map (struct ext3cow_dir_entry_2 *de, int size, -+ struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) -+{ -+ int count = 0; -+ char *base = (char *) de; -+ struct dx_hash_info h = *hinfo; -+ -+ while ((char *) de < base + size) -+ { -+ if (de->name_len && de->inode) { -+ ext3cowfs_dirhash(de->name, de->name_len, &h); -+ map_tail--; -+ map_tail->hash = h.hash; -+ map_tail->offs = (u32) ((char *) de - base); -+ count++; -+ cond_resched(); -+ } -+ /* XXX: do we need to check rec_len == 0 case? -Chris */ -+ de = (struct ext3cow_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); -+ } -+ return count; -+} -+ -+static void dx_sort_map (struct dx_map_entry *map, unsigned count) -+{ -+ struct dx_map_entry *p, *q, *top = map + count - 1; -+ int more; -+ /* Combsort until bubble sort doesn't suck */ -+ while (count > 2) -+ { -+ count = count*10/13; -+ if (count - 9 < 2) /* 9, 10 -> 11 */ -+ count = 11; -+ for (p = top, q = p - count; q >= map; p--, q--) -+ if (p->hash < q->hash) -+ swap(*p, *q); -+ } -+ /* Garden variety bubble sort */ -+ do { -+ more = 0; -+ q = top; -+ while (q-- > map) -+ { -+ if (q[1].hash >= q[0].hash) -+ continue; -+ swap(*(q+1), *q); -+ more = 1; -+ } -+ } while(more); -+} -+ -+static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block) -+{ -+ struct dx_entry *entries = frame->entries; -+ struct dx_entry *old = frame->at, *new = old + 1; -+ int count = dx_get_count(entries); -+ -+ assert(count < dx_get_limit(entries)); -+ assert(old < entries + count); -+ memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); -+ dx_set_hash(new, hash); -+ dx_set_block(new, block); -+ dx_set_count(entries, count + 1); -+} -+#endif -+ -+ -+static void ext3cow_update_dx_flag(struct inode *inode) -+{ -+ if (!EXT3COW_HAS_COMPAT_FEATURE(inode->i_sb, -+ EXT3COW_FEATURE_COMPAT_DIR_INDEX)) -+ EXT3COW_I(inode)->i_flags &= ~EXT3COW_INDEX_FL; -+} -+ -+/* -+ * NOTE! unlike strncmp, ext3cow_match returns 1 for success, 0 for failure. -+ * -+ * `len <= EXT3COW_NAME_LEN' is guaranteed by caller. -+ * `de != NULL' is guaranteed by caller. -+ */ -+static inline int ext3cow_match (int len, const char * const name, -+ struct ext3cow_dir_entry_2 * de) -+{ -+ if (len != de->name_len) -+ return 0; -+ if (!de->inode) -+ return 0; -+ return !memcmp(name, de->name, len); -+} -+ -+/* -+ * Returns 0 if not found, -1 on failure, and 1 on success -+ */ -+/* For versioning - this is the function used when looking for -+ * names. We now handle names which include the flux token, -+ * strip it off and continue looking -znjp */ -+static inline int search_dirblock(struct buffer_head * bh, -+ struct inode *dir, -+ struct dentry *dentry, -+ unsigned long offset, -+ struct ext3cow_dir_entry_2 ** res_dir) -+{ -+ struct ext3cow_dir_entry_2 * de; -+ char * dlimit, * flux = NULL; -+ int de_len; -+ char name[EXT3COW_NAME_LEN]; -+ int namelen = dentry->d_name.len; -+ unsigned int epoch_number = EXT3COW_I_EPOCHNUMBER(dir); -+ -+ /* Get the name for the dentry */ -+ memcpy(name, dentry->d_name.name, namelen); -+ name[namelen] = '\0'; -+ -+ /* Check to see if the flux token is in the name */ -+ flux = strrchr(dentry->d_name.name, EXT3COW_FLUX_TOKEN); -+ if(NULL != flux){ -+ /* If we're here, the name we want is in the past. */ -+ int new_namelen = strlen(dentry->d_name.name) - strlen(flux); -+ /* Get the epoch number */ -+ epoch_number = simple_strtol(&flux[1], (char **)NULL, 10) - 1; -+ /* If there's a valid epoch number or if we're version listing -+ * we need the name seperately, otherwise the FLUX_TOKEN exists -+ * in the file name */ -+ if(epoch_number + 1 == 0 && (strlen(flux) > 1)){ -+ /* EXT3COW_FLUX_TOKEN exists in the file name */ -+ epoch_number = EXT3COW_S_EPOCHNUMBER(dir->i_sb); -+ }else{ -+ /* Grab the correct name and length */ -+ memcpy(name, dentry->d_name.name, new_namelen); -+ name[new_namelen] = '\0'; -+ namelen = strlen(name); -+ } -+ } -+ -+ -+ de = (struct ext3cow_dir_entry_2 *) bh->b_data; -+ dlimit = bh->b_data + dir->i_sb->s_blocksize; -+ while ((char *) de < dlimit) { -+ /* this code is executed quadratically often */ -+ /* do minimal checking `by hand' */ -+ -+ /* Can't just return first entry of something; -+ * may exist twice if died and same name appears again. - znjp -+ */ -+ if ((char *) de + namelen <= dlimit && -+ ext3cow_match (namelen, name, de) && -+ EXT3COW_IS_DIRENT_SCOPED(de, epoch_number)) { -+ /* found a match - just to be sure, do a full check */ -+ if (!ext3cow_check_dir_entry("ext3cow_find_entry", -+ dir, de, bh, offset)) -+ return -1; -+ *res_dir = de; -+ return 1; -+ } -+ /* prevent looping on a bad block */ -+ de_len = le16_to_cpu(de->rec_len); -+ if (de_len <= 0) -+ return -1; -+ offset += de_len; -+ de = (struct ext3cow_dir_entry_2 *) ((char *) de + de_len); -+ } -+ return 0; -+} -+ -+ -+/* -+ * ext3cow_find_entry() -+ * -+ * finds an entry in the specified directory with the wanted name. It -+ * returns the cache buffer in which the entry was found, and the entry -+ * itself (as a parameter - res_dir). It does NOT read the inode of the -+ * entry - you'll have to do that yourself if you want to. -+ * -+ * The returned buffer_head has ->b_count elevated. The caller is expected -+ * to brelse() it when appropriate. -+ */ -+static struct buffer_head * ext3cow_find_entry (struct dentry *dentry, -+ struct ext3cow_dir_entry_2 ** res_dir) -+{ -+ struct super_block * sb; -+ struct buffer_head * bh_use[NAMEI_RA_SIZE]; -+ struct buffer_head * bh, *ret = NULL; -+ unsigned long start, block, b; -+ int ra_max = 0; /* Number of bh's in the readahead -+ buffer, bh_use[] */ -+ int ra_ptr = 0; /* Current index into readahead -+ buffer */ -+ int num = 0; -+ int nblocks, i, err; -+ struct inode *dir = dentry->d_parent->d_inode; -+ int namelen; -+ const u8 *name; -+ unsigned blocksize; -+ -+ *res_dir = NULL; -+ sb = dir->i_sb; -+ blocksize = sb->s_blocksize; -+ namelen = dentry->d_name.len; -+ name = dentry->d_name.name; -+ if (namelen > EXT3COW_NAME_LEN) -+ return NULL; -+#ifdef CONFIG_EXT3COW_INDEX -+ if (is_dx(dir)) { -+ bh = ext3cow_dx_find_entry(dentry, res_dir, &err); -+ /* -+ * On success, or if the error was file not found, -+ * return. Otherwise, fall back to doing a search the -+ * old fashioned way. -+ */ -+ if (bh || (err != ERR_BAD_DX_DIR)) -+ return bh; -+ dxtrace(printk("ext3cow_find_entry: dx failed, falling back\n")); -+ } -+#endif -+ nblocks = dir->i_size >> EXT3COW_BLOCK_SIZE_BITS(sb); -+ start = EXT3COW_I(dir)->i_dir_start_lookup; -+ if (start >= nblocks) -+ start = 0; -+ block = start; -+restart: -+ do { -+ /* -+ * We deal with the read-ahead logic here. -+ */ -+ if (ra_ptr >= ra_max) { -+ /* Refill the readahead buffer */ -+ ra_ptr = 0; -+ b = block; -+ for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) { -+ /* -+ * Terminate if we reach the end of the -+ * directory and must wrap, or if our -+ * search has finished at this block. -+ */ -+ if (b >= nblocks || (num && block == start)) { -+ bh_use[ra_max] = NULL; -+ break; -+ } -+ num++; -+ bh = ext3cow_getblk(NULL, dir, b++, 0, &err); -+ bh_use[ra_max] = bh; -+ if (bh) -+ ll_rw_block(READ_META, 1, &bh); -+ } -+ } -+ if ((bh = bh_use[ra_ptr++]) == NULL) -+ goto next; -+ wait_on_buffer(bh); -+ if (!buffer_uptodate(bh)) { -+ /* read error, skip block & hope for the best */ -+ ext3cow_error(sb, __FUNCTION__, "reading directory #%lu " -+ "offset %lu", dir->i_ino, block); -+ brelse(bh); -+ goto next; -+ } -+ i = search_dirblock(bh, dir, dentry, -+ block << EXT3COW_BLOCK_SIZE_BITS(sb), res_dir); -+ if (i == 1) { -+ EXT3COW_I(dir)->i_dir_start_lookup = block; -+ ret = bh; -+ goto cleanup_and_exit; -+ } else { -+ brelse(bh); -+ if (i < 0) -+ goto cleanup_and_exit; -+ } -+ next: -+ if (++block >= nblocks) -+ block = 0; -+ } while (block != start); -+ -+ /* -+ * If the directory has grown while we were searching, then -+ * search the last part of the directory before giving up. -+ */ -+ block = nblocks; -+ nblocks = dir->i_size >> EXT3COW_BLOCK_SIZE_BITS(sb); -+ if (block < nblocks) { -+ start = 0; -+ goto restart; -+ } -+ -+cleanup_and_exit: -+ /* Clean up the read-ahead blocks */ -+ for (; ra_ptr < ra_max; ra_ptr++) -+ brelse (bh_use[ra_ptr]); -+ return ret; -+} -+ -+#ifdef CONFIG_EXT3COW_INDEX -+static struct buffer_head * ext3cow_dx_find_entry(struct dentry *dentry, -+ struct ext3cow_dir_entry_2 **res_dir, int *err) -+{ -+ struct super_block * sb; -+ struct dx_hash_info hinfo; -+ u32 hash; -+ struct dx_frame frames[2], *frame; -+ struct ext3cow_dir_entry_2 *de, *top; -+ struct buffer_head *bh; -+ unsigned long block; -+ int retval; -+ int namelen = dentry->d_name.len; -+ const u8 *name = dentry->d_name.name; -+ struct inode *dir = dentry->d_parent->d_inode; -+ -+ sb = dir->i_sb; -+ /* NFS may look up ".." - look at dx_root directory block */ -+ if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ -+ if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err))) -+ return NULL; -+ } else { -+ frame = frames; -+ frame->bh = NULL; /* for dx_release() */ -+ frame->at = (struct dx_entry *)frames; /* hack for zero entry*/ -+ dx_set_block(frame->at, 0); /* dx_root block is 0 */ -+ } -+ hash = hinfo.hash; -+ do { -+ block = dx_get_block(frame->at); -+ if (!(bh = ext3cow_bread (NULL,dir, block, 0, err))) -+ goto errout; -+ de = (struct ext3cow_dir_entry_2 *) bh->b_data; -+ top = (struct ext3cow_dir_entry_2 *) ((char *) de + sb->s_blocksize - -+ EXT3COW_DIR_REC_LEN(0)); -+ for (; de < top; de = ext3cow_next_entry(de)) -+ if (ext3cow_match (namelen, name, de)) { -+ if (!ext3cow_check_dir_entry("ext3cow_find_entry", -+ dir, de, bh, -+ (block<b_data))) { -+ brelse (bh); -+ goto errout; -+ } -+ *res_dir = de; -+ dx_release (frames); -+ return bh; -+ } -+ brelse (bh); -+ /* Check to see if we should continue to search */ -+ retval = ext3cow_htree_next_block(dir, hash, frame, -+ frames, NULL); -+ if (retval < 0) { -+ ext3cow_warning(sb, __FUNCTION__, -+ "error reading index page in directory #%lu", -+ dir->i_ino); -+ *err = retval; -+ goto errout; -+ } -+ } while (retval == 1); -+ -+ *err = -ENOENT; -+errout: -+ dxtrace(printk("%s not found\n", name)); -+ dx_release (frames); -+ return NULL; -+} -+#endif -+ -+/* ext3cow_lookup: One the key functions of this versioning file sytem, -+ * allowing people to return to the past. -+ * -+ * Two policies for inode chains: -+ * 1) If it's the head of the list, it's the most current inode -+ * and always changable. The inode number is static. -+ * 2) If it's any inode in the chain that's not the head, -+ * than it's an inode in the past and unchangeable. The inode -+ * number may change. -+ */ -+static struct dentry *ext3cow_lookup(struct inode * dir, struct dentry *dentry, -+ struct nameidata *nd) -+{ -+ struct inode * inode = NULL; -+ struct ext3cow_dir_entry_2 * de = NULL; -+ struct buffer_head * bh = NULL; -+ unsigned int epoch_number = 0; -+ char * flux = NULL; -+ -+ if (dentry->d_name.len > EXT3COW_NAME_LEN) -+ return ERR_PTR(-ENAMETOOLONG); -+ -+ /* Find the epoch number to scope with -znjp -+ * if the parent is unchangeable, so is the inode -+ */ -+ if(EXT3COW_IS_UNCHANGEABLE(dir)) -+ epoch_number = EXT3COW_I_EPOCHNUMBER(dir); -+ else -+ epoch_number = EXT3COW_S_EPOCHNUMBER(dir->i_sb); -+ -+ bh = ext3cow_find_entry(dentry, &de); -+ if (bh) { -+ unsigned long ino = le32_to_cpu(de->inode); -+ brelse (bh); -+ if (!ext3cow_valid_inum(dir->i_sb, ino)) { -+ ext3cow_error(dir->i_sb, "ext3cow_lookup", -+ "bad inode number: %lu", ino); -+ inode = NULL; -+ } else -+ inode = iget(dir->i_sb, ino); -+ -+ if (!inode) -+ return ERR_PTR(-EACCES); -+ -+ /* Is this a version listing ? */ -+ if ((char)dentry->d_name.name[dentry->d_name.len - 1] == -+ EXT3COW_FLUX_TOKEN) { -+ /* prevent going round in circles */ -+ if (dentry->d_parent && -+ dentry->d_parent->d_name.name[dentry->d_parent->d_name.len - 1] == -+ EXT3COW_FLUX_TOKEN) { -+ return NULL; -+ } -+ /* we fake a directory using the directory inode instead of -+ * the file one and subsequently force a call to ext3cow_readdir */ -+ iput(inode); -+ inode = ext3cow_fake_inode(dir, EXT3COW_S_EPOCHNUMBER(dir->i_sb)); -+ EXT3COW_I(inode)->i_next_inode = EXT3COW_I(dir)->i_next_inode; -+ d_splice_alias(inode, dentry); -+ -+ return NULL; -+ } -+ -+ /* Is the user time-shifting to the past? */ -+ flux = strrchr(dentry->d_name.name, EXT3COW_FLUX_TOKEN); -+ if(NULL != flux){ -+ -+ if(strnicmp(&flux[1], "onehour", 8) == 0){ -+ epoch_number = get_seconds() - ONEHOUR; -+ printk(KERN_INFO "ONEHOUR!\n"); -+ }else if(strnicmp(&flux[1], "yesterday", 10) == 0 || -+ strnicmp(&flux[1], "oneday", 7) == 0){ -+ epoch_number = get_seconds() - YESTERDAY; -+ }else if(strnicmp(&flux[1], "oneweek", 8) == 0){ -+ epoch_number = get_seconds() - ONEWEEK; -+ }else if(strnicmp(&flux[1], "onemonth", 9) == 0){ -+ epoch_number = get_seconds() - ONEMONTH; -+ }else if(strnicmp(&flux[1], "oneyear", 8) == 0){ -+ epoch_number = get_seconds() - ONEYEAR; -+ }else -+ epoch_number = simple_strtol(&flux[1], (char **)NULL, 10) - 1; -+ -+ /* No future epochs */ -+ if(epoch_number + 1 > EXT3COW_S_EPOCHNUMBER(dir->i_sb)) -+ return ERR_PTR(-ENOENT); -+ -+ /* Move to present -+ if(epoch_number + 1 == 0) -+ epoch_number = EXT3COW_S_EPOCHNUMBER(dir->i_sb); -+ */ -+ } -+ -+ /* Find correct inode in chain */ -+ while(EXT3COW_I_EPOCHNUMBER(inode) > epoch_number){ -+ -+ printk(KERN_INFO "Looking for %u with epoch %u\n", epoch_number, -+ EXT3COW_I_EPOCHNUMBER(inode)); -+ -+ ino = EXT3COW_I(inode)->i_next_inode; -+ if(ino == 0){ -+ ext3cow_warning(dir->i_sb, "ext3cow_lookup", -+ "Next inode is 0 in lookup."); -+ iput(inode); -+ return ERR_PTR(-ENOENT); -+ } -+ iput(inode); /* for correct usage count (i_count) */ -+ inode = iget(dir->i_sb, ino); -+ -+ if (!inode){ -+ ext3cow_warning(dir->i_sb, "ext3cow_lookup", -+ "Could not access inode number %lu", -+ ino); -+ return ERR_PTR(-EACCES); -+ } -+ } -+ -+ /* If we're in the past, fake the inode for scoping and "unchangability" */ -+ if(flux || (epoch_number != EXT3COW_S_EPOCHNUMBER(dir->i_sb))){ -+ printk(KERN_INFO "Faking %s\n", dentry->d_name.name); -+ inode = ext3cow_fake_inode(inode, epoch_number); -+ } -+ -+ if (!inode) -+ return ERR_PTR(-EACCES); -+ } -+ return d_splice_alias(inode, dentry); -+} -+ -+ -+struct dentry *ext3cow_get_parent(struct dentry *child) -+{ -+ unsigned long ino; -+ struct dentry *parent; -+ struct inode *inode; -+ struct dentry dotdot; -+ struct ext3cow_dir_entry_2 * de; -+ struct buffer_head *bh; -+ -+ dotdot.d_name.name = ".."; -+ dotdot.d_name.len = 2; -+ dotdot.d_parent = child; /* confusing, isn't it! */ -+ -+ bh = ext3cow_find_entry(&dotdot, &de); -+ inode = NULL; -+ if (!bh) -+ return ERR_PTR(-ENOENT); -+ ino = le32_to_cpu(de->inode); -+ brelse(bh); -+ -+ if (!ext3cow_valid_inum(child->d_inode->i_sb, ino)) { -+ ext3cow_error(child->d_inode->i_sb, "ext3cow_get_parent", -+ "bad inode number: %lu", ino); -+ inode = NULL; -+ } else -+ inode = iget(child->d_inode->i_sb, ino); -+ -+ if (!inode) -+ return ERR_PTR(-EACCES); -+ -+ parent = d_alloc_anon(inode); -+ if (!parent) { -+ iput(inode); -+ parent = ERR_PTR(-ENOMEM); -+ } -+ return parent; -+} -+ -+#define S_SHIFT 12 -+static unsigned char ext3cow_type_by_mode[S_IFMT >> S_SHIFT] = { -+ [S_IFREG >> S_SHIFT] = EXT3COW_FT_REG_FILE, -+ [S_IFDIR >> S_SHIFT] = EXT3COW_FT_DIR, -+ [S_IFCHR >> S_SHIFT] = EXT3COW_FT_CHRDEV, -+ [S_IFBLK >> S_SHIFT] = EXT3COW_FT_BLKDEV, -+ [S_IFIFO >> S_SHIFT] = EXT3COW_FT_FIFO, -+ [S_IFSOCK >> S_SHIFT] = EXT3COW_FT_SOCK, -+ [S_IFLNK >> S_SHIFT] = EXT3COW_FT_SYMLINK, -+}; -+ -+static inline void ext3cow_set_de_type(struct super_block *sb, -+ struct ext3cow_dir_entry_2 *de, -+ umode_t mode) { -+ if (EXT3COW_HAS_INCOMPAT_FEATURE(sb, EXT3COW_FEATURE_INCOMPAT_FILETYPE)) -+ de->file_type = ext3cow_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; -+} -+ -+#ifdef CONFIG_EXT3COW_INDEX -+static struct ext3cow_dir_entry_2 * -+dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) -+{ -+ unsigned rec_len = 0; -+ -+ while (count--) { -+ struct ext3cow_dir_entry_2 *de = (struct ext3cow_dir_entry_2 *) (from + map->offs); -+ rec_len = EXT3COW_DIR_REC_LEN(de->name_len); -+ memcpy (to, de, rec_len); -+ ((struct ext3cow_dir_entry_2 *) to)->rec_len = -+ cpu_to_le16(rec_len); -+ de->inode = 0; -+ map++; -+ to += rec_len; -+ } -+ return (struct ext3cow_dir_entry_2 *) (to - rec_len); -+} -+ -+static struct ext3cow_dir_entry_2* dx_pack_dirents(char *base, int size) -+{ -+ struct ext3cow_dir_entry_2 *next, *to, *prev, *de = (struct ext3cow_dir_entry_2 *) base; -+ unsigned rec_len = 0; -+ -+ prev = to = de; -+ while ((char*)de < base + size) { -+ next = (struct ext3cow_dir_entry_2 *) ((char *) de + -+ le16_to_cpu(de->rec_len)); -+ if (de->inode && de->name_len) { -+ rec_len = EXT3COW_DIR_REC_LEN(de->name_len); -+ if (de > to) -+ memmove(to, de, rec_len); -+ to->rec_len = cpu_to_le16(rec_len); -+ prev = to; -+ to = (struct ext3cow_dir_entry_2 *) (((char *) to) + rec_len); -+ } -+ de = next; -+ } -+ return prev; -+} -+ -+static struct ext3cow_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, -+ struct buffer_head **bh,struct dx_frame *frame, -+ struct dx_hash_info *hinfo, int *error) -+{ -+ unsigned blocksize = dir->i_sb->s_blocksize; -+ unsigned count, continued; -+ struct buffer_head *bh2; -+ u32 newblock; -+ u32 hash2; -+ struct dx_map_entry *map; -+ char *data1 = (*bh)->b_data, *data2; -+ unsigned split; -+ struct ext3cow_dir_entry_2 *de = NULL, *de2; -+ int err; -+ -+ bh2 = ext3cow_append (handle, dir, &newblock, error); -+ if (!(bh2)) { -+ brelse(*bh); -+ *bh = NULL; -+ goto errout; -+ } -+ -+ BUFFER_TRACE(*bh, "get_write_access"); -+ err = ext3cow_journal_get_write_access(handle, *bh); -+ if (err) { -+ journal_error: -+ brelse(*bh); -+ brelse(bh2); -+ *bh = NULL; -+ ext3cow_std_error(dir->i_sb, err); -+ goto errout; -+ } -+ BUFFER_TRACE(frame->bh, "get_write_access"); -+ err = ext3cow_journal_get_write_access(handle, frame->bh); -+ if (err) -+ goto journal_error; -+ -+ data2 = bh2->b_data; -+ -+ /* create map in the end of data2 block */ -+ map = (struct dx_map_entry *) (data2 + blocksize); -+ count = dx_make_map ((struct ext3cow_dir_entry_2 *) data1, -+ blocksize, hinfo, map); -+ map -= count; -+ split = count/2; // need to adjust to actual middle -+ dx_sort_map (map, count); -+ hash2 = map[split].hash; -+ continued = hash2 == map[split - 1].hash; -+ dxtrace(printk("Split block %i at %x, %i/%i\n", -+ dx_get_block(frame->at), hash2, split, count-split)); -+ -+ /* Fancy dance to stay within two buffers */ -+ de2 = dx_move_dirents(data1, data2, map + split, count - split); -+ de = dx_pack_dirents(data1,blocksize); -+ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); -+ de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); -+ dxtrace(dx_show_leaf (hinfo, (struct ext3cow_dir_entry_2 *) data1, blocksize, 1)); -+ dxtrace(dx_show_leaf (hinfo, (struct ext3cow_dir_entry_2 *) data2, blocksize, 1)); -+ -+ /* Which block gets the new entry? */ -+ if (hinfo->hash >= hash2) -+ { -+ swap(*bh, bh2); -+ de = de2; -+ } -+ dx_insert_block (frame, hash2 + continued, newblock); -+ err = ext3cow_journal_dirty_metadata (handle, bh2); -+ if (err) -+ goto journal_error; -+ err = ext3cow_journal_dirty_metadata (handle, frame->bh); -+ if (err) -+ goto journal_error; -+ brelse (bh2); -+ dxtrace(dx_show_index ("frame", frame->entries)); -+errout: -+ return de; -+} -+#endif -+ -+ -+/* -+ * Add a new entry into a directory (leaf) block. If de is non-NULL, -+ * it points to a directory entry which is guaranteed to be large -+ * enough for new directory entry. If de is NULL, then -+ * add_dirent_to_buf will attempt search the directory block for -+ * space. It will return -ENOSPC if no space is available, and -EIO -+ * and -EEXIST if directory entry already exists. -+ * -+ * NOTE! bh is NOT released in the case where ENOSPC is returned. In -+ * all other cases bh is released. -+ */ -+static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, -+ struct inode *inode, struct ext3cow_dir_entry_2 *de, -+ struct buffer_head * bh) -+{ -+ struct inode *dir = dentry->d_parent->d_inode; -+ const char *name = dentry->d_name.name; -+ int namelen = dentry->d_name.len; -+ unsigned long offset = 0; -+ unsigned short reclen; -+ int nlen, rlen, err; -+ char *top; -+ -+ reclen = EXT3COW_DIR_REC_LEN(namelen); -+ if (!de) { -+ de = (struct ext3cow_dir_entry_2 *)bh->b_data; -+ top = bh->b_data + dir->i_sb->s_blocksize - reclen; -+ while ((char *) de <= top) { -+ if (!ext3cow_check_dir_entry("ext3cow_add_entry", dir, de, -+ bh, offset)) { -+ brelse (bh); -+ ext3cow_reclaim_dup_inode(dentry->d_parent->d_parent->d_inode, dir); -+ return -EIO; -+ } -+ /* If name exists and it's still alive, no add. But if it's a new -+ * name in this scope, ok to add. -znjp */ -+ if (ext3cow_match (namelen, name, de) && EXT3COW_IS_DIRENT_ALIVE(de)) { -+ brelse (bh); -+ return -EEXIST; -+ } -+ nlen = EXT3COW_DIR_REC_LEN(de->name_len); -+ rlen = le16_to_cpu(de->rec_len); -+ if ((de->inode? rlen - nlen: rlen) >= reclen) -+ break; -+ de = (struct ext3cow_dir_entry_2 *)((char *)de + rlen); -+ offset += rlen; -+ } -+ if ((char *) de > top) -+ return -ENOSPC; -+ } -+ BUFFER_TRACE(bh, "get_write_access"); -+ err = ext3cow_journal_get_write_access(handle, bh); -+ if (err) { -+ ext3cow_std_error(dir->i_sb, err); -+ brelse(bh); -+ return err; -+ } -+ -+ /* By now the buffer is marked for journaling */ -+ nlen = EXT3COW_DIR_REC_LEN(de->name_len); -+ rlen = le16_to_cpu(de->rec_len); -+ if (de->inode) { -+ struct ext3cow_dir_entry_2 *de1 = (struct ext3cow_dir_entry_2 *)((char *)de + nlen); -+ de1->rec_len = cpu_to_le16(rlen - nlen); -+ de->rec_len = cpu_to_le16(nlen); -+ de = de1; -+ } -+ de->file_type = EXT3COW_FT_UNKNOWN; -+ if (inode) { -+ de->inode = cpu_to_le32(inode->i_ino); -+ ext3cow_set_de_type(dir->i_sb, de, inode->i_mode); -+ } else -+ de->inode = 0; -+ /* For versioning -znjp */ -+ de->birth_epoch = cpu_to_le32(EXT3COW_S_EPOCHNUMBER(dir->i_sb)); -+ de->death_epoch = cpu_to_le32(EXT3COW_DIRENT_ALIVE); -+ de->name_len = namelen; -+ memcpy (de->name, name, namelen); -+ /* -+ * XXX shouldn't update any times until successful -+ * completion of syscall, but too many callers depend -+ * on this. -+ * -+ * XXX similarly, too many callers depend on -+ * ext3cow_new_inode() setting the times, but error -+ * recovery deletes the inode, so the worst that can -+ * happen is that the times are slightly out of date -+ * and/or different from the directory change time. -+ */ -+ dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; -+ ext3cow_update_dx_flag(dir); -+ dir->i_version++; -+ ext3cow_mark_inode_dirty(handle, dir); -+ BUFFER_TRACE(bh, "call ext3cow_journal_dirty_metadata"); -+ err = ext3cow_journal_dirty_metadata(handle, bh); -+ if (err) -+ ext3cow_std_error(dir->i_sb, err); -+ brelse(bh); -+ return 0; -+} -+ -+#ifdef CONFIG_EXT3COW_INDEX -+/* -+ * This converts a one block unindexed directory to a 3 block indexed -+ * directory, and adds the dentry to the indexed directory. -+ */ -+static int make_indexed_dir(handle_t *handle, struct dentry *dentry, -+ struct inode *inode, struct buffer_head *bh) -+{ -+ struct inode *dir = dentry->d_parent->d_inode; -+ const char *name = dentry->d_name.name; -+ int namelen = dentry->d_name.len; -+ struct buffer_head *bh2; -+ struct dx_root *root; -+ struct dx_frame frames[2], *frame; -+ struct dx_entry *entries; -+ struct ext3cow_dir_entry_2 *de, *de2; -+ char *data1, *top; -+ unsigned len; -+ int retval; -+ unsigned blocksize; -+ struct dx_hash_info hinfo; -+ u32 block; -+ struct fake_dirent *fde; -+ -+ blocksize = dir->i_sb->s_blocksize; -+ dxtrace(printk("Creating index\n")); -+ retval = ext3cow_journal_get_write_access(handle, bh); -+ if (retval) { -+ ext3cow_std_error(dir->i_sb, retval); -+ brelse(bh); -+ return retval; -+ } -+ root = (struct dx_root *) bh->b_data; -+ -+ bh2 = ext3cow_append (handle, dir, &block, &retval); -+ if (!(bh2)) { -+ brelse(bh); -+ return retval; -+ } -+ EXT3COW_I(dir)->i_flags |= EXT3COW_INDEX_FL; -+ data1 = bh2->b_data; -+ -+ /* The 0th block becomes the root, move the dirents out */ -+ fde = &root->dotdot; -+ de = (struct ext3cow_dir_entry_2 *)((char *)fde + le16_to_cpu(fde->rec_len)); -+ len = ((char *) root) + blocksize - (char *) de; -+ memcpy (data1, de, len); -+ de = (struct ext3cow_dir_entry_2 *) data1; -+ top = data1 + len; -+ while ((char *)(de2=(void*)de+le16_to_cpu(de->rec_len)) < top) -+ de = de2; -+ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); -+ /* Initialize the root; the dot dirents already exist */ -+ de = (struct ext3cow_dir_entry_2 *) (&root->dotdot); -+ de->rec_len = cpu_to_le16(blocksize - EXT3COW_DIR_REC_LEN(2)); -+ memset (&root->info, 0, sizeof(root->info)); -+ root->info.info_length = sizeof(root->info); -+ root->info.hash_version = EXT3COW_SB(dir->i_sb)->s_def_hash_version; -+ entries = root->entries; -+ dx_set_block (entries, 1); -+ dx_set_count (entries, 1); -+ dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info))); -+ -+ /* Initialize as for dx_probe */ -+ hinfo.hash_version = root->info.hash_version; -+ hinfo.seed = EXT3COW_SB(dir->i_sb)->s_hash_seed; -+ ext3cowfs_dirhash(name, namelen, &hinfo); -+ frame = frames; -+ frame->entries = entries; -+ frame->at = entries; -+ frame->bh = bh; -+ bh = bh2; -+ de = do_split(handle,dir, &bh, frame, &hinfo, &retval); -+ dx_release (frames); -+ if (!(de)) -+ return retval; -+ -+ return add_dirent_to_buf(handle, dentry, inode, de, bh); -+} -+#endif -+ -+/* -+ * ext3cow_add_entry() -+ * -+ * adds a file entry to the specified directory, using the same -+ * semantics as ext3cow_find_entry(). It returns NULL if it failed. -+ * -+ * NOTE!! The inode part of 'de' is left at 0 - which means you -+ * may not sleep between calling this and putting something into -+ * the entry, as someone else might have used it while you slept. -+ */ -+static int ext3cow_add_entry (handle_t *handle, struct dentry *dentry, -+ struct inode *inode) -+{ -+ struct inode *dir = dentry->d_parent->d_inode; -+ unsigned long offset; -+ struct buffer_head * bh; -+ struct ext3cow_dir_entry_2 *de; -+ struct super_block * sb; -+ int retval; -+#ifdef CONFIG_EXT3COW_INDEX -+ int dx_fallback=0; -+#endif -+ unsigned blocksize; -+ u32 block, blocks; -+ -+ sb = dir->i_sb; -+ blocksize = sb->s_blocksize; -+ if (!dentry->d_name.len) -+ return -EINVAL; -+ /* No additions in the past -znjp */ -+ if(is_unchangeable(dir, dentry)) -+ return -EROFS; -+ -+ if(EXT3COW_S_EPOCHNUMBER(sb) > EXT3COW_I_EPOCHNUMBER(dir)){ -+ if(ext3cow_dup_inode(dentry->d_parent->d_parent->d_inode, dir)) -+ return -1; -+ } -+ -+#ifdef CONFIG_EXT3COW_INDEX -+ if (is_dx(dir)) { -+ retval = ext3cow_dx_add_entry(handle, dentry, inode); -+ if (!retval || (retval != ERR_BAD_DX_DIR)){ -+ ext3cow_reclaim_dup_inode(dentry->d_parent->d_parent->d_inode, dir); -+ return retval; -+ } -+ EXT3COW_I(dir)->i_flags &= ~EXT3COW_INDEX_FL; -+ dx_fallback++; -+ ext3cow_mark_inode_dirty(handle, dir); -+ } -+#endif -+ blocks = dir->i_size >> sb->s_blocksize_bits; -+ for (block = 0, offset = 0; block < blocks; block++) { -+ bh = ext3cow_bread(handle, dir, block, 0, &retval); -+ if(!bh){ -+ ext3cow_reclaim_dup_inode(dentry->d_parent->d_parent->d_inode, dir); -+ return retval; -+ } -+ retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); -+ if (retval != -ENOSPC) -+ return retval; -+ -+#ifdef CONFIG_EXT3COW_INDEX -+ if (blocks == 1 && !dx_fallback && -+ EXT3COW_HAS_COMPAT_FEATURE(sb, EXT3COW_FEATURE_COMPAT_DIR_INDEX)) -+ return make_indexed_dir(handle, dentry, inode, bh); -+#endif -+ brelse(bh); -+ } -+ -+ bh = ext3cow_append(handle, dir, &block, &retval); -+ if (!bh){ -+ ext3cow_reclaim_dup_inode(dentry->d_parent->d_parent->d_inode, dir); -+ return retval; -+ } -+ de = (struct ext3cow_dir_entry_2 *) bh->b_data; -+ de->inode = 0; -+ de->rec_len = cpu_to_le16(blocksize); -+ return add_dirent_to_buf(handle, dentry, inode, de, bh); -+} -+ -+#ifdef CONFIG_EXT3COW_INDEX -+/* -+ * Returns 0 for success, or a negative error value -+ */ -+static int ext3cow_dx_add_entry(handle_t *handle, struct dentry *dentry, -+ struct inode *inode) -+{ -+ struct dx_frame frames[2], *frame; -+ struct dx_entry *entries, *at; -+ struct dx_hash_info hinfo; -+ struct buffer_head * bh; -+ struct inode *dir = dentry->d_parent->d_inode; -+ struct super_block * sb = dir->i_sb; -+ struct ext3cow_dir_entry_2 *de; -+ int err; -+ -+ frame = dx_probe(dentry, NULL, &hinfo, frames, &err); -+ if (!frame) -+ return err; -+ entries = frame->entries; -+ at = frame->at; -+ -+ if (!(bh = ext3cow_bread(handle,dir, dx_get_block(frame->at), 0, &err))) -+ goto cleanup; -+ -+ BUFFER_TRACE(bh, "get_write_access"); -+ err = ext3cow_journal_get_write_access(handle, bh); -+ if (err) -+ goto journal_error; -+ -+ err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); -+ if (err != -ENOSPC) { -+ bh = NULL; -+ goto cleanup; -+ } -+ -+ /* Block full, should compress but for now just split */ -+ dxtrace(printk("using %u of %u node entries\n", -+ dx_get_count(entries), dx_get_limit(entries))); -+ /* Need to split index? */ -+ if (dx_get_count(entries) == dx_get_limit(entries)) { -+ u32 newblock; -+ unsigned icount = dx_get_count(entries); -+ int levels = frame - frames; -+ struct dx_entry *entries2; -+ struct dx_node *node2; -+ struct buffer_head *bh2; -+ -+ if (levels && (dx_get_count(frames->entries) == -+ dx_get_limit(frames->entries))) { -+ ext3cow_warning(sb, __FUNCTION__, -+ "Directory index full!"); -+ err = -ENOSPC; -+ goto cleanup; -+ } -+ bh2 = ext3cow_append (handle, dir, &newblock, &err); -+ if (!(bh2)) -+ goto cleanup; -+ node2 = (struct dx_node *)(bh2->b_data); -+ entries2 = node2->entries; -+ node2->fake.rec_len = cpu_to_le16(sb->s_blocksize); -+ node2->fake.inode = 0; -+ BUFFER_TRACE(frame->bh, "get_write_access"); -+ err = ext3cow_journal_get_write_access(handle, frame->bh); -+ if (err) -+ goto journal_error; -+ if (levels) { -+ unsigned icount1 = icount/2, icount2 = icount - icount1; -+ unsigned hash2 = dx_get_hash(entries + icount1); -+ dxtrace(printk("Split index %i/%i\n", icount1, icount2)); -+ -+ BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ -+ err = ext3cow_journal_get_write_access(handle, -+ frames[0].bh); -+ if (err) -+ goto journal_error; -+ -+ memcpy ((char *) entries2, (char *) (entries + icount1), -+ icount2 * sizeof(struct dx_entry)); -+ dx_set_count (entries, icount1); -+ dx_set_count (entries2, icount2); -+ dx_set_limit (entries2, dx_node_limit(dir)); -+ -+ /* Which index block gets the new entry? */ -+ if (at - entries >= icount1) { -+ frame->at = at = at - entries - icount1 + entries2; -+ frame->entries = entries = entries2; -+ swap(frame->bh, bh2); -+ } -+ dx_insert_block (frames + 0, hash2, newblock); -+ dxtrace(dx_show_index ("node", frames[1].entries)); -+ dxtrace(dx_show_index ("node", -+ ((struct dx_node *) bh2->b_data)->entries)); -+ err = ext3cow_journal_dirty_metadata(handle, bh2); -+ if (err) -+ goto journal_error; -+ brelse (bh2); -+ } else { -+ dxtrace(printk("Creating second level index...\n")); -+ memcpy((char *) entries2, (char *) entries, -+ icount * sizeof(struct dx_entry)); -+ dx_set_limit(entries2, dx_node_limit(dir)); -+ -+ /* Set up root */ -+ dx_set_count(entries, 1); -+ dx_set_block(entries + 0, newblock); -+ ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; -+ -+ /* Add new access path frame */ -+ frame = frames + 1; -+ frame->at = at = at - entries + entries2; -+ frame->entries = entries = entries2; -+ frame->bh = bh2; -+ err = ext3cow_journal_get_write_access(handle, -+ frame->bh); -+ if (err) -+ goto journal_error; -+ } -+ ext3cow_journal_dirty_metadata(handle, frames[0].bh); -+ } -+ de = do_split(handle, dir, &bh, frame, &hinfo, &err); -+ if (!de) -+ goto cleanup; -+ err = add_dirent_to_buf(handle, dentry, inode, de, bh); -+ bh = NULL; -+ goto cleanup; -+ -+journal_error: -+ ext3cow_std_error(dir->i_sb, err); -+cleanup: -+ if (bh) -+ brelse(bh); -+ dx_release(frames); -+ return err; -+} -+#endif -+ -+/* -+ * ext3cow_delete_entry deletes a directory entry by merging it with the -+ * previous entry -+ */ -+static int ext3cow_delete_entry (handle_t *handle, -+ struct inode * dir, -+ struct ext3cow_dir_entry_2 * de_del, -+ struct buffer_head * bh, -+ struct dentry *dentry) -+{ -+ struct ext3cow_dir_entry_2 * de, * pde; -+ int i; -+ -+ i = 0; -+ pde = NULL; -+ de = (struct ext3cow_dir_entry_2 *) bh->b_data; -+ while (i < bh->b_size) { -+ if (!ext3cow_check_dir_entry("ext3cow_delete_entry", dir, de, bh, i)) -+ return -EIO; -+ if (de == de_del) { -+ /* Can't delete an already dead entry - znjp */ -+ if(!EXT3COW_IS_DIRENT_ALIVE(de)) -+ return 0; -+ -+ if(EXT3COW_S_EPOCHNUMBER(dir->i_sb) > EXT3COW_I_EPOCHNUMBER(dir)){ -+ if(ext3cow_dup_inode(dentry->d_parent->d_parent->d_inode, dir)) -+ return -1; -+ } -+ -+ BUFFER_TRACE(bh, "get_write_access"); -+ ext3cow_journal_get_write_access(handle, bh); -+ /* There used to be code here to adjust the rec_len -+ * but since names really never go away, the code was deleted -+ if (pde) -+ pde->rec_len = -+ cpu_to_le16(le16_to_cpu(pde->rec_len) + -+ le16_to_cpu(de->rec_len)); -+ else -+ de->inode = 0; -+ */ -+ /* Mark it dead - znjp */ -+ de->death_epoch = cpu_to_le32(EXT3COW_I_EPOCHNUMBER(dir)); -+ dir->i_version++; -+ BUFFER_TRACE(bh, "call ext3cow_journal_dirty_metadata"); -+ ext3cow_journal_dirty_metadata(handle, bh); -+ return 0; -+ } -+ i += le16_to_cpu(de->rec_len); -+ pde = de; -+ de = (struct ext3cow_dir_entry_2 *) -+ ((char *) de + le16_to_cpu(de->rec_len)); -+ } -+ return -ENOENT; -+} -+ -+/* -+ * ext3cow_mark_inode_dirty is somewhat expensive, so unlike ext2 we -+ * do not perform it in these functions. We perform it at the call site, -+ * if it is needed. -+ */ -+static inline void ext3cow_inc_count(handle_t *handle, struct inode *inode) -+{ -+ inc_nlink(inode); -+} -+ -+static inline void ext3cow_dec_count(handle_t *handle, struct inode *inode) -+{ -+ drop_nlink(inode); -+} -+ -+static int ext3cow_add_nondir(handle_t *handle, -+ struct dentry *dentry, struct inode *inode) -+{ -+ int err = ext3cow_add_entry(handle, dentry, inode); -+ if (!err) { -+ ext3cow_mark_inode_dirty(handle, inode); -+ d_instantiate(dentry, inode); -+ return 0; -+ } -+ ext3cow_dec_count(handle, inode); -+ iput(inode); -+ return err; -+} -+ -+/* -+ * By the time this is called, we already have created -+ * the directory cache entry for the new file, but it -+ * is so far negative - it has no inode. -+ * -+ * If the create succeeds, we fill in the inode information -+ * with d_instantiate(). -+ */ -+static int ext3cow_create (struct inode * dir, struct dentry * dentry, int mode, -+ struct nameidata *nd) -+{ -+ handle_t *handle; -+ struct inode * inode; -+ int err, retries = 0; -+ -+ /* Can't create in the past -znjp */ -+ if(is_unchangeable(dir, dentry)) -+ return -EROFS; -+ -+retry: -+ handle = ext3cow_journal_start(dir, EXT3COW_DATA_TRANS_BLOCKS(dir->i_sb) + -+ EXT3COW_INDEX_EXTRA_TRANS_BLOCKS + 3 + -+ 2*EXT3COW_QUOTA_INIT_BLOCKS(dir->i_sb)); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ if (IS_DIRSYNC(dir)) -+ handle->h_sync = 1; -+ -+ inode = ext3cow_new_inode (handle, dir, mode); -+ err = PTR_ERR(inode); -+ if (!IS_ERR(inode)) { -+ inode->i_op = &ext3cow_file_inode_operations; -+ inode->i_fop = &ext3cow_file_operations; -+ ext3cow_set_aops(inode); -+ err = ext3cow_add_nondir(handle, dentry, inode); -+ } -+ ext3cow_journal_stop(handle); -+ if (err == -ENOSPC && ext3cow_should_retry_alloc(dir->i_sb, &retries)) -+ goto retry; -+ return err; -+} -+ -+static int ext3cow_mknod (struct inode * dir, struct dentry *dentry, -+ int mode, dev_t rdev) -+{ -+ handle_t *handle; -+ struct inode *inode; -+ int err, retries = 0; -+ -+ if (!new_valid_dev(rdev)) -+ return -EINVAL; -+ -+retry: -+ handle = ext3cow_journal_start(dir, EXT3COW_DATA_TRANS_BLOCKS(dir->i_sb) + -+ EXT3COW_INDEX_EXTRA_TRANS_BLOCKS + 3 + -+ 2*EXT3COW_QUOTA_INIT_BLOCKS(dir->i_sb)); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ if (IS_DIRSYNC(dir)) -+ handle->h_sync = 1; -+ -+ inode = ext3cow_new_inode (handle, dir, mode); -+ err = PTR_ERR(inode); -+ if (!IS_ERR(inode)) { -+ init_special_inode(inode, inode->i_mode, rdev); -+#ifdef CONFIG_EXT3COW_FS_XATTR -+ inode->i_op = &ext3cow_special_inode_operations; -+#endif -+ err = ext3cow_add_nondir(handle, dentry, inode); -+ } -+ ext3cow_journal_stop(handle); -+ if (err == -ENOSPC && ext3cow_should_retry_alloc(dir->i_sb, &retries)) -+ goto retry; -+ return err; -+} -+ -+static int ext3cow_mkdir(struct inode * dir, struct dentry * dentry, int mode) -+{ -+ handle_t *handle; -+ struct inode * inode; -+ struct buffer_head * dir_block; -+ struct ext3cow_dir_entry_2 * de; -+ int err, retries = 0; -+ -+ if (dir->i_nlink >= EXT3COW_LINK_MAX) -+ return -EMLINK; -+ /* No mkdirs in the past -znjp */ -+ if(is_unchangeable(dir, dentry)) -+ return -EROFS; -+ -+ -+retry: -+ handle = ext3cow_journal_start(dir, EXT3COW_DATA_TRANS_BLOCKS(dir->i_sb) + -+ EXT3COW_INDEX_EXTRA_TRANS_BLOCKS + 3 + -+ 2*EXT3COW_QUOTA_INIT_BLOCKS(dir->i_sb)); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ if (IS_DIRSYNC(dir)) -+ handle->h_sync = 1; -+ -+ inode = ext3cow_new_inode (handle, dir, S_IFDIR | mode); -+ err = PTR_ERR(inode); -+ if (IS_ERR(inode)) -+ goto out_stop; -+ -+ inode->i_op = &ext3cow_dir_inode_operations; -+ inode->i_fop = &ext3cow_dir_operations; -+ inode->i_size = EXT3COW_I(inode)->i_disksize = inode->i_sb->s_blocksize; -+ dir_block = ext3cow_bread (handle, inode, 0, 1, &err); -+ if (!dir_block) { -+ drop_nlink(inode); /* is this nlink == 0? */ -+ ext3cow_mark_inode_dirty(handle, inode); -+ iput (inode); -+ goto out_stop; -+ } -+ BUFFER_TRACE(dir_block, "get_write_access"); -+ ext3cow_journal_get_write_access(handle, dir_block); -+ de = (struct ext3cow_dir_entry_2 *) dir_block->b_data; -+ de->inode = cpu_to_le32(inode->i_ino); -+ de->name_len = 1; -+ de->rec_len = cpu_to_le16(EXT3COW_DIR_REC_LEN(de->name_len)); -+ /* For versioning -znjp */ -+ de->birth_epoch = cpu_to_le32(EXT3COW_S_EPOCHNUMBER(dir->i_sb)); -+ de->death_epoch = cpu_to_le32(EXT3COW_DIRENT_ALIVE); -+ strcpy (de->name, "."); -+ ext3cow_set_de_type(dir->i_sb, de, S_IFDIR); -+ de = (struct ext3cow_dir_entry_2 *) -+ ((char *) de + le16_to_cpu(de->rec_len)); -+ de->inode = cpu_to_le32(dir->i_ino); -+ de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize-EXT3COW_DIR_REC_LEN(1)); -+ de->name_len = 2; -+ strcpy (de->name, ".."); -+ ext3cow_set_de_type(dir->i_sb, de, S_IFDIR); -+ inode->i_nlink = 2; -+ /* For versioning -znjp */ -+ de->birth_epoch = cpu_to_le32(EXT3COW_I_EPOCHNUMBER(dir)); -+ de->death_epoch = cpu_to_le32(EXT3COW_DIRENT_ALIVE); -+ BUFFER_TRACE(dir_block, "call ext3cow_journal_dirty_metadata"); -+ ext3cow_journal_dirty_metadata(handle, dir_block); -+ brelse (dir_block); -+ ext3cow_mark_inode_dirty(handle, inode); -+ err = ext3cow_add_entry (handle, dentry, inode); -+ if (err) { -+ inode->i_nlink = 0; -+ ext3cow_mark_inode_dirty(handle, inode); -+ iput (inode); -+ goto out_stop; -+ } -+ inc_nlink(dir); -+ ext3cow_update_dx_flag(dir); -+ ext3cow_mark_inode_dirty(handle, dir); -+ d_instantiate(dentry, inode); -+out_stop: -+ ext3cow_journal_stop(handle); -+ if (err == -ENOSPC && ext3cow_should_retry_alloc(dir->i_sb, &retries)) -+ goto retry; -+ return err; -+} -+ -+/* -+ * routine to check that the specified directory is empty (for rmdir) -+ */ -+static int empty_dir (struct inode * inode) -+{ -+ unsigned long offset; -+ struct buffer_head * bh; -+ struct ext3cow_dir_entry_2 * de, * de1; -+ struct super_block * sb; -+ int err = 0; -+ -+ sb = inode->i_sb; -+ if (inode->i_size < EXT3COW_DIR_REC_LEN(1) + EXT3COW_DIR_REC_LEN(2) || -+ !(bh = ext3cow_bread (NULL, inode, 0, 0, &err))) { -+ if (err) -+ ext3cow_error(inode->i_sb, __FUNCTION__, -+ "error %d reading directory #%lu offset 0", -+ err, inode->i_ino); -+ else -+ ext3cow_warning(inode->i_sb, __FUNCTION__, -+ "bad directory (dir #%lu) - no data block", -+ inode->i_ino); -+ return 1; -+ } -+ de = (struct ext3cow_dir_entry_2 *) bh->b_data; -+ de1 = (struct ext3cow_dir_entry_2 *) -+ ((char *) de + le16_to_cpu(de->rec_len)); -+ if (le32_to_cpu(de->inode) != inode->i_ino || -+ !le32_to_cpu(de1->inode) || -+ strcmp (".", de->name) || -+ strcmp ("..", de1->name)) { -+ ext3cow_warning (inode->i_sb, "empty_dir", -+ "bad directory (dir #%lu) - no `.' or `..'", -+ inode->i_ino); -+ brelse (bh); -+ return 1; -+ } -+ offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len); -+ de = (struct ext3cow_dir_entry_2 *) -+ ((char *) de1 + le16_to_cpu(de1->rec_len)); -+ while (offset < inode->i_size ) { -+ if (!bh || -+ (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { -+ err = 0; -+ brelse (bh); -+ bh = ext3cow_bread (NULL, inode, -+ offset >> EXT3COW_BLOCK_SIZE_BITS(sb), 0, &err); -+ if (!bh) { -+ if (err) -+ ext3cow_error(sb, __FUNCTION__, -+ "error %d reading directory" -+ " #%lu offset %lu", -+ err, inode->i_ino, offset); -+ offset += sb->s_blocksize; -+ continue; -+ } -+ de = (struct ext3cow_dir_entry_2 *) bh->b_data; -+ } -+ if (!ext3cow_check_dir_entry("empty_dir", inode, de, bh, offset)) { -+ de = (struct ext3cow_dir_entry_2 *)(bh->b_data + -+ sb->s_blocksize); -+ offset = (offset | (sb->s_blocksize - 1)) + 1; -+ continue; -+ } -+ /* Can remove a dir only if all dirents are out of scope -znjp */ -+ if (le32_to_cpu(de->inode) && -+ EXT3COW_IS_DIRENT_SCOPED(de, EXT3COW_I_EPOCHNUMBER(inode))) { -+ brelse (bh); -+ return 0; -+ } -+ offset += le16_to_cpu(de->rec_len); -+ de = (struct ext3cow_dir_entry_2 *) -+ ((char *) de + le16_to_cpu(de->rec_len)); -+ } -+ brelse (bh); -+ return 1; -+} -+ -+/* ext3cow_orphan_add() links an unlinked or truncated inode into a list of -+ * such inodes, starting at the superblock, in case we crash before the -+ * file is closed/deleted, or in case the inode truncate spans multiple -+ * transactions and the last transaction is not recovered after a crash. -+ * -+ * At filesystem recovery time, we walk this list deleting unlinked -+ * inodes and truncating linked inodes in ext3cow_orphan_cleanup(). -+ */ -+int ext3cow_orphan_add(handle_t *handle, struct inode *inode) -+{ -+ struct super_block *sb = inode->i_sb; -+ struct ext3cow_iloc iloc; -+ int err = 0, rc; -+ -+ lock_super(sb); -+ if (!list_empty(&EXT3COW_I(inode)->i_orphan)) -+ goto out_unlock; -+ -+ /* Orphan handling is only valid for files with data blocks -+ * being truncated, or files being unlinked. */ -+ -+ /* @@@ FIXME: Observation from aviro: -+ * I think I can trigger J_ASSERT in ext3cow_orphan_add(). We block -+ * here (on lock_super()), so race with ext3cow_link() which might bump -+ * ->i_nlink. For, say it, character device. Not a regular file, -+ * not a directory, not a symlink and ->i_nlink > 0. -+ */ -+ J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || -+ S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); -+ -+ BUFFER_TRACE(EXT3COW_SB(sb)->s_sbh, "get_write_access"); -+ err = ext3cow_journal_get_write_access(handle, EXT3COW_SB(sb)->s_sbh); -+ if (err) -+ goto out_unlock; -+ -+ err = ext3cow_reserve_inode_write(handle, inode, &iloc); -+ if (err) -+ goto out_unlock; -+ -+ /* Insert this inode at the head of the on-disk orphan list... */ -+ NEXT_ORPHAN(inode) = le32_to_cpu(EXT3COW_SB(sb)->s_es->s_last_orphan); -+ EXT3COW_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); -+ err = ext3cow_journal_dirty_metadata(handle, EXT3COW_SB(sb)->s_sbh); -+ rc = ext3cow_mark_iloc_dirty(handle, inode, &iloc); -+ if (!err) -+ err = rc; -+ -+ /* Only add to the head of the in-memory list if all the -+ * previous operations succeeded. If the orphan_add is going to -+ * fail (possibly taking the journal offline), we can't risk -+ * leaving the inode on the orphan list: stray orphan-list -+ * entries can cause panics at unmount time. -+ * -+ * This is safe: on error we're going to ignore the orphan list -+ * anyway on the next recovery. */ -+ if (!err) -+ list_add(&EXT3COW_I(inode)->i_orphan, &EXT3COW_SB(sb)->s_orphan); -+ -+ jbd_debug(4, "superblock will point to %lu\n", inode->i_ino); -+ jbd_debug(4, "orphan inode %lu will point to %d\n", -+ inode->i_ino, NEXT_ORPHAN(inode)); -+out_unlock: -+ unlock_super(sb); -+ ext3cow_std_error(inode->i_sb, err); -+ return err; -+} -+ -+/* -+ * ext3cow_orphan_del() removes an unlinked or truncated inode from the list -+ * of such inodes stored on disk, because it is finally being cleaned up. -+ */ -+int ext3cow_orphan_del(handle_t *handle, struct inode *inode) -+{ -+ struct list_head *prev; -+ struct ext3cow_inode_info *ei = EXT3COW_I(inode); -+ struct ext3cow_sb_info *sbi; -+ unsigned long ino_next; -+ struct ext3cow_iloc iloc; -+ int err = 0; -+ -+ lock_super(inode->i_sb); -+ if (list_empty(&ei->i_orphan)) { -+ unlock_super(inode->i_sb); -+ return 0; -+ } -+ -+ ino_next = NEXT_ORPHAN(inode); -+ prev = ei->i_orphan.prev; -+ sbi = EXT3COW_SB(inode->i_sb); -+ -+ jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino); -+ -+ list_del_init(&ei->i_orphan); -+ -+ /* If we're on an error path, we may not have a valid -+ * transaction handle with which to update the orphan list on -+ * disk, but we still need to remove the inode from the linked -+ * list in memory. */ -+ if (!handle) -+ goto out; -+ -+ err = ext3cow_reserve_inode_write(handle, inode, &iloc); -+ if (err) -+ goto out_err; -+ -+ if (prev == &sbi->s_orphan) { -+ jbd_debug(4, "superblock will point to %lu\n", ino_next); -+ BUFFER_TRACE(sbi->s_sbh, "get_write_access"); -+ err = ext3cow_journal_get_write_access(handle, sbi->s_sbh); -+ if (err) -+ goto out_brelse; -+ sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); -+ err = ext3cow_journal_dirty_metadata(handle, sbi->s_sbh); -+ } else { -+ struct ext3cow_iloc iloc2; -+ struct inode *i_prev = -+ &list_entry(prev, struct ext3cow_inode_info, i_orphan)->vfs_inode; -+ -+ jbd_debug(4, "orphan inode %lu will point to %lu\n", -+ i_prev->i_ino, ino_next); -+ err = ext3cow_reserve_inode_write(handle, i_prev, &iloc2); -+ if (err) -+ goto out_brelse; -+ NEXT_ORPHAN(i_prev) = ino_next; -+ err = ext3cow_mark_iloc_dirty(handle, i_prev, &iloc2); -+ } -+ if (err) -+ goto out_brelse; -+ NEXT_ORPHAN(inode) = 0; -+ err = ext3cow_mark_iloc_dirty(handle, inode, &iloc); -+ -+out_err: -+ ext3cow_std_error(inode->i_sb, err); -+out: -+ unlock_super(inode->i_sb); -+ return err; -+ -+out_brelse: -+ brelse(iloc.bh); -+ goto out_err; -+} -+ -+static int ext3cow_rmdir (struct inode * dir, struct dentry *dentry) -+{ -+ int retval; -+ struct inode * inode; -+ struct buffer_head * bh; -+ struct ext3cow_dir_entry_2 * de; -+ handle_t *handle; -+ -+ /* Initialize quotas before so that eventual writes go in -+ * separate transaction */ -+ DQUOT_INIT(dentry->d_inode); -+ handle = ext3cow_journal_start(dir, EXT3COW_DELETE_TRANS_BLOCKS(dir->i_sb)); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ retval = -ENOENT; -+ bh = ext3cow_find_entry (dentry, &de); -+ if (!bh) -+ goto end_rmdir; -+ -+ if (IS_DIRSYNC(dir)) -+ handle->h_sync = 1; -+ -+ inode = dentry->d_inode; -+ -+ /* Can't rmdir in the past -znjp */ -+ retval = -EROFS; -+ if(is_unchangeable(inode, dentry)) -+ goto end_rmdir; -+ -+ retval = -EIO; -+ if (le32_to_cpu(de->inode) != inode->i_ino) -+ goto end_rmdir; -+ -+ retval = -ENOTEMPTY; -+ if (!empty_dir (inode)) -+ goto end_rmdir; -+ -+ retval = ext3cow_delete_entry(handle, dir, de, bh, dentry); -+ if (retval) -+ goto end_rmdir; -+ if (inode->i_nlink != 2) -+ ext3cow_warning (inode->i_sb, "ext3cow_rmdir", -+ "empty directory has nlink!=2 (%d)", -+ inode->i_nlink); -+ inode->i_version++; -+ -+ /* We only delete things that were created in the same epoch -znjp */ -+ if(de->birth_epoch == de->death_epoch){ -+ clear_nlink(inode); -+ /* There's no need to set i_disksize: the fact that i_nlink is -+ * zero will ensure that the right thing happens during any -+ * recovery. */ -+ inode->i_size = 0; -+ ext3cow_orphan_add(handle, inode); -+ drop_nlink(dir); -+ } -+ EXT3COW_I(inode)->i_flags |= EXT3COW_UNCHANGEABLE_FL; -+ inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; -+ ext3cow_mark_inode_dirty(handle, inode); -+ ext3cow_update_dx_flag(dir); -+ ext3cow_mark_inode_dirty(handle, dir); -+ -+end_rmdir: -+ ext3cow_journal_stop(handle); -+ brelse (bh); -+ return retval; -+} -+ -+static int ext3cow_unlink(struct inode * dir, struct dentry *dentry) -+{ -+ int retval; -+ struct inode * inode; -+ struct buffer_head * bh; -+ struct ext3cow_dir_entry_2 * de; -+ handle_t *handle; -+ -+ /* Initialize quotas before so that eventual writes go -+ * in separate transaction */ -+ DQUOT_INIT(dentry->d_inode); -+ handle = ext3cow_journal_start(dir, EXT3COW_DELETE_TRANS_BLOCKS(dir->i_sb)); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ if (IS_DIRSYNC(dir)) -+ handle->h_sync = 1; -+ -+ retval = -ENOENT; -+ bh = ext3cow_find_entry (dentry, &de); -+ if (!bh) -+ goto end_unlink; -+ -+ inode = dentry->d_inode; -+ -+ /* Can't unlink in the past -znjp */ -+ retval = -EROFS; -+ if(is_unchangeable(inode, dentry)) -+ goto end_unlink; -+ -+ retval = -EIO; -+ if (le32_to_cpu(de->inode) != inode->i_ino) -+ goto end_unlink; -+ -+ if (!inode->i_nlink) { -+ ext3cow_warning (inode->i_sb, "ext3cow_unlink", -+ "Deleting nonexistent file (%lu), %d", -+ inode->i_ino, inode->i_nlink); -+ inode->i_nlink = 1; -+ } -+ retval = ext3cow_delete_entry(handle, dir, de, bh, dentry); -+ if (retval) -+ goto end_unlink; -+ dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; -+ ext3cow_update_dx_flag(dir); -+ ext3cow_mark_inode_dirty(handle, dir); -+ -+ /* If the file should be deleted here, don't actually delete it -+ * but mark it unchangeable, i.e. it's now in the past. -znjp */ -+ -+ /* If file was created in this epoch, then we actually unlink it, -+ * if not, then it belongs to the past, so mark it unchangeable -znjp */ -+ if(de->birth_epoch == de->death_epoch){ -+ drop_nlink(inode); -+ if (!inode->i_nlink){ -+ ext3cow_orphan_add(handle, inode); -+ } -+ }else{ -+ if(!(inode->i_nlink - 1)) -+ EXT3COW_I(inode)->i_flags |= EXT3COW_UNCHANGEABLE_FL; -+ } -+ inode->i_ctime = dir->i_ctime; -+ ext3cow_mark_inode_dirty(handle, inode); -+ retval = 0; -+ -+end_unlink: -+ ext3cow_journal_stop(handle); -+ brelse (bh); -+ return retval; -+} -+ -+static int ext3cow_symlink (struct inode * dir, -+ struct dentry *dentry, const char * symname) -+{ -+ handle_t *handle; -+ struct inode * inode; -+ int l, err, retries = 0; -+ -+ l = strlen(symname)+1; -+ if (l > dir->i_sb->s_blocksize) -+ return -ENAMETOOLONG; -+ -+retry: -+ handle = ext3cow_journal_start(dir, EXT3COW_DATA_TRANS_BLOCKS(dir->i_sb) + -+ EXT3COW_INDEX_EXTRA_TRANS_BLOCKS + 5 + -+ 2*EXT3COW_QUOTA_INIT_BLOCKS(dir->i_sb)); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ if (IS_DIRSYNC(dir)) -+ handle->h_sync = 1; -+ -+ inode = ext3cow_new_inode (handle, dir, S_IFLNK|S_IRWXUGO); -+ err = PTR_ERR(inode); -+ if (IS_ERR(inode)) -+ goto out_stop; -+ -+ if (l > sizeof (EXT3COW_I(inode)->i_data)) { -+ inode->i_op = &ext3cow_symlink_inode_operations; -+ ext3cow_set_aops(inode); -+ /* -+ * page_symlink() calls into ext3cow_prepare/commit_write. -+ * We have a transaction open. All is sweetness. It also sets -+ * i_size in generic_commit_write(). -+ */ -+ err = __page_symlink(inode, symname, l, -+ mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); -+ if (err) { -+ ext3cow_dec_count(handle, inode); -+ ext3cow_mark_inode_dirty(handle, inode); -+ iput (inode); -+ goto out_stop; -+ } -+ } else { -+ inode->i_op = &ext3cow_fast_symlink_inode_operations; -+ memcpy((char*)&EXT3COW_I(inode)->i_data,symname,l); -+ inode->i_size = l-1; -+ } -+ EXT3COW_I(inode)->i_disksize = inode->i_size; -+ err = ext3cow_add_nondir(handle, dentry, inode); -+out_stop: -+ ext3cow_journal_stop(handle); -+ if (err == -ENOSPC && ext3cow_should_retry_alloc(dir->i_sb, &retries)) -+ goto retry; -+ return err; -+} -+ -+static int ext3cow_link (struct dentry * old_dentry, -+ struct inode * dir, struct dentry *dentry) -+{ -+ handle_t *handle; -+ struct inode *inode = old_dentry->d_inode; -+ int err, retries = 0; -+ -+ if (inode->i_nlink >= EXT3COW_LINK_MAX) -+ return -EMLINK; -+ -+retry: -+ handle = ext3cow_journal_start(dir, EXT3COW_DATA_TRANS_BLOCKS(dir->i_sb) + -+ EXT3COW_INDEX_EXTRA_TRANS_BLOCKS); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ if (IS_DIRSYNC(dir)) -+ handle->h_sync = 1; -+ -+ inode->i_ctime = CURRENT_TIME_SEC; -+ ext3cow_inc_count(handle, inode); -+ atomic_inc(&inode->i_count); -+ -+ err = ext3cow_add_nondir(handle, dentry, inode); -+ ext3cow_journal_stop(handle); -+ if (err == -ENOSPC && ext3cow_should_retry_alloc(dir->i_sb, &retries)) -+ goto retry; -+ return err; -+} -+ -+#define PARENT_INO(buffer) \ -+ ((struct ext3cow_dir_entry_2 *) ((char *) buffer + \ -+ le16_to_cpu(((struct ext3cow_dir_entry_2 *) buffer)->rec_len)))->inode -+ -+/* -+ * Anybody can rename anything with this: the permission checks are left to the -+ * higher-level routines. -+ */ -+static int ext3cow_rename (struct inode * old_dir, struct dentry *old_dentry, -+ struct inode * new_dir,struct dentry *new_dentry) -+{ -+ handle_t *handle; -+ struct inode * old_inode, * new_inode; -+ struct buffer_head * old_bh, * new_bh, * dir_bh; -+ struct ext3cow_dir_entry_2 * old_de, * new_de; -+ int retval; -+ -+ old_bh = new_bh = dir_bh = NULL; -+ -+ /* Initialize quotas before so that eventual writes go -+ * in separate transaction */ -+ if (new_dentry->d_inode) -+ DQUOT_INIT(new_dentry->d_inode); -+ handle = ext3cow_journal_start(old_dir, 2 * -+ EXT3COW_DATA_TRANS_BLOCKS(old_dir->i_sb) + -+ EXT3COW_INDEX_EXTRA_TRANS_BLOCKS + 2); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) -+ handle->h_sync = 1; -+ -+ old_bh = ext3cow_find_entry (old_dentry, &old_de); -+ /* -+ * Check for inode number is _not_ due to possible IO errors. -+ * We might rmdir the source, keep it as pwd of some process -+ * and merrily kill the link to whatever was created under the -+ * same name. Goodbye sticky bit ;-< -+ */ -+ old_inode = old_dentry->d_inode; -+ retval = -ENOENT; -+ if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino) -+ goto end_rename; -+ -+ new_inode = new_dentry->d_inode; -+ new_bh = ext3cow_find_entry (new_dentry, &new_de); -+ if (new_bh) { -+ if (!new_inode) { -+ brelse (new_bh); -+ new_bh = NULL; -+ } -+ } -+ -+ /* can't move something into the past -znjp */ -+ retval = -EROFS; -+ if(is_unchangeable(new_inode, new_dentry)) -+ goto end_rename; -+ /* can't some move from the past -znjp */ -+ if(is_unchangeable(old_inode, old_dentry)) -+ goto end_rename; -+ -+ if (S_ISDIR(old_inode->i_mode)) { -+ if (new_inode) { -+ retval = -ENOTEMPTY; -+ if (!empty_dir (new_inode)) -+ goto end_rename; -+ } -+ retval = -EIO; -+ dir_bh = ext3cow_bread (handle, old_inode, 0, 0, &retval); -+ if (!dir_bh) -+ goto end_rename; -+ if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) -+ goto end_rename; -+ retval = -EMLINK; -+ if (!new_inode && new_dir!=old_dir && -+ new_dir->i_nlink >= EXT3COW_LINK_MAX) -+ goto end_rename; -+ } -+ if (!new_bh) { -+ retval = ext3cow_add_entry (handle, new_dentry, old_inode); -+ if (retval) -+ goto end_rename; -+ } else { -+ BUFFER_TRACE(new_bh, "get write access"); -+ ext3cow_journal_get_write_access(handle, new_bh); -+ new_de->inode = cpu_to_le32(old_inode->i_ino); -+ if (EXT3COW_HAS_INCOMPAT_FEATURE(new_dir->i_sb, -+ EXT3COW_FEATURE_INCOMPAT_FILETYPE)) -+ new_de->file_type = old_de->file_type; -+ new_dir->i_version++; -+ BUFFER_TRACE(new_bh, "call ext3cow_journal_dirty_metadata"); -+ ext3cow_journal_dirty_metadata(handle, new_bh); -+ brelse(new_bh); -+ new_bh = NULL; -+ } -+ -+ /* -+ * Like most other Unix systems, set the ctime for inodes on a -+ * rename. -+ */ -+ old_inode->i_ctime = CURRENT_TIME_SEC; -+ ext3cow_mark_inode_dirty(handle, old_inode); -+ -+ /* -+ * ok, that's it -+ */ -+ if (le32_to_cpu(old_de->inode) != old_inode->i_ino || -+ old_de->name_len != old_dentry->d_name.len || -+ strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) || -+ (retval = ext3cow_delete_entry(handle, old_dir, -+ old_de, old_bh, new_dentry)) == -ENOENT) { -+ /* old_de could have moved from under us during htree split, so -+ * make sure that we are deleting the right entry. We might -+ * also be pointing to a stale entry in the unused part of -+ * old_bh so just checking inum and the name isn't enough. */ -+ struct buffer_head *old_bh2; -+ struct ext3cow_dir_entry_2 *old_de2; -+ -+ old_bh2 = ext3cow_find_entry(old_dentry, &old_de2); -+ if (old_bh2) { -+ retval = ext3cow_delete_entry(handle, old_dir, -+ old_de2, old_bh2, new_dentry); -+ brelse(old_bh2); -+ } -+ } -+ if (retval) { -+ ext3cow_warning(old_dir->i_sb, "ext3cow_rename", -+ "Deleting old file (%lu), %d, error=%d", -+ old_dir->i_ino, old_dir->i_nlink, retval); -+ } -+ -+ if (new_inode) { -+ new_inode->i_ctime = CURRENT_TIME_SEC; -+ } -+ if(!is_unchangeable(old_inode, old_dentry)) -+ old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC; -+ ext3cow_update_dx_flag(old_dir); -+ if (dir_bh) { -+ BUFFER_TRACE(dir_bh, "get_write_access"); -+ ext3cow_journal_get_write_access(handle, dir_bh); -+ PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); -+ BUFFER_TRACE(dir_bh, "call ext3cow_journal_dirty_metadata"); -+ ext3cow_journal_dirty_metadata(handle, dir_bh); -+ if (!new_inode) { -+ inc_nlink(new_dir); -+ ext3cow_update_dx_flag(new_dir); -+ ext3cow_mark_inode_dirty(handle, new_dir); -+ } -+ } -+ ext3cow_mark_inode_dirty(handle, old_dir); -+ if (new_inode) { -+ ext3cow_mark_inode_dirty(handle, new_inode); -+ if (!new_inode->i_nlink) -+ ext3cow_orphan_add(handle, new_inode); -+ } -+ retval = 0; -+ -+end_rename: -+ brelse (dir_bh); -+ brelse (old_bh); -+ brelse (new_bh); -+ ext3cow_journal_stop(handle); -+ return retval; -+} -+ -+/* ext3cow_fake_inode: This function creates a VFS-only inode -+ * used for properly scoping views into the past file system - znjp -+ */ -+struct inode *ext3cow_fake_inode(struct inode *inode, -+ unsigned int epoch_number) -+{ -+ struct inode * fake_inode = NULL; -+ struct ext3cow_inode_info * ini = NULL; -+ struct ext3cow_inode_info * fake_ini = NULL; -+ static unsigned int last_ino = UINT_MAX; -+ int err = 0; -+ int block = -1; -+ -+ if(NULL == inode){ -+ printk(KERN_ERR "Trying to duplicate a NULL inode.\n"); -+ return NULL; -+ } -+ -+ if(EXT3COW_IS_FAKEINODE(inode)){ -+ printk(KERN_ERR "Trying to fake a fake inode.\n"); -+ return inode; -+ } -+ -+ printk(KERN_INFO "** faking inode %lu\n", inode->i_ino); -+ -+ ini = EXT3COW_I(inode); -+ -+ /* Create a new VFS-only inode */ -+ fake_inode = new_inode(inode->i_sb); -+ err = PTR_ERR(fake_inode); -+ if(!IS_ERR(fake_inode)){ -+ -+ fake_ini = EXT3COW_I(fake_inode); -+ -+ printk(KERN_INFO "** got inode %lu setting with %u\n", fake_inode->i_ino, -+ last_ino); -+ -+ /* When inode is a directory, we can fake the inode number */ -+ //if(S_ISDIR(inode->i_mode)) -+ fake_inode->i_ino = --last_ino; -+ -+ fake_inode->i_mode = inode->i_mode; -+ fake_inode->i_uid = inode->i_uid; -+ fake_inode->i_gid = inode->i_gid; -+ -+ atomic_set(&fake_inode->i_count, 1); -+ -+ fake_inode->i_nlink = inode->i_nlink; -+ fake_inode->i_size = inode->i_size; -+ fake_inode->i_atime.tv_sec = inode->i_atime.tv_sec; -+ fake_inode->i_ctime.tv_sec = inode->i_ctime.tv_sec; -+ fake_inode->i_mtime.tv_sec = inode->i_mtime.tv_sec; -+ fake_inode->i_atime.tv_nsec = inode->i_atime.tv_nsec; -+ fake_inode->i_ctime.tv_nsec = inode->i_ctime.tv_nsec; -+ fake_inode->i_mtime.tv_nsec = inode->i_mtime.tv_nsec; -+ -+ fake_ini->i_state = ini->i_state; -+ fake_ini->i_dir_start_lookup = ini->i_dir_start_lookup; -+ fake_ini->i_dtime = ini->i_dtime; -+ -+ fake_inode->i_blocks = inode->i_blocks; -+ fake_ini->i_flags = ini->i_flags; -+#ifdef EXT3COW_FRAGMENTS -+ /* Taken out for versioning -znjp */ -+ //fake_ini->i_faddr = ini->i_faddr; -+ //fake_ini->i_frag_no = ini->i_frag_no; -+ //fake_ini->i_frag_size = ini->i_frag_size; -+#endif -+ fake_ini->i_file_acl = ini->i_file_acl; -+ if (!S_ISREG(fake_inode->i_mode)) { -+ fake_ini->i_dir_acl = ini->i_dir_acl; -+ } -+ fake_ini->i_disksize = inode->i_size; -+ fake_inode->i_generation = inode->i_generation; -+ //TODO: This could be wrong. -+ //fake_ini->i_block_group = ini->i_block_group; //iloc.block_group; -+ -+ for (block = 0; block < EXT3COW_N_BLOCKS; block++) -+ fake_ini->i_data[block] = ini->i_data[block]; -+ -+ fake_ini->i_extra_isize = ini->i_extra_isize; -+ -+ /* set copy-on-write bitmap to 0 */ -+ fake_ini->i_cow_bitmap = 0x0000; -+ -+ /* Mark fake inode unchangeable, etc. */ -+ fake_ini->i_flags |= EXT3COW_UNCHANGEABLE_FL; -+ fake_ini->i_flags |= EXT3COW_UNVERSIONABLE_FL; -+ fake_ini->i_flags |= EXT3COW_FAKEINODE_FL; -+ fake_ini->i_flags |= EXT3COW_IMMUTABLE_FL; -+ -+ /* Make sure we get the right operations */ -+ if (S_ISREG(fake_inode->i_mode)) { -+ fake_inode->i_op = &ext3cow_file_inode_operations; -+ fake_inode->i_fop = &ext3cow_file_operations; -+ ext3cow_set_aops(fake_inode); -+ } else if (S_ISDIR(fake_inode->i_mode)) { -+ fake_inode->i_op = &ext3cow_dir_inode_operations; -+ fake_inode->i_fop = &ext3cow_dir_operations; -+ } else if (S_ISLNK(fake_inode->i_mode)) { -+ //if (ext3cow_inode_is_fast_symlink(cow_inode)) -+ if((S_ISLNK(fake_inode->i_mode) && fake_inode->i_blocks - -+ (EXT3COW_I(fake_inode)->i_file_acl ? -+ (fake_inode->i_sb->s_blocksize >> 9) : 0))) -+ fake_inode->i_op = &ext3cow_fast_symlink_inode_operations; -+ else { -+ fake_inode->i_op = &ext3cow_symlink_inode_operations; -+ ext3cow_set_aops(fake_inode); -+ } -+ } else { -+ fake_inode->i_op = &ext3cow_special_inode_operations; -+ } -+ -+ fake_ini->i_epoch_number = epoch_number; -+ fake_ini->i_next_inode = 0; -+ -+ iput(inode); /* dec i_count */ -+ -+ return fake_inode; -+ }else -+ ext3cow_warning(inode->i_sb, "ext3cow_fake_inode", -+ "Could not create fake inode."); -+ -+ return NULL; -+} -+ -+/* -+ * ext3cow_dup_inode: This function creates a new inode, -+ * copies all the metadata from the passed in inode, -+ * and adds it to the version chain, creating a new version. -+ * The head of the chain never changes; it is always the most current version. -+ * Similar in nature to ext3cow_creat and ext3cow_read_inode. -znjp -+ */ -+int ext3cow_dup_inode(struct inode *dir, struct inode *inode){ -+ -+ struct inode *cow_inode = NULL; -+ struct inode *parent = NULL; -+ struct ext3cow_inode_info *ini = NULL; -+ struct ext3cow_inode_info *cow_ini = NULL; -+ handle_t *handle = NULL; -+ int err = 0; -+ int block = -1; -+ unsigned int epoch_number_temp = 0; -+ int retries = 0; -+ -+ printk(KERN_INFO "** duping inode %lu\n", inode->i_ino); -+ -+ if(EXT3COW_IS_UNVERSIONABLE(inode)) -+ return 0; -+ -+ if(NULL == inode){ -+ printk(KERN_ERR "Trying to duplicate a NULL inode.\n"); -+ return -1; -+ } -+ -+ if (inode->i_nlink == 0) { -+ if (inode->i_mode == 0 || -+ !(EXT3COW_SB(inode->i_sb)->s_mount_state & EXT3COW_ORPHAN_FS)) { -+ /* this inode is deleted */ -+ return -1; -+ } -+ /* The only unlinked inodes we let through here have -+ * valid i_mode and are being read by the orphan -+ * recovery code: that's fine, we're about to complete -+ * the process of deleting those. */ -+ } -+ -+ ini = EXT3COW_I(inode); -+ -+ /* This is for truncate, which can't pass in a parent */ -+ if(NULL == dir) -+ parent = inode; -+ else -+ parent = dir; -+ -+ retry: -+ handle = ext3cow_journal_start(parent, EXT3COW_DATA_TRANS_BLOCKS(dir->i_sb) + -+ EXT3COW_INDEX_EXTRA_TRANS_BLOCKS + 3 + -+ 2*EXT3COW_QUOTA_INIT_BLOCKS(dir->i_sb)); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ if (IS_DIRSYNC(parent)) -+ handle->h_sync = 1; -+ -+ cow_inode = ext3cow_new_inode (handle, parent, inode->i_mode); -+ err = PTR_ERR(cow_inode); -+ if (!IS_ERR(cow_inode)) { -+ -+ printk(KERN_INFO " ** Allocated new inode %lu\n", cow_inode->i_ino); -+ -+ cow_ini = EXT3COW_I(cow_inode); -+ -+ cow_inode->i_mode = inode->i_mode; -+ cow_inode->i_uid = inode->i_uid; -+ cow_inode->i_gid = inode->i_gid; -+ -+ cow_inode->i_nlink = inode->i_nlink; -+ cow_inode->i_size = inode->i_size; -+ cow_inode->i_atime.tv_sec = inode->i_atime.tv_sec; -+ cow_inode->i_ctime.tv_sec = inode->i_ctime.tv_sec; -+ cow_inode->i_mtime.tv_sec = inode->i_mtime.tv_sec; -+ cow_inode->i_atime.tv_nsec = inode->i_atime.tv_nsec; -+ cow_inode->i_ctime.tv_nsec = inode->i_ctime.tv_nsec; -+ cow_inode->i_mtime.tv_nsec = inode->i_mtime.tv_nsec; -+ -+ cow_ini->i_state = ini->i_state; -+ cow_ini->i_dir_start_lookup = ini->i_dir_start_lookup; -+ cow_ini->i_dtime = ini->i_dtime; -+ -+ cow_inode->i_blocks = inode->i_blocks; -+ cow_ini->i_flags = ini->i_flags; -+#ifdef EXT3COW_FRAGMENTS -+ /* Taken out for versioning -znjp */ -+ //cow_ini->i_faddr = ini->i_faddr; -+ //cow_ini->i_frag_no = ini->i_frag_no; -+ //cow_ini->i_frag_size = ini->i_frag_size; -+#endif -+ cow_ini->i_file_acl = ini->i_file_acl; -+ if (!S_ISREG(cow_inode->i_mode)) { -+ cow_ini->i_dir_acl = ini->i_dir_acl; -+ } -+ cow_ini->i_disksize = inode->i_size; -+ cow_inode->i_generation = inode->i_generation; -+ //TODO: This could be wrong. -+ cow_ini->i_block_group = ini->i_block_group; //iloc.block_group; -+ -+ for (block = 0; block < EXT3COW_N_BLOCKS; block++) -+ cow_ini->i_data[block] = ini->i_data[block]; -+ -+ //TODO: This could be wrong -+ //cow_ini->i_orphan = NULL; //INIT_LIST_HEAD(&ei->i_orphan); -+ -+ cow_ini->i_extra_isize = ini->i_extra_isize; -+ -+ /* Make sure we get the right operations */ -+ if (S_ISREG(cow_inode->i_mode)) { -+ cow_inode->i_op = &ext3cow_file_inode_operations; -+ cow_inode->i_fop = &ext3cow_file_operations; -+ ext3cow_set_aops(cow_inode); -+ } else if (S_ISDIR(cow_inode->i_mode)) { -+ cow_inode->i_op = &ext3cow_dir_inode_operations; -+ cow_inode->i_fop = &ext3cow_dir_operations; -+ } else if (S_ISLNK(cow_inode->i_mode)) { -+ //if (ext3cow_inode_is_fast_symlink(cow_inode)) -+ if((S_ISLNK(cow_inode->i_mode) && cow_inode->i_blocks - -+ (EXT3COW_I(cow_inode)->i_file_acl ? -+ (cow_inode->i_sb->s_blocksize >> 9) : 0))) -+ cow_inode->i_op = &ext3cow_fast_symlink_inode_operations; -+ else { -+ cow_inode->i_op = &ext3cow_symlink_inode_operations; -+ ext3cow_set_aops(cow_inode); -+ } -+ } else { -+ cow_inode->i_op = &ext3cow_special_inode_operations; -+ /* -+ if (raw_inode->i_block[0]) -+ init_special_inode(inode, inode->i_mode, -+ old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); -+ else -+ init_special_inode(inode, inode->i_mode, -+ new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); -+ */ -+ } -+ -+ /* Dup in the direct cow bitmap */ -+ cow_ini->i_cow_bitmap = ini->i_cow_bitmap; -+ ini->i_cow_bitmap = 0x0000; -+ /* Mark new inode unchangeable */ -+ cow_ini->i_flags |= EXT3COW_UNCHANGEABLE_FL; -+ /* Switch epoch numbers */ -+ epoch_number_temp = ini->i_epoch_number; -+ ini->i_epoch_number = cow_ini->i_epoch_number; -+ cow_ini->i_epoch_number = epoch_number_temp; -+ /* Chain Inodes together */ -+ cow_ini->i_next_inode = ini->i_next_inode; -+ ini->i_next_inode = cow_inode->i_ino; -+ -+ ext3cow_mark_inode_dirty(handle, cow_inode); -+ ext3cow_mark_inode_dirty(handle, inode); -+ -+ iput(cow_inode); /* dec i_count */ -+ -+ err = 0; -+ } -+ ext3cow_journal_stop(handle); -+ if (err == -ENOSPC && ext3cow_should_retry_alloc(dir->i_sb, &retries)) -+ goto retry; -+ return err; -+ -+} -+ -+/* ext3cow_reclaim_dup_inode: rolls back a recently dup'd inode -+ * on error, including epoch number and bitmaps. Should not -+ * be used for removing versions. */ -+int ext3cow_reclaim_dup_inode(struct inode *dir, struct inode *inode) -+{ -+ handle_t *handle = NULL; -+ int err = 0; -+ struct inode *old_inode = NULL; -+ struct inode *parent = dir; -+ -+ if(!parent) -+ parent = inode; -+ -+ if(is_bad_inode(inode)) -+ return -1; -+ -+ handle = ext3cow_journal_start(parent, -+ EXT3COW_DELETE_TRANS_BLOCKS(parent->i_sb)); -+ if(IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ if(IS_DIRSYNC(parent)) -+ handle->h_sync = 1; -+ -+ old_inode = iget(parent->i_sb, EXT3COW_I_NEXT_INODE(inode)); -+ err = PTR_ERR(old_inode); -+ if (!IS_ERR(old_inode)){ -+ -+ EXT3COW_I(inode)->i_epoch_number = EXT3COW_I_EPOCHNUMBER(old_inode); -+ EXT3COW_I(inode)->i_cow_bitmap = EXT3COW_I(old_inode)->i_cow_bitmap; -+ EXT3COW_I(inode)->i_next_inode = EXT3COW_I(old_inode)->i_next_inode; -+ old_inode->i_nlink = 0; -+ -+ iput(old_inode); -+ ext3cow_mark_inode_dirty(handle, inode); -+ }else -+ ext3cow_error(inode->i_sb, "ext3cow_reclaim_dup_inode", -+ "Couldn't remove dup'd inode."); -+ -+ ext3cow_journal_stop(handle); -+ -+ return 0; -+} -+ -+/* -+ * directories can handle most operations... -+ */ -+struct inode_operations ext3cow_dir_inode_operations = { -+ .create = ext3cow_create, -+ .lookup = ext3cow_lookup, -+ .link = ext3cow_link, -+ .unlink = ext3cow_unlink, -+ .symlink = ext3cow_symlink, -+ .mkdir = ext3cow_mkdir, -+ .rmdir = ext3cow_rmdir, -+ .mknod = ext3cow_mknod, -+ .rename = ext3cow_rename, -+ .setattr = ext3cow_setattr, -+#ifdef CONFIG_EXT3COW_FS_XATTR -+ .setxattr = generic_setxattr, -+ .getxattr = generic_getxattr, -+ .listxattr = ext3cow_listxattr, -+ .removexattr = generic_removexattr, -+#endif -+ .permission = ext3cow_permission, -+}; -+ -+struct inode_operations ext3cow_special_inode_operations = { -+ .setattr = ext3cow_setattr, -+#ifdef CONFIG_EXT3COW_FS_XATTR -+ .setxattr = generic_setxattr, -+ .getxattr = generic_getxattr, -+ .listxattr = ext3cow_listxattr, -+ .removexattr = generic_removexattr, -+#endif -+ .permission = ext3cow_permission, -+}; -diff -ruN linux-2.6.20.3/fs/ext3cow/namei.h linux-2.6.20.3-ext3cow/fs/ext3cow/namei.h ---- linux-2.6.20.3/fs/ext3cow/namei.h 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.20.3-ext3cow/fs/ext3cow/namei.h 2008-03-09 11:14:48.000000000 -0400 -@@ -0,0 +1,8 @@ -+/* linux/fs/ext3cow/namei.h -+ * -+ * Copyright (C) 2005 Simtec Electronics -+ * Ben Dooks -+ * -+*/ -+ -+extern struct dentry *ext3cow_get_parent(struct dentry *child); -diff -ruN linux-2.6.20.3/fs/ext3cow/resize.c linux-2.6.20.3-ext3cow/fs/ext3cow/resize.c ---- linux-2.6.20.3/fs/ext3cow/resize.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.20.3-ext3cow/fs/ext3cow/resize.c 2008-03-09 11:14:48.000000000 -0400 -@@ -0,0 +1,1042 @@ -+/* -+ * linux/fs/ext3cow/resize.c -+ * -+ * Support for resizing an ext3cow filesystem while it is mounted. -+ * -+ * Copyright (C) 2001, 2002 Andreas Dilger -+ * -+ * This could probably be made into a module, because it is not often in use. -+ */ -+ -+ -+#define EXT3COWFS_DEBUG -+ -+#include -+#include -+#include -+ -+#include -+#include -+ -+ -+#define outside(b, first, last) ((b) < (first) || (b) >= (last)) -+#define inside(b, first, last) ((b) >= (first) && (b) < (last)) -+ -+static int verify_group_input(struct super_block *sb, -+ struct ext3cow_new_group_data *input) -+{ -+ struct ext3cow_sb_info *sbi = EXT3COW_SB(sb); -+ struct ext3cow_super_block *es = sbi->s_es; -+ ext3cow_fsblk_t start = le32_to_cpu(es->s_blocks_count); -+ ext3cow_fsblk_t end = start + input->blocks_count; -+ unsigned group = input->group; -+ ext3cow_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; -+ unsigned overhead = ext3cow_bg_has_super(sb, group) ? -+ (1 + ext3cow_bg_num_gdb(sb, group) + -+ le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; -+ ext3cow_fsblk_t metaend = start + overhead; -+ struct buffer_head *bh = NULL; -+ ext3cow_grpblk_t free_blocks_count; -+ int err = -EINVAL; -+ -+ input->free_blocks_count = free_blocks_count = -+ input->blocks_count - 2 - overhead - sbi->s_itb_per_group; -+ -+ if (test_opt(sb, DEBUG)) -+ printk(KERN_DEBUG "EXT3COW-fs: adding %s group %u: %u blocks " -+ "(%d free, %u reserved)\n", -+ ext3cow_bg_has_super(sb, input->group) ? "normal" : -+ "no-super", input->group, input->blocks_count, -+ free_blocks_count, input->reserved_blocks); -+ -+ if (group != sbi->s_groups_count) -+ ext3cow_warning(sb, __FUNCTION__, -+ "Cannot add at group %u (only %lu groups)", -+ input->group, sbi->s_groups_count); -+ else if ((start - le32_to_cpu(es->s_first_data_block)) % -+ EXT3COW_BLOCKS_PER_GROUP(sb)) -+ ext3cow_warning(sb, __FUNCTION__, "Last group not full"); -+ else if (input->reserved_blocks > input->blocks_count / 5) -+ ext3cow_warning(sb, __FUNCTION__, "Reserved blocks too high (%u)", -+ input->reserved_blocks); -+ else if (free_blocks_count < 0) -+ ext3cow_warning(sb, __FUNCTION__, "Bad blocks count %u", -+ input->blocks_count); -+ else if (!(bh = sb_bread(sb, end - 1))) -+ ext3cow_warning(sb, __FUNCTION__, -+ "Cannot read last block ("E3FSBLK")", -+ end - 1); -+ else if (outside(input->block_bitmap, start, end)) -+ ext3cow_warning(sb, __FUNCTION__, -+ "Block bitmap not in group (block %u)", -+ input->block_bitmap); -+ else if (outside(input->inode_bitmap, start, end)) -+ ext3cow_warning(sb, __FUNCTION__, -+ "Inode bitmap not in group (block %u)", -+ input->inode_bitmap); -+ else if (outside(input->inode_table, start, end) || -+ outside(itend - 1, start, end)) -+ ext3cow_warning(sb, __FUNCTION__, -+ "Inode table not in group (blocks %u-"E3FSBLK")", -+ input->inode_table, itend - 1); -+ else if (input->inode_bitmap == input->block_bitmap) -+ ext3cow_warning(sb, __FUNCTION__, -+ "Block bitmap same as inode bitmap (%u)", -+ input->block_bitmap); -+ else if (inside(input->block_bitmap, input->inode_table, itend)) -+ ext3cow_warning(sb, __FUNCTION__, -+ "Block bitmap (%u) in inode table (%u-"E3FSBLK")", -+ input->block_bitmap, input->inode_table, itend-1); -+ else if (inside(input->inode_bitmap, input->inode_table, itend)) -+ ext3cow_warning(sb, __FUNCTION__, -+ "Inode bitmap (%u) in inode table (%u-"E3FSBLK")", -+ input->inode_bitmap, input->inode_table, itend-1); -+ else if (inside(input->block_bitmap, start, metaend)) -+ ext3cow_warning(sb, __FUNCTION__, -+ "Block bitmap (%u) in GDT table" -+ " ("E3FSBLK"-"E3FSBLK")", -+ input->block_bitmap, start, metaend - 1); -+ else if (inside(input->inode_bitmap, start, metaend)) -+ ext3cow_warning(sb, __FUNCTION__, -+ "Inode bitmap (%u) in GDT table" -+ " ("E3FSBLK"-"E3FSBLK")", -+ input->inode_bitmap, start, metaend - 1); -+ else if (inside(input->inode_table, start, metaend) || -+ inside(itend - 1, start, metaend)) -+ ext3cow_warning(sb, __FUNCTION__, -+ "Inode table (%u-"E3FSBLK") overlaps" -+ "GDT table ("E3FSBLK"-"E3FSBLK")", -+ input->inode_table, itend - 1, start, metaend - 1); -+ else -+ err = 0; -+ brelse(bh); -+ -+ return err; -+} -+ -+static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, -+ ext3cow_fsblk_t blk) -+{ -+ struct buffer_head *bh; -+ int err; -+ -+ bh = sb_getblk(sb, blk); -+ if (!bh) -+ return ERR_PTR(-EIO); -+ if ((err = ext3cow_journal_get_write_access(handle, bh))) { -+ brelse(bh); -+ bh = ERR_PTR(err); -+ } else { -+ lock_buffer(bh); -+ memset(bh->b_data, 0, sb->s_blocksize); -+ set_buffer_uptodate(bh); -+ unlock_buffer(bh); -+ } -+ -+ return bh; -+} -+ -+/* -+ * To avoid calling the atomic setbit hundreds or thousands of times, we only -+ * need to use it within a single byte (to ensure we get endianness right). -+ * We can use memset for the rest of the bitmap as there are no other users. -+ */ -+static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) -+{ -+ int i; -+ -+ if (start_bit >= end_bit) -+ return; -+ -+ ext3cow_debug("mark end bits +%d through +%d used\n", start_bit, end_bit); -+ for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++) -+ ext3cow_set_bit(i, bitmap); -+ if (i < end_bit) -+ memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3); -+} -+ -+/* -+ * Set up the block and inode bitmaps, and the inode table for the new group. -+ * This doesn't need to be part of the main transaction, since we are only -+ * changing blocks outside the actual filesystem. We still do journaling to -+ * ensure the recovery is correct in case of a failure just after resize. -+ * If any part of this fails, we simply abort the resize. -+ */ -+static int setup_new_group_blocks(struct super_block *sb, -+ struct ext3cow_new_group_data *input) -+{ -+ struct ext3cow_sb_info *sbi = EXT3COW_SB(sb); -+ ext3cow_fsblk_t start = ext3cow_group_first_block_no(sb, input->group); -+ int reserved_gdb = ext3cow_bg_has_super(sb, input->group) ? -+ le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0; -+ unsigned long gdblocks = ext3cow_bg_num_gdb(sb, input->group); -+ struct buffer_head *bh; -+ handle_t *handle; -+ ext3cow_fsblk_t block; -+ ext3cow_grpblk_t bit; -+ int i; -+ int err = 0, err2; -+ -+ handle = ext3cow_journal_start_sb(sb, reserved_gdb + gdblocks + -+ 2 + sbi->s_itb_per_group); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ -+ lock_super(sb); -+ if (input->group != sbi->s_groups_count) { -+ err = -EBUSY; -+ goto exit_journal; -+ } -+ -+ if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) { -+ err = PTR_ERR(bh); -+ goto exit_journal; -+ } -+ -+ if (ext3cow_bg_has_super(sb, input->group)) { -+ ext3cow_debug("mark backup superblock %#04lx (+0)\n", start); -+ ext3cow_set_bit(0, bh->b_data); -+ } -+ -+ /* Copy all of the GDT blocks into the backup in this group */ -+ for (i = 0, bit = 1, block = start + 1; -+ i < gdblocks; i++, block++, bit++) { -+ struct buffer_head *gdb; -+ -+ ext3cow_debug("update backup group %#04lx (+%d)\n", block, bit); -+ -+ gdb = sb_getblk(sb, block); -+ if (!gdb) { -+ err = -EIO; -+ goto exit_bh; -+ } -+ if ((err = ext3cow_journal_get_write_access(handle, gdb))) { -+ brelse(gdb); -+ goto exit_bh; -+ } -+ lock_buffer(bh); -+ memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, bh->b_size); -+ set_buffer_uptodate(gdb); -+ unlock_buffer(bh); -+ ext3cow_journal_dirty_metadata(handle, gdb); -+ ext3cow_set_bit(bit, bh->b_data); -+ brelse(gdb); -+ } -+ -+ /* Zero out all of the reserved backup group descriptor table blocks */ -+ for (i = 0, bit = gdblocks + 1, block = start + bit; -+ i < reserved_gdb; i++, block++, bit++) { -+ struct buffer_head *gdb; -+ -+ ext3cow_debug("clear reserved block %#04lx (+%d)\n", block, bit); -+ -+ if (IS_ERR(gdb = bclean(handle, sb, block))) { -+ err = PTR_ERR(bh); -+ goto exit_bh; -+ } -+ ext3cow_journal_dirty_metadata(handle, gdb); -+ ext3cow_set_bit(bit, bh->b_data); -+ brelse(gdb); -+ } -+ ext3cow_debug("mark block bitmap %#04x (+%ld)\n", input->block_bitmap, -+ input->block_bitmap - start); -+ ext3cow_set_bit(input->block_bitmap - start, bh->b_data); -+ ext3cow_debug("mark inode bitmap %#04x (+%ld)\n", input->inode_bitmap, -+ input->inode_bitmap - start); -+ ext3cow_set_bit(input->inode_bitmap - start, bh->b_data); -+ -+ /* Zero out all of the inode table blocks */ -+ for (i = 0, block = input->inode_table, bit = block - start; -+ i < sbi->s_itb_per_group; i++, bit++, block++) { -+ struct buffer_head *it; -+ -+ ext3cow_debug("clear inode block %#04lx (+%d)\n", block, bit); -+ if (IS_ERR(it = bclean(handle, sb, block))) { -+ err = PTR_ERR(it); -+ goto exit_bh; -+ } -+ ext3cow_journal_dirty_metadata(handle, it); -+ brelse(it); -+ ext3cow_set_bit(bit, bh->b_data); -+ } -+ mark_bitmap_end(input->blocks_count, EXT3COW_BLOCKS_PER_GROUP(sb), -+ bh->b_data); -+ ext3cow_journal_dirty_metadata(handle, bh); -+ brelse(bh); -+ -+ /* Mark unused entries in inode bitmap used */ -+ ext3cow_debug("clear inode bitmap %#04x (+%ld)\n", -+ input->inode_bitmap, input->inode_bitmap - start); -+ if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) { -+ err = PTR_ERR(bh); -+ goto exit_journal; -+ } -+ -+ mark_bitmap_end(EXT3COW_INODES_PER_GROUP(sb), EXT3COW_BLOCKS_PER_GROUP(sb), -+ bh->b_data); -+ ext3cow_journal_dirty_metadata(handle, bh); -+exit_bh: -+ brelse(bh); -+ -+exit_journal: -+ unlock_super(sb); -+ if ((err2 = ext3cow_journal_stop(handle)) && !err) -+ err = err2; -+ -+ return err; -+} -+ -+/* -+ * Iterate through the groups which hold BACKUP superblock/GDT copies in an -+ * ext3cow filesystem. The counters should be initialized to 1, 5, and 7 before -+ * calling this for the first time. In a sparse filesystem it will be the -+ * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ... -+ * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ... -+ */ -+static unsigned ext3cow_list_backups(struct super_block *sb, unsigned *three, -+ unsigned *five, unsigned *seven) -+{ -+ unsigned *min = three; -+ int mult = 3; -+ unsigned ret; -+ -+ if (!EXT3COW_HAS_RO_COMPAT_FEATURE(sb, -+ EXT3COW_FEATURE_RO_COMPAT_SPARSE_SUPER)) { -+ ret = *min; -+ *min += 1; -+ return ret; -+ } -+ -+ if (*five < *min) { -+ min = five; -+ mult = 5; -+ } -+ if (*seven < *min) { -+ min = seven; -+ mult = 7; -+ } -+ -+ ret = *min; -+ *min *= mult; -+ -+ return ret; -+} -+ -+/* -+ * Check that all of the backup GDT blocks are held in the primary GDT block. -+ * It is assumed that they are stored in group order. Returns the number of -+ * groups in current filesystem that have BACKUPS, or -ve error code. -+ */ -+static int verify_reserved_gdb(struct super_block *sb, -+ struct buffer_head *primary) -+{ -+ const ext3cow_fsblk_t blk = primary->b_blocknr; -+ const unsigned long end = EXT3COW_SB(sb)->s_groups_count; -+ unsigned three = 1; -+ unsigned five = 5; -+ unsigned seven = 7; -+ unsigned grp; -+ __le32 *p = (__le32 *)primary->b_data; -+ int gdbackups = 0; -+ -+ while ((grp = ext3cow_list_backups(sb, &three, &five, &seven)) < end) { -+ if (le32_to_cpu(*p++) != grp * EXT3COW_BLOCKS_PER_GROUP(sb) + blk){ -+ ext3cow_warning(sb, __FUNCTION__, -+ "reserved GDT "E3FSBLK -+ " missing grp %d ("E3FSBLK")", -+ blk, grp, -+ grp * EXT3COW_BLOCKS_PER_GROUP(sb) + blk); -+ return -EINVAL; -+ } -+ if (++gdbackups > EXT3COW_ADDR_PER_BLOCK(sb)) -+ return -EFBIG; -+ } -+ -+ return gdbackups; -+} -+ -+/* -+ * Called when we need to bring a reserved group descriptor table block into -+ * use from the resize inode. The primary copy of the new GDT block currently -+ * is an indirect block (under the double indirect block in the resize inode). -+ * The new backup GDT blocks will be stored as leaf blocks in this indirect -+ * block, in group order. Even though we know all the block numbers we need, -+ * we check to ensure that the resize inode has actually reserved these blocks. -+ * -+ * Don't need to update the block bitmaps because the blocks are still in use. -+ * -+ * We get all of the error cases out of the way, so that we are sure to not -+ * fail once we start modifying the data on disk, because JBD has no rollback. -+ */ -+static int add_new_gdb(handle_t *handle, struct inode *inode, -+ struct ext3cow_new_group_data *input, -+ struct buffer_head **primary) -+{ -+ struct super_block *sb = inode->i_sb; -+ struct ext3cow_super_block *es = EXT3COW_SB(sb)->s_es; -+ unsigned long gdb_num = input->group / EXT3COW_DESC_PER_BLOCK(sb); -+ ext3cow_fsblk_t gdblock = EXT3COW_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; -+ struct buffer_head **o_group_desc, **n_group_desc; -+ struct buffer_head *dind; -+ int gdbackups; -+ struct ext3cow_iloc iloc; -+ __le32 *data; -+ int err; -+ -+ if (test_opt(sb, DEBUG)) -+ printk(KERN_DEBUG -+ "EXT3COW-fs: ext3cow_add_new_gdb: adding group block %lu\n", -+ gdb_num); -+ -+ /* -+ * If we are not using the primary superblock/GDT copy don't resize, -+ * because the user tools have no way of handling this. Probably a -+ * bad time to do it anyways. -+ */ -+ if (EXT3COW_SB(sb)->s_sbh->b_blocknr != -+ le32_to_cpu(EXT3COW_SB(sb)->s_es->s_first_data_block)) { -+ ext3cow_warning(sb, __FUNCTION__, -+ "won't resize using backup superblock at %llu", -+ (unsigned long long)EXT3COW_SB(sb)->s_sbh->b_blocknr); -+ return -EPERM; -+ } -+ -+ *primary = sb_bread(sb, gdblock); -+ if (!*primary) -+ return -EIO; -+ -+ if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) { -+ err = gdbackups; -+ goto exit_bh; -+ } -+ -+ data = EXT3COW_I(inode)->i_data + EXT3COW_DIND_BLOCK; -+ dind = sb_bread(sb, le32_to_cpu(*data)); -+ if (!dind) { -+ err = -EIO; -+ goto exit_bh; -+ } -+ -+ data = (__le32 *)dind->b_data; -+ if (le32_to_cpu(data[gdb_num % EXT3COW_ADDR_PER_BLOCK(sb)]) != gdblock) { -+ ext3cow_warning(sb, __FUNCTION__, -+ "new group %u GDT block "E3FSBLK" not reserved", -+ input->group, gdblock); -+ err = -EINVAL; -+ goto exit_dind; -+ } -+ -+ if ((err = ext3cow_journal_get_write_access(handle, EXT3COW_SB(sb)->s_sbh))) -+ goto exit_dind; -+ -+ if ((err = ext3cow_journal_get_write_access(handle, *primary))) -+ goto exit_sbh; -+ -+ if ((err = ext3cow_journal_get_write_access(handle, dind))) -+ goto exit_primary; -+ -+ /* ext3cow_reserve_inode_write() gets a reference on the iloc */ -+ if ((err = ext3cow_reserve_inode_write(handle, inode, &iloc))) -+ goto exit_dindj; -+ -+ n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *), -+ GFP_KERNEL); -+ if (!n_group_desc) { -+ err = -ENOMEM; -+ ext3cow_warning (sb, __FUNCTION__, -+ "not enough memory for %lu groups", gdb_num + 1); -+ goto exit_inode; -+ } -+ -+ /* -+ * Finally, we have all of the possible failures behind us... -+ * -+ * Remove new GDT block from inode double-indirect block and clear out -+ * the new GDT block for use (which also "frees" the backup GDT blocks -+ * from the reserved inode). We don't need to change the bitmaps for -+ * these blocks, because they are marked as in-use from being in the -+ * reserved inode, and will become GDT blocks (primary and backup). -+ */ -+ data[gdb_num % EXT3COW_ADDR_PER_BLOCK(sb)] = 0; -+ ext3cow_journal_dirty_metadata(handle, dind); -+ brelse(dind); -+ inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; -+ ext3cow_mark_iloc_dirty(handle, inode, &iloc); -+ memset((*primary)->b_data, 0, sb->s_blocksize); -+ ext3cow_journal_dirty_metadata(handle, *primary); -+ -+ o_group_desc = EXT3COW_SB(sb)->s_group_desc; -+ memcpy(n_group_desc, o_group_desc, -+ EXT3COW_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); -+ n_group_desc[gdb_num] = *primary; -+ EXT3COW_SB(sb)->s_group_desc = n_group_desc; -+ EXT3COW_SB(sb)->s_gdb_count++; -+ kfree(o_group_desc); -+ -+ es->s_reserved_gdt_blocks = -+ cpu_to_le16(le16_to_cpu(es->s_reserved_gdt_blocks) - 1); -+ ext3cow_journal_dirty_metadata(handle, EXT3COW_SB(sb)->s_sbh); -+ -+ return 0; -+ -+exit_inode: -+ //ext3cow_journal_release_buffer(handle, iloc.bh); -+ brelse(iloc.bh); -+exit_dindj: -+ //ext3cow_journal_release_buffer(handle, dind); -+exit_primary: -+ //ext3cow_journal_release_buffer(handle, *primary); -+exit_sbh: -+ //ext3cow_journal_release_buffer(handle, *primary); -+exit_dind: -+ brelse(dind); -+exit_bh: -+ brelse(*primary); -+ -+ ext3cow_debug("leaving with error %d\n", err); -+ return err; -+} -+ -+/* -+ * Called when we are adding a new group which has a backup copy of each of -+ * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks. -+ * We need to add these reserved backup GDT blocks to the resize inode, so -+ * that they are kept for future resizing and not allocated to files. -+ * -+ * Each reserved backup GDT block will go into a different indirect block. -+ * The indirect blocks are actually the primary reserved GDT blocks, -+ * so we know in advance what their block numbers are. We only get the -+ * double-indirect block to verify it is pointing to the primary reserved -+ * GDT blocks so we don't overwrite a data block by accident. The reserved -+ * backup GDT blocks are stored in their reserved primary GDT block. -+ */ -+static int reserve_backup_gdb(handle_t *handle, struct inode *inode, -+ struct ext3cow_new_group_data *input) -+{ -+ struct super_block *sb = inode->i_sb; -+ int reserved_gdb =le16_to_cpu(EXT3COW_SB(sb)->s_es->s_reserved_gdt_blocks); -+ struct buffer_head **primary; -+ struct buffer_head *dind; -+ struct ext3cow_iloc iloc; -+ ext3cow_fsblk_t blk; -+ __le32 *data, *end; -+ int gdbackups = 0; -+ int res, i; -+ int err; -+ -+ primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_KERNEL); -+ if (!primary) -+ return -ENOMEM; -+ -+ data = EXT3COW_I(inode)->i_data + EXT3COW_DIND_BLOCK; -+ dind = sb_bread(sb, le32_to_cpu(*data)); -+ if (!dind) { -+ err = -EIO; -+ goto exit_free; -+ } -+ -+ blk = EXT3COW_SB(sb)->s_sbh->b_blocknr + 1 + EXT3COW_SB(sb)->s_gdb_count; -+ data = (__le32 *)dind->b_data + EXT3COW_SB(sb)->s_gdb_count; -+ end = (__le32 *)dind->b_data + EXT3COW_ADDR_PER_BLOCK(sb); -+ -+ /* Get each reserved primary GDT block and verify it holds backups */ -+ for (res = 0; res < reserved_gdb; res++, blk++) { -+ if (le32_to_cpu(*data) != blk) { -+ ext3cow_warning(sb, __FUNCTION__, -+ "reserved block "E3FSBLK -+ " not at offset %ld", -+ blk, -+ (long)(data - (__le32 *)dind->b_data)); -+ err = -EINVAL; -+ goto exit_bh; -+ } -+ primary[res] = sb_bread(sb, blk); -+ if (!primary[res]) { -+ err = -EIO; -+ goto exit_bh; -+ } -+ if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) { -+ brelse(primary[res]); -+ err = gdbackups; -+ goto exit_bh; -+ } -+ if (++data >= end) -+ data = (__le32 *)dind->b_data; -+ } -+ -+ for (i = 0; i < reserved_gdb; i++) { -+ if ((err = ext3cow_journal_get_write_access(handle, primary[i]))) { -+ /* -+ int j; -+ for (j = 0; j < i; j++) -+ ext3cow_journal_release_buffer(handle, primary[j]); -+ */ -+ goto exit_bh; -+ } -+ } -+ -+ if ((err = ext3cow_reserve_inode_write(handle, inode, &iloc))) -+ goto exit_bh; -+ -+ /* -+ * Finally we can add each of the reserved backup GDT blocks from -+ * the new group to its reserved primary GDT block. -+ */ -+ blk = input->group * EXT3COW_BLOCKS_PER_GROUP(sb); -+ for (i = 0; i < reserved_gdb; i++) { -+ int err2; -+ data = (__le32 *)primary[i]->b_data; -+ /* printk("reserving backup %lu[%u] = %lu\n", -+ primary[i]->b_blocknr, gdbackups, -+ blk + primary[i]->b_blocknr); */ -+ data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr); -+ err2 = ext3cow_journal_dirty_metadata(handle, primary[i]); -+ if (!err) -+ err = err2; -+ } -+ inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9; -+ ext3cow_mark_iloc_dirty(handle, inode, &iloc); -+ -+exit_bh: -+ while (--res >= 0) -+ brelse(primary[res]); -+ brelse(dind); -+ -+exit_free: -+ kfree(primary); -+ -+ return err; -+} -+ -+/* -+ * Update the backup copies of the ext3cow metadata. These don't need to be part -+ * of the main resize transaction, because e2fsck will re-write them if there -+ * is a problem (basically only OOM will cause a problem). However, we -+ * _should_ update the backups if possible, in case the primary gets trashed -+ * for some reason and we need to run e2fsck from a backup superblock. The -+ * important part is that the new block and inode counts are in the backup -+ * superblocks, and the location of the new group metadata in the GDT backups. -+ * -+ * We do not need lock_super() for this, because these blocks are not -+ * otherwise touched by the filesystem code when it is mounted. We don't -+ * need to worry about last changing from sbi->s_groups_count, because the -+ * worst that can happen is that we do not copy the full number of backups -+ * at this time. The resize which changed s_groups_count will backup again. -+ */ -+static void update_backups(struct super_block *sb, -+ int blk_off, char *data, int size) -+{ -+ struct ext3cow_sb_info *sbi = EXT3COW_SB(sb); -+ const unsigned long last = sbi->s_groups_count; -+ const int bpg = EXT3COW_BLOCKS_PER_GROUP(sb); -+ unsigned three = 1; -+ unsigned five = 5; -+ unsigned seven = 7; -+ unsigned group; -+ int rest = sb->s_blocksize - size; -+ handle_t *handle; -+ int err = 0, err2; -+ -+ handle = ext3cow_journal_start_sb(sb, EXT3COW_MAX_TRANS_DATA); -+ if (IS_ERR(handle)) { -+ group = 1; -+ err = PTR_ERR(handle); -+ goto exit_err; -+ } -+ -+ while ((group = ext3cow_list_backups(sb, &three, &five, &seven)) < last) { -+ struct buffer_head *bh; -+ -+ /* Out of journal space, and can't get more - abort - so sad */ -+ if (handle->h_buffer_credits == 0 && -+ ext3cow_journal_extend(handle, EXT3COW_MAX_TRANS_DATA) && -+ (err = ext3cow_journal_restart(handle, EXT3COW_MAX_TRANS_DATA))) -+ break; -+ -+ bh = sb_getblk(sb, group * bpg + blk_off); -+ if (!bh) { -+ err = -EIO; -+ break; -+ } -+ ext3cow_debug("update metadata backup %#04lx\n", -+ (unsigned long)bh->b_blocknr); -+ if ((err = ext3cow_journal_get_write_access(handle, bh))) -+ break; -+ lock_buffer(bh); -+ memcpy(bh->b_data, data, size); -+ if (rest) -+ memset(bh->b_data + size, 0, rest); -+ set_buffer_uptodate(bh); -+ unlock_buffer(bh); -+ ext3cow_journal_dirty_metadata(handle, bh); -+ brelse(bh); -+ } -+ if ((err2 = ext3cow_journal_stop(handle)) && !err) -+ err = err2; -+ -+ /* -+ * Ugh! Need to have e2fsck write the backup copies. It is too -+ * late to revert the resize, we shouldn't fail just because of -+ * the backup copies (they are only needed in case of corruption). -+ * -+ * However, if we got here we have a journal problem too, so we -+ * can't really start a transaction to mark the superblock. -+ * Chicken out and just set the flag on the hope it will be written -+ * to disk, and if not - we will simply wait until next fsck. -+ */ -+exit_err: -+ if (err) { -+ ext3cow_warning(sb, __FUNCTION__, -+ "can't update backup for group %d (err %d), " -+ "forcing fsck on next reboot", group, err); -+ sbi->s_mount_state &= ~EXT3COW_VALID_FS; -+ sbi->s_es->s_state &= cpu_to_le16(~EXT3COW_VALID_FS); -+ mark_buffer_dirty(sbi->s_sbh); -+ } -+} -+ -+/* Add group descriptor data to an existing or new group descriptor block. -+ * Ensure we handle all possible error conditions _before_ we start modifying -+ * the filesystem, because we cannot abort the transaction and not have it -+ * write the data to disk. -+ * -+ * If we are on a GDT block boundary, we need to get the reserved GDT block. -+ * Otherwise, we may need to add backup GDT blocks for a sparse group. -+ * -+ * We only need to hold the superblock lock while we are actually adding -+ * in the new group's counts to the superblock. Prior to that we have -+ * not really "added" the group at all. We re-check that we are still -+ * adding in the last group in case things have changed since verifying. -+ */ -+int ext3cow_group_add(struct super_block *sb, struct ext3cow_new_group_data *input) -+{ -+ struct ext3cow_sb_info *sbi = EXT3COW_SB(sb); -+ struct ext3cow_super_block *es = sbi->s_es; -+ int reserved_gdb = ext3cow_bg_has_super(sb, input->group) ? -+ le16_to_cpu(es->s_reserved_gdt_blocks) : 0; -+ struct buffer_head *primary = NULL; -+ struct ext3cow_group_desc *gdp; -+ struct inode *inode = NULL; -+ handle_t *handle; -+ int gdb_off, gdb_num; -+ int err, err2; -+ -+ gdb_num = input->group / EXT3COW_DESC_PER_BLOCK(sb); -+ gdb_off = input->group % EXT3COW_DESC_PER_BLOCK(sb); -+ -+ if (gdb_off == 0 && !EXT3COW_HAS_RO_COMPAT_FEATURE(sb, -+ EXT3COW_FEATURE_RO_COMPAT_SPARSE_SUPER)) { -+ ext3cow_warning(sb, __FUNCTION__, -+ "Can't resize non-sparse filesystem further"); -+ return -EPERM; -+ } -+ -+ if (le32_to_cpu(es->s_blocks_count) + input->blocks_count < -+ le32_to_cpu(es->s_blocks_count)) { -+ ext3cow_warning(sb, __FUNCTION__, "blocks_count overflow\n"); -+ return -EINVAL; -+ } -+ -+ if (le32_to_cpu(es->s_inodes_count) + EXT3COW_INODES_PER_GROUP(sb) < -+ le32_to_cpu(es->s_inodes_count)) { -+ ext3cow_warning(sb, __FUNCTION__, "inodes_count overflow\n"); -+ return -EINVAL; -+ } -+ -+ if (reserved_gdb || gdb_off == 0) { -+ if (!EXT3COW_HAS_COMPAT_FEATURE(sb, -+ EXT3COW_FEATURE_COMPAT_RESIZE_INODE)){ -+ ext3cow_warning(sb, __FUNCTION__, -+ "No reserved GDT blocks, can't resize"); -+ return -EPERM; -+ } -+ inode = iget(sb, EXT3COW_RESIZE_INO); -+ if (!inode || is_bad_inode(inode)) { -+ ext3cow_warning(sb, __FUNCTION__, -+ "Error opening resize inode"); -+ iput(inode); -+ return -ENOENT; -+ } -+ } -+ -+ if ((err = verify_group_input(sb, input))) -+ goto exit_put; -+ -+ if ((err = setup_new_group_blocks(sb, input))) -+ goto exit_put; -+ -+ /* -+ * We will always be modifying at least the superblock and a GDT -+ * block. If we are adding a group past the last current GDT block, -+ * we will also modify the inode and the dindirect block. If we -+ * are adding a group with superblock/GDT backups we will also -+ * modify each of the reserved GDT dindirect blocks. -+ */ -+ handle = ext3cow_journal_start_sb(sb, -+ ext3cow_bg_has_super(sb, input->group) ? -+ 3 + reserved_gdb : 4); -+ if (IS_ERR(handle)) { -+ err = PTR_ERR(handle); -+ goto exit_put; -+ } -+ -+ lock_super(sb); -+ if (input->group != sbi->s_groups_count) { -+ ext3cow_warning(sb, __FUNCTION__, -+ "multiple resizers run on filesystem!"); -+ err = -EBUSY; -+ goto exit_journal; -+ } -+ -+ if ((err = ext3cow_journal_get_write_access(handle, sbi->s_sbh))) -+ goto exit_journal; -+ -+ /* -+ * We will only either add reserved group blocks to a backup group -+ * or remove reserved blocks for the first group in a new group block. -+ * Doing both would be mean more complex code, and sane people don't -+ * use non-sparse filesystems anymore. This is already checked above. -+ */ -+ if (gdb_off) { -+ primary = sbi->s_group_desc[gdb_num]; -+ if ((err = ext3cow_journal_get_write_access(handle, primary))) -+ goto exit_journal; -+ -+ if (reserved_gdb && ext3cow_bg_num_gdb(sb, input->group) && -+ (err = reserve_backup_gdb(handle, inode, input))) -+ goto exit_journal; -+ } else if ((err = add_new_gdb(handle, inode, input, &primary))) -+ goto exit_journal; -+ -+ /* -+ * OK, now we've set up the new group. Time to make it active. -+ * -+ * Current kernels don't lock all allocations via lock_super(), -+ * so we have to be safe wrt. concurrent accesses the group -+ * data. So we need to be careful to set all of the relevant -+ * group descriptor data etc. *before* we enable the group. -+ * -+ * The key field here is sbi->s_groups_count: as long as -+ * that retains its old value, nobody is going to access the new -+ * group. -+ * -+ * So first we update all the descriptor metadata for the new -+ * group; then we update the total disk blocks count; then we -+ * update the groups count to enable the group; then finally we -+ * update the free space counts so that the system can start -+ * using the new disk blocks. -+ */ -+ -+ /* Update group descriptor block for new group */ -+ gdp = (struct ext3cow_group_desc *)primary->b_data + gdb_off; -+ -+ gdp->bg_block_bitmap = cpu_to_le32(input->block_bitmap); -+ gdp->bg_inode_bitmap = cpu_to_le32(input->inode_bitmap); -+ gdp->bg_inode_table = cpu_to_le32(input->inode_table); -+ gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count); -+ gdp->bg_free_inodes_count = cpu_to_le16(EXT3COW_INODES_PER_GROUP(sb)); -+ -+ /* -+ * Make the new blocks and inodes valid next. We do this before -+ * increasing the group count so that once the group is enabled, -+ * all of its blocks and inodes are already valid. -+ * -+ * We always allocate group-by-group, then block-by-block or -+ * inode-by-inode within a group, so enabling these -+ * blocks/inodes before the group is live won't actually let us -+ * allocate the new space yet. -+ */ -+ es->s_blocks_count = cpu_to_le32(le32_to_cpu(es->s_blocks_count) + -+ input->blocks_count); -+ es->s_inodes_count = cpu_to_le32(le32_to_cpu(es->s_inodes_count) + -+ EXT3COW_INODES_PER_GROUP(sb)); -+ -+ /* -+ * We need to protect s_groups_count against other CPUs seeing -+ * inconsistent state in the superblock. -+ * -+ * The precise rules we use are: -+ * -+ * * Writers of s_groups_count *must* hold lock_super -+ * AND -+ * * Writers must perform a smp_wmb() after updating all dependent -+ * data and before modifying the groups count -+ * -+ * * Readers must hold lock_super() over the access -+ * OR -+ * * Readers must perform an smp_rmb() after reading the groups count -+ * and before reading any dependent data. -+ * -+ * NB. These rules can be relaxed when checking the group count -+ * while freeing data, as we can only allocate from a block -+ * group after serialising against the group count, and we can -+ * only then free after serialising in turn against that -+ * allocation. -+ */ -+ smp_wmb(); -+ -+ /* Update the global fs size fields */ -+ sbi->s_groups_count++; -+ -+ ext3cow_journal_dirty_metadata(handle, primary); -+ -+ /* Update the reserved block counts only once the new group is -+ * active. */ -+ es->s_r_blocks_count = cpu_to_le32(le32_to_cpu(es->s_r_blocks_count) + -+ input->reserved_blocks); -+ -+ /* Update the free space counts */ -+ percpu_counter_mod(&sbi->s_freeblocks_counter, -+ input->free_blocks_count); -+ percpu_counter_mod(&sbi->s_freeinodes_counter, -+ EXT3COW_INODES_PER_GROUP(sb)); -+ -+ ext3cow_journal_dirty_metadata(handle, sbi->s_sbh); -+ sb->s_dirt = 1; -+ -+exit_journal: -+ unlock_super(sb); -+ if ((err2 = ext3cow_journal_stop(handle)) && !err) -+ err = err2; -+ if (!err) { -+ update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, -+ sizeof(struct ext3cow_super_block)); -+ update_backups(sb, primary->b_blocknr, primary->b_data, -+ primary->b_size); -+ } -+exit_put: -+ iput(inode); -+ return err; -+} /* ext3cow_group_add */ -+ -+/* Extend the filesystem to the new number of blocks specified. This entry -+ * point is only used to extend the current filesystem to the end of the last -+ * existing group. It can be accessed via ioctl, or by "remount,resize=" -+ * for emergencies (because it has no dependencies on reserved blocks). -+ * -+ * If we _really_ wanted, we could use default values to call ext3cow_group_add() -+ * allow the "remount" trick to work for arbitrary resizing, assuming enough -+ * GDT blocks are reserved to grow to the desired size. -+ */ -+int ext3cow_group_extend(struct super_block *sb, struct ext3cow_super_block *es, -+ ext3cow_fsblk_t n_blocks_count) -+{ -+ ext3cow_fsblk_t o_blocks_count; -+ unsigned long o_groups_count; -+ ext3cow_grpblk_t last; -+ ext3cow_grpblk_t add; -+ struct buffer_head * bh; -+ handle_t *handle; -+ int err; -+ unsigned long freed_blocks; -+ -+ /* We don't need to worry about locking wrt other resizers just -+ * yet: we're going to revalidate es->s_blocks_count after -+ * taking lock_super() below. */ -+ o_blocks_count = le32_to_cpu(es->s_blocks_count); -+ o_groups_count = EXT3COW_SB(sb)->s_groups_count; -+ -+ if (test_opt(sb, DEBUG)) -+ printk(KERN_DEBUG "EXT3COW-fs: extending last group from "E3FSBLK" uto "E3FSBLK" blocks\n", -+ o_blocks_count, n_blocks_count); -+ -+ if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) -+ return 0; -+ -+ if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { -+ printk(KERN_ERR "EXT3COW-fs: filesystem on %s:" -+ " too large to resize to %lu blocks safely\n", -+ sb->s_id, n_blocks_count); -+ if (sizeof(sector_t) < 8) -+ ext3cow_warning(sb, __FUNCTION__, -+ "CONFIG_LBD not enabled\n"); -+ return -EINVAL; -+ } -+ -+ if (n_blocks_count < o_blocks_count) { -+ ext3cow_warning(sb, __FUNCTION__, -+ "can't shrink FS - resize aborted"); -+ return -EBUSY; -+ } -+ -+ /* Handle the remaining blocks in the last group only. */ -+ last = (o_blocks_count - le32_to_cpu(es->s_first_data_block)) % -+ EXT3COW_BLOCKS_PER_GROUP(sb); -+ -+ if (last == 0) { -+ ext3cow_warning(sb, __FUNCTION__, -+ "need to use ext2online to resize further"); -+ return -EPERM; -+ } -+ -+ add = EXT3COW_BLOCKS_PER_GROUP(sb) - last; -+ -+ if (o_blocks_count + add < o_blocks_count) { -+ ext3cow_warning(sb, __FUNCTION__, "blocks_count overflow"); -+ return -EINVAL; -+ } -+ -+ if (o_blocks_count + add > n_blocks_count) -+ add = n_blocks_count - o_blocks_count; -+ -+ if (o_blocks_count + add < n_blocks_count) -+ ext3cow_warning(sb, __FUNCTION__, -+ "will only finish group ("E3FSBLK -+ " blocks, %u new)", -+ o_blocks_count + add, add); -+ -+ /* See if the device is actually as big as what was requested */ -+ bh = sb_bread(sb, o_blocks_count + add -1); -+ if (!bh) { -+ ext3cow_warning(sb, __FUNCTION__, -+ "can't read last block, resize aborted"); -+ return -ENOSPC; -+ } -+ brelse(bh); -+ -+ /* We will update the superblock, one block bitmap, and -+ * one group descriptor via ext3cow_free_blocks(). -+ */ -+ handle = ext3cow_journal_start_sb(sb, 3); -+ if (IS_ERR(handle)) { -+ err = PTR_ERR(handle); -+ ext3cow_warning(sb, __FUNCTION__, "error %d on journal start",err); -+ goto exit_put; -+ } -+ -+ lock_super(sb); -+ if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) { -+ ext3cow_warning(sb, __FUNCTION__, -+ "multiple resizers run on filesystem!"); -+ unlock_super(sb); -+ err = -EBUSY; -+ goto exit_put; -+ } -+ -+ if ((err = ext3cow_journal_get_write_access(handle, -+ EXT3COW_SB(sb)->s_sbh))) { -+ ext3cow_warning(sb, __FUNCTION__, -+ "error %d on journal write access", err); -+ unlock_super(sb); -+ ext3cow_journal_stop(handle); -+ goto exit_put; -+ } -+ es->s_blocks_count = cpu_to_le32(o_blocks_count + add); -+ ext3cow_journal_dirty_metadata(handle, EXT3COW_SB(sb)->s_sbh); -+ sb->s_dirt = 1; -+ unlock_super(sb); -+ ext3cow_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count, -+ o_blocks_count + add); -+ ext3cow_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); -+ ext3cow_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n", o_blocks_count, -+ o_blocks_count + add); -+ if ((err = ext3cow_journal_stop(handle))) -+ goto exit_put; -+ if (test_opt(sb, DEBUG)) -+ printk(KERN_DEBUG "EXT3COW-fs: extended group to %u blocks\n", -+ le32_to_cpu(es->s_blocks_count)); -+ update_backups(sb, EXT3COW_SB(sb)->s_sbh->b_blocknr, (char *)es, -+ sizeof(struct ext3cow_super_block)); -+exit_put: -+ return err; -+} /* ext3cow_group_extend */ -diff -ruN linux-2.6.20.3/fs/ext3cow/super.c linux-2.6.20.3-ext3cow/fs/ext3cow/super.c ---- linux-2.6.20.3/fs/ext3cow/super.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.20.3-ext3cow/fs/ext3cow/super.c 2008-03-09 11:14:49.000000000 -0400 -@@ -0,0 +1,2808 @@ -+/* -+ * linux/fs/ext3cow/super.c -+ * -+ * Copyright (C) 1992, 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ * -+ * from -+ * -+ * linux/fs/minix/inode.c -+ * -+ * Copyright (C) 1991, 1992 Linus Torvalds -+ * -+ * Big-endian to little-endian byte-swapping/bitmaps by -+ * David S. Miller (davem@caip.rutgers.edu), 1995 -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#include "xattr.h" -+#include "acl.h" -+#include "namei.h" -+ -+static int ext3cow_load_journal(struct super_block *, struct ext3cow_super_block *, -+ unsigned long journal_devnum); -+static int ext3cow_create_journal(struct super_block *, struct ext3cow_super_block *, -+ unsigned int); -+static void ext3cow_commit_super (struct super_block * sb, -+ struct ext3cow_super_block * es, -+ int sync); -+static void ext3cow_mark_recovery_complete(struct super_block * sb, -+ struct ext3cow_super_block * es); -+static void ext3cow_clear_journal_err(struct super_block * sb, -+ struct ext3cow_super_block * es); -+static int ext3cow_sync_fs(struct super_block *sb, int wait); -+static const char *ext3cow_decode_error(struct super_block * sb, int errno, -+ char nbuf[16]); -+static int ext3cow_remount (struct super_block * sb, int * flags, char * data); -+static int ext3cow_statfs (struct dentry * dentry, struct kstatfs * buf); -+static void ext3cow_unlockfs(struct super_block *sb); -+static void ext3cow_write_super (struct super_block * sb); -+static void ext3cow_write_super_lockfs(struct super_block *sb); -+ -+/* -+ * Wrappers for journal_start/end. -+ * -+ * The only special thing we need to do here is to make sure that all -+ * journal_end calls result in the superblock being marked dirty, so -+ * that sync() will call the filesystem's write_super callback if -+ * appropriate. -+ */ -+handle_t *ext3cow_journal_start_sb(struct super_block *sb, int nblocks) -+{ -+ journal_t *journal; -+ -+ if (sb->s_flags & MS_RDONLY) -+ return ERR_PTR(-EROFS); -+ -+ /* Special case here: if the journal has aborted behind our -+ * backs (eg. EIO in the commit thread), then we still need to -+ * take the FS itself readonly cleanly. */ -+ journal = EXT3COW_SB(sb)->s_journal; -+ if (is_journal_aborted(journal)) { -+ ext3cow_abort(sb, __FUNCTION__, -+ "Detected aborted journal"); -+ return ERR_PTR(-EROFS); -+ } -+ -+ return journal_start(journal, nblocks); -+} -+ -+/* -+ * The only special thing we need to do here is to make sure that all -+ * journal_stop calls result in the superblock being marked dirty, so -+ * that sync() will call the filesystem's write_super callback if -+ * appropriate. -+ */ -+int __ext3cow_journal_stop(const char *where, handle_t *handle) -+{ -+ struct super_block *sb; -+ int err; -+ int rc; -+ -+ sb = handle->h_transaction->t_journal->j_private; -+ err = handle->h_err; -+ rc = journal_stop(handle); -+ -+ if (!err) -+ err = rc; -+ if (err) -+ __ext3cow_std_error(sb, where, err); -+ return err; -+} -+ -+void ext3cow_journal_abort_handle(const char *caller, const char *err_fn, -+ struct buffer_head *bh, handle_t *handle, int err) -+{ -+ char nbuf[16]; -+ const char *errstr = ext3cow_decode_error(NULL, err, nbuf); -+ -+ if (bh) -+ BUFFER_TRACE(bh, "abort"); -+ -+ if (!handle->h_err) -+ handle->h_err = err; -+ -+ if (is_handle_aborted(handle)) -+ return; -+ -+ printk(KERN_ERR "%s: aborting transaction: %s in %s\n", -+ caller, errstr, err_fn); -+ -+ journal_abort_handle(handle); -+} -+ -+/* Deal with the reporting of failure conditions on a filesystem such as -+ * inconsistencies detected or read IO failures. -+ * -+ * On ext2, we can store the error state of the filesystem in the -+ * superblock. That is not possible on ext3cow, because we may have other -+ * write ordering constraints on the superblock which prevent us from -+ * writing it out straight away; and given that the journal is about to -+ * be aborted, we can't rely on the current, or future, transactions to -+ * write out the superblock safely. -+ * -+ * We'll just use the journal_abort() error code to record an error in -+ * the journal instead. On recovery, the journal will compain about -+ * that error until we've noted it down and cleared it. -+ */ -+ -+static void ext3cow_handle_error(struct super_block *sb) -+{ -+ struct ext3cow_super_block *es = EXT3COW_SB(sb)->s_es; -+ -+ EXT3COW_SB(sb)->s_mount_state |= EXT3COW_ERROR_FS; -+ es->s_state |= cpu_to_le16(EXT3COW_ERROR_FS); -+ -+ if (sb->s_flags & MS_RDONLY) -+ return; -+ -+ if (!test_opt (sb, ERRORS_CONT)) { -+ journal_t *journal = EXT3COW_SB(sb)->s_journal; -+ -+ EXT3COW_SB(sb)->s_mount_opt |= EXT3COW_MOUNT_ABORT; -+ if (journal) -+ journal_abort(journal, -EIO); -+ } -+ if (test_opt (sb, ERRORS_RO)) { -+ printk (KERN_CRIT "Remounting filesystem read-only\n"); -+ sb->s_flags |= MS_RDONLY; -+ } -+ ext3cow_commit_super(sb, es, 1); -+ if (test_opt(sb, ERRORS_PANIC)) -+ panic("EXT3COW-fs (device %s): panic forced after error\n", -+ sb->s_id); -+} -+ -+void ext3cow_error (struct super_block * sb, const char * function, -+ const char * fmt, ...) -+{ -+ va_list args; -+ -+ va_start(args, fmt); -+ printk(KERN_CRIT "EXT3COW-fs error (device %s): %s: ",sb->s_id, function); -+ vprintk(fmt, args); -+ printk("\n"); -+ va_end(args); -+ -+ ext3cow_handle_error(sb); -+} -+ -+static const char *ext3cow_decode_error(struct super_block * sb, int errno, -+ char nbuf[16]) -+{ -+ char *errstr = NULL; -+ -+ switch (errno) { -+ case -EIO: -+ errstr = "IO failure"; -+ break; -+ case -ENOMEM: -+ errstr = "Out of memory"; -+ break; -+ case -EROFS: -+ if (!sb || EXT3COW_SB(sb)->s_journal->j_flags & JFS_ABORT) -+ errstr = "Journal has aborted"; -+ else -+ errstr = "Readonly filesystem"; -+ break; -+ default: -+ /* If the caller passed in an extra buffer for unknown -+ * errors, textualise them now. Else we just return -+ * NULL. */ -+ if (nbuf) { -+ /* Check for truncated error codes... */ -+ if (snprintf(nbuf, 16, "error %d", -errno) >= 0) -+ errstr = nbuf; -+ } -+ break; -+ } -+ -+ return errstr; -+} -+ -+/* __ext3cow_std_error decodes expected errors from journaling functions -+ * automatically and invokes the appropriate error response. */ -+ -+void __ext3cow_std_error (struct super_block * sb, const char * function, -+ int errno) -+{ -+ char nbuf[16]; -+ const char *errstr; -+ -+ /* Special case: if the error is EROFS, and we're not already -+ * inside a transaction, then there's really no point in logging -+ * an error. */ -+ if (errno == -EROFS && journal_current_handle() == NULL && -+ (sb->s_flags & MS_RDONLY)) -+ return; -+ -+ errstr = ext3cow_decode_error(sb, errno, nbuf); -+ printk (KERN_CRIT "EXT3COW-fs error (device %s) in %s: %s\n", -+ sb->s_id, function, errstr); -+ -+ ext3cow_handle_error(sb); -+} -+ -+/* -+ * ext3cow_abort is a much stronger failure handler than ext3cow_error. The -+ * abort function may be used to deal with unrecoverable failures such -+ * as journal IO errors or ENOMEM at a critical moment in log management. -+ * -+ * We unconditionally force the filesystem into an ABORT|READONLY state, -+ * unless the error response on the fs has been set to panic in which -+ * case we take the easy way out and panic immediately. -+ */ -+ -+void ext3cow_abort (struct super_block * sb, const char * function, -+ const char * fmt, ...) -+{ -+ va_list args; -+ -+ printk (KERN_CRIT "ext3cow_abort called.\n"); -+ -+ va_start(args, fmt); -+ printk(KERN_CRIT "EXT3COW-fs error (device %s): %s: ",sb->s_id, function); -+ vprintk(fmt, args); -+ printk("\n"); -+ va_end(args); -+ -+ if (test_opt(sb, ERRORS_PANIC)) -+ panic("EXT3COW-fs panic from previous error\n"); -+ -+ if (sb->s_flags & MS_RDONLY) -+ return; -+ -+ printk(KERN_CRIT "Remounting filesystem read-only\n"); -+ EXT3COW_SB(sb)->s_mount_state |= EXT3COW_ERROR_FS; -+ sb->s_flags |= MS_RDONLY; -+ EXT3COW_SB(sb)->s_mount_opt |= EXT3COW_MOUNT_ABORT; -+ journal_abort(EXT3COW_SB(sb)->s_journal, -EIO); -+} -+ -+void ext3cow_warning (struct super_block * sb, const char * function, -+ const char * fmt, ...) -+{ -+ va_list args; -+ -+ va_start(args, fmt); -+ printk(KERN_WARNING "EXT3COW-fs warning (device %s): %s: ", -+ sb->s_id, function); -+ vprintk(fmt, args); -+ printk("\n"); -+ va_end(args); -+} -+ -+void ext3cow_update_dynamic_rev(struct super_block *sb) -+{ -+ struct ext3cow_super_block *es = EXT3COW_SB(sb)->s_es; -+ -+ if (le32_to_cpu(es->s_rev_level) > EXT3COW_GOOD_OLD_REV) -+ return; -+ -+ ext3cow_warning(sb, __FUNCTION__, -+ "updating to rev %d because of new feature flag, " -+ "running e2fsck is recommended", -+ EXT3COW_DYNAMIC_REV); -+ -+ es->s_first_ino = cpu_to_le32(EXT3COW_GOOD_OLD_FIRST_INO); -+ es->s_inode_size = cpu_to_le16(EXT3COW_GOOD_OLD_INODE_SIZE); -+ es->s_rev_level = cpu_to_le32(EXT3COW_DYNAMIC_REV); -+ /* leave es->s_feature_*compat flags alone */ -+ /* es->s_uuid will be set by e2fsck if empty */ -+ -+ /* -+ * The rest of the superblock fields should be zero, and if not it -+ * means they are likely already in use, so leave them alone. We -+ * can leave it up to e2fsck to clean up any inconsistencies there. -+ */ -+} -+ -+/* -+ * Open the external journal device -+ */ -+static struct block_device *ext3cow_blkdev_get(dev_t dev) -+{ -+ struct block_device *bdev; -+ char b[BDEVNAME_SIZE]; -+ -+ bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); -+ if (IS_ERR(bdev)) -+ goto fail; -+ return bdev; -+ -+fail: -+ printk(KERN_ERR "EXT3COW: failed to open journal device %s: %ld\n", -+ __bdevname(dev, b), PTR_ERR(bdev)); -+ return NULL; -+} -+ -+/* -+ * Release the journal device -+ */ -+static int ext3cow_blkdev_put(struct block_device *bdev) -+{ -+ bd_release(bdev); -+ return blkdev_put(bdev); -+} -+ -+static int ext3cow_blkdev_remove(struct ext3cow_sb_info *sbi) -+{ -+ struct block_device *bdev; -+ int ret = -ENODEV; -+ -+ bdev = sbi->journal_bdev; -+ if (bdev) { -+ ret = ext3cow_blkdev_put(bdev); -+ sbi->journal_bdev = NULL; -+ } -+ return ret; -+} -+ -+static inline struct inode *orphan_list_entry(struct list_head *l) -+{ -+ return &list_entry(l, struct ext3cow_inode_info, i_orphan)->vfs_inode; -+} -+ -+static void dump_orphan_list(struct super_block *sb, struct ext3cow_sb_info *sbi) -+{ -+ struct list_head *l; -+ -+ printk(KERN_ERR "sb orphan head is %d\n", -+ le32_to_cpu(sbi->s_es->s_last_orphan)); -+ -+ printk(KERN_ERR "sb_info orphan list:\n"); -+ list_for_each(l, &sbi->s_orphan) { -+ struct inode *inode = orphan_list_entry(l); -+ printk(KERN_ERR " " -+ "inode %s:%lu at %p: mode %o, nlink %d, next %d\n", -+ inode->i_sb->s_id, inode->i_ino, inode, -+ inode->i_mode, inode->i_nlink, -+ NEXT_ORPHAN(inode)); -+ } -+} -+ -+static void ext3cow_put_super (struct super_block * sb) -+{ -+ struct ext3cow_sb_info *sbi = EXT3COW_SB(sb); -+ struct ext3cow_super_block *es = sbi->s_es; -+ int i; -+ -+ ext3cow_xattr_put_super(sb); -+ journal_destroy(sbi->s_journal); -+ if (!(sb->s_flags & MS_RDONLY)) { -+ EXT3COW_CLEAR_INCOMPAT_FEATURE(sb, EXT3COW_FEATURE_INCOMPAT_RECOVER); -+ es->s_state = cpu_to_le16(sbi->s_mount_state); -+ BUFFER_TRACE(sbi->s_sbh, "marking dirty"); -+ mark_buffer_dirty(sbi->s_sbh); -+ ext3cow_commit_super(sb, es, 1); -+ } -+ -+ for (i = 0; i < sbi->s_gdb_count; i++) -+ brelse(sbi->s_group_desc[i]); -+ kfree(sbi->s_group_desc); -+ percpu_counter_destroy(&sbi->s_freeblocks_counter); -+ percpu_counter_destroy(&sbi->s_freeinodes_counter); -+ percpu_counter_destroy(&sbi->s_dirs_counter); -+ brelse(sbi->s_sbh); -+#ifdef CONFIG_QUOTA -+ for (i = 0; i < MAXQUOTAS; i++) -+ kfree(sbi->s_qf_names[i]); -+#endif -+ -+ /* Debugging code just in case the in-memory inode orphan list -+ * isn't empty. The on-disk one can be non-empty if we've -+ * detected an error and taken the fs readonly, but the -+ * in-memory list had better be clean by this point. */ -+ if (!list_empty(&sbi->s_orphan)) -+ dump_orphan_list(sb, sbi); -+ J_ASSERT(list_empty(&sbi->s_orphan)); -+ -+ invalidate_bdev(sb->s_bdev, 0); -+ if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) { -+ /* -+ * Invalidate the journal device's buffers. We don't want them -+ * floating about in memory - the physical journal device may -+ * hotswapped, and it breaks the `ro-after' testing code. -+ */ -+ sync_blockdev(sbi->journal_bdev); -+ invalidate_bdev(sbi->journal_bdev, 0); -+ ext3cow_blkdev_remove(sbi); -+ } -+ sb->s_fs_info = NULL; -+ kfree(sbi); -+ return; -+} -+ -+static struct kmem_cache *ext3cow_inode_cachep; -+ -+/* -+ * Called inside transaction, so use GFP_NOFS -+ */ -+static struct inode *ext3cow_alloc_inode(struct super_block *sb) -+{ -+ struct ext3cow_inode_info *ei; -+ -+ ei = kmem_cache_alloc(ext3cow_inode_cachep, GFP_NOFS); -+ if (!ei) -+ return NULL; -+#ifdef CONFIG_EXT3COW_FS_POSIX_ACL -+ ei->i_acl = EXT3COW_ACL_NOT_CACHED; -+ ei->i_default_acl = EXT3COW_ACL_NOT_CACHED; -+#endif -+ ei->i_block_alloc_info = NULL; -+ ei->vfs_inode.i_version = 1; -+ return &ei->vfs_inode; -+} -+ -+static void ext3cow_destroy_inode(struct inode *inode) -+{ -+ kmem_cache_free(ext3cow_inode_cachep, EXT3COW_I(inode)); -+} -+ -+static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags) -+{ -+ struct ext3cow_inode_info *ei = (struct ext3cow_inode_info *) foo; -+ -+ if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == -+ SLAB_CTOR_CONSTRUCTOR) { -+ INIT_LIST_HEAD(&ei->i_orphan); -+#ifdef CONFIG_EXT3COW_FS_XATTR -+ init_rwsem(&ei->xattr_sem); -+#endif -+ mutex_init(&ei->truncate_mutex); -+ inode_init_once(&ei->vfs_inode); -+ } -+} -+ -+static int init_inodecache(void) -+{ -+ ext3cow_inode_cachep = kmem_cache_create("ext3cow_inode_cache", -+ sizeof(struct ext3cow_inode_info), -+ 0, (SLAB_RECLAIM_ACCOUNT| -+ SLAB_MEM_SPREAD), -+ init_once, NULL); -+ if (ext3cow_inode_cachep == NULL) -+ return -ENOMEM; -+ return 0; -+} -+ -+static void destroy_inodecache(void) -+{ -+ kmem_cache_destroy(ext3cow_inode_cachep); -+} -+ -+static void ext3cow_clear_inode(struct inode *inode) -+{ -+ struct ext3cow_block_alloc_info *rsv = EXT3COW_I(inode)->i_block_alloc_info; -+#ifdef CONFIG_EXT3COW_FS_POSIX_ACL -+ if (EXT3COW_I(inode)->i_acl && -+ EXT3COW_I(inode)->i_acl != EXT3COW_ACL_NOT_CACHED) { -+ posix_acl_release(EXT3COW_I(inode)->i_acl); -+ EXT3COW_I(inode)->i_acl = EXT3COW_ACL_NOT_CACHED; -+ } -+ if (EXT3COW_I(inode)->i_default_acl && -+ EXT3COW_I(inode)->i_default_acl != EXT3COW_ACL_NOT_CACHED) { -+ posix_acl_release(EXT3COW_I(inode)->i_default_acl); -+ EXT3COW_I(inode)->i_default_acl = EXT3COW_ACL_NOT_CACHED; -+ } -+#endif -+ ext3cow_discard_reservation(inode); -+ EXT3COW_I(inode)->i_block_alloc_info = NULL; -+ if (unlikely(rsv)) -+ kfree(rsv); -+} -+ -+static inline void ext3cow_show_quota_options(struct seq_file *seq, struct super_block *sb) -+{ -+#if defined(CONFIG_QUOTA) -+ struct ext3cow_sb_info *sbi = EXT3COW_SB(sb); -+ -+ if (sbi->s_jquota_fmt) -+ seq_printf(seq, ",jqfmt=%s", -+ (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0"); -+ -+ if (sbi->s_qf_names[USRQUOTA]) -+ seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); -+ -+ if (sbi->s_qf_names[GRPQUOTA]) -+ seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); -+ -+ if (sbi->s_mount_opt & EXT3COW_MOUNT_USRQUOTA) -+ seq_puts(seq, ",usrquota"); -+ -+ if (sbi->s_mount_opt & EXT3COW_MOUNT_GRPQUOTA) -+ seq_puts(seq, ",grpquota"); -+#endif -+} -+ -+static int ext3cow_show_options(struct seq_file *seq, struct vfsmount *vfs) -+{ -+ struct super_block *sb = vfs->mnt_sb; -+ -+ if (test_opt(sb, DATA_FLAGS) == EXT3COW_MOUNT_JOURNAL_DATA) -+ seq_puts(seq, ",data=journal"); -+ else if (test_opt(sb, DATA_FLAGS) == EXT3COW_MOUNT_ORDERED_DATA) -+ seq_puts(seq, ",data=ordered"); -+ else if (test_opt(sb, DATA_FLAGS) == EXT3COW_MOUNT_WRITEBACK_DATA) -+ seq_puts(seq, ",data=writeback"); -+ -+ ext3cow_show_quota_options(seq, sb); -+ -+ return 0; -+} -+ -+ -+static struct dentry *ext3cow_get_dentry(struct super_block *sb, void *vobjp) -+{ -+ __u32 *objp = vobjp; -+ unsigned long ino = objp[0]; -+ __u32 generation = objp[1]; -+ struct inode *inode; -+ struct dentry *result; -+ -+ if (ino < EXT3COW_FIRST_INO(sb) && ino != EXT3COW_ROOT_INO) -+ return ERR_PTR(-ESTALE); -+ if (ino > le32_to_cpu(EXT3COW_SB(sb)->s_es->s_inodes_count)) -+ return ERR_PTR(-ESTALE); -+ -+ /* iget isn't really right if the inode is currently unallocated!! -+ * -+ * ext3cow_read_inode will return a bad_inode if the inode had been -+ * deleted, so we should be safe. -+ * -+ * Currently we don't know the generation for parent directory, so -+ * a generation of 0 means "accept any" -+ */ -+ inode = iget(sb, ino); -+ if (inode == NULL) -+ return ERR_PTR(-ENOMEM); -+ if (is_bad_inode(inode) || -+ (generation && inode->i_generation != generation)) { -+ iput(inode); -+ return ERR_PTR(-ESTALE); -+ } -+ /* now to find a dentry. -+ * If possible, get a well-connected one -+ */ -+ result = d_alloc_anon(inode); -+ if (!result) { -+ iput(inode); -+ return ERR_PTR(-ENOMEM); -+ } -+ return result; -+} -+ -+#ifdef CONFIG_QUOTA -+#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group") -+#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) -+ -+static int ext3cow_dquot_initialize(struct inode *inode, int type); -+static int ext3cow_dquot_drop(struct inode *inode); -+static int ext3cow_write_dquot(struct dquot *dquot); -+static int ext3cow_acquire_dquot(struct dquot *dquot); -+static int ext3cow_release_dquot(struct dquot *dquot); -+static int ext3cow_mark_dquot_dirty(struct dquot *dquot); -+static int ext3cow_write_info(struct super_block *sb, int type); -+static int ext3cow_quota_on(struct super_block *sb, int type, int format_id, char *path); -+static int ext3cow_quota_on_mount(struct super_block *sb, int type); -+static ssize_t ext3cow_quota_read(struct super_block *sb, int type, char *data, -+ size_t len, loff_t off); -+static ssize_t ext3cow_quota_write(struct super_block *sb, int type, -+ const char *data, size_t len, loff_t off); -+ -+static struct dquot_operations ext3cow_quota_operations = { -+ .initialize = ext3cow_dquot_initialize, -+ .drop = ext3cow_dquot_drop, -+ .alloc_space = dquot_alloc_space, -+ .alloc_inode = dquot_alloc_inode, -+ .free_space = dquot_free_space, -+ .free_inode = dquot_free_inode, -+ .transfer = dquot_transfer, -+ .write_dquot = ext3cow_write_dquot, -+ .acquire_dquot = ext3cow_acquire_dquot, -+ .release_dquot = ext3cow_release_dquot, -+ .mark_dirty = ext3cow_mark_dquot_dirty, -+ .write_info = ext3cow_write_info -+}; -+ -+static struct quotactl_ops ext3cow_qctl_operations = { -+ .quota_on = ext3cow_quota_on, -+ .quota_off = vfs_quota_off, -+ .quota_sync = vfs_quota_sync, -+ .get_info = vfs_get_dqinfo, -+ .set_info = vfs_set_dqinfo, -+ .get_dqblk = vfs_get_dqblk, -+ .set_dqblk = vfs_set_dqblk -+}; -+#endif -+ -+static struct super_operations ext3cow_sops = { -+ .alloc_inode = ext3cow_alloc_inode, -+ .destroy_inode = ext3cow_destroy_inode, -+ .read_inode = ext3cow_read_inode, -+ .write_inode = ext3cow_write_inode, -+ .dirty_inode = ext3cow_dirty_inode, -+ .delete_inode = ext3cow_delete_inode, -+ .put_super = ext3cow_put_super, -+ .write_super = ext3cow_write_super, -+ .sync_fs = ext3cow_sync_fs, -+ .write_super_lockfs = ext3cow_write_super_lockfs, -+ .unlockfs = ext3cow_unlockfs, -+ .statfs = ext3cow_statfs, -+ .remount_fs = ext3cow_remount, -+ .clear_inode = ext3cow_clear_inode, -+ .show_options = ext3cow_show_options, -+#ifdef CONFIG_QUOTA -+ .quota_read = ext3cow_quota_read, -+ .quota_write = ext3cow_quota_write, -+#endif -+}; -+ -+static struct export_operations ext3cow_export_ops = { -+ .get_parent = ext3cow_get_parent, -+ .get_dentry = ext3cow_get_dentry, -+}; -+ -+enum { -+ Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, -+ Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, -+ Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, -+ Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, -+ Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, -+ Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, -+ Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, -+ Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, -+ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, -+ Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, -+ Opt_grpquota -+}; -+ -+static match_table_t tokens = { -+ {Opt_bsd_df, "bsddf"}, -+ {Opt_minix_df, "minixdf"}, -+ {Opt_grpid, "grpid"}, -+ {Opt_grpid, "bsdgroups"}, -+ {Opt_nogrpid, "nogrpid"}, -+ {Opt_nogrpid, "sysvgroups"}, -+ {Opt_resgid, "resgid=%u"}, -+ {Opt_resuid, "resuid=%u"}, -+ {Opt_sb, "sb=%u"}, -+ {Opt_err_cont, "errors=continue"}, -+ {Opt_err_panic, "errors=panic"}, -+ {Opt_err_ro, "errors=remount-ro"}, -+ {Opt_nouid32, "nouid32"}, -+ {Opt_nocheck, "nocheck"}, -+ {Opt_nocheck, "check=none"}, -+ {Opt_debug, "debug"}, -+ {Opt_oldalloc, "oldalloc"}, -+ {Opt_orlov, "orlov"}, -+ {Opt_user_xattr, "user_xattr"}, -+ {Opt_nouser_xattr, "nouser_xattr"}, -+ {Opt_acl, "acl"}, -+ {Opt_noacl, "noacl"}, -+ {Opt_reservation, "reservation"}, -+ {Opt_noreservation, "noreservation"}, -+ {Opt_noload, "noload"}, -+ {Opt_nobh, "nobh"}, -+ {Opt_bh, "bh"}, -+ {Opt_commit, "commit=%u"}, -+ {Opt_journal_update, "journal=update"}, -+ {Opt_journal_inum, "journal=%u"}, -+ {Opt_journal_dev, "journal_dev=%u"}, -+ {Opt_abort, "abort"}, -+ {Opt_data_journal, "data=journal"}, -+ {Opt_data_ordered, "data=ordered"}, -+ {Opt_data_writeback, "data=writeback"}, -+ {Opt_offusrjquota, "usrjquota="}, -+ {Opt_usrjquota, "usrjquota=%s"}, -+ {Opt_offgrpjquota, "grpjquota="}, -+ {Opt_grpjquota, "grpjquota=%s"}, -+ {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, -+ {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, -+ {Opt_grpquota, "grpquota"}, -+ {Opt_noquota, "noquota"}, -+ {Opt_quota, "quota"}, -+ {Opt_usrquota, "usrquota"}, -+ {Opt_barrier, "barrier=%u"}, -+ {Opt_err, NULL}, -+ {Opt_resize, "resize"}, -+}; -+ -+static ext3cow_fsblk_t get_sb_block(void **data) -+{ -+ ext3cow_fsblk_t sb_block; -+ char *options = (char *) *data; -+ -+ if (!options || strncmp(options, "sb=", 3) != 0) -+ return 1; /* Default location */ -+ options += 3; -+ /*todo: use simple_strtoll with >32bit ext3cow */ -+ sb_block = simple_strtoul(options, &options, 0); -+ if (*options && *options != ',') { -+ printk("EXT3COW-fs: Invalid sb specification: %s\n", -+ (char *) *data); -+ return 1; -+ } -+ if (*options == ',') -+ options++; -+ *data = (void *) options; -+ return sb_block; -+} -+ -+static int parse_options (char *options, struct super_block *sb, -+ unsigned int *inum, unsigned long *journal_devnum, -+ ext3cow_fsblk_t *n_blocks_count, int is_remount) -+{ -+ struct ext3cow_sb_info *sbi = EXT3COW_SB(sb); -+ char * p; -+ substring_t args[MAX_OPT_ARGS]; -+ int data_opt = 0; -+ int option; -+#ifdef CONFIG_QUOTA -+ int qtype; -+ char *qname; -+#endif -+ -+ if (!options) -+ return 1; -+ -+ while ((p = strsep (&options, ",")) != NULL) { -+ int token; -+ if (!*p) -+ continue; -+ -+ token = match_token(p, tokens, args); -+ switch (token) { -+ case Opt_bsd_df: -+ clear_opt (sbi->s_mount_opt, MINIX_DF); -+ break; -+ case Opt_minix_df: -+ set_opt (sbi->s_mount_opt, MINIX_DF); -+ break; -+ case Opt_grpid: -+ set_opt (sbi->s_mount_opt, GRPID); -+ break; -+ case Opt_nogrpid: -+ clear_opt (sbi->s_mount_opt, GRPID); -+ break; -+ case Opt_resuid: -+ if (match_int(&args[0], &option)) -+ return 0; -+ sbi->s_resuid = option; -+ break; -+ case Opt_resgid: -+ if (match_int(&args[0], &option)) -+ return 0; -+ sbi->s_resgid = option; -+ break; -+ case Opt_sb: -+ /* handled by get_sb_block() instead of here */ -+ /* *sb_block = match_int(&args[0]); */ -+ break; -+ case Opt_err_panic: -+ clear_opt (sbi->s_mount_opt, ERRORS_CONT); -+ clear_opt (sbi->s_mount_opt, ERRORS_RO); -+ set_opt (sbi->s_mount_opt, ERRORS_PANIC); -+ break; -+ case Opt_err_ro: -+ clear_opt (sbi->s_mount_opt, ERRORS_CONT); -+ clear_opt (sbi->s_mount_opt, ERRORS_PANIC); -+ set_opt (sbi->s_mount_opt, ERRORS_RO); -+ break; -+ case Opt_err_cont: -+ clear_opt (sbi->s_mount_opt, ERRORS_RO); -+ clear_opt (sbi->s_mount_opt, ERRORS_PANIC); -+ set_opt (sbi->s_mount_opt, ERRORS_CONT); -+ break; -+ case Opt_nouid32: -+ set_opt (sbi->s_mount_opt, NO_UID32); -+ break; -+ case Opt_nocheck: -+ clear_opt (sbi->s_mount_opt, CHECK); -+ break; -+ case Opt_debug: -+ set_opt (sbi->s_mount_opt, DEBUG); -+ break; -+ case Opt_oldalloc: -+ set_opt (sbi->s_mount_opt, OLDALLOC); -+ break; -+ case Opt_orlov: -+ clear_opt (sbi->s_mount_opt, OLDALLOC); -+ break; -+#ifdef CONFIG_EXT3COW_FS_XATTR -+ case Opt_user_xattr: -+ set_opt (sbi->s_mount_opt, XATTR_USER); -+ break; -+ case Opt_nouser_xattr: -+ clear_opt (sbi->s_mount_opt, XATTR_USER); -+ break; -+#else -+ case Opt_user_xattr: -+ case Opt_nouser_xattr: -+ printk("EXT3COW (no)user_xattr options not supported\n"); -+ break; -+#endif -+#ifdef CONFIG_EXT3COW_FS_POSIX_ACL -+ case Opt_acl: -+ set_opt(sbi->s_mount_opt, POSIX_ACL); -+ break; -+ case Opt_noacl: -+ clear_opt(sbi->s_mount_opt, POSIX_ACL); -+ break; -+#else -+ case Opt_acl: -+ case Opt_noacl: -+ printk("EXT3COW (no)acl options not supported\n"); -+ break; -+#endif -+ case Opt_reservation: -+ set_opt(sbi->s_mount_opt, RESERVATION); -+ break; -+ case Opt_noreservation: -+ clear_opt(sbi->s_mount_opt, RESERVATION); -+ break; -+ case Opt_journal_update: -+ /* @@@ FIXME */ -+ /* Eventually we will want to be able to create -+ a journal file here. For now, only allow the -+ user to specify an existing inode to be the -+ journal file. */ -+ if (is_remount) { -+ printk(KERN_ERR "EXT3COW-fs: cannot specify " -+ "journal on remount\n"); -+ return 0; -+ } -+ set_opt (sbi->s_mount_opt, UPDATE_JOURNAL); -+ break; -+ case Opt_journal_inum: -+ if (is_remount) { -+ printk(KERN_ERR "EXT3COW-fs: cannot specify " -+ "journal on remount\n"); -+ return 0; -+ } -+ if (match_int(&args[0], &option)) -+ return 0; -+ *inum = option; -+ break; -+ case Opt_journal_dev: -+ if (is_remount) { -+ printk(KERN_ERR "EXT3COW-fs: cannot specify " -+ "journal on remount\n"); -+ return 0; -+ } -+ if (match_int(&args[0], &option)) -+ return 0; -+ *journal_devnum = option; -+ break; -+ case Opt_noload: -+ set_opt (sbi->s_mount_opt, NOLOAD); -+ break; -+ case Opt_commit: -+ if (match_int(&args[0], &option)) -+ return 0; -+ if (option < 0) -+ return 0; -+ if (option == 0) -+ option = JBD_DEFAULT_MAX_COMMIT_AGE; -+ sbi->s_commit_interval = HZ * option; -+ break; -+ case Opt_data_journal: -+ data_opt = EXT3COW_MOUNT_JOURNAL_DATA; -+ goto datacheck; -+ case Opt_data_ordered: -+ data_opt = EXT3COW_MOUNT_ORDERED_DATA; -+ goto datacheck; -+ case Opt_data_writeback: -+ data_opt = EXT3COW_MOUNT_WRITEBACK_DATA; -+ datacheck: -+ if (is_remount) { -+ if ((sbi->s_mount_opt & EXT3COW_MOUNT_DATA_FLAGS) -+ != data_opt) { -+ printk(KERN_ERR -+ "EXT3COW-fs: cannot change data " -+ "mode on remount\n"); -+ return 0; -+ } -+ } else { -+ sbi->s_mount_opt &= ~EXT3COW_MOUNT_DATA_FLAGS; -+ sbi->s_mount_opt |= data_opt; -+ } -+ break; -+#ifdef CONFIG_QUOTA -+ case Opt_usrjquota: -+ qtype = USRQUOTA; -+ goto set_qf_name; -+ case Opt_grpjquota: -+ qtype = GRPQUOTA; -+set_qf_name: -+ if (sb_any_quota_enabled(sb)) { -+ printk(KERN_ERR -+ "EXT3COW-fs: Cannot change journalled " -+ "quota options when quota turned on.\n"); -+ return 0; -+ } -+ qname = match_strdup(&args[0]); -+ if (!qname) { -+ printk(KERN_ERR -+ "EXT3COW-fs: not enough memory for " -+ "storing quotafile name.\n"); -+ return 0; -+ } -+ if (sbi->s_qf_names[qtype] && -+ strcmp(sbi->s_qf_names[qtype], qname)) { -+ printk(KERN_ERR -+ "EXT3COW-fs: %s quota file already " -+ "specified.\n", QTYPE2NAME(qtype)); -+ kfree(qname); -+ return 0; -+ } -+ sbi->s_qf_names[qtype] = qname; -+ if (strchr(sbi->s_qf_names[qtype], '/')) { -+ printk(KERN_ERR -+ "EXT3COW-fs: quotafile must be on " -+ "filesystem root.\n"); -+ kfree(sbi->s_qf_names[qtype]); -+ sbi->s_qf_names[qtype] = NULL; -+ return 0; -+ } -+ set_opt(sbi->s_mount_opt, QUOTA); -+ break; -+ case Opt_offusrjquota: -+ qtype = USRQUOTA; -+ goto clear_qf_name; -+ case Opt_offgrpjquota: -+ qtype = GRPQUOTA; -+clear_qf_name: -+ if (sb_any_quota_enabled(sb)) { -+ printk(KERN_ERR "EXT3COW-fs: Cannot change " -+ "journalled quota options when " -+ "quota turned on.\n"); -+ return 0; -+ } -+ /* -+ * The space will be released later when all options -+ * are confirmed to be correct -+ */ -+ sbi->s_qf_names[qtype] = NULL; -+ break; -+ case Opt_jqfmt_vfsold: -+ sbi->s_jquota_fmt = QFMT_VFS_OLD; -+ break; -+ case Opt_jqfmt_vfsv0: -+ sbi->s_jquota_fmt = QFMT_VFS_V0; -+ break; -+ case Opt_quota: -+ case Opt_usrquota: -+ set_opt(sbi->s_mount_opt, QUOTA); -+ set_opt(sbi->s_mount_opt, USRQUOTA); -+ break; -+ case Opt_grpquota: -+ set_opt(sbi->s_mount_opt, QUOTA); -+ set_opt(sbi->s_mount_opt, GRPQUOTA); -+ break; -+ case Opt_noquota: -+ if (sb_any_quota_enabled(sb)) { -+ printk(KERN_ERR "EXT3COW-fs: Cannot change quota " -+ "options when quota turned on.\n"); -+ return 0; -+ } -+ clear_opt(sbi->s_mount_opt, QUOTA); -+ clear_opt(sbi->s_mount_opt, USRQUOTA); -+ clear_opt(sbi->s_mount_opt, GRPQUOTA); -+ break; -+#else -+ case Opt_quota: -+ case Opt_usrquota: -+ case Opt_grpquota: -+ case Opt_usrjquota: -+ case Opt_grpjquota: -+ case Opt_offusrjquota: -+ case Opt_offgrpjquota: -+ case Opt_jqfmt_vfsold: -+ case Opt_jqfmt_vfsv0: -+ printk(KERN_ERR -+ "EXT3COW-fs: journalled quota options not " -+ "supported.\n"); -+ break; -+ case Opt_noquota: -+ break; -+#endif -+ case Opt_abort: -+ set_opt(sbi->s_mount_opt, ABORT); -+ break; -+ case Opt_barrier: -+ if (match_int(&args[0], &option)) -+ return 0; -+ if (option) -+ set_opt(sbi->s_mount_opt, BARRIER); -+ else -+ clear_opt(sbi->s_mount_opt, BARRIER); -+ break; -+ case Opt_ignore: -+ break; -+ case Opt_resize: -+ if (!is_remount) { -+ printk("EXT3COW-fs: resize option only available " -+ "for remount\n"); -+ return 0; -+ } -+ if (match_int(&args[0], &option) != 0) -+ return 0; -+ *n_blocks_count = option; -+ break; -+ case Opt_nobh: -+ set_opt(sbi->s_mount_opt, NOBH); -+ break; -+ case Opt_bh: -+ clear_opt(sbi->s_mount_opt, NOBH); -+ break; -+ default: -+ printk (KERN_ERR -+ "EXT3COW-fs: Unrecognized mount option \"%s\" " -+ "or missing value\n", p); -+ return 0; -+ } -+ } -+#ifdef CONFIG_QUOTA -+ if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { -+ if ((sbi->s_mount_opt & EXT3COW_MOUNT_USRQUOTA) && -+ sbi->s_qf_names[USRQUOTA]) -+ clear_opt(sbi->s_mount_opt, USRQUOTA); -+ -+ if ((sbi->s_mount_opt & EXT3COW_MOUNT_GRPQUOTA) && -+ sbi->s_qf_names[GRPQUOTA]) -+ clear_opt(sbi->s_mount_opt, GRPQUOTA); -+ -+ if ((sbi->s_qf_names[USRQUOTA] && -+ (sbi->s_mount_opt & EXT3COW_MOUNT_GRPQUOTA)) || -+ (sbi->s_qf_names[GRPQUOTA] && -+ (sbi->s_mount_opt & EXT3COW_MOUNT_USRQUOTA))) { -+ printk(KERN_ERR "EXT3COW-fs: old and new quota " -+ "format mixing.\n"); -+ return 0; -+ } -+ -+ if (!sbi->s_jquota_fmt) { -+ printk(KERN_ERR "EXT3COW-fs: journalled quota format " -+ "not specified.\n"); -+ return 0; -+ } -+ } else { -+ if (sbi->s_jquota_fmt) { -+ printk(KERN_ERR "EXT3COW-fs: journalled quota format " -+ "specified with no journalling " -+ "enabled.\n"); -+ return 0; -+ } -+ } -+#endif -+ return 1; -+} -+ -+static int ext3cow_setup_super(struct super_block *sb, struct ext3cow_super_block *es, -+ int read_only) -+{ -+ struct ext3cow_sb_info *sbi = EXT3COW_SB(sb); -+ int res = 0; -+ -+ if (le32_to_cpu(es->s_rev_level) > EXT3COW_MAX_SUPP_REV) { -+ printk (KERN_ERR "EXT3COW-fs warning: revision level too high, " -+ "forcing read-only mode\n"); -+ res = MS_RDONLY; -+ } -+ if (read_only) -+ return res; -+ if (!(sbi->s_mount_state & EXT3COW_VALID_FS)) -+ printk (KERN_WARNING "EXT3COW-fs warning: mounting unchecked fs, " -+ "running e2fsck is recommended\n"); -+ else if ((sbi->s_mount_state & EXT3COW_ERROR_FS)) -+ printk (KERN_WARNING -+ "EXT3COW-fs warning: mounting fs with errors, " -+ "running e2fsck is recommended\n"); -+ else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && -+ le16_to_cpu(es->s_mnt_count) >= -+ (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) -+ printk (KERN_WARNING -+ "EXT3COW-fs warning: maximal mount count reached, " -+ "running e2fsck is recommended\n"); -+ else if (le32_to_cpu(es->s_checkinterval) && -+ (le32_to_cpu(es->s_lastcheck) + -+ le32_to_cpu(es->s_checkinterval) <= get_seconds())) -+ printk (KERN_WARNING -+ "EXT3COW-fs warning: checktime reached, " -+ "running e2fsck is recommended\n"); -+#if 0 -+ /* @@@ We _will_ want to clear the valid bit if we find -+ inconsistencies, to force a fsck at reboot. But for -+ a plain journaled filesystem we can keep it set as -+ valid forever! :) */ -+ es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) & ~EXT3COW_VALID_FS); -+#endif -+ if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) -+ es->s_max_mnt_count = cpu_to_le16(EXT3COW_DFL_MAX_MNT_COUNT); -+ es->s_mnt_count=cpu_to_le16(le16_to_cpu(es->s_mnt_count) + 1); -+ es->s_mtime = cpu_to_le32(get_seconds()); -+ ext3cow_update_dynamic_rev(sb); -+ EXT3COW_SET_INCOMPAT_FEATURE(sb, EXT3COW_FEATURE_INCOMPAT_RECOVER); -+ -+ ext3cow_commit_super(sb, es, 1); -+ if (test_opt(sb, DEBUG)) -+ printk(KERN_INFO "[EXT3COW FS bs=%lu, gc=%lu, " -+ "bpg=%lu, ipg=%lu, mo=%04lx]\n", -+ sb->s_blocksize, -+ sbi->s_groups_count, -+ EXT3COW_BLOCKS_PER_GROUP(sb), -+ EXT3COW_INODES_PER_GROUP(sb), -+ sbi->s_mount_opt); -+ -+ printk(KERN_INFO "EXT3COW FS on %s, ", sb->s_id); -+ if (EXT3COW_SB(sb)->s_journal->j_inode == NULL) { -+ char b[BDEVNAME_SIZE]; -+ -+ printk("external journal on %s\n", -+ bdevname(EXT3COW_SB(sb)->s_journal->j_dev, b)); -+ } else { -+ printk("internal journal\n"); -+ } -+ return res; -+} -+ -+/* Called at mount-time, super-block is locked */ -+static int ext3cow_check_descriptors (struct super_block * sb) -+{ -+ struct ext3cow_sb_info *sbi = EXT3COW_SB(sb); -+ ext3cow_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block); -+ ext3cow_fsblk_t last_block; -+ struct ext3cow_group_desc * gdp = NULL; -+ int desc_block = 0; -+ int i; -+ -+ ext3cow_debug ("Checking group descriptors"); -+ -+ for (i = 0; i < sbi->s_groups_count; i++) -+ { -+ if (i == sbi->s_groups_count - 1) -+ last_block = le32_to_cpu(sbi->s_es->s_blocks_count) - 1; -+ else -+ last_block = first_block + -+ (EXT3COW_BLOCKS_PER_GROUP(sb) - 1); -+ -+ if ((i % EXT3COW_DESC_PER_BLOCK(sb)) == 0) -+ gdp = (struct ext3cow_group_desc *) -+ sbi->s_group_desc[desc_block++]->b_data; -+ if (le32_to_cpu(gdp->bg_block_bitmap) < first_block || -+ le32_to_cpu(gdp->bg_block_bitmap) > last_block) -+ { -+ ext3cow_error (sb, "ext3cow_check_descriptors", -+ "Block bitmap for group %d" -+ " not in group (block %lu)!", -+ i, (unsigned long) -+ le32_to_cpu(gdp->bg_block_bitmap)); -+ return 0; -+ } -+ if (le32_to_cpu(gdp->bg_inode_bitmap) < first_block || -+ le32_to_cpu(gdp->bg_inode_bitmap) > last_block) -+ { -+ ext3cow_error (sb, "ext3cow_check_descriptors", -+ "Inode bitmap for group %d" -+ " not in group (block %lu)!", -+ i, (unsigned long) -+ le32_to_cpu(gdp->bg_inode_bitmap)); -+ return 0; -+ } -+ if (le32_to_cpu(gdp->bg_inode_table) < first_block || -+ le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group > -+ last_block) -+ { -+ ext3cow_error (sb, "ext3cow_check_descriptors", -+ "Inode table for group %d" -+ " not in group (block %lu)!", -+ i, (unsigned long) -+ le32_to_cpu(gdp->bg_inode_table)); -+ return 0; -+ } -+ first_block += EXT3COW_BLOCKS_PER_GROUP(sb); -+ gdp++; -+ } -+ -+ sbi->s_es->s_free_blocks_count=cpu_to_le32(ext3cow_count_free_blocks(sb)); -+ sbi->s_es->s_free_inodes_count=cpu_to_le32(ext3cow_count_free_inodes(sb)); -+ return 1; -+} -+ -+ -+/* ext3cow_orphan_cleanup() walks a singly-linked list of inodes (starting at -+ * the superblock) which were deleted from all directories, but held open by -+ * a process at the time of a crash. We walk the list and try to delete these -+ * inodes at recovery time (only with a read-write filesystem). -+ * -+ * In order to keep the orphan inode chain consistent during traversal (in -+ * case of crash during recovery), we link each inode into the superblock -+ * orphan list_head and handle it the same way as an inode deletion during -+ * normal operation (which journals the operations for us). -+ * -+ * We only do an iget() and an iput() on each inode, which is very safe if we -+ * accidentally point at an in-use or already deleted inode. The worst that -+ * can happen in this case is that we get a "bit already cleared" message from -+ * ext3cow_free_inode(). The only reason we would point at a wrong inode is if -+ * e2fsck was run on this filesystem, and it must have already done the orphan -+ * inode cleanup for us, so we can safely abort without any further action. -+ */ -+static void ext3cow_orphan_cleanup (struct super_block * sb, -+ struct ext3cow_super_block * es) -+{ -+ unsigned int s_flags = sb->s_flags; -+ int nr_orphans = 0, nr_truncates = 0; -+#ifdef CONFIG_QUOTA -+ int i; -+#endif -+ if (!es->s_last_orphan) { -+ jbd_debug(4, "no orphan inodes to clean up\n"); -+ return; -+ } -+ -+ if (bdev_read_only(sb->s_bdev)) { -+ printk(KERN_ERR "EXT3COW-fs: write access " -+ "unavailable, skipping orphan cleanup.\n"); -+ return; -+ } -+ -+ if (EXT3COW_SB(sb)->s_mount_state & EXT3COW_ERROR_FS) { -+ if (es->s_last_orphan) -+ jbd_debug(1, "Errors on filesystem, " -+ "clearing orphan list.\n"); -+ es->s_last_orphan = 0; -+ jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); -+ return; -+ } -+ -+ if (s_flags & MS_RDONLY) { -+ printk(KERN_INFO "EXT3COW-fs: %s: orphan cleanup on readonly fs\n", -+ sb->s_id); -+ sb->s_flags &= ~MS_RDONLY; -+ } -+#ifdef CONFIG_QUOTA -+ /* Needed for iput() to work correctly and not trash data */ -+ sb->s_flags |= MS_ACTIVE; -+ /* Turn on quotas so that they are updated correctly */ -+ for (i = 0; i < MAXQUOTAS; i++) { -+ if (EXT3COW_SB(sb)->s_qf_names[i]) { -+ int ret = ext3cow_quota_on_mount(sb, i); -+ if (ret < 0) -+ printk(KERN_ERR -+ "EXT3COW-fs: Cannot turn on journalled " -+ "quota: error %d\n", ret); -+ } -+ } -+#endif -+ -+ while (es->s_last_orphan) { -+ struct inode *inode; -+ -+ if (!(inode = -+ ext3cow_orphan_get(sb, le32_to_cpu(es->s_last_orphan)))) { -+ es->s_last_orphan = 0; -+ break; -+ } -+ -+ list_add(&EXT3COW_I(inode)->i_orphan, &EXT3COW_SB(sb)->s_orphan); -+ DQUOT_INIT(inode); -+ if (inode->i_nlink) { -+ printk(KERN_DEBUG -+ "%s: truncating inode %lu to %Ld bytes\n", -+ __FUNCTION__, inode->i_ino, inode->i_size); -+ jbd_debug(2, "truncating inode %lu to %Ld bytes\n", -+ inode->i_ino, inode->i_size); -+ ext3cow_truncate(inode); -+ nr_truncates++; -+ } else { -+ printk(KERN_DEBUG -+ "%s: deleting unreferenced inode %lu\n", -+ __FUNCTION__, inode->i_ino); -+ jbd_debug(2, "deleting unreferenced inode %lu\n", -+ inode->i_ino); -+ nr_orphans++; -+ } -+ iput(inode); /* The delete magic happens here! */ -+ } -+ -+#define PLURAL(x) (x), ((x)==1) ? "" : "s" -+ -+ if (nr_orphans) -+ printk(KERN_INFO "EXT3COW-fs: %s: %d orphan inode%s deleted\n", -+ sb->s_id, PLURAL(nr_orphans)); -+ if (nr_truncates) -+ printk(KERN_INFO "EXT3COW-fs: %s: %d truncate%s cleaned up\n", -+ sb->s_id, PLURAL(nr_truncates)); -+#ifdef CONFIG_QUOTA -+ /* Turn quotas off */ -+ for (i = 0; i < MAXQUOTAS; i++) { -+ if (sb_dqopt(sb)->files[i]) -+ vfs_quota_off(sb, i); -+ } -+#endif -+ sb->s_flags = s_flags; /* Restore MS_RDONLY status */ -+} -+ -+/* -+ * Maximal file size. There is a direct, and {,double-,triple-}indirect -+ * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks. -+ * We need to be 1 filesystem block less than the 2^32 sector limit. -+ */ -+static loff_t ext3cow_max_size(int bits) -+{ -+ loff_t res = EXT3COW_NDIR_BLOCKS; -+ /* This constant is calculated to be the largest file size for a -+ * dense, 4k-blocksize file such that the total number of -+ * sectors in the file, including data and all indirect blocks, -+ * does not exceed 2^32. */ -+ const loff_t upper_limit = 0x1ff7fffd000LL; -+ -+ res += 1LL << (bits-2); -+ res += 1LL << (2*(bits-2)); -+ res += 1LL << (3*(bits-2)); -+ res <<= bits; -+ if (res > upper_limit) -+ res = upper_limit; -+ return res; -+} -+ -+static ext3cow_fsblk_t descriptor_loc(struct super_block *sb, -+ ext3cow_fsblk_t logic_sb_block, -+ int nr) -+{ -+ struct ext3cow_sb_info *sbi = EXT3COW_SB(sb); -+ unsigned long bg, first_meta_bg; -+ int has_super = 0; -+ -+ first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); -+ -+ if (!EXT3COW_HAS_INCOMPAT_FEATURE(sb, EXT3COW_FEATURE_INCOMPAT_META_BG) || -+ nr < first_meta_bg) -+ return (logic_sb_block + nr + 1); -+ bg = sbi->s_desc_per_block * nr; -+ if (ext3cow_bg_has_super(sb, bg)) -+ has_super = 1; -+ return (has_super + ext3cow_group_first_block_no(sb, bg)); -+} -+ -+ -+static int ext3cow_fill_super (struct super_block *sb, void *data, int silent) -+{ -+ struct buffer_head * bh; -+ struct ext3cow_super_block *es = NULL; -+ struct ext3cow_sb_info *sbi; -+ ext3cow_fsblk_t block; -+ ext3cow_fsblk_t sb_block = get_sb_block(&data); -+ ext3cow_fsblk_t logic_sb_block; -+ unsigned long offset = 0; -+ unsigned int journal_inum = 0; -+ unsigned long journal_devnum = 0; -+ unsigned long def_mount_opts; -+ struct inode *root; -+ int blocksize; -+ int hblock; -+ int db_count; -+ int i; -+ int needs_recovery; -+ __le32 features; -+ -+ sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); -+ if (!sbi) -+ return -ENOMEM; -+ sb->s_fs_info = sbi; -+ sbi->s_mount_opt = 0; -+ sbi->s_resuid = EXT3COW_DEF_RESUID; -+ sbi->s_resgid = EXT3COW_DEF_RESGID; -+ -+ unlock_kernel(); -+ -+ blocksize = sb_min_blocksize(sb, EXT3COW_MIN_BLOCK_SIZE); -+ if (!blocksize) { -+ printk(KERN_ERR "EXT3COW-fs: unable to set blocksize\n"); -+ goto out_fail; -+ } -+ -+ /* -+ * The ext3cow superblock will not be buffer aligned for other than 1kB -+ * block sizes. We need to calculate the offset from buffer start. -+ */ -+ if (blocksize != EXT3COW_MIN_BLOCK_SIZE) { -+ logic_sb_block = (sb_block * EXT3COW_MIN_BLOCK_SIZE) / blocksize; -+ offset = (sb_block * EXT3COW_MIN_BLOCK_SIZE) % blocksize; -+ } else { -+ logic_sb_block = sb_block; -+ } -+ -+ if (!(bh = sb_bread(sb, logic_sb_block))) { -+ printk (KERN_ERR "EXT3COW-fs: unable to read superblock\n"); -+ goto out_fail; -+ } -+ /* -+ * Note: s_es must be initialized as soon as possible because -+ * some ext3cow macro-instructions depend on its value -+ */ -+ es = (struct ext3cow_super_block *) (((char *)bh->b_data) + offset); -+ sbi->s_es = es; -+ sb->s_magic = le16_to_cpu(es->s_magic); -+ if (sb->s_magic != EXT3COW_SUPER_MAGIC) -+ goto cantfind_ext3cow; -+ -+ /* Set defaults before we parse the mount options */ -+ def_mount_opts = le32_to_cpu(es->s_default_mount_opts); -+ if (def_mount_opts & EXT3COW_DEFM_DEBUG) -+ set_opt(sbi->s_mount_opt, DEBUG); -+ if (def_mount_opts & EXT3COW_DEFM_BSDGROUPS) -+ set_opt(sbi->s_mount_opt, GRPID); -+ if (def_mount_opts & EXT3COW_DEFM_UID16) -+ set_opt(sbi->s_mount_opt, NO_UID32); -+#ifdef CONFIG_EXT3COW_FS_XATTR -+ if (def_mount_opts & EXT3COW_DEFM_XATTR_USER) -+ set_opt(sbi->s_mount_opt, XATTR_USER); -+#endif -+#ifdef CONFIG_EXT3COW_FS_POSIX_ACL -+ if (def_mount_opts & EXT3COW_DEFM_ACL) -+ set_opt(sbi->s_mount_opt, POSIX_ACL); -+#endif -+ if ((def_mount_opts & EXT3COW_DEFM_JMODE) == EXT3COW_DEFM_JMODE_DATA) -+ sbi->s_mount_opt |= EXT3COW_MOUNT_JOURNAL_DATA; -+ else if ((def_mount_opts & EXT3COW_DEFM_JMODE) == EXT3COW_DEFM_JMODE_ORDERED) -+ sbi->s_mount_opt |= EXT3COW_MOUNT_ORDERED_DATA; -+ else if ((def_mount_opts & EXT3COW_DEFM_JMODE) == EXT3COW_DEFM_JMODE_WBACK) -+ sbi->s_mount_opt |= EXT3COW_MOUNT_WRITEBACK_DATA; -+ -+ if (le16_to_cpu(sbi->s_es->s_errors) == EXT3COW_ERRORS_PANIC) -+ set_opt(sbi->s_mount_opt, ERRORS_PANIC); -+ else if (le16_to_cpu(sbi->s_es->s_errors) == EXT3COW_ERRORS_RO) -+ set_opt(sbi->s_mount_opt, ERRORS_RO); -+ else -+ set_opt(sbi->s_mount_opt, ERRORS_CONT); -+ -+ sbi->s_resuid = le16_to_cpu(es->s_def_resuid); -+ sbi->s_resgid = le16_to_cpu(es->s_def_resgid); -+ -+ set_opt(sbi->s_mount_opt, RESERVATION); -+ -+ if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, -+ NULL, 0)) -+ goto failed_mount; -+ -+ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | -+ ((sbi->s_mount_opt & EXT3COW_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); -+ -+ if (le32_to_cpu(es->s_rev_level) == EXT3COW_GOOD_OLD_REV && -+ (EXT3COW_HAS_COMPAT_FEATURE(sb, ~0U) || -+ EXT3COW_HAS_RO_COMPAT_FEATURE(sb, ~0U) || -+ EXT3COW_HAS_INCOMPAT_FEATURE(sb, ~0U))) -+ printk(KERN_WARNING -+ "EXT3COW-fs warning: feature flags set on rev 0 fs, " -+ "running e2fsck is recommended\n"); -+ /* -+ * Check feature flags regardless of the revision level, since we -+ * previously didn't change the revision level when setting the flags, -+ * so there is a chance incompat flags are set on a rev 0 filesystem. -+ */ -+ features = EXT3COW_HAS_INCOMPAT_FEATURE(sb, ~EXT3COW_FEATURE_INCOMPAT_SUPP); -+ if (features) { -+ printk(KERN_ERR "EXT3COW-fs: %s: couldn't mount because of " -+ "unsupported optional features (%x).\n", -+ sb->s_id, le32_to_cpu(features)); -+ goto failed_mount; -+ } -+ features = EXT3COW_HAS_RO_COMPAT_FEATURE(sb, ~EXT3COW_FEATURE_RO_COMPAT_SUPP); -+ if (!(sb->s_flags & MS_RDONLY) && features) { -+ printk(KERN_ERR "EXT3COW-fs: %s: couldn't mount RDWR because of " -+ "unsupported optional features (%x).\n", -+ sb->s_id, le32_to_cpu(features)); -+ goto failed_mount; -+ } -+ blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); -+ -+ if (blocksize < EXT3COW_MIN_BLOCK_SIZE || -+ blocksize > EXT3COW_MAX_BLOCK_SIZE) { -+ printk(KERN_ERR -+ "EXT3COW-fs: Unsupported filesystem blocksize %d on %s.\n", -+ blocksize, sb->s_id); -+ goto failed_mount; -+ } -+ -+ hblock = bdev_hardsect_size(sb->s_bdev); -+ if (sb->s_blocksize != blocksize) { -+ /* -+ * Make sure the blocksize for the filesystem is larger -+ * than the hardware sectorsize for the machine. -+ */ -+ if (blocksize < hblock) { -+ printk(KERN_ERR "EXT3COW-fs: blocksize %d too small for " -+ "device blocksize %d.\n", blocksize, hblock); -+ goto failed_mount; -+ } -+ -+ brelse (bh); -+ sb_set_blocksize(sb, blocksize); -+ logic_sb_block = (sb_block * EXT3COW_MIN_BLOCK_SIZE) / blocksize; -+ offset = (sb_block * EXT3COW_MIN_BLOCK_SIZE) % blocksize; -+ bh = sb_bread(sb, logic_sb_block); -+ if (!bh) { -+ printk(KERN_ERR -+ "EXT3COW-fs: Can't read superblock on 2nd try.\n"); -+ goto failed_mount; -+ } -+ es = (struct ext3cow_super_block *)(((char *)bh->b_data) + offset); -+ sbi->s_es = es; -+ if (es->s_magic != cpu_to_le16(EXT3COW_SUPER_MAGIC)) { -+ printk (KERN_ERR -+ "EXT3COW-fs: Magic mismatch, very weird !\n"); -+ goto failed_mount; -+ } -+ } -+ -+ sb->s_maxbytes = ext3cow_max_size(sb->s_blocksize_bits); -+ -+ if (le32_to_cpu(es->s_rev_level) == EXT3COW_GOOD_OLD_REV) { -+ sbi->s_inode_size = EXT3COW_GOOD_OLD_INODE_SIZE; -+ sbi->s_first_ino = EXT3COW_GOOD_OLD_FIRST_INO; -+ } else { -+ sbi->s_inode_size = le16_to_cpu(es->s_inode_size); -+ sbi->s_first_ino = le32_to_cpu(es->s_first_ino); -+ if ((sbi->s_inode_size < EXT3COW_GOOD_OLD_INODE_SIZE) || -+ (sbi->s_inode_size & (sbi->s_inode_size - 1)) || -+ (sbi->s_inode_size > blocksize)) { -+ printk (KERN_ERR -+ "EXT3COW-fs: unsupported inode size: %d\n", -+ sbi->s_inode_size); -+ goto failed_mount; -+ } -+ } -+ sbi->s_frag_size = EXT3COW_MIN_FRAG_SIZE << -+ le32_to_cpu(es->s_log_frag_size); -+ if (blocksize != sbi->s_frag_size) { -+ printk(KERN_ERR -+ "EXT3COW-fs: fragsize %lu != blocksize %u (unsupported)\n", -+ sbi->s_frag_size, blocksize); -+ goto failed_mount; -+ } -+ sbi->s_frags_per_block = 1; -+ sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); -+ sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group); -+ sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); -+ if (EXT3COW_INODE_SIZE(sb) == 0) -+ goto cantfind_ext3cow; -+ sbi->s_inodes_per_block = blocksize / EXT3COW_INODE_SIZE(sb); -+ if (sbi->s_inodes_per_block == 0) -+ goto cantfind_ext3cow; -+ sbi->s_itb_per_group = sbi->s_inodes_per_group / -+ sbi->s_inodes_per_block; -+ sbi->s_desc_per_block = blocksize / sizeof(struct ext3cow_group_desc); -+ sbi->s_sbh = bh; -+ sbi->s_mount_state = le16_to_cpu(es->s_state); -+ sbi->s_addr_per_block_bits = ilog2(EXT3COW_ADDR_PER_BLOCK(sb)); -+ sbi->s_desc_per_block_bits = ilog2(EXT3COW_DESC_PER_BLOCK(sb)); -+ for (i=0; i < 4; i++) -+ sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); -+ sbi->s_def_hash_version = es->s_def_hash_version; -+ -+ /* Epoch number for versioning -znjp */ -+ sbi->s_epoch_number = le32_to_cpu(es->s_epoch_number); -+ printk(KERN_INFO "EXT3COW-fs: System epoch number: %u\n", -+ sbi->s_epoch_number); -+ -+ if (sbi->s_blocks_per_group > blocksize * 8) { -+ printk (KERN_ERR -+ "EXT3COW-fs: #blocks per group too big: %lu\n", -+ sbi->s_blocks_per_group); -+ goto failed_mount; -+ } -+ if (sbi->s_frags_per_group > blocksize * 8) { -+ printk (KERN_ERR -+ "EXT3COW-fs: #fragments per group too big: %lu\n", -+ sbi->s_frags_per_group); -+ goto failed_mount; -+ } -+ if (sbi->s_inodes_per_group > blocksize * 8) { -+ printk (KERN_ERR -+ "EXT3COW-fs: #inodes per group too big: %lu\n", -+ sbi->s_inodes_per_group); -+ goto failed_mount; -+ } -+ -+ if (le32_to_cpu(es->s_blocks_count) > -+ (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { -+ printk(KERN_ERR "EXT3COW-fs: filesystem on %s:" -+ " too large to mount safely\n", sb->s_id); -+ if (sizeof(sector_t) < 8) -+ printk(KERN_WARNING "EXT3COW-fs: CONFIG_LBD not " -+ "enabled\n"); -+ goto failed_mount; -+ } -+ -+ if (EXT3COW_BLOCKS_PER_GROUP(sb) == 0) -+ goto cantfind_ext3cow; -+ sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) - -+ le32_to_cpu(es->s_first_data_block) - 1) -+ / EXT3COW_BLOCKS_PER_GROUP(sb)) + 1; -+ db_count = (sbi->s_groups_count + EXT3COW_DESC_PER_BLOCK(sb) - 1) / -+ EXT3COW_DESC_PER_BLOCK(sb); -+ sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *), -+ GFP_KERNEL); -+ if (sbi->s_group_desc == NULL) { -+ printk (KERN_ERR "EXT3COW-fs: not enough memory\n"); -+ goto failed_mount; -+ } -+ -+ bgl_lock_init(&sbi->s_blockgroup_lock); -+ -+ for (i = 0; i < db_count; i++) { -+ block = descriptor_loc(sb, logic_sb_block, i); -+ sbi->s_group_desc[i] = sb_bread(sb, block); -+ if (!sbi->s_group_desc[i]) { -+ printk (KERN_ERR "EXT3COW-fs: " -+ "can't read group descriptor %d\n", i); -+ db_count = i; -+ goto failed_mount2; -+ } -+ } -+ if (!ext3cow_check_descriptors (sb)) { -+ printk(KERN_ERR "EXT3COW-fs: group descriptors corrupted!\n"); -+ goto failed_mount2; -+ } -+ sbi->s_gdb_count = db_count; -+ get_random_bytes(&sbi->s_next_generation, sizeof(u32)); -+ spin_lock_init(&sbi->s_next_gen_lock); -+ -+ percpu_counter_init(&sbi->s_freeblocks_counter, -+ ext3cow_count_free_blocks(sb)); -+ percpu_counter_init(&sbi->s_freeinodes_counter, -+ ext3cow_count_free_inodes(sb)); -+ percpu_counter_init(&sbi->s_dirs_counter, -+ ext3cow_count_dirs(sb)); -+ -+ /* per fileystem reservation list head & lock */ -+ spin_lock_init(&sbi->s_rsv_window_lock); -+ sbi->s_rsv_window_root = RB_ROOT; -+ /* Add a single, static dummy reservation to the start of the -+ * reservation window list --- it gives us a placeholder for -+ * append-at-start-of-list which makes the allocation logic -+ * _much_ simpler. */ -+ sbi->s_rsv_window_head.rsv_start = EXT3COW_RESERVE_WINDOW_NOT_ALLOCATED; -+ sbi->s_rsv_window_head.rsv_end = EXT3COW_RESERVE_WINDOW_NOT_ALLOCATED; -+ sbi->s_rsv_window_head.rsv_alloc_hit = 0; -+ sbi->s_rsv_window_head.rsv_goal_size = 0; -+ ext3cow_rsv_window_add(sb, &sbi->s_rsv_window_head); -+ -+ /* -+ * set up enough so that it can read an inode -+ */ -+ sb->s_op = &ext3cow_sops; -+ sb->s_export_op = &ext3cow_export_ops; -+ sb->s_xattr = ext3cow_xattr_handlers; -+#ifdef CONFIG_QUOTA -+ sb->s_qcop = &ext3cow_qctl_operations; -+ sb->dq_op = &ext3cow_quota_operations; -+#endif -+ INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ -+ -+ sb->s_root = NULL; -+ -+ needs_recovery = (es->s_last_orphan != 0 || -+ EXT3COW_HAS_INCOMPAT_FEATURE(sb, -+ EXT3COW_FEATURE_INCOMPAT_RECOVER)); -+ -+ /* -+ * The first inode we look at is the journal inode. Don't try -+ * root first: it may be modified in the journal! -+ */ -+ if (!test_opt(sb, NOLOAD) && -+ EXT3COW_HAS_COMPAT_FEATURE(sb, EXT3COW_FEATURE_COMPAT_HAS_JOURNAL)) { -+ if (ext3cow_load_journal(sb, es, journal_devnum)) -+ goto failed_mount3; -+ } else if (journal_inum) { -+ if (ext3cow_create_journal(sb, es, journal_inum)) -+ goto failed_mount3; -+ } else { -+ if (!silent) -+ printk (KERN_ERR -+ "ext3cow: No journal on filesystem on %s\n", -+ sb->s_id); -+ goto failed_mount3; -+ } -+ -+ /* We have now updated the journal if required, so we can -+ * validate the data journaling mode. */ -+ switch (test_opt(sb, DATA_FLAGS)) { -+ case 0: -+ /* No mode set, assume a default based on the journal -+ capabilities: ORDERED_DATA if the journal can -+ cope, else JOURNAL_DATA */ -+ if (journal_check_available_features -+ (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) -+ set_opt(sbi->s_mount_opt, ORDERED_DATA); -+ else -+ set_opt(sbi->s_mount_opt, JOURNAL_DATA); -+ break; -+ -+ case EXT3COW_MOUNT_ORDERED_DATA: -+ case EXT3COW_MOUNT_WRITEBACK_DATA: -+ if (!journal_check_available_features -+ (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) { -+ printk(KERN_ERR "EXT3COW-fs: Journal does not support " -+ "requested data journaling mode\n"); -+ goto failed_mount4; -+ } -+ default: -+ break; -+ } -+ -+ if (test_opt(sb, NOBH)) { -+ if (!(test_opt(sb, DATA_FLAGS) == EXT3COW_MOUNT_WRITEBACK_DATA)) { -+ printk(KERN_WARNING "EXT3COW-fs: Ignoring nobh option - " -+ "its supported only with writeback mode\n"); -+ clear_opt(sbi->s_mount_opt, NOBH); -+ } -+ } -+ /* -+ * The journal_load will have done any necessary log recovery, -+ * so we can safely mount the rest of the filesystem now. -+ */ -+ -+ root = iget(sb, EXT3COW_ROOT_INO); -+ sb->s_root = d_alloc_root(root); -+ if (!sb->s_root) { -+ printk(KERN_ERR "EXT3COW-fs: get root inode failed\n"); -+ iput(root); -+ goto failed_mount4; -+ } -+ if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { -+ dput(sb->s_root); -+ sb->s_root = NULL; -+ printk(KERN_ERR "EXT3COW-fs: corrupt root inode, run e2fsck\n"); -+ goto failed_mount4; -+ } -+ -+ ext3cow_setup_super (sb, es, sb->s_flags & MS_RDONLY); -+ /* -+ * akpm: core read_super() calls in here with the superblock locked. -+ * That deadlocks, because orphan cleanup needs to lock the superblock -+ * in numerous places. Here we just pop the lock - it's relatively -+ * harmless, because we are now ready to accept write_super() requests, -+ * and aviro says that's the only reason for hanging onto the -+ * superblock lock. -+ */ -+ EXT3COW_SB(sb)->s_mount_state |= EXT3COW_ORPHAN_FS; -+ ext3cow_orphan_cleanup(sb, es); -+ EXT3COW_SB(sb)->s_mount_state &= ~EXT3COW_ORPHAN_FS; -+ if (needs_recovery) -+ printk (KERN_INFO "EXT3COW-fs: recovery complete.\n"); -+ ext3cow_mark_recovery_complete(sb, es); -+ printk (KERN_INFO "EXT3COW-fs: mounted filesystem with %s data mode.\n", -+ test_opt(sb,DATA_FLAGS) == EXT3COW_MOUNT_JOURNAL_DATA ? "journal": -+ test_opt(sb,DATA_FLAGS) == EXT3COW_MOUNT_ORDERED_DATA ? "ordered": -+ "writeback"); -+ -+ lock_kernel(); -+ return 0; -+ -+cantfind_ext3cow: -+ if (!silent) -+ printk(KERN_ERR "VFS: Can't find ext3cow filesystem on dev %s.\n", -+ sb->s_id); -+ goto failed_mount; -+ -+failed_mount4: -+ journal_destroy(sbi->s_journal); -+failed_mount3: -+ percpu_counter_destroy(&sbi->s_freeblocks_counter); -+ percpu_counter_destroy(&sbi->s_freeinodes_counter); -+ percpu_counter_destroy(&sbi->s_dirs_counter); -+failed_mount2: -+ for (i = 0; i < db_count; i++) -+ brelse(sbi->s_group_desc[i]); -+ kfree(sbi->s_group_desc); -+failed_mount: -+#ifdef CONFIG_QUOTA -+ for (i = 0; i < MAXQUOTAS; i++) -+ kfree(sbi->s_qf_names[i]); -+#endif -+ ext3cow_blkdev_remove(sbi); -+ brelse(bh); -+out_fail: -+ sb->s_fs_info = NULL; -+ kfree(sbi); -+ lock_kernel(); -+ return -EINVAL; -+} -+ -+/* -+ * Setup any per-fs journal parameters now. We'll do this both on -+ * initial mount, once the journal has been initialised but before we've -+ * done any recovery; and again on any subsequent remount. -+ */ -+static void ext3cow_init_journal_params(struct super_block *sb, journal_t *journal) -+{ -+ struct ext3cow_sb_info *sbi = EXT3COW_SB(sb); -+ -+ if (sbi->s_commit_interval) -+ journal->j_commit_interval = sbi->s_commit_interval; -+ /* We could also set up an ext3cow-specific default for the commit -+ * interval here, but for now we'll just fall back to the jbd -+ * default. */ -+ -+ spin_lock(&journal->j_state_lock); -+ if (test_opt(sb, BARRIER)) -+ journal->j_flags |= JFS_BARRIER; -+ else -+ journal->j_flags &= ~JFS_BARRIER; -+ spin_unlock(&journal->j_state_lock); -+} -+ -+static journal_t *ext3cow_get_journal(struct super_block *sb, -+ unsigned int journal_inum) -+{ -+ struct inode *journal_inode; -+ journal_t *journal; -+ -+ /* First, test for the existence of a valid inode on disk. Bad -+ * things happen if we iget() an unused inode, as the subsequent -+ * iput() will try to delete it. */ -+ -+ journal_inode = iget(sb, journal_inum); -+ if (!journal_inode) { -+ printk(KERN_ERR "EXT3COW-fs: no journal found.\n"); -+ return NULL; -+ } -+ if (!journal_inode->i_nlink) { -+ make_bad_inode(journal_inode); -+ iput(journal_inode); -+ printk(KERN_ERR "EXT3COW-fs: journal inode is deleted.\n"); -+ return NULL; -+ } -+ -+ jbd_debug(2, "Journal inode found at %p: %Ld bytes\n", -+ journal_inode, journal_inode->i_size); -+ if (is_bad_inode(journal_inode) || !S_ISREG(journal_inode->i_mode)) { -+ printk(KERN_ERR "EXT3COW-fs: invalid journal inode.\n"); -+ iput(journal_inode); -+ return NULL; -+ } -+ -+ journal = journal_init_inode(journal_inode); -+ if (!journal) { -+ printk(KERN_ERR "EXT3COW-fs: Could not load journal inode\n"); -+ iput(journal_inode); -+ return NULL; -+ } -+ /* Make sure the journal never gets versioned -znjp */ -+ EXT3COW_I(journal_inode)->i_flags |= EXT3COW_UNVERSIONABLE_FL; -+ journal->j_private = sb; -+ ext3cow_init_journal_params(sb, journal); -+ return journal; -+} -+ -+static journal_t *ext3cow_get_dev_journal(struct super_block *sb, -+ dev_t j_dev) -+{ -+ struct buffer_head * bh; -+ journal_t *journal; -+ ext3cow_fsblk_t start; -+ ext3cow_fsblk_t len; -+ int hblock, blocksize; -+ ext3cow_fsblk_t sb_block; -+ unsigned long offset; -+ struct ext3cow_super_block * es; -+ struct block_device *bdev; -+ -+ bdev = ext3cow_blkdev_get(j_dev); -+ if (bdev == NULL) -+ return NULL; -+ -+ if (bd_claim(bdev, sb)) { -+ printk(KERN_ERR -+ "EXT3COW: failed to claim external journal device.\n"); -+ blkdev_put(bdev); -+ return NULL; -+ } -+ -+ blocksize = sb->s_blocksize; -+ hblock = bdev_hardsect_size(bdev); -+ if (blocksize < hblock) { -+ printk(KERN_ERR -+ "EXT3COW-fs: blocksize too small for journal device.\n"); -+ goto out_bdev; -+ } -+ -+ sb_block = EXT3COW_MIN_BLOCK_SIZE / blocksize; -+ offset = EXT3COW_MIN_BLOCK_SIZE % blocksize; -+ set_blocksize(bdev, blocksize); -+ if (!(bh = __bread(bdev, sb_block, blocksize))) { -+ printk(KERN_ERR "EXT3COW-fs: couldn't read superblock of " -+ "external journal\n"); -+ goto out_bdev; -+ } -+ -+ es = (struct ext3cow_super_block *) (((char *)bh->b_data) + offset); -+ if ((le16_to_cpu(es->s_magic) != EXT3COW_SUPER_MAGIC) || -+ !(le32_to_cpu(es->s_feature_incompat) & -+ EXT3COW_FEATURE_INCOMPAT_JOURNAL_DEV)) { -+ printk(KERN_ERR "EXT3COW-fs: external journal has " -+ "bad superblock\n"); -+ brelse(bh); -+ goto out_bdev; -+ } -+ -+ if (memcmp(EXT3COW_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { -+ printk(KERN_ERR "EXT3COW-fs: journal UUID does not match\n"); -+ brelse(bh); -+ goto out_bdev; -+ } -+ -+ len = le32_to_cpu(es->s_blocks_count); -+ start = sb_block + 1; -+ brelse(bh); /* we're done with the superblock */ -+ -+ journal = journal_init_dev(bdev, sb->s_bdev, -+ start, len, blocksize); -+ if (!journal) { -+ printk(KERN_ERR "EXT3COW-fs: failed to create device journal\n"); -+ goto out_bdev; -+ } -+ journal->j_private = sb; -+ ll_rw_block(READ, 1, &journal->j_sb_buffer); -+ wait_on_buffer(journal->j_sb_buffer); -+ if (!buffer_uptodate(journal->j_sb_buffer)) { -+ printk(KERN_ERR "EXT3COW-fs: I/O error on journal device\n"); -+ goto out_journal; -+ } -+ if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { -+ printk(KERN_ERR "EXT3COW-fs: External journal has more than one " -+ "user (unsupported) - %d\n", -+ be32_to_cpu(journal->j_superblock->s_nr_users)); -+ goto out_journal; -+ } -+ EXT3COW_SB(sb)->journal_bdev = bdev; -+ ext3cow_init_journal_params(sb, journal); -+ return journal; -+out_journal: -+ journal_destroy(journal); -+out_bdev: -+ ext3cow_blkdev_put(bdev); -+ return NULL; -+} -+ -+static int ext3cow_load_journal(struct super_block *sb, -+ struct ext3cow_super_block *es, -+ unsigned long journal_devnum) -+{ -+ journal_t *journal; -+ unsigned int journal_inum = le32_to_cpu(es->s_journal_inum); -+ dev_t journal_dev; -+ int err = 0; -+ int really_read_only; -+ -+ if (journal_devnum && -+ journal_devnum != le32_to_cpu(es->s_journal_dev)) { -+ printk(KERN_INFO "EXT3COW-fs: external journal device major/minor " -+ "numbers have changed\n"); -+ journal_dev = new_decode_dev(journal_devnum); -+ } else -+ journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); -+ -+ really_read_only = bdev_read_only(sb->s_bdev); -+ -+ /* -+ * Are we loading a blank journal or performing recovery after a -+ * crash? For recovery, we need to check in advance whether we -+ * can get read-write access to the device. -+ */ -+ -+ if (EXT3COW_HAS_INCOMPAT_FEATURE(sb, EXT3COW_FEATURE_INCOMPAT_RECOVER)) { -+ if (sb->s_flags & MS_RDONLY) { -+ printk(KERN_INFO "EXT3COW-fs: INFO: recovery " -+ "required on readonly filesystem.\n"); -+ if (really_read_only) { -+ printk(KERN_ERR "EXT3COW-fs: write access " -+ "unavailable, cannot proceed.\n"); -+ return -EROFS; -+ } -+ printk (KERN_INFO "EXT3COW-fs: write access will " -+ "be enabled during recovery.\n"); -+ } -+ } -+ -+ if (journal_inum && journal_dev) { -+ printk(KERN_ERR "EXT3COW-fs: filesystem has both journal " -+ "and inode journals!\n"); -+ return -EINVAL; -+ } -+ -+ if (journal_inum) { -+ if (!(journal = ext3cow_get_journal(sb, journal_inum))) -+ return -EINVAL; -+ } else { -+ if (!(journal = ext3cow_get_dev_journal(sb, journal_dev))) -+ return -EINVAL; -+ } -+ -+ if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { -+ err = journal_update_format(journal); -+ if (err) { -+ printk(KERN_ERR "EXT3COW-fs: error updating journal.\n"); -+ journal_destroy(journal); -+ return err; -+ } -+ } -+ -+ if (!EXT3COW_HAS_INCOMPAT_FEATURE(sb, EXT3COW_FEATURE_INCOMPAT_RECOVER)) -+ err = journal_wipe(journal, !really_read_only); -+ if (!err) -+ err = journal_load(journal); -+ -+ if (err) { -+ printk(KERN_ERR "EXT3COW-fs: error loading journal.\n"); -+ journal_destroy(journal); -+ return err; -+ } -+ -+ EXT3COW_SB(sb)->s_journal = journal; -+ ext3cow_clear_journal_err(sb, es); -+ -+ if (journal_devnum && -+ journal_devnum != le32_to_cpu(es->s_journal_dev)) { -+ es->s_journal_dev = cpu_to_le32(journal_devnum); -+ sb->s_dirt = 1; -+ -+ /* Make sure we flush the recovery flag to disk. */ -+ ext3cow_commit_super(sb, es, 1); -+ } -+ -+ return 0; -+} -+ -+static int ext3cow_create_journal(struct super_block * sb, -+ struct ext3cow_super_block * es, -+ unsigned int journal_inum) -+{ -+ journal_t *journal; -+ -+ if (sb->s_flags & MS_RDONLY) { -+ printk(KERN_ERR "EXT3COW-fs: readonly filesystem when trying to " -+ "create journal.\n"); -+ return -EROFS; -+ } -+ -+ if (!(journal = ext3cow_get_journal(sb, journal_inum))) -+ return -EINVAL; -+ -+ printk(KERN_INFO "EXT3COW-fs: creating new journal on inode %u\n", -+ journal_inum); -+ -+ if (journal_create(journal)) { -+ printk(KERN_ERR "EXT3COW-fs: error creating journal.\n"); -+ journal_destroy(journal); -+ return -EIO; -+ } -+ -+ EXT3COW_SB(sb)->s_journal = journal; -+ -+ ext3cow_update_dynamic_rev(sb); -+ EXT3COW_SET_INCOMPAT_FEATURE(sb, EXT3COW_FEATURE_INCOMPAT_RECOVER); -+ EXT3COW_SET_COMPAT_FEATURE(sb, EXT3COW_FEATURE_COMPAT_HAS_JOURNAL); -+ -+ es->s_journal_inum = cpu_to_le32(journal_inum); -+ sb->s_dirt = 1; -+ -+ /* Make sure we flush the recovery flag to disk. */ -+ ext3cow_commit_super(sb, es, 1); -+ -+ return 0; -+} -+ -+static void ext3cow_commit_super (struct super_block * sb, -+ struct ext3cow_super_block * es, -+ int sync) -+{ -+ struct buffer_head *sbh = EXT3COW_SB(sb)->s_sbh; -+ -+ if (!sbh) -+ return; -+ es->s_wtime = cpu_to_le32(get_seconds()); -+ es->s_free_blocks_count = cpu_to_le32(ext3cow_count_free_blocks(sb)); -+ es->s_free_inodes_count = cpu_to_le32(ext3cow_count_free_inodes(sb)); -+ BUFFER_TRACE(sbh, "marking dirty"); -+ mark_buffer_dirty(sbh); -+ if (sync) -+ sync_dirty_buffer(sbh); -+} -+ -+ -+/* -+ * Have we just finished recovery? If so, and if we are mounting (or -+ * remounting) the filesystem readonly, then we will end up with a -+ * consistent fs on disk. Record that fact. -+ */ -+static void ext3cow_mark_recovery_complete(struct super_block * sb, -+ struct ext3cow_super_block * es) -+{ -+ journal_t *journal = EXT3COW_SB(sb)->s_journal; -+ -+ journal_lock_updates(journal); -+ journal_flush(journal); -+ if (EXT3COW_HAS_INCOMPAT_FEATURE(sb, EXT3COW_FEATURE_INCOMPAT_RECOVER) && -+ sb->s_flags & MS_RDONLY) { -+ EXT3COW_CLEAR_INCOMPAT_FEATURE(sb, EXT3COW_FEATURE_INCOMPAT_RECOVER); -+ sb->s_dirt = 0; -+ ext3cow_commit_super(sb, es, 1); -+ } -+ journal_unlock_updates(journal); -+} -+ -+/* -+ * If we are mounting (or read-write remounting) a filesystem whose journal -+ * has recorded an error from a previous lifetime, move that error to the -+ * main filesystem now. -+ */ -+static void ext3cow_clear_journal_err(struct super_block * sb, -+ struct ext3cow_super_block * es) -+{ -+ journal_t *journal; -+ int j_errno; -+ const char *errstr; -+ -+ journal = EXT3COW_SB(sb)->s_journal; -+ -+ /* -+ * Now check for any error status which may have been recorded in the -+ * journal by a prior ext3cow_error() or ext3cow_abort() -+ */ -+ -+ j_errno = journal_errno(journal); -+ if (j_errno) { -+ char nbuf[16]; -+ -+ errstr = ext3cow_decode_error(sb, j_errno, nbuf); -+ ext3cow_warning(sb, __FUNCTION__, "Filesystem error recorded " -+ "from previous mount: %s", errstr); -+ ext3cow_warning(sb, __FUNCTION__, "Marking fs in need of " -+ "filesystem check."); -+ -+ EXT3COW_SB(sb)->s_mount_state |= EXT3COW_ERROR_FS; -+ es->s_state |= cpu_to_le16(EXT3COW_ERROR_FS); -+ ext3cow_commit_super (sb, es, 1); -+ -+ journal_clear_err(journal); -+ } -+} -+ -+/* -+ * Force the running and committing transactions to commit, -+ * and wait on the commit. -+ */ -+int ext3cow_force_commit(struct super_block *sb) -+{ -+ journal_t *journal; -+ int ret; -+ -+ if (sb->s_flags & MS_RDONLY) -+ return 0; -+ -+ journal = EXT3COW_SB(sb)->s_journal; -+ sb->s_dirt = 0; -+ ret = ext3cow_journal_force_commit(journal); -+ return ret; -+} -+ -+/* -+ * Ext3 always journals updates to the superblock itself, so we don't -+ * have to propagate any other updates to the superblock on disk at this -+ * point. Just start an async writeback to get the buffers on their way -+ * to the disk. -+ * -+ * This implicitly triggers the writebehind on sync(). -+ */ -+ -+static void ext3cow_write_super (struct super_block * sb) -+{ -+ if (mutex_trylock(&sb->s_lock) != 0) -+ BUG(); -+ sb->s_dirt = 0; -+} -+ -+static int ext3cow_sync_fs(struct super_block *sb, int wait) -+{ -+ tid_t target; -+ -+ sb->s_dirt = 0; -+ if (journal_start_commit(EXT3COW_SB(sb)->s_journal, &target)) { -+ if (wait) -+ log_wait_commit(EXT3COW_SB(sb)->s_journal, target); -+ } -+ return 0; -+} -+ -+/* -+ * LVM calls this function before a (read-only) snapshot is created. This -+ * gives us a chance to flush the journal completely and mark the fs clean. -+ */ -+static void ext3cow_write_super_lockfs(struct super_block *sb) -+{ -+ sb->s_dirt = 0; -+ -+ if (!(sb->s_flags & MS_RDONLY)) { -+ journal_t *journal = EXT3COW_SB(sb)->s_journal; -+ -+ /* Now we set up the journal barrier. */ -+ journal_lock_updates(journal); -+ journal_flush(journal); -+ -+ /* Journal blocked and flushed, clear needs_recovery flag. */ -+ EXT3COW_CLEAR_INCOMPAT_FEATURE(sb, EXT3COW_FEATURE_INCOMPAT_RECOVER); -+ ext3cow_commit_super(sb, EXT3COW_SB(sb)->s_es, 1); -+ } -+} -+ -+/* -+ * Called by LVM after the snapshot is done. We need to reset the RECOVER -+ * flag here, even though the filesystem is not technically dirty yet. -+ */ -+static void ext3cow_unlockfs(struct super_block *sb) -+{ -+ if (!(sb->s_flags & MS_RDONLY)) { -+ lock_super(sb); -+ /* Reser the needs_recovery flag before the fs is unlocked. */ -+ EXT3COW_SET_INCOMPAT_FEATURE(sb, EXT3COW_FEATURE_INCOMPAT_RECOVER); -+ ext3cow_commit_super(sb, EXT3COW_SB(sb)->s_es, 1); -+ unlock_super(sb); -+ journal_unlock_updates(EXT3COW_SB(sb)->s_journal); -+ } -+} -+ -+static int ext3cow_remount (struct super_block * sb, int * flags, char * data) -+{ -+ struct ext3cow_super_block * es; -+ struct ext3cow_sb_info *sbi = EXT3COW_SB(sb); -+ ext3cow_fsblk_t n_blocks_count = 0; -+ unsigned long old_sb_flags; -+ struct ext3cow_mount_options old_opts; -+ int err; -+#ifdef CONFIG_QUOTA -+ int i; -+#endif -+ -+ /* Store the original options */ -+ old_sb_flags = sb->s_flags; -+ old_opts.s_mount_opt = sbi->s_mount_opt; -+ old_opts.s_resuid = sbi->s_resuid; -+ old_opts.s_resgid = sbi->s_resgid; -+ old_opts.s_commit_interval = sbi->s_commit_interval; -+#ifdef CONFIG_QUOTA -+ old_opts.s_jquota_fmt = sbi->s_jquota_fmt; -+ for (i = 0; i < MAXQUOTAS; i++) -+ old_opts.s_qf_names[i] = sbi->s_qf_names[i]; -+#endif -+ -+ /* -+ * Allow the "check" option to be passed as a remount option. -+ */ -+ if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) { -+ err = -EINVAL; -+ goto restore_opts; -+ } -+ -+ if (sbi->s_mount_opt & EXT3COW_MOUNT_ABORT) -+ ext3cow_abort(sb, __FUNCTION__, "Abort forced by user"); -+ -+ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | -+ ((sbi->s_mount_opt & EXT3COW_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); -+ -+ es = sbi->s_es; -+ -+ ext3cow_init_journal_params(sb, sbi->s_journal); -+ -+ if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || -+ n_blocks_count > le32_to_cpu(es->s_blocks_count)) { -+ if (sbi->s_mount_opt & EXT3COW_MOUNT_ABORT) { -+ err = -EROFS; -+ goto restore_opts; -+ } -+ -+ if (*flags & MS_RDONLY) { -+ /* -+ * First of all, the unconditional stuff we have to do -+ * to disable replay of the journal when we next remount -+ */ -+ sb->s_flags |= MS_RDONLY; -+ -+ /* -+ * OK, test if we are remounting a valid rw partition -+ * readonly, and if so set the rdonly flag and then -+ * mark the partition as valid again. -+ */ -+ if (!(es->s_state & cpu_to_le16(EXT3COW_VALID_FS)) && -+ (sbi->s_mount_state & EXT3COW_VALID_FS)) -+ es->s_state = cpu_to_le16(sbi->s_mount_state); -+ -+ ext3cow_mark_recovery_complete(sb, es); -+ } else { -+ __le32 ret; -+ if ((ret = EXT3COW_HAS_RO_COMPAT_FEATURE(sb, -+ ~EXT3COW_FEATURE_RO_COMPAT_SUPP))) { -+ printk(KERN_WARNING "EXT3COW-fs: %s: couldn't " -+ "remount RDWR because of unsupported " -+ "optional features (%x).\n", -+ sb->s_id, le32_to_cpu(ret)); -+ err = -EROFS; -+ goto restore_opts; -+ } -+ /* -+ * Mounting a RDONLY partition read-write, so reread -+ * and store the current valid flag. (It may have -+ * been changed by e2fsck since we originally mounted -+ * the partition.) -+ */ -+ ext3cow_clear_journal_err(sb, es); -+ sbi->s_mount_state = le16_to_cpu(es->s_state); -+ if ((err = ext3cow_group_extend(sb, es, n_blocks_count))) -+ goto restore_opts; -+ if (!ext3cow_setup_super (sb, es, 0)) -+ sb->s_flags &= ~MS_RDONLY; -+ } -+ } -+#ifdef CONFIG_QUOTA -+ /* Release old quota file names */ -+ for (i = 0; i < MAXQUOTAS; i++) -+ if (old_opts.s_qf_names[i] && -+ old_opts.s_qf_names[i] != sbi->s_qf_names[i]) -+ kfree(old_opts.s_qf_names[i]); -+#endif -+ return 0; -+restore_opts: -+ sb->s_flags = old_sb_flags; -+ sbi->s_mount_opt = old_opts.s_mount_opt; -+ sbi->s_resuid = old_opts.s_resuid; -+ sbi->s_resgid = old_opts.s_resgid; -+ sbi->s_commit_interval = old_opts.s_commit_interval; -+#ifdef CONFIG_QUOTA -+ sbi->s_jquota_fmt = old_opts.s_jquota_fmt; -+ for (i = 0; i < MAXQUOTAS; i++) { -+ if (sbi->s_qf_names[i] && -+ old_opts.s_qf_names[i] != sbi->s_qf_names[i]) -+ kfree(sbi->s_qf_names[i]); -+ sbi->s_qf_names[i] = old_opts.s_qf_names[i]; -+ } -+#endif -+ return err; -+} -+ -+static int ext3cow_statfs (struct dentry * dentry, struct kstatfs * buf) -+{ -+ struct super_block *sb = dentry->d_sb; -+ struct ext3cow_sb_info *sbi = EXT3COW_SB(sb); -+ struct ext3cow_super_block *es = sbi->s_es; -+ ext3cow_fsblk_t overhead; -+ int i; -+ u64 fsid; -+ -+ if (test_opt (sb, MINIX_DF)) -+ overhead = 0; -+ else { -+ unsigned long ngroups; -+ ngroups = EXT3COW_SB(sb)->s_groups_count; -+ smp_rmb(); -+ -+ /* -+ * Compute the overhead (FS structures) -+ */ -+ -+ /* -+ * All of the blocks before first_data_block are -+ * overhead -+ */ -+ overhead = le32_to_cpu(es->s_first_data_block); -+ -+ /* -+ * Add the overhead attributed to the superblock and -+ * block group descriptors. If the sparse superblocks -+ * feature is turned on, then not all groups have this. -+ */ -+ for (i = 0; i < ngroups; i++) { -+ overhead += ext3cow_bg_has_super(sb, i) + -+ ext3cow_bg_num_gdb(sb, i); -+ cond_resched(); -+ } -+ -+ /* -+ * Every block group has an inode bitmap, a block -+ * bitmap, and an inode table. -+ */ -+ overhead += (ngroups * (2 + EXT3COW_SB(sb)->s_itb_per_group)); -+ } -+ -+ buf->f_type = EXT3COW_SUPER_MAGIC; -+ buf->f_bsize = sb->s_blocksize; -+ buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead; -+ buf->f_bfree = percpu_counter_sum(&sbi->s_freeblocks_counter); -+ buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count); -+ if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count)) -+ buf->f_bavail = 0; -+ buf->f_files = le32_to_cpu(es->s_inodes_count); -+ buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter); -+ buf->f_namelen = EXT3COW_NAME_LEN; -+ fsid = le64_to_cpup((void *)es->s_uuid) ^ -+ le64_to_cpup((void *)es->s_uuid + sizeof(u64)); -+ buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; -+ buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; -+ return 0; -+} -+ -+/* Helper function for writing quotas on sync - we need to start transaction before quota file -+ * is locked for write. Otherwise the are possible deadlocks: -+ * Process 1 Process 2 -+ * ext3cow_create() quota_sync() -+ * journal_start() write_dquot() -+ * DQUOT_INIT() down(dqio_mutex) -+ * down(dqio_mutex) journal_start() -+ * -+ */ -+ -+#ifdef CONFIG_QUOTA -+ -+static inline struct inode *dquot_to_inode(struct dquot *dquot) -+{ -+ return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]; -+} -+ -+static int ext3cow_dquot_initialize(struct inode *inode, int type) -+{ -+ handle_t *handle; -+ int ret, err; -+ -+ /* We may create quota structure so we need to reserve enough blocks */ -+ handle = ext3cow_journal_start(inode, 2*EXT3COW_QUOTA_INIT_BLOCKS(inode->i_sb)); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ ret = dquot_initialize(inode, type); -+ err = ext3cow_journal_stop(handle); -+ if (!ret) -+ ret = err; -+ return ret; -+} -+ -+static int ext3cow_dquot_drop(struct inode *inode) -+{ -+ handle_t *handle; -+ int ret, err; -+ -+ /* We may delete quota structure so we need to reserve enough blocks */ -+ handle = ext3cow_journal_start(inode, 2*EXT3COW_QUOTA_DEL_BLOCKS(inode->i_sb)); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ ret = dquot_drop(inode); -+ err = ext3cow_journal_stop(handle); -+ if (!ret) -+ ret = err; -+ return ret; -+} -+ -+static int ext3cow_write_dquot(struct dquot *dquot) -+{ -+ int ret, err; -+ handle_t *handle; -+ struct inode *inode; -+ -+ inode = dquot_to_inode(dquot); -+ handle = ext3cow_journal_start(inode, -+ EXT3COW_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ ret = dquot_commit(dquot); -+ err = ext3cow_journal_stop(handle); -+ if (!ret) -+ ret = err; -+ return ret; -+} -+ -+static int ext3cow_acquire_dquot(struct dquot *dquot) -+{ -+ int ret, err; -+ handle_t *handle; -+ -+ handle = ext3cow_journal_start(dquot_to_inode(dquot), -+ EXT3COW_QUOTA_INIT_BLOCKS(dquot->dq_sb)); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ ret = dquot_acquire(dquot); -+ err = ext3cow_journal_stop(handle); -+ if (!ret) -+ ret = err; -+ return ret; -+} -+ -+static int ext3cow_release_dquot(struct dquot *dquot) -+{ -+ int ret, err; -+ handle_t *handle; -+ -+ handle = ext3cow_journal_start(dquot_to_inode(dquot), -+ EXT3COW_QUOTA_DEL_BLOCKS(dquot->dq_sb)); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ ret = dquot_release(dquot); -+ err = ext3cow_journal_stop(handle); -+ if (!ret) -+ ret = err; -+ return ret; -+} -+ -+static int ext3cow_mark_dquot_dirty(struct dquot *dquot) -+{ -+ /* Are we journalling quotas? */ -+ if (EXT3COW_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] || -+ EXT3COW_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) { -+ dquot_mark_dquot_dirty(dquot); -+ return ext3cow_write_dquot(dquot); -+ } else { -+ return dquot_mark_dquot_dirty(dquot); -+ } -+} -+ -+static int ext3cow_write_info(struct super_block *sb, int type) -+{ -+ int ret, err; -+ handle_t *handle; -+ -+ /* Data block + inode block */ -+ handle = ext3cow_journal_start(sb->s_root->d_inode, 2); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ ret = dquot_commit_info(sb, type); -+ err = ext3cow_journal_stop(handle); -+ if (!ret) -+ ret = err; -+ return ret; -+} -+ -+/* -+ * Turn on quotas during mount time - we need to find -+ * the quota file and such... -+ */ -+static int ext3cow_quota_on_mount(struct super_block *sb, int type) -+{ -+ return vfs_quota_on_mount(sb, EXT3COW_SB(sb)->s_qf_names[type], -+ EXT3COW_SB(sb)->s_jquota_fmt, type); -+} -+ -+/* -+ * Standard function to be called on quota_on -+ */ -+static int ext3cow_quota_on(struct super_block *sb, int type, int format_id, -+ char *path) -+{ -+ int err; -+ struct nameidata nd; -+ -+ if (!test_opt(sb, QUOTA)) -+ return -EINVAL; -+ /* Not journalling quota? */ -+ if (!EXT3COW_SB(sb)->s_qf_names[USRQUOTA] && -+ !EXT3COW_SB(sb)->s_qf_names[GRPQUOTA]) -+ return vfs_quota_on(sb, type, format_id, path); -+ err = path_lookup(path, LOOKUP_FOLLOW, &nd); -+ if (err) -+ return err; -+ /* Quotafile not on the same filesystem? */ -+ if (nd.mnt->mnt_sb != sb) { -+ path_release(&nd); -+ return -EXDEV; -+ } -+ /* Quotafile not of fs root? */ -+ if (nd.dentry->d_parent->d_inode != sb->s_root->d_inode) -+ printk(KERN_WARNING -+ "EXT3COW-fs: Quota file not on filesystem root. " -+ "Journalled quota will not work.\n"); -+ path_release(&nd); -+ return vfs_quota_on(sb, type, format_id, path); -+} -+ -+/* Read data from quotafile - avoid pagecache and such because we cannot afford -+ * acquiring the locks... As quota files are never truncated and quota code -+ * itself serializes the operations (and noone else should touch the files) -+ * we don't have to be afraid of races */ -+static ssize_t ext3cow_quota_read(struct super_block *sb, int type, char *data, -+ size_t len, loff_t off) -+{ -+ struct inode *inode = sb_dqopt(sb)->files[type]; -+ sector_t blk = off >> EXT3COW_BLOCK_SIZE_BITS(sb); -+ int err = 0; -+ int offset = off & (sb->s_blocksize - 1); -+ int tocopy; -+ size_t toread; -+ struct buffer_head *bh; -+ loff_t i_size = i_size_read(inode); -+ -+ if (off > i_size) -+ return 0; -+ if (off+len > i_size) -+ len = i_size-off; -+ toread = len; -+ while (toread > 0) { -+ tocopy = sb->s_blocksize - offset < toread ? -+ sb->s_blocksize - offset : toread; -+ bh = ext3cow_bread(NULL, inode, blk, 0, &err); -+ if (err) -+ return err; -+ if (!bh) /* A hole? */ -+ memset(data, 0, tocopy); -+ else -+ memcpy(data, bh->b_data+offset, tocopy); -+ brelse(bh); -+ offset = 0; -+ toread -= tocopy; -+ data += tocopy; -+ blk++; -+ } -+ return len; -+} -+ -+/* Write to quotafile (we know the transaction is already started and has -+ * enough credits) */ -+static ssize_t ext3cow_quota_write(struct super_block *sb, int type, -+ const char *data, size_t len, loff_t off) -+{ -+ struct inode *inode = sb_dqopt(sb)->files[type]; -+ sector_t blk = off >> EXT3COW_BLOCK_SIZE_BITS(sb); -+ int err = 0; -+ int offset = off & (sb->s_blocksize - 1); -+ int tocopy; -+ int journal_quota = EXT3COW_SB(sb)->s_qf_names[type] != NULL; -+ size_t towrite = len; -+ struct buffer_head *bh; -+ handle_t *handle = journal_current_handle(); -+ -+ mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); -+ while (towrite > 0) { -+ tocopy = sb->s_blocksize - offset < towrite ? -+ sb->s_blocksize - offset : towrite; -+ bh = ext3cow_bread(handle, inode, blk, 1, &err); -+ if (!bh) -+ goto out; -+ if (journal_quota) { -+ err = ext3cow_journal_get_write_access(handle, bh); -+ if (err) { -+ brelse(bh); -+ goto out; -+ } -+ } -+ lock_buffer(bh); -+ memcpy(bh->b_data+offset, data, tocopy); -+ flush_dcache_page(bh->b_page); -+ unlock_buffer(bh); -+ if (journal_quota) -+ err = ext3cow_journal_dirty_metadata(handle, bh); -+ else { -+ /* Always do at least ordered writes for quotas */ -+ err = ext3cow_journal_dirty_data(handle, bh); -+ mark_buffer_dirty(bh); -+ } -+ brelse(bh); -+ if (err) -+ goto out; -+ offset = 0; -+ towrite -= tocopy; -+ data += tocopy; -+ blk++; -+ } -+out: -+ if (len == towrite) -+ return err; -+ if (inode->i_size < off+len-towrite) { -+ i_size_write(inode, off+len-towrite); -+ EXT3COW_I(inode)->i_disksize = inode->i_size; -+ } -+ inode->i_version++; -+ inode->i_mtime = inode->i_ctime = CURRENT_TIME; -+ ext3cow_mark_inode_dirty(handle, inode); -+ mutex_unlock(&inode->i_mutex); -+ return len - towrite; -+} -+ -+#endif -+ -+static int ext3cow_get_sb(struct file_system_type *fs_type, -+ int flags, const char *dev_name, void *data, struct vfsmount *mnt) -+{ -+ return get_sb_bdev(fs_type, flags, dev_name, data, ext3cow_fill_super, mnt); -+} -+ -+/* Code to update the epoch counter in the super block -znjp */ -+unsigned int ext3cow_take_snapshot(struct super_block *sb){ -+ -+ struct ext3cow_sb_info *sbi = NULL; -+ struct ext3cow_super_block *es = NULL; -+ tid_t target; -+ -+ if(NULL == sb){ -+ printk("EXT3COW-fs: superblock is NULL when taking snapshot.\n"); -+ return -1; -+ } -+ -+ sbi = EXT3COW_SB(sb); -+ es = sbi->s_es; -+ -+ /* Sync the dirty blocks */ -+ if (journal_start_commit(EXT3COW_SB(sb)->s_journal, &target)) { -+ log_wait_commit(EXT3COW_SB(sb)->s_journal, target); -+ } -+ -+ -+ sbi->s_epoch_number = cpu_to_le32(get_seconds()); -+ es->s_epoch_number = sbi->s_epoch_number; -+ sb->s_dirt = 1; -+ -+ BUFFER_TRACE(EXT3COW_SB(sb)->s_sbh, "marking dirty"); -+ mark_buffer_dirty(sbi->s_sbh); -+ ext3cow_commit_super (sb, es, 1); -+ -+ return (unsigned int)sbi->s_epoch_number; -+} -+ -+static struct file_system_type ext3cow_fs_type = { -+ .owner = THIS_MODULE, -+ .name = "ext3cow", -+ .get_sb = ext3cow_get_sb, -+ .kill_sb = kill_block_super, -+ .fs_flags = FS_REQUIRES_DEV, -+}; -+ -+static int __init init_ext3cow_fs(void) -+{ -+ int err = init_ext3cow_xattr(); -+ if (err) -+ return err; -+ err = init_inodecache(); -+ if (err) -+ goto out1; -+ err = register_filesystem(&ext3cow_fs_type); -+ if (err) -+ goto out; -+ return 0; -+out: -+ destroy_inodecache(); -+out1: -+ exit_ext3cow_xattr(); -+ return err; -+} -+ -+static void __exit exit_ext3cow_fs(void) -+{ -+ unregister_filesystem(&ext3cow_fs_type); -+ destroy_inodecache(); -+ exit_ext3cow_xattr(); -+} -+ -+MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); -+MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); -+MODULE_LICENSE("GPL"); -+module_init(init_ext3cow_fs) -+module_exit(exit_ext3cow_fs) -diff -ruN linux-2.6.20.3/fs/ext3cow/symlink.c linux-2.6.20.3-ext3cow/fs/ext3cow/symlink.c ---- linux-2.6.20.3/fs/ext3cow/symlink.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.20.3-ext3cow/fs/ext3cow/symlink.c 2008-03-09 11:14:48.000000000 -0400 -@@ -0,0 +1,54 @@ -+/* -+ * linux/fs/ext3cow/symlink.c -+ * -+ * Only fast symlinks left here - the rest is done by generic code. AV, 1999 -+ * -+ * Copyright (C) 1992, 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ * -+ * from -+ * -+ * linux/fs/minix/symlink.c -+ * -+ * Copyright (C) 1991, 1992 Linus Torvalds -+ * -+ * ext3cow symlink handling code -+ */ -+ -+#include -+#include -+#include -+#include -+#include "xattr.h" -+ -+static void * ext3cow_follow_link(struct dentry *dentry, struct nameidata *nd) -+{ -+ struct ext3cow_inode_info *ei = EXT3COW_I(dentry->d_inode); -+ nd_set_link(nd, (char*)ei->i_data); -+ return NULL; -+} -+ -+struct inode_operations ext3cow_symlink_inode_operations = { -+ .readlink = generic_readlink, -+ .follow_link = page_follow_link_light, -+ .put_link = page_put_link, -+#ifdef CONFIG_EXT3COW_FS_XATTR -+ .setxattr = generic_setxattr, -+ .getxattr = generic_getxattr, -+ .listxattr = ext3cow_listxattr, -+ .removexattr = generic_removexattr, -+#endif -+}; -+ -+struct inode_operations ext3cow_fast_symlink_inode_operations = { -+ .readlink = generic_readlink, -+ .follow_link = ext3cow_follow_link, -+#ifdef CONFIG_EXT3COW_FS_XATTR -+ .setxattr = generic_setxattr, -+ .getxattr = generic_getxattr, -+ .listxattr = ext3cow_listxattr, -+ .removexattr = generic_removexattr, -+#endif -+}; -diff -ruN linux-2.6.20.3/fs/ext3cow/xattr.c linux-2.6.20.3-ext3cow/fs/ext3cow/xattr.c ---- linux-2.6.20.3/fs/ext3cow/xattr.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.20.3-ext3cow/fs/ext3cow/xattr.c 2008-03-09 11:14:49.000000000 -0400 -@@ -0,0 +1,1314 @@ -+/* -+ * linux/fs/ext3cow/xattr.c -+ * -+ * Copyright (C) 2001-2003 Andreas Gruenbacher, -+ * -+ * Fix by Harrison Xing . -+ * Ext3 code with a lot of help from Eric Jarman . -+ * Extended attributes for symlinks and special files added per -+ * suggestion of Luka Renko . -+ * xattr consolidation Copyright (c) 2004 James Morris , -+ * Red Hat Inc. -+ * ea-in-inode support by Alex Tomas aka bzzz -+ * and Andreas Gruenbacher . -+ */ -+ -+/* -+ * Extended attributes are stored directly in inodes (on file systems with -+ * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl -+ * field contains the block number if an inode uses an additional block. All -+ * attributes must fit in the inode and one additional block. Blocks that -+ * contain the identical set of attributes may be shared among several inodes. -+ * Identical blocks are detected by keeping a cache of blocks that have -+ * recently been accessed. -+ * -+ * The attributes in inodes and on blocks have a different header; the entries -+ * are stored in the same format: -+ * -+ * +------------------+ -+ * | header | -+ * | entry 1 | | -+ * | entry 2 | | growing downwards -+ * | entry 3 | v -+ * | four null bytes | -+ * | . . . | -+ * | value 1 | ^ -+ * | value 3 | | growing upwards -+ * | value 2 | | -+ * +------------------+ -+ * -+ * The header is followed by multiple entry descriptors. In disk blocks, the -+ * entry descriptors are kept sorted. In inodes, they are unsorted. The -+ * attribute values are aligned to the end of the block in no specific order. -+ * -+ * Locking strategy -+ * ---------------- -+ * EXT3COW_I(inode)->i_file_acl is protected by EXT3COW_I(inode)->xattr_sem. -+ * EA blocks are only changed if they are exclusive to an inode, so -+ * holding xattr_sem also means that nothing but the EA block's reference -+ * count can change. Multiple writers to the same block are synchronized -+ * by the buffer lock. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "xattr.h" -+#include "acl.h" -+ -+#define BHDR(bh) ((struct ext3cow_xattr_header *)((bh)->b_data)) -+#define ENTRY(ptr) ((struct ext3cow_xattr_entry *)(ptr)) -+#define BFIRST(bh) ENTRY(BHDR(bh)+1) -+#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) -+ -+#define IHDR(inode, raw_inode) \ -+ ((struct ext3cow_xattr_ibody_header *) \ -+ ((void *)raw_inode + \ -+ EXT3COW_GOOD_OLD_INODE_SIZE + \ -+ EXT3COW_I(inode)->i_extra_isize)) -+#define IFIRST(hdr) ((struct ext3cow_xattr_entry *)((hdr)+1)) -+ -+#ifdef EXT3COW_XATTR_DEBUG -+# define ea_idebug(inode, f...) do { \ -+ printk(KERN_DEBUG "inode %s:%lu: ", \ -+ inode->i_sb->s_id, inode->i_ino); \ -+ printk(f); \ -+ printk("\n"); \ -+ } while (0) -+# define ea_bdebug(bh, f...) do { \ -+ char b[BDEVNAME_SIZE]; \ -+ printk(KERN_DEBUG "block %s:%lu: ", \ -+ bdevname(bh->b_bdev, b), \ -+ (unsigned long) bh->b_blocknr); \ -+ printk(f); \ -+ printk("\n"); \ -+ } while (0) -+#else -+# define ea_idebug(f...) -+# define ea_bdebug(f...) -+#endif -+ -+static void ext3cow_xattr_cache_insert(struct buffer_head *); -+static struct buffer_head *ext3cow_xattr_cache_find(struct inode *, -+ struct ext3cow_xattr_header *, -+ struct mb_cache_entry **); -+static void ext3cow_xattr_rehash(struct ext3cow_xattr_header *, -+ struct ext3cow_xattr_entry *); -+ -+static struct mb_cache *ext3cow_xattr_cache; -+ -+static struct xattr_handler *ext3cow_xattr_handler_map[] = { -+ [EXT3COW_XATTR_INDEX_USER] = &ext3cow_xattr_user_handler, -+#ifdef CONFIG_EXT3COW_FS_POSIX_ACL -+ [EXT3COW_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext3cow_xattr_acl_access_handler, -+ [EXT3COW_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext3cow_xattr_acl_default_handler, -+#endif -+ [EXT3COW_XATTR_INDEX_TRUSTED] = &ext3cow_xattr_trusted_handler, -+#ifdef CONFIG_EXT3COW_FS_SECURITY -+ [EXT3COW_XATTR_INDEX_SECURITY] = &ext3cow_xattr_security_handler, -+#endif -+}; -+ -+struct xattr_handler *ext3cow_xattr_handlers[] = { -+ &ext3cow_xattr_user_handler, -+ &ext3cow_xattr_trusted_handler, -+#ifdef CONFIG_EXT3COW_FS_POSIX_ACL -+ &ext3cow_xattr_acl_access_handler, -+ &ext3cow_xattr_acl_default_handler, -+#endif -+#ifdef CONFIG_EXT3COW_FS_SECURITY -+ &ext3cow_xattr_security_handler, -+#endif -+ NULL -+}; -+ -+static inline struct xattr_handler * -+ext3cow_xattr_handler(int name_index) -+{ -+ struct xattr_handler *handler = NULL; -+ -+ if (name_index > 0 && name_index < ARRAY_SIZE(ext3cow_xattr_handler_map)) -+ handler = ext3cow_xattr_handler_map[name_index]; -+ return handler; -+} -+ -+/* -+ * Inode operation listxattr() -+ * -+ * dentry->d_inode->i_mutex: don't care -+ */ -+ssize_t -+ext3cow_listxattr(struct dentry *dentry, char *buffer, size_t size) -+{ -+ return ext3cow_xattr_list(dentry->d_inode, buffer, size); -+} -+ -+static int -+ext3cow_xattr_check_names(struct ext3cow_xattr_entry *entry, void *end) -+{ -+ while (!IS_LAST_ENTRY(entry)) { -+ struct ext3cow_xattr_entry *next = EXT3COW_XATTR_NEXT(entry); -+ if ((void *)next >= end) -+ return -EIO; -+ entry = next; -+ } -+ return 0; -+} -+ -+static inline int -+ext3cow_xattr_check_block(struct buffer_head *bh) -+{ -+ int error; -+ -+ if (BHDR(bh)->h_magic != cpu_to_le32(EXT3COW_XATTR_MAGIC) || -+ BHDR(bh)->h_blocks != cpu_to_le32(1)) -+ return -EIO; -+ error = ext3cow_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); -+ return error; -+} -+ -+static inline int -+ext3cow_xattr_check_entry(struct ext3cow_xattr_entry *entry, size_t size) -+{ -+ size_t value_size = le32_to_cpu(entry->e_value_size); -+ -+ if (entry->e_value_block != 0 || value_size > size || -+ le16_to_cpu(entry->e_value_offs) + value_size > size) -+ return -EIO; -+ return 0; -+} -+ -+static int -+ext3cow_xattr_find_entry(struct ext3cow_xattr_entry **pentry, int name_index, -+ const char *name, size_t size, int sorted) -+{ -+ struct ext3cow_xattr_entry *entry; -+ size_t name_len; -+ int cmp = 1; -+ -+ if (name == NULL) -+ return -EINVAL; -+ name_len = strlen(name); -+ entry = *pentry; -+ for (; !IS_LAST_ENTRY(entry); entry = EXT3COW_XATTR_NEXT(entry)) { -+ cmp = name_index - entry->e_name_index; -+ if (!cmp) -+ cmp = name_len - entry->e_name_len; -+ if (!cmp) -+ cmp = memcmp(name, entry->e_name, name_len); -+ if (cmp <= 0 && (sorted || cmp == 0)) -+ break; -+ } -+ *pentry = entry; -+ if (!cmp && ext3cow_xattr_check_entry(entry, size)) -+ return -EIO; -+ return cmp ? -ENODATA : 0; -+} -+ -+static int -+ext3cow_xattr_block_get(struct inode *inode, int name_index, const char *name, -+ void *buffer, size_t buffer_size) -+{ -+ struct buffer_head *bh = NULL; -+ struct ext3cow_xattr_entry *entry; -+ size_t size; -+ int error; -+ -+ ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", -+ name_index, name, buffer, (long)buffer_size); -+ -+ error = -ENODATA; -+ if (!EXT3COW_I(inode)->i_file_acl) -+ goto cleanup; -+ ea_idebug(inode, "reading block %u", EXT3COW_I(inode)->i_file_acl); -+ bh = sb_bread(inode->i_sb, EXT3COW_I(inode)->i_file_acl); -+ if (!bh) -+ goto cleanup; -+ ea_bdebug(bh, "b_count=%d, refcount=%d", -+ atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); -+ if (ext3cow_xattr_check_block(bh)) { -+bad_block: ext3cow_error(inode->i_sb, __FUNCTION__, -+ "inode %lu: bad block "E3FSBLK, inode->i_ino, -+ EXT3COW_I(inode)->i_file_acl); -+ error = -EIO; -+ goto cleanup; -+ } -+ ext3cow_xattr_cache_insert(bh); -+ entry = BFIRST(bh); -+ error = ext3cow_xattr_find_entry(&entry, name_index, name, bh->b_size, 1); -+ if (error == -EIO) -+ goto bad_block; -+ if (error) -+ goto cleanup; -+ size = le32_to_cpu(entry->e_value_size); -+ if (buffer) { -+ error = -ERANGE; -+ if (size > buffer_size) -+ goto cleanup; -+ memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), -+ size); -+ } -+ error = size; -+ -+cleanup: -+ brelse(bh); -+ return error; -+} -+ -+static int -+ext3cow_xattr_ibody_get(struct inode *inode, int name_index, const char *name, -+ void *buffer, size_t buffer_size) -+{ -+ struct ext3cow_xattr_ibody_header *header; -+ struct ext3cow_xattr_entry *entry; -+ struct ext3cow_inode *raw_inode; -+ struct ext3cow_iloc iloc; -+ size_t size; -+ void *end; -+ int error; -+ -+ if (!(EXT3COW_I(inode)->i_state & EXT3COW_STATE_XATTR)) -+ return -ENODATA; -+ error = ext3cow_get_inode_loc(inode, &iloc); -+ if (error) -+ return error; -+ raw_inode = ext3cow_raw_inode(&iloc); -+ header = IHDR(inode, raw_inode); -+ entry = IFIRST(header); -+ end = (void *)raw_inode + EXT3COW_SB(inode->i_sb)->s_inode_size; -+ error = ext3cow_xattr_check_names(entry, end); -+ if (error) -+ goto cleanup; -+ error = ext3cow_xattr_find_entry(&entry, name_index, name, -+ end - (void *)entry, 0); -+ if (error) -+ goto cleanup; -+ size = le32_to_cpu(entry->e_value_size); -+ if (buffer) { -+ error = -ERANGE; -+ if (size > buffer_size) -+ goto cleanup; -+ memcpy(buffer, (void *)IFIRST(header) + -+ le16_to_cpu(entry->e_value_offs), size); -+ } -+ error = size; -+ -+cleanup: -+ brelse(iloc.bh); -+ return error; -+} -+ -+/* -+ * ext3cow_xattr_get() -+ * -+ * Copy an extended attribute into the buffer -+ * provided, or compute the buffer size required. -+ * Buffer is NULL to compute the size of the buffer required. -+ * -+ * Returns a negative error number on failure, or the number of bytes -+ * used / required on success. -+ */ -+int -+ext3cow_xattr_get(struct inode *inode, int name_index, const char *name, -+ void *buffer, size_t buffer_size) -+{ -+ int error; -+ -+ down_read(&EXT3COW_I(inode)->xattr_sem); -+ error = ext3cow_xattr_ibody_get(inode, name_index, name, buffer, -+ buffer_size); -+ if (error == -ENODATA) -+ error = ext3cow_xattr_block_get(inode, name_index, name, buffer, -+ buffer_size); -+ up_read(&EXT3COW_I(inode)->xattr_sem); -+ return error; -+} -+ -+static int -+ext3cow_xattr_list_entries(struct inode *inode, struct ext3cow_xattr_entry *entry, -+ char *buffer, size_t buffer_size) -+{ -+ size_t rest = buffer_size; -+ -+ for (; !IS_LAST_ENTRY(entry); entry = EXT3COW_XATTR_NEXT(entry)) { -+ struct xattr_handler *handler = -+ ext3cow_xattr_handler(entry->e_name_index); -+ -+ if (handler) { -+ size_t size = handler->list(inode, buffer, rest, -+ entry->e_name, -+ entry->e_name_len); -+ if (buffer) { -+ if (size > rest) -+ return -ERANGE; -+ buffer += size; -+ } -+ rest -= size; -+ } -+ } -+ return buffer_size - rest; -+} -+ -+static int -+ext3cow_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) -+{ -+ struct buffer_head *bh = NULL; -+ int error; -+ -+ ea_idebug(inode, "buffer=%p, buffer_size=%ld", -+ buffer, (long)buffer_size); -+ -+ error = 0; -+ if (!EXT3COW_I(inode)->i_file_acl) -+ goto cleanup; -+ ea_idebug(inode, "reading block %u", EXT3COW_I(inode)->i_file_acl); -+ bh = sb_bread(inode->i_sb, EXT3COW_I(inode)->i_file_acl); -+ error = -EIO; -+ if (!bh) -+ goto cleanup; -+ ea_bdebug(bh, "b_count=%d, refcount=%d", -+ atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); -+ if (ext3cow_xattr_check_block(bh)) { -+ ext3cow_error(inode->i_sb, __FUNCTION__, -+ "inode %lu: bad block "E3FSBLK, inode->i_ino, -+ EXT3COW_I(inode)->i_file_acl); -+ error = -EIO; -+ goto cleanup; -+ } -+ ext3cow_xattr_cache_insert(bh); -+ error = ext3cow_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size); -+ -+cleanup: -+ brelse(bh); -+ -+ return error; -+} -+ -+static int -+ext3cow_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size) -+{ -+ struct ext3cow_xattr_ibody_header *header; -+ struct ext3cow_inode *raw_inode; -+ struct ext3cow_iloc iloc; -+ void *end; -+ int error; -+ -+ if (!(EXT3COW_I(inode)->i_state & EXT3COW_STATE_XATTR)) -+ return 0; -+ error = ext3cow_get_inode_loc(inode, &iloc); -+ if (error) -+ return error; -+ raw_inode = ext3cow_raw_inode(&iloc); -+ header = IHDR(inode, raw_inode); -+ end = (void *)raw_inode + EXT3COW_SB(inode->i_sb)->s_inode_size; -+ error = ext3cow_xattr_check_names(IFIRST(header), end); -+ if (error) -+ goto cleanup; -+ error = ext3cow_xattr_list_entries(inode, IFIRST(header), -+ buffer, buffer_size); -+ -+cleanup: -+ brelse(iloc.bh); -+ return error; -+} -+ -+/* -+ * ext3cow_xattr_list() -+ * -+ * Copy a list of attribute names into the buffer -+ * provided, or compute the buffer size required. -+ * Buffer is NULL to compute the size of the buffer required. -+ * -+ * Returns a negative error number on failure, or the number of bytes -+ * used / required on success. -+ */ -+int -+ext3cow_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) -+{ -+ int i_error, b_error; -+ -+ down_read(&EXT3COW_I(inode)->xattr_sem); -+ i_error = ext3cow_xattr_ibody_list(inode, buffer, buffer_size); -+ if (i_error < 0) { -+ b_error = 0; -+ } else { -+ if (buffer) { -+ buffer += i_error; -+ buffer_size -= i_error; -+ } -+ b_error = ext3cow_xattr_block_list(inode, buffer, buffer_size); -+ if (b_error < 0) -+ i_error = 0; -+ } -+ up_read(&EXT3COW_I(inode)->xattr_sem); -+ return i_error + b_error; -+} -+ -+/* -+ * If the EXT3COW_FEATURE_COMPAT_EXT_ATTR feature of this file system is -+ * not set, set it. -+ */ -+static void ext3cow_xattr_update_super_block(handle_t *handle, -+ struct super_block *sb) -+{ -+ if (EXT3COW_HAS_COMPAT_FEATURE(sb, EXT3COW_FEATURE_COMPAT_EXT_ATTR)) -+ return; -+ -+ if (ext3cow_journal_get_write_access(handle, EXT3COW_SB(sb)->s_sbh) == 0) { -+ EXT3COW_SET_COMPAT_FEATURE(sb, EXT3COW_FEATURE_COMPAT_EXT_ATTR); -+ sb->s_dirt = 1; -+ ext3cow_journal_dirty_metadata(handle, EXT3COW_SB(sb)->s_sbh); -+ } -+} -+ -+/* -+ * Release the xattr block BH: If the reference count is > 1, decrement -+ * it; otherwise free the block. -+ */ -+static void -+ext3cow_xattr_release_block(handle_t *handle, struct inode *inode, -+ struct buffer_head *bh) -+{ -+ struct mb_cache_entry *ce = NULL; -+ -+ ce = mb_cache_entry_get(ext3cow_xattr_cache, bh->b_bdev, bh->b_blocknr); -+ if (BHDR(bh)->h_refcount == cpu_to_le32(1)) { -+ ea_bdebug(bh, "refcount now=0; freeing"); -+ if (ce) -+ mb_cache_entry_free(ce); -+ ext3cow_free_blocks(handle, inode, bh->b_blocknr, 1); -+ get_bh(bh); -+ ext3cow_forget(handle, 1, inode, bh, bh->b_blocknr); -+ } else { -+ if (ext3cow_journal_get_write_access(handle, bh) == 0) { -+ lock_buffer(bh); -+ BHDR(bh)->h_refcount = cpu_to_le32( -+ le32_to_cpu(BHDR(bh)->h_refcount) - 1); -+ ext3cow_journal_dirty_metadata(handle, bh); -+ if (IS_SYNC(inode)) -+ handle->h_sync = 1; -+ DQUOT_FREE_BLOCK(inode, 1); -+ unlock_buffer(bh); -+ ea_bdebug(bh, "refcount now=%d; releasing", -+ le32_to_cpu(BHDR(bh)->h_refcount)); -+ } -+ if (ce) -+ mb_cache_entry_release(ce); -+ } -+} -+ -+struct ext3cow_xattr_info { -+ int name_index; -+ const char *name; -+ const void *value; -+ size_t value_len; -+}; -+ -+struct ext3cow_xattr_search { -+ struct ext3cow_xattr_entry *first; -+ void *base; -+ void *end; -+ struct ext3cow_xattr_entry *here; -+ int not_found; -+}; -+ -+static int -+ext3cow_xattr_set_entry(struct ext3cow_xattr_info *i, struct ext3cow_xattr_search *s) -+{ -+ struct ext3cow_xattr_entry *last; -+ size_t free, min_offs = s->end - s->base, name_len = strlen(i->name); -+ -+ /* Compute min_offs and last. */ -+ last = s->first; -+ for (; !IS_LAST_ENTRY(last); last = EXT3COW_XATTR_NEXT(last)) { -+ if (!last->e_value_block && last->e_value_size) { -+ size_t offs = le16_to_cpu(last->e_value_offs); -+ if (offs < min_offs) -+ min_offs = offs; -+ } -+ } -+ free = min_offs - ((void *)last - s->base) - sizeof(__u32); -+ if (!s->not_found) { -+ if (!s->here->e_value_block && s->here->e_value_size) { -+ size_t size = le32_to_cpu(s->here->e_value_size); -+ free += EXT3COW_XATTR_SIZE(size); -+ } -+ free += EXT3COW_XATTR_LEN(name_len); -+ } -+ if (i->value) { -+ if (free < EXT3COW_XATTR_SIZE(i->value_len) || -+ free < EXT3COW_XATTR_LEN(name_len) + -+ EXT3COW_XATTR_SIZE(i->value_len)) -+ return -ENOSPC; -+ } -+ -+ if (i->value && s->not_found) { -+ /* Insert the new name. */ -+ size_t size = EXT3COW_XATTR_LEN(name_len); -+ size_t rest = (void *)last - (void *)s->here + sizeof(__u32); -+ memmove((void *)s->here + size, s->here, rest); -+ memset(s->here, 0, size); -+ s->here->e_name_index = i->name_index; -+ s->here->e_name_len = name_len; -+ memcpy(s->here->e_name, i->name, name_len); -+ } else { -+ if (!s->here->e_value_block && s->here->e_value_size) { -+ void *first_val = s->base + min_offs; -+ size_t offs = le16_to_cpu(s->here->e_value_offs); -+ void *val = s->base + offs; -+ size_t size = EXT3COW_XATTR_SIZE( -+ le32_to_cpu(s->here->e_value_size)); -+ -+ if (i->value && size == EXT3COW_XATTR_SIZE(i->value_len)) { -+ /* The old and the new value have the same -+ size. Just replace. */ -+ s->here->e_value_size = -+ cpu_to_le32(i->value_len); -+ memset(val + size - EXT3COW_XATTR_PAD, 0, -+ EXT3COW_XATTR_PAD); /* Clear pad bytes. */ -+ memcpy(val, i->value, i->value_len); -+ return 0; -+ } -+ -+ /* Remove the old value. */ -+ memmove(first_val + size, first_val, val - first_val); -+ memset(first_val, 0, size); -+ s->here->e_value_size = 0; -+ s->here->e_value_offs = 0; -+ min_offs += size; -+ -+ /* Adjust all value offsets. */ -+ last = s->first; -+ while (!IS_LAST_ENTRY(last)) { -+ size_t o = le16_to_cpu(last->e_value_offs); -+ if (!last->e_value_block && -+ last->e_value_size && o < offs) -+ last->e_value_offs = -+ cpu_to_le16(o + size); -+ last = EXT3COW_XATTR_NEXT(last); -+ } -+ } -+ if (!i->value) { -+ /* Remove the old name. */ -+ size_t size = EXT3COW_XATTR_LEN(name_len); -+ last = ENTRY((void *)last - size); -+ memmove(s->here, (void *)s->here + size, -+ (void *)last - (void *)s->here + sizeof(__u32)); -+ memset(last, 0, size); -+ } -+ } -+ -+ if (i->value) { -+ /* Insert the new value. */ -+ s->here->e_value_size = cpu_to_le32(i->value_len); -+ if (i->value_len) { -+ size_t size = EXT3COW_XATTR_SIZE(i->value_len); -+ void *val = s->base + min_offs - size; -+ s->here->e_value_offs = cpu_to_le16(min_offs - size); -+ memset(val + size - EXT3COW_XATTR_PAD, 0, -+ EXT3COW_XATTR_PAD); /* Clear the pad bytes. */ -+ memcpy(val, i->value, i->value_len); -+ } -+ } -+ return 0; -+} -+ -+struct ext3cow_xattr_block_find { -+ struct ext3cow_xattr_search s; -+ struct buffer_head *bh; -+}; -+ -+static int -+ext3cow_xattr_block_find(struct inode *inode, struct ext3cow_xattr_info *i, -+ struct ext3cow_xattr_block_find *bs) -+{ -+ struct super_block *sb = inode->i_sb; -+ int error; -+ -+ ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", -+ i->name_index, i->name, i->value, (long)i->value_len); -+ -+ if (EXT3COW_I(inode)->i_file_acl) { -+ /* The inode already has an extended attribute block. */ -+ bs->bh = sb_bread(sb, EXT3COW_I(inode)->i_file_acl); -+ error = -EIO; -+ if (!bs->bh) -+ goto cleanup; -+ ea_bdebug(bs->bh, "b_count=%d, refcount=%d", -+ atomic_read(&(bs->bh->b_count)), -+ le32_to_cpu(BHDR(bs->bh)->h_refcount)); -+ if (ext3cow_xattr_check_block(bs->bh)) { -+ ext3cow_error(sb, __FUNCTION__, -+ "inode %lu: bad block "E3FSBLK, inode->i_ino, -+ EXT3COW_I(inode)->i_file_acl); -+ error = -EIO; -+ goto cleanup; -+ } -+ /* Find the named attribute. */ -+ bs->s.base = BHDR(bs->bh); -+ bs->s.first = BFIRST(bs->bh); -+ bs->s.end = bs->bh->b_data + bs->bh->b_size; -+ bs->s.here = bs->s.first; -+ error = ext3cow_xattr_find_entry(&bs->s.here, i->name_index, -+ i->name, bs->bh->b_size, 1); -+ if (error && error != -ENODATA) -+ goto cleanup; -+ bs->s.not_found = error; -+ } -+ error = 0; -+ -+cleanup: -+ return error; -+} -+ -+static int -+ext3cow_xattr_block_set(handle_t *handle, struct inode *inode, -+ struct ext3cow_xattr_info *i, -+ struct ext3cow_xattr_block_find *bs) -+{ -+ struct super_block *sb = inode->i_sb; -+ struct buffer_head *new_bh = NULL; -+ struct ext3cow_xattr_search *s = &bs->s; -+ struct mb_cache_entry *ce = NULL; -+ int error; -+ -+#define header(x) ((struct ext3cow_xattr_header *)(x)) -+ -+ if (i->value && i->value_len > sb->s_blocksize) -+ return -ENOSPC; -+ if (s->base) { -+ ce = mb_cache_entry_get(ext3cow_xattr_cache, bs->bh->b_bdev, -+ bs->bh->b_blocknr); -+ if (header(s->base)->h_refcount == cpu_to_le32(1)) { -+ if (ce) { -+ mb_cache_entry_free(ce); -+ ce = NULL; -+ } -+ ea_bdebug(bs->bh, "modifying in-place"); -+ error = ext3cow_journal_get_write_access(handle, bs->bh); -+ if (error) -+ goto cleanup; -+ lock_buffer(bs->bh); -+ error = ext3cow_xattr_set_entry(i, s); -+ if (!error) { -+ if (!IS_LAST_ENTRY(s->first)) -+ ext3cow_xattr_rehash(header(s->base), -+ s->here); -+ ext3cow_xattr_cache_insert(bs->bh); -+ } -+ unlock_buffer(bs->bh); -+ if (error == -EIO) -+ goto bad_block; -+ if (!error) -+ error = ext3cow_journal_dirty_metadata(handle, -+ bs->bh); -+ if (error) -+ goto cleanup; -+ goto inserted; -+ } else { -+ int offset = (char *)s->here - bs->bh->b_data; -+ -+ if (ce) { -+ mb_cache_entry_release(ce); -+ ce = NULL; -+ } -+ ea_bdebug(bs->bh, "cloning"); -+ s->base = kmalloc(bs->bh->b_size, GFP_KERNEL); -+ error = -ENOMEM; -+ if (s->base == NULL) -+ goto cleanup; -+ memcpy(s->base, BHDR(bs->bh), bs->bh->b_size); -+ s->first = ENTRY(header(s->base)+1); -+ header(s->base)->h_refcount = cpu_to_le32(1); -+ s->here = ENTRY(s->base + offset); -+ s->end = s->base + bs->bh->b_size; -+ } -+ } else { -+ /* Allocate a buffer where we construct the new block. */ -+ s->base = kmalloc(sb->s_blocksize, GFP_KERNEL); -+ /* assert(header == s->base) */ -+ error = -ENOMEM; -+ if (s->base == NULL) -+ goto cleanup; -+ memset(s->base, 0, sb->s_blocksize); -+ header(s->base)->h_magic = cpu_to_le32(EXT3COW_XATTR_MAGIC); -+ header(s->base)->h_blocks = cpu_to_le32(1); -+ header(s->base)->h_refcount = cpu_to_le32(1); -+ s->first = ENTRY(header(s->base)+1); -+ s->here = ENTRY(header(s->base)+1); -+ s->end = s->base + sb->s_blocksize; -+ } -+ -+ error = ext3cow_xattr_set_entry(i, s); -+ if (error == -EIO) -+ goto bad_block; -+ if (error) -+ goto cleanup; -+ if (!IS_LAST_ENTRY(s->first)) -+ ext3cow_xattr_rehash(header(s->base), s->here); -+ -+inserted: -+ if (!IS_LAST_ENTRY(s->first)) { -+ new_bh = ext3cow_xattr_cache_find(inode, header(s->base), &ce); -+ if (new_bh) { -+ /* We found an identical block in the cache. */ -+ if (new_bh == bs->bh) -+ ea_bdebug(new_bh, "keeping"); -+ else { -+ /* The old block is released after updating -+ the inode. */ -+ error = -EDQUOT; -+ if (DQUOT_ALLOC_BLOCK(inode, 1)) -+ goto cleanup; -+ error = ext3cow_journal_get_write_access(handle, -+ new_bh); -+ if (error) -+ goto cleanup_dquot; -+ lock_buffer(new_bh); -+ BHDR(new_bh)->h_refcount = cpu_to_le32(1 + -+ le32_to_cpu(BHDR(new_bh)->h_refcount)); -+ ea_bdebug(new_bh, "reusing; refcount now=%d", -+ le32_to_cpu(BHDR(new_bh)->h_refcount)); -+ unlock_buffer(new_bh); -+ error = ext3cow_journal_dirty_metadata(handle, -+ new_bh); -+ if (error) -+ goto cleanup_dquot; -+ } -+ mb_cache_entry_release(ce); -+ ce = NULL; -+ } else if (bs->bh && s->base == bs->bh->b_data) { -+ /* We were modifying this block in-place. */ -+ ea_bdebug(bs->bh, "keeping this block"); -+ new_bh = bs->bh; -+ get_bh(new_bh); -+ } else { -+ /* We need to allocate a new block */ -+ ext3cow_fsblk_t goal = le32_to_cpu( -+ EXT3COW_SB(sb)->s_es->s_first_data_block) + -+ (ext3cow_fsblk_t)EXT3COW_I(inode)->i_block_group * -+ EXT3COW_BLOCKS_PER_GROUP(sb); -+ ext3cow_fsblk_t block = ext3cow_new_block(handle, inode, -+ goal, &error); -+ if (error) -+ goto cleanup; -+ ea_idebug(inode, "creating block %d", block); -+ -+ new_bh = sb_getblk(sb, block); -+ if (!new_bh) { -+getblk_failed: -+ ext3cow_free_blocks(handle, inode, block, 1); -+ error = -EIO; -+ goto cleanup; -+ } -+ lock_buffer(new_bh); -+ error = ext3cow_journal_get_create_access(handle, new_bh); -+ if (error) { -+ unlock_buffer(new_bh); -+ goto getblk_failed; -+ } -+ memcpy(new_bh->b_data, s->base, new_bh->b_size); -+ set_buffer_uptodate(new_bh); -+ unlock_buffer(new_bh); -+ ext3cow_xattr_cache_insert(new_bh); -+ error = ext3cow_journal_dirty_metadata(handle, new_bh); -+ if (error) -+ goto cleanup; -+ } -+ } -+ -+ /* Update the inode. */ -+ EXT3COW_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; -+ -+ /* Drop the previous xattr block. */ -+ if (bs->bh && bs->bh != new_bh) -+ ext3cow_xattr_release_block(handle, inode, bs->bh); -+ error = 0; -+ -+cleanup: -+ if (ce) -+ mb_cache_entry_release(ce); -+ brelse(new_bh); -+ if (!(bs->bh && s->base == bs->bh->b_data)) -+ kfree(s->base); -+ -+ return error; -+ -+cleanup_dquot: -+ DQUOT_FREE_BLOCK(inode, 1); -+ goto cleanup; -+ -+bad_block: -+ ext3cow_error(inode->i_sb, __FUNCTION__, -+ "inode %lu: bad block "E3FSBLK, inode->i_ino, -+ EXT3COW_I(inode)->i_file_acl); -+ goto cleanup; -+ -+#undef header -+} -+ -+struct ext3cow_xattr_ibody_find { -+ struct ext3cow_xattr_search s; -+ struct ext3cow_iloc iloc; -+}; -+ -+static int -+ext3cow_xattr_ibody_find(struct inode *inode, struct ext3cow_xattr_info *i, -+ struct ext3cow_xattr_ibody_find *is) -+{ -+ struct ext3cow_xattr_ibody_header *header; -+ struct ext3cow_inode *raw_inode; -+ int error; -+ -+ if (EXT3COW_I(inode)->i_extra_isize == 0) -+ return 0; -+ raw_inode = ext3cow_raw_inode(&is->iloc); -+ header = IHDR(inode, raw_inode); -+ is->s.base = is->s.first = IFIRST(header); -+ is->s.here = is->s.first; -+ is->s.end = (void *)raw_inode + EXT3COW_SB(inode->i_sb)->s_inode_size; -+ if (EXT3COW_I(inode)->i_state & EXT3COW_STATE_XATTR) { -+ error = ext3cow_xattr_check_names(IFIRST(header), is->s.end); -+ if (error) -+ return error; -+ /* Find the named attribute. */ -+ error = ext3cow_xattr_find_entry(&is->s.here, i->name_index, -+ i->name, is->s.end - -+ (void *)is->s.base, 0); -+ if (error && error != -ENODATA) -+ return error; -+ is->s.not_found = error; -+ } -+ return 0; -+} -+ -+static int -+ext3cow_xattr_ibody_set(handle_t *handle, struct inode *inode, -+ struct ext3cow_xattr_info *i, -+ struct ext3cow_xattr_ibody_find *is) -+{ -+ struct ext3cow_xattr_ibody_header *header; -+ struct ext3cow_xattr_search *s = &is->s; -+ int error; -+ -+ if (EXT3COW_I(inode)->i_extra_isize == 0) -+ return -ENOSPC; -+ error = ext3cow_xattr_set_entry(i, s); -+ if (error) -+ return error; -+ header = IHDR(inode, ext3cow_raw_inode(&is->iloc)); -+ if (!IS_LAST_ENTRY(s->first)) { -+ header->h_magic = cpu_to_le32(EXT3COW_XATTR_MAGIC); -+ EXT3COW_I(inode)->i_state |= EXT3COW_STATE_XATTR; -+ } else { -+ header->h_magic = cpu_to_le32(0); -+ EXT3COW_I(inode)->i_state &= ~EXT3COW_STATE_XATTR; -+ } -+ return 0; -+} -+ -+/* -+ * ext3cow_xattr_set_handle() -+ * -+ * Create, replace or remove an extended attribute for this inode. Buffer -+ * is NULL to remove an existing extended attribute, and non-NULL to -+ * either replace an existing extended attribute, or create a new extended -+ * attribute. The flags XATTR_REPLACE and XATTR_CREATE -+ * specify that an extended attribute must exist and must not exist -+ * previous to the call, respectively. -+ * -+ * Returns 0, or a negative error number on failure. -+ */ -+int -+ext3cow_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, -+ const char *name, const void *value, size_t value_len, -+ int flags) -+{ -+ struct ext3cow_xattr_info i = { -+ .name_index = name_index, -+ .name = name, -+ .value = value, -+ .value_len = value_len, -+ -+ }; -+ struct ext3cow_xattr_ibody_find is = { -+ .s = { .not_found = -ENODATA, }, -+ }; -+ struct ext3cow_xattr_block_find bs = { -+ .s = { .not_found = -ENODATA, }, -+ }; -+ int error; -+ -+ if (!name) -+ return -EINVAL; -+ if (strlen(name) > 255) -+ return -ERANGE; -+ down_write(&EXT3COW_I(inode)->xattr_sem); -+ error = ext3cow_get_inode_loc(inode, &is.iloc); -+ if (error) -+ goto cleanup; -+ -+ if (EXT3COW_I(inode)->i_state & EXT3COW_STATE_NEW) { -+ struct ext3cow_inode *raw_inode = ext3cow_raw_inode(&is.iloc); -+ memset(raw_inode, 0, EXT3COW_SB(inode->i_sb)->s_inode_size); -+ EXT3COW_I(inode)->i_state &= ~EXT3COW_STATE_NEW; -+ } -+ -+ error = ext3cow_xattr_ibody_find(inode, &i, &is); -+ if (error) -+ goto cleanup; -+ if (is.s.not_found) -+ error = ext3cow_xattr_block_find(inode, &i, &bs); -+ if (error) -+ goto cleanup; -+ if (is.s.not_found && bs.s.not_found) { -+ error = -ENODATA; -+ if (flags & XATTR_REPLACE) -+ goto cleanup; -+ error = 0; -+ if (!value) -+ goto cleanup; -+ } else { -+ error = -EEXIST; -+ if (flags & XATTR_CREATE) -+ goto cleanup; -+ } -+ error = ext3cow_journal_get_write_access(handle, is.iloc.bh); -+ if (error) -+ goto cleanup; -+ if (!value) { -+ if (!is.s.not_found) -+ error = ext3cow_xattr_ibody_set(handle, inode, &i, &is); -+ else if (!bs.s.not_found) -+ error = ext3cow_xattr_block_set(handle, inode, &i, &bs); -+ } else { -+ error = ext3cow_xattr_ibody_set(handle, inode, &i, &is); -+ if (!error && !bs.s.not_found) { -+ i.value = NULL; -+ error = ext3cow_xattr_block_set(handle, inode, &i, &bs); -+ } else if (error == -ENOSPC) { -+ error = ext3cow_xattr_block_set(handle, inode, &i, &bs); -+ if (error) -+ goto cleanup; -+ if (!is.s.not_found) { -+ i.value = NULL; -+ error = ext3cow_xattr_ibody_set(handle, inode, &i, -+ &is); -+ } -+ } -+ } -+ if (!error) { -+ ext3cow_xattr_update_super_block(handle, inode->i_sb); -+ inode->i_ctime = CURRENT_TIME_SEC; -+ error = ext3cow_mark_iloc_dirty(handle, inode, &is.iloc); -+ /* -+ * The bh is consumed by ext3cow_mark_iloc_dirty, even with -+ * error != 0. -+ */ -+ is.iloc.bh = NULL; -+ if (IS_SYNC(inode)) -+ handle->h_sync = 1; -+ } -+ -+cleanup: -+ brelse(is.iloc.bh); -+ brelse(bs.bh); -+ up_write(&EXT3COW_I(inode)->xattr_sem); -+ return error; -+} -+ -+/* -+ * ext3cow_xattr_set() -+ * -+ * Like ext3cow_xattr_set_handle, but start from an inode. This extended -+ * attribute modification is a filesystem transaction by itself. -+ * -+ * Returns 0, or a negative error number on failure. -+ */ -+int -+ext3cow_xattr_set(struct inode *inode, int name_index, const char *name, -+ const void *value, size_t value_len, int flags) -+{ -+ handle_t *handle; -+ int error, retries = 0; -+ -+retry: -+ handle = ext3cow_journal_start(inode, EXT3COW_DATA_TRANS_BLOCKS(inode->i_sb)); -+ if (IS_ERR(handle)) { -+ error = PTR_ERR(handle); -+ } else { -+ int error2; -+ -+ error = ext3cow_xattr_set_handle(handle, inode, name_index, name, -+ value, value_len, flags); -+ error2 = ext3cow_journal_stop(handle); -+ if (error == -ENOSPC && -+ ext3cow_should_retry_alloc(inode->i_sb, &retries)) -+ goto retry; -+ if (error == 0) -+ error = error2; -+ } -+ -+ return error; -+} -+ -+/* -+ * ext3cow_xattr_delete_inode() -+ * -+ * Free extended attribute resources associated with this inode. This -+ * is called immediately before an inode is freed. We have exclusive -+ * access to the inode. -+ */ -+void -+ext3cow_xattr_delete_inode(handle_t *handle, struct inode *inode) -+{ -+ struct buffer_head *bh = NULL; -+ -+ if (!EXT3COW_I(inode)->i_file_acl) -+ goto cleanup; -+ bh = sb_bread(inode->i_sb, EXT3COW_I(inode)->i_file_acl); -+ if (!bh) { -+ ext3cow_error(inode->i_sb, __FUNCTION__, -+ "inode %lu: block "E3FSBLK" read error", inode->i_ino, -+ EXT3COW_I(inode)->i_file_acl); -+ goto cleanup; -+ } -+ if (BHDR(bh)->h_magic != cpu_to_le32(EXT3COW_XATTR_MAGIC) || -+ BHDR(bh)->h_blocks != cpu_to_le32(1)) { -+ ext3cow_error(inode->i_sb, __FUNCTION__, -+ "inode %lu: bad block "E3FSBLK, inode->i_ino, -+ EXT3COW_I(inode)->i_file_acl); -+ goto cleanup; -+ } -+ ext3cow_xattr_release_block(handle, inode, bh); -+ EXT3COW_I(inode)->i_file_acl = 0; -+ -+cleanup: -+ brelse(bh); -+} -+ -+/* -+ * ext3cow_xattr_put_super() -+ * -+ * This is called when a file system is unmounted. -+ */ -+void -+ext3cow_xattr_put_super(struct super_block *sb) -+{ -+ mb_cache_shrink(sb->s_bdev); -+} -+ -+/* -+ * ext3cow_xattr_cache_insert() -+ * -+ * Create a new entry in the extended attribute cache, and insert -+ * it unless such an entry is already in the cache. -+ * -+ * Returns 0, or a negative error number on failure. -+ */ -+static void -+ext3cow_xattr_cache_insert(struct buffer_head *bh) -+{ -+ __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); -+ struct mb_cache_entry *ce; -+ int error; -+ -+ ce = mb_cache_entry_alloc(ext3cow_xattr_cache); -+ if (!ce) { -+ ea_bdebug(bh, "out of memory"); -+ return; -+ } -+ error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash); -+ if (error) { -+ mb_cache_entry_free(ce); -+ if (error == -EBUSY) { -+ ea_bdebug(bh, "already in cache"); -+ error = 0; -+ } -+ } else { -+ ea_bdebug(bh, "inserting [%x]", (int)hash); -+ mb_cache_entry_release(ce); -+ } -+} -+ -+/* -+ * ext3cow_xattr_cmp() -+ * -+ * Compare two extended attribute blocks for equality. -+ * -+ * Returns 0 if the blocks are equal, 1 if they differ, and -+ * a negative error number on errors. -+ */ -+static int -+ext3cow_xattr_cmp(struct ext3cow_xattr_header *header1, -+ struct ext3cow_xattr_header *header2) -+{ -+ struct ext3cow_xattr_entry *entry1, *entry2; -+ -+ entry1 = ENTRY(header1+1); -+ entry2 = ENTRY(header2+1); -+ while (!IS_LAST_ENTRY(entry1)) { -+ if (IS_LAST_ENTRY(entry2)) -+ return 1; -+ if (entry1->e_hash != entry2->e_hash || -+ entry1->e_name_index != entry2->e_name_index || -+ entry1->e_name_len != entry2->e_name_len || -+ entry1->e_value_size != entry2->e_value_size || -+ memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) -+ return 1; -+ if (entry1->e_value_block != 0 || entry2->e_value_block != 0) -+ return -EIO; -+ if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), -+ (char *)header2 + le16_to_cpu(entry2->e_value_offs), -+ le32_to_cpu(entry1->e_value_size))) -+ return 1; -+ -+ entry1 = EXT3COW_XATTR_NEXT(entry1); -+ entry2 = EXT3COW_XATTR_NEXT(entry2); -+ } -+ if (!IS_LAST_ENTRY(entry2)) -+ return 1; -+ return 0; -+} -+ -+/* -+ * ext3cow_xattr_cache_find() -+ * -+ * Find an identical extended attribute block. -+ * -+ * Returns a pointer to the block found, or NULL if such a block was -+ * not found or an error occurred. -+ */ -+static struct buffer_head * -+ext3cow_xattr_cache_find(struct inode *inode, struct ext3cow_xattr_header *header, -+ struct mb_cache_entry **pce) -+{ -+ __u32 hash = le32_to_cpu(header->h_hash); -+ struct mb_cache_entry *ce; -+ -+ if (!header->h_hash) -+ return NULL; /* never share */ -+ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); -+again: -+ ce = mb_cache_entry_find_first(ext3cow_xattr_cache, 0, -+ inode->i_sb->s_bdev, hash); -+ while (ce) { -+ struct buffer_head *bh; -+ -+ if (IS_ERR(ce)) { -+ if (PTR_ERR(ce) == -EAGAIN) -+ goto again; -+ break; -+ } -+ bh = sb_bread(inode->i_sb, ce->e_block); -+ if (!bh) { -+ ext3cow_error(inode->i_sb, __FUNCTION__, -+ "inode %lu: block %lu read error", -+ inode->i_ino, (unsigned long) ce->e_block); -+ } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= -+ EXT3COW_XATTR_REFCOUNT_MAX) { -+ ea_idebug(inode, "block %lu refcount %d>=%d", -+ (unsigned long) ce->e_block, -+ le32_to_cpu(BHDR(bh)->h_refcount), -+ EXT3COW_XATTR_REFCOUNT_MAX); -+ } else if (ext3cow_xattr_cmp(header, BHDR(bh)) == 0) { -+ *pce = ce; -+ return bh; -+ } -+ brelse(bh); -+ ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash); -+ } -+ return NULL; -+} -+ -+#define NAME_HASH_SHIFT 5 -+#define VALUE_HASH_SHIFT 16 -+ -+/* -+ * ext3cow_xattr_hash_entry() -+ * -+ * Compute the hash of an extended attribute. -+ */ -+static inline void ext3cow_xattr_hash_entry(struct ext3cow_xattr_header *header, -+ struct ext3cow_xattr_entry *entry) -+{ -+ __u32 hash = 0; -+ char *name = entry->e_name; -+ int n; -+ -+ for (n=0; n < entry->e_name_len; n++) { -+ hash = (hash << NAME_HASH_SHIFT) ^ -+ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ -+ *name++; -+ } -+ -+ if (entry->e_value_block == 0 && entry->e_value_size != 0) { -+ __le32 *value = (__le32 *)((char *)header + -+ le16_to_cpu(entry->e_value_offs)); -+ for (n = (le32_to_cpu(entry->e_value_size) + -+ EXT3COW_XATTR_ROUND) >> EXT3COW_XATTR_PAD_BITS; n; n--) { -+ hash = (hash << VALUE_HASH_SHIFT) ^ -+ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ -+ le32_to_cpu(*value++); -+ } -+ } -+ entry->e_hash = cpu_to_le32(hash); -+} -+ -+#undef NAME_HASH_SHIFT -+#undef VALUE_HASH_SHIFT -+ -+#define BLOCK_HASH_SHIFT 16 -+ -+/* -+ * ext3cow_xattr_rehash() -+ * -+ * Re-compute the extended attribute hash value after an entry has changed. -+ */ -+static void ext3cow_xattr_rehash(struct ext3cow_xattr_header *header, -+ struct ext3cow_xattr_entry *entry) -+{ -+ struct ext3cow_xattr_entry *here; -+ __u32 hash = 0; -+ -+ ext3cow_xattr_hash_entry(header, entry); -+ here = ENTRY(header+1); -+ while (!IS_LAST_ENTRY(here)) { -+ if (!here->e_hash) { -+ /* Block is not shared if an entry's hash value == 0 */ -+ hash = 0; -+ break; -+ } -+ hash = (hash << BLOCK_HASH_SHIFT) ^ -+ (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ -+ le32_to_cpu(here->e_hash); -+ here = EXT3COW_XATTR_NEXT(here); -+ } -+ header->h_hash = cpu_to_le32(hash); -+} -+ -+#undef BLOCK_HASH_SHIFT -+ -+int __init -+init_ext3cow_xattr(void) -+{ -+ ext3cow_xattr_cache = mb_cache_create("ext3cow_xattr", NULL, -+ sizeof(struct mb_cache_entry) + -+ sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6); -+ if (!ext3cow_xattr_cache) -+ return -ENOMEM; -+ return 0; -+} -+ -+void -+exit_ext3cow_xattr(void) -+{ -+ if (ext3cow_xattr_cache) -+ mb_cache_destroy(ext3cow_xattr_cache); -+ ext3cow_xattr_cache = NULL; -+} -diff -ruN linux-2.6.20.3/fs/ext3cow/xattr.h linux-2.6.20.3-ext3cow/fs/ext3cow/xattr.h ---- linux-2.6.20.3/fs/ext3cow/xattr.h 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.20.3-ext3cow/fs/ext3cow/xattr.h 2008-03-09 11:14:49.000000000 -0400 -@@ -0,0 +1,145 @@ -+/* -+ File: fs/ext3cow/xattr.h -+ -+ On-disk format of extended attributes for the ext3cow filesystem. -+ -+ (C) 2001 Andreas Gruenbacher, -+*/ -+ -+#include -+ -+/* Magic value in attribute blocks */ -+#define EXT3COW_XATTR_MAGIC 0xEA020000 -+ -+/* Maximum number of references to one attribute block */ -+#define EXT3COW_XATTR_REFCOUNT_MAX 1024 -+ -+/* Name indexes */ -+#define EXT3COW_XATTR_INDEX_USER 1 -+#define EXT3COW_XATTR_INDEX_POSIX_ACL_ACCESS 2 -+#define EXT3COW_XATTR_INDEX_POSIX_ACL_DEFAULT 3 -+#define EXT3COW_XATTR_INDEX_TRUSTED 4 -+#define EXT3COW_XATTR_INDEX_LUSTRE 5 -+#define EXT3COW_XATTR_INDEX_SECURITY 6 -+ -+struct ext3cow_xattr_header { -+ __le32 h_magic; /* magic number for identification */ -+ __le32 h_refcount; /* reference count */ -+ __le32 h_blocks; /* number of disk blocks used */ -+ __le32 h_hash; /* hash value of all attributes */ -+ __u32 h_reserved[4]; /* zero right now */ -+}; -+ -+struct ext3cow_xattr_ibody_header { -+ __le32 h_magic; /* magic number for identification */ -+}; -+ -+struct ext3cow_xattr_entry { -+ __u8 e_name_len; /* length of name */ -+ __u8 e_name_index; /* attribute name index */ -+ __le16 e_value_offs; /* offset in disk block of value */ -+ __le32 e_value_block; /* disk block attribute is stored on (n/i) */ -+ __le32 e_value_size; /* size of attribute value */ -+ __le32 e_hash; /* hash value of name and value */ -+ char e_name[0]; /* attribute name */ -+}; -+ -+#define EXT3COW_XATTR_PAD_BITS 2 -+#define EXT3COW_XATTR_PAD (1<e_name_len)) ) -+#define EXT3COW_XATTR_SIZE(size) \ -+ (((size) + EXT3COW_XATTR_ROUND) & ~EXT3COW_XATTR_ROUND) -+ -+# ifdef CONFIG_EXT3COW_FS_XATTR -+ -+extern struct xattr_handler ext3cow_xattr_user_handler; -+extern struct xattr_handler ext3cow_xattr_trusted_handler; -+extern struct xattr_handler ext3cow_xattr_acl_access_handler; -+extern struct xattr_handler ext3cow_xattr_acl_default_handler; -+extern struct xattr_handler ext3cow_xattr_security_handler; -+ -+extern ssize_t ext3cow_listxattr(struct dentry *, char *, size_t); -+ -+extern int ext3cow_xattr_get(struct inode *, int, const char *, void *, size_t); -+extern int ext3cow_xattr_list(struct inode *, char *, size_t); -+extern int ext3cow_xattr_set(struct inode *, int, const char *, const void *, size_t, int); -+extern int ext3cow_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); -+ -+extern void ext3cow_xattr_delete_inode(handle_t *, struct inode *); -+extern void ext3cow_xattr_put_super(struct super_block *); -+ -+extern int init_ext3cow_xattr(void); -+extern void exit_ext3cow_xattr(void); -+ -+extern struct xattr_handler *ext3cow_xattr_handlers[]; -+ -+# else /* CONFIG_EXT3COW_FS_XATTR */ -+ -+static inline int -+ext3cow_xattr_get(struct inode *inode, int name_index, const char *name, -+ void *buffer, size_t size, int flags) -+{ -+ return -EOPNOTSUPP; -+} -+ -+static inline int -+ext3cow_xattr_list(struct inode *inode, void *buffer, size_t size) -+{ -+ return -EOPNOTSUPP; -+} -+ -+static inline int -+ext3cow_xattr_set(struct inode *inode, int name_index, const char *name, -+ const void *value, size_t size, int flags) -+{ -+ return -EOPNOTSUPP; -+} -+ -+static inline int -+ext3cow_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, -+ const char *name, const void *value, size_t size, int flags) -+{ -+ return -EOPNOTSUPP; -+} -+ -+static inline void -+ext3cow_xattr_delete_inode(handle_t *handle, struct inode *inode) -+{ -+} -+ -+static inline void -+ext3cow_xattr_put_super(struct super_block *sb) -+{ -+} -+ -+static inline int -+init_ext3cow_xattr(void) -+{ -+ return 0; -+} -+ -+static inline void -+exit_ext3cow_xattr(void) -+{ -+} -+ -+#define ext3cow_xattr_handlers NULL -+ -+# endif /* CONFIG_EXT3COW_FS_XATTR */ -+ -+#ifdef CONFIG_EXT3COW_FS_SECURITY -+extern int ext3cow_init_security(handle_t *handle, struct inode *inode, -+ struct inode *dir); -+#else -+static inline int ext3cow_init_security(handle_t *handle, struct inode *inode, -+ struct inode *dir) -+{ -+ return 0; -+} -+#endif -diff -ruN linux-2.6.20.3/fs/ext3cow/xattr_security.c linux-2.6.20.3-ext3cow/fs/ext3cow/xattr_security.c ---- linux-2.6.20.3/fs/ext3cow/xattr_security.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.20.3-ext3cow/fs/ext3cow/xattr_security.c 2008-03-09 11:14:48.000000000 -0400 -@@ -0,0 +1,77 @@ -+/* -+ * linux/fs/ext3cow/xattr_security.c -+ * Handler for storing security labels as extended attributes. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "xattr.h" -+ -+static size_t -+ext3cow_xattr_security_list(struct inode *inode, char *list, size_t list_size, -+ const char *name, size_t name_len) -+{ -+ const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1; -+ const size_t total_len = prefix_len + name_len + 1; -+ -+ -+ if (list && total_len <= list_size) { -+ memcpy(list, XATTR_SECURITY_PREFIX, prefix_len); -+ memcpy(list+prefix_len, name, name_len); -+ list[prefix_len + name_len] = '\0'; -+ } -+ return total_len; -+} -+ -+static int -+ext3cow_xattr_security_get(struct inode *inode, const char *name, -+ void *buffer, size_t size) -+{ -+ if (strcmp(name, "") == 0) -+ return -EINVAL; -+ return ext3cow_xattr_get(inode, EXT3COW_XATTR_INDEX_SECURITY, name, -+ buffer, size); -+} -+ -+static int -+ext3cow_xattr_security_set(struct inode *inode, const char *name, -+ const void *value, size_t size, int flags) -+{ -+ if (strcmp(name, "") == 0) -+ return -EINVAL; -+ return ext3cow_xattr_set(inode, EXT3COW_XATTR_INDEX_SECURITY, name, -+ value, size, flags); -+} -+ -+int -+ext3cow_init_security(handle_t *handle, struct inode *inode, struct inode *dir) -+{ -+ int err; -+ size_t len; -+ void *value; -+ char *name; -+ -+ err = security_inode_init_security(inode, dir, &name, &value, &len); -+ if (err) { -+ if (err == -EOPNOTSUPP) -+ return 0; -+ return err; -+ } -+ err = ext3cow_xattr_set_handle(handle, inode, EXT3COW_XATTR_INDEX_SECURITY, -+ name, value, len, 0); -+ kfree(name); -+ kfree(value); -+ return err; -+} -+ -+struct xattr_handler ext3cow_xattr_security_handler = { -+ .prefix = XATTR_SECURITY_PREFIX, -+ .list = ext3cow_xattr_security_list, -+ .get = ext3cow_xattr_security_get, -+ .set = ext3cow_xattr_security_set, -+}; -diff -ruN linux-2.6.20.3/fs/ext3cow/xattr_trusted.c linux-2.6.20.3-ext3cow/fs/ext3cow/xattr_trusted.c ---- linux-2.6.20.3/fs/ext3cow/xattr_trusted.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.20.3-ext3cow/fs/ext3cow/xattr_trusted.c 2008-03-09 11:14:48.000000000 -0400 -@@ -0,0 +1,62 @@ -+/* -+ * linux/fs/ext3cow/xattr_trusted.c -+ * Handler for trusted extended attributes. -+ * -+ * Copyright (C) 2003 by Andreas Gruenbacher, -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "xattr.h" -+ -+#define XATTR_TRUSTED_PREFIX "trusted." -+ -+static size_t -+ext3cow_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, -+ const char *name, size_t name_len) -+{ -+ const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1; -+ const size_t total_len = prefix_len + name_len + 1; -+ -+ if (!capable(CAP_SYS_ADMIN)) -+ return 0; -+ -+ if (list && total_len <= list_size) { -+ memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); -+ memcpy(list+prefix_len, name, name_len); -+ list[prefix_len + name_len] = '\0'; -+ } -+ return total_len; -+} -+ -+static int -+ext3cow_xattr_trusted_get(struct inode *inode, const char *name, -+ void *buffer, size_t size) -+{ -+ if (strcmp(name, "") == 0) -+ return -EINVAL; -+ return ext3cow_xattr_get(inode, EXT3COW_XATTR_INDEX_TRUSTED, name, -+ buffer, size); -+} -+ -+static int -+ext3cow_xattr_trusted_set(struct inode *inode, const char *name, -+ const void *value, size_t size, int flags) -+{ -+ if (strcmp(name, "") == 0) -+ return -EINVAL; -+ return ext3cow_xattr_set(inode, EXT3COW_XATTR_INDEX_TRUSTED, name, -+ value, size, flags); -+} -+ -+struct xattr_handler ext3cow_xattr_trusted_handler = { -+ .prefix = XATTR_TRUSTED_PREFIX, -+ .list = ext3cow_xattr_trusted_list, -+ .get = ext3cow_xattr_trusted_get, -+ .set = ext3cow_xattr_trusted_set, -+}; -diff -ruN linux-2.6.20.3/fs/ext3cow/xattr_user.c linux-2.6.20.3-ext3cow/fs/ext3cow/xattr_user.c ---- linux-2.6.20.3/fs/ext3cow/xattr_user.c 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.20.3-ext3cow/fs/ext3cow/xattr_user.c 2008-03-09 11:14:48.000000000 -0400 -@@ -0,0 +1,64 @@ -+/* -+ * linux/fs/ext3cow/xattr_user.c -+ * Handler for extended user attributes. -+ * -+ * Copyright (C) 2001 by Andreas Gruenbacher, -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include "xattr.h" -+ -+#define XATTR_USER_PREFIX "user." -+ -+static size_t -+ext3cow_xattr_user_list(struct inode *inode, char *list, size_t list_size, -+ const char *name, size_t name_len) -+{ -+ const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1; -+ const size_t total_len = prefix_len + name_len + 1; -+ -+ if (!test_opt(inode->i_sb, XATTR_USER)) -+ return 0; -+ -+ if (list && total_len <= list_size) { -+ memcpy(list, XATTR_USER_PREFIX, prefix_len); -+ memcpy(list+prefix_len, name, name_len); -+ list[prefix_len + name_len] = '\0'; -+ } -+ return total_len; -+} -+ -+static int -+ext3cow_xattr_user_get(struct inode *inode, const char *name, -+ void *buffer, size_t size) -+{ -+ if (strcmp(name, "") == 0) -+ return -EINVAL; -+ if (!test_opt(inode->i_sb, XATTR_USER)) -+ return -EOPNOTSUPP; -+ return ext3cow_xattr_get(inode, EXT3COW_XATTR_INDEX_USER, name, buffer, size); -+} -+ -+static int -+ext3cow_xattr_user_set(struct inode *inode, const char *name, -+ const void *value, size_t size, int flags) -+{ -+ if (strcmp(name, "") == 0) -+ return -EINVAL; -+ if (!test_opt(inode->i_sb, XATTR_USER)) -+ return -EOPNOTSUPP; -+ return ext3cow_xattr_set(inode, EXT3COW_XATTR_INDEX_USER, name, -+ value, size, flags); -+} -+ -+struct xattr_handler ext3cow_xattr_user_handler = { -+ .prefix = XATTR_USER_PREFIX, -+ .list = ext3cow_xattr_user_list, -+ .get = ext3cow_xattr_user_get, -+ .set = ext3cow_xattr_user_set, -+}; -diff -ruN linux-2.6.20.3/fs/Kconfig linux-2.6.20.3-ext3cow/fs/Kconfig ---- linux-2.6.20.3/fs/Kconfig 2007-03-13 14:27:08.000000000 -0400 -+++ linux-2.6.20.3-ext3cow/fs/Kconfig 2008-03-09 11:14:25.000000000 -0400 -@@ -136,6 +136,77 @@ - If you are not using a security module that requires using - extended attributes for file security labels, say N. - -+ -+ -+config EXT3COW_FS -+ tristate "Ext3cow journalling and versioning file system support" -+ select JBD -+ help -+ This is the journalling version of the Second extended file system -+ (often called ext3), the de facto standard Linux file system -+ (method to organize files on a storage device) for hard disks. -+ -+ The journalling code included in this driver means you do not have -+ to run e2fsck (file system checker) on your file systems after a -+ crash. The journal keeps track of any changes that were being made -+ at the time the system crashed, and can ensure that your file system -+ is consistent without the need for a lengthy check. -+ -+ Other than adding the journal to the file system, the on-disk format -+ of ext3 is identical to ext2. It is possible to freely switch -+ between using the ext3 driver and the ext2 driver, as long as the -+ file system has been cleanly unmounted, or e2fsck is run on the file -+ system. -+ -+ To add a journal on an existing ext2 file system or change the -+ behavior of ext3 file systems, you can use the tune2fs utility ("man -+ tune2fs"). To modify attributes of files and directories on ext3 -+ file systems, use chattr ("man chattr"). You need to be using -+ e2fsprogs version 1.20 or later in order to create ext3 journals -+ (available at ). -+ -+ To compile this file system support as a module, choose M here: the -+ module will be called ext3. -+ -+config EXT3COW_FS_XATTR -+ bool "Ext3cow extended attributes" -+ depends on EXT3COW_FS -+ default y -+ help -+ Extended attributes are name:value pairs associated with inodes by -+ the kernel or by users (see the attr(5) manual page, or visit -+ for details). -+ -+ If unsure, say N. -+ -+ You need this for POSIX ACL support on ext3cow. -+ -+config EXT3COW_FS_POSIX_ACL -+ bool "Ext3cow POSIX Access Control Lists" -+ depends on EXT3COW_FS_XATTR -+ select FS_POSIX_ACL -+ help -+ Posix Access Control Lists (ACLs) support permissions for users and -+ groups beyond the owner/group/world scheme. -+ -+ To learn more about Access Control Lists, visit the Posix ACLs for -+ Linux website . -+ -+ If you don't know what Access Control Lists are, say N -+ -+config EXT3COW_FS_SECURITY -+ bool "Ext3cow Security Labels" -+ depends on EXT3COW_FS_XATTR -+ help -+ Security labels support alternative access control models -+ implemented by security modules like SELinux. This option -+ enables an extended attribute handler for file security -+ labels in the ext3cow filesystem. -+ -+ If you are not using a security module that requires using -+ extended attributes for file security labels, say N. -+ -+ - config EXT4DEV_FS - tristate "Ext4dev/ext4 extended fs support development (EXPERIMENTAL)" - depends on EXPERIMENTAL -@@ -205,23 +276,23 @@ - tristate - help - This is a generic journalling layer for block devices. It is -- currently used by the ext3 and OCFS2 file systems, but it could -+ currently used by the ext3, ext3cow and OCFS2 file systems, but it could - also be used to add journal support to other file systems or block - devices such as RAID or LVM. - -- If you are using the ext3 or OCFS2 file systems, you need to -+ If you are using the ext3, ext3cow or OCFS2 file systems, you need to - say Y here. If you are not using ext3 OCFS2 then you will probably - want to say N. - - To compile this device as a module, choose M here: the module will be -- called jbd. If you are compiling ext3 or OCFS2 into the kernel, -+ called jbd. If you are compiling ext3, ext3cow or OCFS2 into the kernel, - you cannot compile this code as a module. - - config JBD_DEBUG - bool "JBD (ext3) debugging support" - depends on JBD - help -- If you are using the ext3 journaled file system (or potentially any -+ If you are using the ext3 or ext3cow journaled file system (or potentially any - other file system/device using JBD), this option allows you to - enable debugging output while the system is running, in order to - help track down any problems you are having. By default the -@@ -266,11 +337,12 @@ - "echo 0 > /proc/sys/fs/jbd2-debug". - - config FS_MBCACHE --# Meta block cache for Extended Attributes (ext2/ext3/ext4) -+# Meta block cache for Extended Attributes (ext2/ext3(cow)/ext4) - tristate -- depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4DEV_FS_XATTR -- default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y -- default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m -+ depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT3COW_FS_XATTR || EXT4DEV_FS_XATTR -+ default y if EXT2_FS=y || EXT3_FS=y || EXT3COW_FS=y || EXT4DEV_FS=y -+ default m if EXT2_FS=m || EXT3_FS=m || EXT3COW_FS=m || EXT4DEV_FS=m -+ - - config REISERFS_FS - tristate "Reiserfs support" -diff -ruN linux-2.6.20.3/fs/Makefile linux-2.6.20.3-ext3cow/fs/Makefile ---- linux-2.6.20.3/fs/Makefile 2007-03-13 14:27:08.000000000 -0400 -+++ linux-2.6.20.3-ext3cow/fs/Makefile 2008-03-09 11:14:54.000000000 -0400 -@@ -63,6 +63,7 @@ - # Do not add any filesystems before this line - obj-$(CONFIG_REISERFS_FS) += reiserfs/ - obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 -+obj-$(CONFIG_EXT3COW_FS) += ext3cow/ # Before ext2 so root fs can be ext3 - obj-$(CONFIG_EXT4DEV_FS) += ext4/ # Before ext2 so root fs can be ext4dev - obj-$(CONFIG_JBD) += jbd/ - obj-$(CONFIG_JBD2) += jbd2/ -diff -ruN linux-2.6.20.3/include/linux/ext3cow_fs.h linux-2.6.20.3-ext3cow/include/linux/ext3cow_fs.h ---- linux-2.6.20.3/include/linux/ext3cow_fs.h 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.20.3-ext3cow/include/linux/ext3cow_fs.h 2008-03-09 11:10:57.000000000 -0400 -@@ -0,0 +1,948 @@ -+/* -+ * linux/include/linux/ext3cow_fs.h -+ * -+ * Copyright (C) 1992, 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ * -+ * from -+ * -+ * linux/include/linux/minix_fs.h -+ * -+ * Copyright (C) 1991, 1992 Linus Torvalds -+ */ -+ -+#ifndef _LINUX_EXT3COW_FS_H -+#define _LINUX_EXT3COW_FS_H -+ -+#include -+#include -+ -+/* -+ * The second extended filesystem constants/structures -+ */ -+ -+/* -+ * Define EXT3COWFS_DEBUG to produce debug messages -+ */ -+#undef EXT3COWFS_DEBUG -+ -+ -+/* -+ * Define EXT3COW_RESERVATION to reserve data blocks for expanding files -+ */ -+#define EXT3COW_DEFAULT_RESERVE_BLOCKS 8 -+/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */ -+#define EXT3COW_MAX_RESERVE_BLOCKS 1027 -+#define EXT3COW_RESERVE_WINDOW_NOT_ALLOCATED 0 -+/* -+ * Always enable hashed directories -+ */ -+//#define CONFIG_EXT3COW_INDEX -+ -+/* -+ * Debug code -+ */ -+#ifdef EXT3COWFS_DEBUG -+#define ext3cow_debug(f, a...) \ -+ do { \ -+ printk (KERN_DEBUG "EXT3COW-fs DEBUG (%s, %d): %s:", \ -+ __FILE__, __LINE__, __FUNCTION__); \ -+ printk (KERN_DEBUG f, ## a); \ -+ } while (0) -+#else -+#define ext3cow_debug(f, a...) do {} while (0) -+#endif -+ -+/* -+ * Special inodes numbers -+ */ -+#define EXT3COW_BAD_INO 1 /* Bad blocks inode */ -+#define EXT3COW_ROOT_INO 2 /* Root inode */ -+#define EXT3COW_BOOT_LOADER_INO 5 /* Boot loader inode */ -+#define EXT3COW_UNDEL_DIR_INO 6 /* Undelete directory inode */ -+#define EXT3COW_RESIZE_INO 7 /* Reserved group descriptors inode */ -+#define EXT3COW_JOURNAL_INO 8 /* Journal inode */ -+ -+/* First non-reserved inode for old ext3cow filesystems */ -+#define EXT3COW_GOOD_OLD_FIRST_INO 11 -+ -+/* -+ * Maximal count of links to a file -+ */ -+#define EXT3COW_LINK_MAX 32000 -+ -+/* For versioning -znjp */ -+#define EXT3COW_FLUX_TOKEN '@' -+/* Macros for scoping - in seconds -znjp */ -+#define ONEHOUR 3600 -+#define YESTERDAY 86400 -+#define ONEWEEK 604800 -+#define ONEMONTH 2419200 -+#define ONEYEAR 31449600 -+ -+/* -+ * Macro-instructions used to manage several block sizes -+ */ -+#define EXT3COW_MIN_BLOCK_SIZE 1024 -+#define EXT3COW_MAX_BLOCK_SIZE 4096 -+#define EXT3COW_MIN_BLOCK_LOG_SIZE 10 -+#ifdef __KERNEL__ -+# define EXT3COW_BLOCK_SIZE(s) ((s)->s_blocksize) -+#else -+# define EXT3COW_BLOCK_SIZE(s) (EXT3COW_MIN_BLOCK_SIZE << (s)->s_log_block_size) -+#endif -+//#define EXT3COW_ADDR_PER_BLOCK(s) (EXT3COW_BLOCK_SIZE(s) / sizeof (__u32)) -+#ifdef __KERNEL__ -+# define EXT3COW_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) -+#else -+# define EXT3COW_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) -+#endif -+#ifdef __KERNEL__ -+#define EXT3COW_ADDR_PER_BLOCK_BITS(s) (EXT3COW_SB(s)->s_addr_per_block_bits) -+#define EXT3COW_INODE_SIZE(s) (EXT3COW_SB(s)->s_inode_size) -+#define EXT3COW_FIRST_INO(s) (EXT3COW_SB(s)->s_first_ino) -+#else -+#define EXT3COW_INODE_SIZE(s) (((s)->s_rev_level == EXT3COW_GOOD_OLD_REV) ? \ -+ EXT3COW_GOOD_OLD_INODE_SIZE : \ -+ (s)->s_inode_size) -+#define EXT3COW_FIRST_INO(s) (((s)->s_rev_level == EXT3COW_GOOD_OLD_REV) ? \ -+ EXT3COW_GOOD_OLD_FIRST_INO : \ -+ (s)->s_first_ino) -+#endif -+/* -+ * Macro-instructions for versioning support - znjp -+ */ -+#define EXT3COW_COWBITMAP_SIZE (sizeof(__u32) * 8) /* one word */ -+#define EXT3COW_COWBITMAPS_PER_IBLOCK(s) \ -+ (( (EXT3COW_BLOCK_SIZE(s) / sizeof(__u32)) / (EXT3COW_COWBITMAP_SIZE))) -+/* Accounts for COW bitmaps */ -+#define EXT3COW_ADDR_PER_BLOCK(s) ((EXT3COW_BLOCK_SIZE(s) / sizeof(__u32)) - EXT3COW_COWBITMAPS_PER_IBLOCK(s)) -+ -+/* -+ * Macro-instructions used to manage fragments -+ */ -+#define EXT3COW_MIN_FRAG_SIZE 1024 -+#define EXT3COW_MAX_FRAG_SIZE 4096 -+#define EXT3COW_MIN_FRAG_LOG_SIZE 10 -+#ifdef __KERNEL__ -+# define EXT3COW_FRAG_SIZE(s) (EXT3COW_SB(s)->s_frag_size) -+# define EXT3COW_FRAGS_PER_BLOCK(s) (EXT3COW_SB(s)->s_frags_per_block) -+#else -+# define EXT3COW_FRAG_SIZE(s) (EXT3COW_MIN_FRAG_SIZE << (s)->s_log_frag_size) -+# define EXT3COW_FRAGS_PER_BLOCK(s) (EXT3COW_BLOCK_SIZE(s) / EXT3COW_FRAG_SIZE(s)) -+#endif -+ -+/* -+ * Structure of a blocks group descriptor -+ */ -+struct ext3cow_group_desc -+{ -+ __le32 bg_block_bitmap; /* Blocks bitmap block */ -+ __le32 bg_inode_bitmap; /* Inodes bitmap block */ -+ __le32 bg_inode_table; /* Inodes table block */ -+ __le16 bg_free_blocks_count; /* Free blocks count */ -+ __le16 bg_free_inodes_count; /* Free inodes count */ -+ __le16 bg_used_dirs_count; /* Directories count */ -+ __u16 bg_pad; -+ __le32 bg_reserved[3]; -+}; -+ -+/* -+ * Macro-instructions used to manage group descriptors -+ */ -+#ifdef __KERNEL__ -+# define EXT3COW_BLOCKS_PER_GROUP(s) (EXT3COW_SB(s)->s_blocks_per_group) -+# define EXT3COW_DESC_PER_BLOCK(s) (EXT3COW_SB(s)->s_desc_per_block) -+# define EXT3COW_INODES_PER_GROUP(s) (EXT3COW_SB(s)->s_inodes_per_group) -+# define EXT3COW_DESC_PER_BLOCK_BITS(s) (EXT3COW_SB(s)->s_desc_per_block_bits) -+#else -+# define EXT3COW_BLOCKS_PER_GROUP(s) ((s)->s_blocks_per_group) -+# define EXT3COW_DESC_PER_BLOCK(s) (EXT3COW_BLOCK_SIZE(s) / sizeof (struct ext3cow_group_desc)) -+# define EXT3COW_INODES_PER_GROUP(s) ((s)->s_inodes_per_group) -+#endif -+ -+/* -+ * Constants relative to the data blocks -+ */ -+#define EXT3COW_NDIR_BLOCKS 12 -+#define EXT3COW_IND_BLOCK EXT3COW_NDIR_BLOCKS -+#define EXT3COW_DIND_BLOCK (EXT3COW_IND_BLOCK + 1) -+#define EXT3COW_TIND_BLOCK (EXT3COW_DIND_BLOCK + 1) -+#define EXT3COW_N_BLOCKS (EXT3COW_TIND_BLOCK + 1) -+ -+/* -+ * Inode flags -+ */ -+#define EXT3COW_SECRM_FL 0x00000001 /* Secure deletion */ -+#define EXT3COW_UNRM_FL 0x00000002 /* Undelete */ -+#define EXT3COW_COMPR_FL 0x00000004 /* Compress file */ -+#define EXT3COW_SYNC_FL 0x00000008 /* Synchronous updates */ -+#define EXT3COW_IMMUTABLE_FL 0x00000010 /* Immutable file */ -+#define EXT3COW_APPEND_FL 0x00000020 /* writes to file may only append */ -+#define EXT3COW_NODUMP_FL 0x00000040 /* do not dump file */ -+#define EXT3COW_NOATIME_FL 0x00000080 /* do not update atime */ -+/* Reserved for compression usage... */ -+#define EXT3COW_DIRTY_FL 0x00000100 -+#define EXT3COW_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ -+#define EXT3COW_NOCOMPR_FL 0x00000400 /* Don't compress */ -+#define EXT3COW_ECOMPR_FL 0x00000800 /* Compression error */ -+/* End compression flags --- maybe not all used */ -+#define EXT3COW_INDEX_FL 0x00001000 /* hash-indexed directory */ -+#define EXT3COW_IMAGIC_FL 0x00002000 /* AFS directory */ -+#define EXT3COW_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ -+#define EXT3COW_NOTAIL_FL 0x00008000 /* file tail should not be merged */ -+#define EXT3COW_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ -+#define EXT3COW_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ -+/* Used for Versioning - znjp */ -+#define EXT3COW_UNCHANGEABLE_FL 0x00040000 -+#define EXT3COW_UNVERSIONABLE_FL 0x00080000 -+#define EXT3COW_FAKEINODE_FL 0x00100000 -+#define EXT3COW_RESERVED_FL 0x80000000 /* reserved for ext3cow lib */ -+ -+#define EXT3COW_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ -+#define EXT3COW_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ -+ -+/* -+ * Inode dynamic state flags -+ */ -+#define EXT3COW_STATE_JDATA 0x00000001 /* journaled data exists */ -+#define EXT3COW_STATE_NEW 0x00000002 /* inode is newly created */ -+#define EXT3COW_STATE_XATTR 0x00000004 /* has in-inode xattrs */ -+ -+/* Used to pass group descriptor data when online resize is done */ -+struct ext3cow_new_group_input { -+ __u32 group; /* Group number for this data */ -+ __u32 block_bitmap; /* Absolute block number of block bitmap */ -+ __u32 inode_bitmap; /* Absolute block number of inode bitmap */ -+ __u32 inode_table; /* Absolute block number of inode table start */ -+ __u32 blocks_count; /* Total number of blocks in this group */ -+ __u16 reserved_blocks; /* Number of reserved blocks in this group */ -+ __u16 unused; -+}; -+ -+/* The struct ext3cow_new_group_input in kernel space, with free_blocks_count */ -+struct ext3cow_new_group_data { -+ __u32 group; -+ __u32 block_bitmap; -+ __u32 inode_bitmap; -+ __u32 inode_table; -+ __u32 blocks_count; -+ __u16 reserved_blocks; -+ __u16 unused; -+ __u32 free_blocks_count; -+}; -+ -+ -+/* -+ * ioctl commands -+ */ -+#define EXT3COW_IOC_GETFLAGS FS_IOC_GETFLAGS -+#define EXT3COW_IOC_SETFLAGS FS_IOC_SETFLAGS -+#define EXT3COW_IOC_GETVERSION _IOR('f', 3, long) -+#define EXT3COW_IOC_SETVERSION _IOW('f', 4, long) -+#define EXT3COW_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) -+#define EXT3COW_IOC_GROUP_ADD _IOW('f', 8,struct ext3cow_new_group_input) -+#define EXT3COW_IOC_GETVERSION_OLD FS_IOC_GETVERSION -+#define EXT3COW_IOC_SETVERSION_OLD FS_IOC_SETVERSION -+#ifdef CONFIG_JBD_DEBUG -+#define EXT3COW_IOC_WAIT_FOR_READONLY _IOR('f', 99, long) -+#endif -+#define EXT3COW_IOC_GETRSVSZ _IOR('f', 5, long) -+#define EXT3COW_IOC_SETRSVSZ _IOW('f', 6, long) -+/* ioctls for versioning - znjp */ -+#define EXT3COW_IOC_TAKESNAPSHOT _IOR('f', 7, long) -+#define EXT3COW_IOC_GETEPOCH _IOR('f', 8, long) -+ -+/* -+ * ioctl commands in 32 bit emulation -+ */ -+#define EXT3COW_IOC32_GETFLAGS FS_IOC32_GETFLAGS -+#define EXT3COW_IOC32_SETFLAGS FS_IOC32_SETFLAGS -+#define EXT3COW_IOC32_GETVERSION _IOR('f', 3, int) -+#define EXT3COW_IOC32_SETVERSION _IOW('f', 4, int) -+#define EXT3COW_IOC32_GETRSVSZ _IOR('f', 5, int) -+#define EXT3COW_IOC32_SETRSVSZ _IOW('f', 6, int) -+#define EXT3COW_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) -+#ifdef CONFIG_JBD_DEBUG -+#define EXT3COW_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int) -+#endif -+#define EXT3COW_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION -+#define EXT3COW_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION -+ -+ -+/* -+ * Mount options -+ */ -+struct ext3cow_mount_options { -+ unsigned long s_mount_opt; -+ uid_t s_resuid; -+ gid_t s_resgid; -+ unsigned long s_commit_interval; -+#ifdef CONFIG_QUOTA -+ int s_jquota_fmt; -+ char *s_qf_names[MAXQUOTAS]; -+#endif -+}; -+ -+/* -+ * Structure of an inode on the disk -+ */ -+struct ext3cow_inode { -+ __le16 i_mode; /* File mode */ -+ __le16 i_uid; /* Low 16 bits of Owner Uid */ -+ __le32 i_size; /* Size in bytes */ -+ __le32 i_atime; /* Access time */ -+ __le32 i_ctime; /* Creation time */ -+ __le32 i_mtime; /* Modification time */ -+ __le32 i_dtime; /* Deletion Time */ -+ __le16 i_gid; /* Low 16 bits of Group Id */ -+ __le16 i_links_count; /* Links count */ -+ __le32 i_blocks; /* Blocks count */ -+ __le32 i_flags; /* File flags */ -+ union { -+ struct { -+ //__u32 l_i_reserved1; -+ /* Direct block COW bitmap -znjp */ -+ __u16 l_i_direct_cow_bitmap; -+ __u16 l_i_pad1; -+ } linux1; -+ struct { -+ __u32 h_i_translator; -+ } hurd1; -+ struct { -+ __u32 m_i_reserved1; -+ } masix1; -+ } osd1; /* OS dependent 1 */ -+ __le32 i_block[EXT3COW_N_BLOCKS];/* Pointers to blocks */ -+ __le32 i_generation; /* File version (for NFS) */ -+ __le32 i_file_acl; /* File ACL */ -+ __le32 i_dir_acl; /* Directory ACL */ -+ __le32 i_faddr; /* Fragment address */ -+ union { -+ struct { -+ //__u8 l_i_frag; /* Fragment number */ -+ //__u8 l_i_fsize; /* Fragment size */ -+ //__u16 i_pad1; -+ __le16 l_i_uid_high; /* these 2 fields */ -+ __le16 l_i_gid_high; /* were reserved2[0] */ -+ //__u32 l_i_reserved2; -+ /* Epoch number for versioning -znjp */ -+ __le32 l_i_epoch_number; -+ __u32 l_i_next_inode; -+ } linux2; -+ struct { -+ __u8 h_i_frag; /* Fragment number */ -+ __u8 h_i_fsize; /* Fragment size */ -+ __u16 h_i_mode_high; -+ __u16 h_i_uid_high; -+ __u16 h_i_gid_high; -+ __u32 h_i_author; -+ } hurd2; -+ struct { -+ __u8 m_i_frag; /* Fragment number */ -+ __u8 m_i_fsize; /* Fragment size */ -+ __u16 m_pad1; -+ __u32 m_i_reserved2[2]; -+ } masix2; -+ } osd2; /* OS dependent 2 */ -+ __le16 i_extra_isize; -+ __le16 i_pad1; -+}; -+ -+#define i_size_high i_dir_acl -+ -+#if defined(__KERNEL__) || defined(__linux__) -+/* For versioning -znjp */ -+//#define i_reserved1 osd1.linux1.l_i_reserved1 -+#define i_cowbitmap osd1.linux1.l_i_direct_cow_bitmap -+//#define i_frag osd2.linux2.l_i_frag -+//#define i_fsize osd2.linux2.l_i_fsize -+#define i_uid_low i_uid -+#define i_gid_low i_gid -+/* For versioning -znjp */ -+#define i_uid_high osd2.linux2.l_i_uid_high -+#define i_gid_high osd2.linux2.l_i_gid_high -+//#define i_reserved2 osd2.linux2.l_i_reserved2 -+#define i_epch_number osd2.linux2.l_i_epoch_number -+#define i_nxt_inode osd2.linux2.l_i_next_inode -+ -+#elif defined(__GNU__) -+ -+#define i_translator osd1.hurd1.h_i_translator -+#define i_frag osd2.hurd2.h_i_frag; -+#define i_fsize osd2.hurd2.h_i_fsize; -+#define i_uid_high osd2.hurd2.h_i_uid_high -+#define i_gid_high osd2.hurd2.h_i_gid_high -+#define i_author osd2.hurd2.h_i_author -+ -+#elif defined(__masix__) -+ -+#define i_reserved1 osd1.masix1.m_i_reserved1 -+#define i_frag osd2.masix2.m_i_frag -+#define i_fsize osd2.masix2.m_i_fsize -+#define i_reserved2 osd2.masix2.m_i_reserved2 -+ -+#endif /* defined(__KERNEL__) || defined(__linux__) */ -+ -+/* -+ * File system states -+ */ -+#define EXT3COW_VALID_FS 0x0001 /* Unmounted cleanly */ -+#define EXT3COW_ERROR_FS 0x0002 /* Errors detected */ -+#define EXT3COW_ORPHAN_FS 0x0004 /* Orphans being recovered */ -+ -+/* -+ * Mount flags -+ */ -+#define EXT3COW_MOUNT_CHECK 0x00001 /* Do mount-time checks */ -+#define EXT3COW_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */ -+#define EXT3COW_MOUNT_GRPID 0x00004 /* Create files with directory's group */ -+#define EXT3COW_MOUNT_DEBUG 0x00008 /* Some debugging messages */ -+#define EXT3COW_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ -+#define EXT3COW_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ -+#define EXT3COW_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ -+#define EXT3COW_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ -+#define EXT3COW_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ -+#define EXT3COW_MOUNT_ABORT 0x00200 /* Fatal error detected */ -+#define EXT3COW_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ -+#define EXT3COW_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ -+#define EXT3COW_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ -+#define EXT3COW_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */ -+#define EXT3COW_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */ -+#define EXT3COW_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ -+#define EXT3COW_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ -+#define EXT3COW_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ -+#define EXT3COW_MOUNT_RESERVATION 0x10000 /* Preallocation */ -+#define EXT3COW_MOUNT_BARRIER 0x20000 /* Use block barriers */ -+#define EXT3COW_MOUNT_NOBH 0x40000 /* No bufferheads */ -+#define EXT3COW_MOUNT_QUOTA 0x80000 /* Some quota option set */ -+#define EXT3COW_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ -+#define EXT3COW_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ -+ -+/* Compatibility, for having both ext2_fs.h and ext3cow_fs.h included at once */ -+#ifndef _LINUX_EXT2_FS_H -+#define clear_opt(o, opt) o &= ~EXT3COW_MOUNT_##opt -+#define set_opt(o, opt) o |= EXT3COW_MOUNT_##opt -+#define test_opt(sb, opt) (EXT3COW_SB(sb)->s_mount_opt & \ -+ EXT3COW_MOUNT_##opt) -+#else -+#define EXT2_MOUNT_NOLOAD EXT3COW_MOUNT_NOLOAD -+#define EXT2_MOUNT_ABORT EXT3COW_MOUNT_ABORT -+#define EXT2_MOUNT_DATA_FLAGS EXT3COW_MOUNT_DATA_FLAGS -+#endif -+ -+#define ext3cow_set_bit ext2_set_bit -+#define ext3cow_set_bit_atomic ext2_set_bit_atomic -+#define ext3cow_clear_bit ext2_clear_bit -+#define ext3cow_clear_bit_atomic ext2_clear_bit_atomic -+#define ext3cow_test_bit ext2_test_bit -+#define ext3cow_find_first_zero_bit ext2_find_first_zero_bit -+#define ext3cow_find_next_zero_bit ext2_find_next_zero_bit -+ -+/* -+ * Maximal mount counts between two filesystem checks -+ */ -+#define EXT3COW_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ -+#define EXT3COW_DFL_CHECKINTERVAL 0 /* Don't use interval check */ -+ -+/* -+ * Behaviour when detecting errors -+ */ -+#define EXT3COW_ERRORS_CONTINUE 1 /* Continue execution */ -+#define EXT3COW_ERRORS_RO 2 /* Remount fs read-only */ -+#define EXT3COW_ERRORS_PANIC 3 /* Panic */ -+#define EXT3COW_ERRORS_DEFAULT EXT3COW_ERRORS_CONTINUE -+ -+/* -+ * Structure of the super block -+ */ -+struct ext3cow_super_block { -+/*00*/ __le32 s_inodes_count; /* Inodes count */ -+ __le32 s_blocks_count; /* Blocks count */ -+ __le32 s_r_blocks_count; /* Reserved blocks count */ -+ __le32 s_free_blocks_count; /* Free blocks count */ -+/*10*/ __le32 s_free_inodes_count; /* Free inodes count */ -+ __le32 s_first_data_block; /* First Data Block */ -+ __le32 s_log_block_size; /* Block size */ -+ __le32 s_log_frag_size; /* Fragment size */ -+/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ -+ __le32 s_frags_per_group; /* # Fragments per group */ -+ __le32 s_inodes_per_group; /* # Inodes per group */ -+ __le32 s_mtime; /* Mount time */ -+/*30*/ __le32 s_wtime; /* Write time */ -+ __le16 s_mnt_count; /* Mount count */ -+ __le16 s_max_mnt_count; /* Maximal mount count */ -+ __le16 s_magic; /* Magic signature */ -+ __le16 s_state; /* File system state */ -+ __le16 s_errors; /* Behaviour when detecting errors */ -+ __le16 s_minor_rev_level; /* minor revision level */ -+/*40*/ __le32 s_lastcheck; /* time of last check */ -+ __le32 s_checkinterval; /* max. time between checks */ -+ __le32 s_creator_os; /* OS */ -+ __le32 s_rev_level; /* Revision level */ -+/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */ -+ __le16 s_def_resgid; /* Default gid for reserved blocks */ -+ /* -+ * These fields are for EXT3COW_DYNAMIC_REV superblocks only. -+ * -+ * Note: the difference between the compatible feature set and -+ * the incompatible feature set is that if there is a bit set -+ * in the incompatible feature set that the kernel doesn't -+ * know about, it should refuse to mount the filesystem. -+ * -+ * e2fsck's requirements are more strict; if it doesn't know -+ * about a feature in either the compatible or incompatible -+ * feature set, it must abort and not try to meddle with -+ * things it doesn't understand... -+ */ -+ __le32 s_first_ino; /* First non-reserved inode */ -+ __le16 s_inode_size; /* size of inode structure */ -+ __le16 s_block_group_nr; /* block group # of this superblock */ -+ __le32 s_feature_compat; /* compatible feature set */ -+/*60*/ __le32 s_feature_incompat; /* incompatible feature set */ -+ __le32 s_feature_ro_compat; /* readonly-compatible feature set */ -+/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ -+/*78*/ char s_volume_name[16]; /* volume name */ -+/*88*/ char s_last_mounted[64]; /* directory where last mounted */ -+/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ -+ /* -+ * Performance hints. Directory preallocation should only -+ * happen if the EXT3COW_FEATURE_COMPAT_DIR_PREALLOC flag is on. -+ */ -+ __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ -+ __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ -+ __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */ -+ /* -+ * Journaling support valid if EXT3COW_FEATURE_COMPAT_HAS_JOURNAL set. -+ */ -+/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */ -+/*E0*/ __le32 s_journal_inum; /* inode number of journal file */ -+ __le32 s_journal_dev; /* device number of journal file */ -+ __le32 s_last_orphan; /* start of list of inodes to delete */ -+ __le32 s_hash_seed[4]; /* HTREE hash seed */ -+ __u8 s_def_hash_version; /* Default hash version to use */ -+ __u8 s_reserved_char_pad; -+ __u16 s_reserved_word_pad; -+ __le32 s_default_mount_opts; -+ __le32 s_first_meta_bg; /* First metablock block group */ -+ /* Added for version - znjp */ -+ __le32 s_epoch_number; -+ __u32 s_reserved[189]; /* Padding to the end of the block */ -+}; -+ -+#ifdef __KERNEL__ -+#include -+#include -+static inline struct ext3cow_sb_info * EXT3COW_SB(struct super_block *sb) -+{ -+ return sb->s_fs_info; -+} -+static inline struct ext3cow_inode_info *EXT3COW_I(struct inode *inode) -+{ -+ return container_of(inode, struct ext3cow_inode_info, vfs_inode); -+} -+ -+static inline int ext3cow_valid_inum(struct super_block *sb, unsigned long ino) -+{ -+ return ino == EXT3COW_ROOT_INO || -+ ino == EXT3COW_JOURNAL_INO || -+ ino == EXT3COW_RESIZE_INO || -+ (ino >= EXT3COW_FIRST_INO(sb) && -+ ino <= le32_to_cpu(EXT3COW_SB(sb)->s_es->s_inodes_count)); -+} -+#else -+/* Assume that user mode programs are passing in an ext3cowfs superblock, not -+ * a kernel struct super_block. This will allow us to call the feature-test -+ * macros from user land. */ -+#define EXT3COW_SB(sb) (sb) -+#endif -+ -+#define NEXT_ORPHAN(inode) EXT3COW_I(inode)->i_dtime -+ -+/* -+ * Codes for operating systems -+ */ -+#define EXT3COW_OS_LINUX 0 -+#define EXT3COW_OS_HURD 1 -+#define EXT3COW_OS_MASIX 2 -+#define EXT3COW_OS_FREEBSD 3 -+#define EXT3COW_OS_LITES 4 -+ -+/* -+ * Revision levels -+ */ -+#define EXT3COW_GOOD_OLD_REV 0 /* The good old (original) format */ -+#define EXT3COW_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ -+ -+#define EXT3COW_CURRENT_REV EXT3COW_GOOD_OLD_REV -+#define EXT3COW_MAX_SUPP_REV EXT3COW_DYNAMIC_REV -+ -+#define EXT3COW_GOOD_OLD_INODE_SIZE 128 -+ -+/* -+ * Feature set definitions -+ */ -+ -+#define EXT3COW_HAS_COMPAT_FEATURE(sb,mask) \ -+ ( EXT3COW_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) ) -+#define EXT3COW_HAS_RO_COMPAT_FEATURE(sb,mask) \ -+ ( EXT3COW_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) ) -+#define EXT3COW_HAS_INCOMPAT_FEATURE(sb,mask) \ -+ ( EXT3COW_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) ) -+#define EXT3COW_SET_COMPAT_FEATURE(sb,mask) \ -+ EXT3COW_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) -+#define EXT3COW_SET_RO_COMPAT_FEATURE(sb,mask) \ -+ EXT3COW_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask) -+#define EXT3COW_SET_INCOMPAT_FEATURE(sb,mask) \ -+ EXT3COW_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask) -+#define EXT3COW_CLEAR_COMPAT_FEATURE(sb,mask) \ -+ EXT3COW_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask) -+#define EXT3COW_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ -+ EXT3COW_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask) -+#define EXT3COW_CLEAR_INCOMPAT_FEATURE(sb,mask) \ -+ EXT3COW_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask) -+ -+#define EXT3COW_FEATURE_COMPAT_DIR_PREALLOC 0x0001 -+#define EXT3COW_FEATURE_COMPAT_IMAGIC_INODES 0x0002 -+#define EXT3COW_FEATURE_COMPAT_HAS_JOURNAL 0x0004 -+#define EXT3COW_FEATURE_COMPAT_EXT_ATTR 0x0008 -+#define EXT3COW_FEATURE_COMPAT_RESIZE_INODE 0x0010 -+#define EXT3COW_FEATURE_COMPAT_DIR_INDEX 0x0020 -+ -+#define EXT3COW_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 -+#define EXT3COW_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 -+#define EXT3COW_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 -+ -+#define EXT3COW_FEATURE_INCOMPAT_COMPRESSION 0x0001 -+#define EXT3COW_FEATURE_INCOMPAT_FILETYPE 0x0002 -+#define EXT3COW_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ -+#define EXT3COW_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ -+#define EXT3COW_FEATURE_INCOMPAT_META_BG 0x0010 -+ -+#define EXT3COW_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR -+#define EXT3COW_FEATURE_INCOMPAT_SUPP (EXT3COW_FEATURE_INCOMPAT_FILETYPE| \ -+ EXT3COW_FEATURE_INCOMPAT_RECOVER| \ -+ EXT3COW_FEATURE_INCOMPAT_META_BG) -+#define EXT3COW_FEATURE_RO_COMPAT_SUPP (EXT3COW_FEATURE_RO_COMPAT_SPARSE_SUPER| \ -+ EXT3COW_FEATURE_RO_COMPAT_LARGE_FILE| \ -+ EXT3COW_FEATURE_RO_COMPAT_BTREE_DIR) -+ -+/* -+ * Default values for user and/or group using reserved blocks -+ */ -+#define EXT3COW_DEF_RESUID 0 -+#define EXT3COW_DEF_RESGID 0 -+ -+/* -+ * Default mount options -+ */ -+#define EXT3COW_DEFM_DEBUG 0x0001 -+#define EXT3COW_DEFM_BSDGROUPS 0x0002 -+#define EXT3COW_DEFM_XATTR_USER 0x0004 -+#define EXT3COW_DEFM_ACL 0x0008 -+#define EXT3COW_DEFM_UID16 0x0010 -+#define EXT3COW_DEFM_JMODE 0x0060 -+#define EXT3COW_DEFM_JMODE_DATA 0x0020 -+#define EXT3COW_DEFM_JMODE_ORDERED 0x0040 -+#define EXT3COW_DEFM_JMODE_WBACK 0x0060 -+ -+/* -+ * Structure of a directory entry -+ */ -+#define EXT3COW_NAME_LEN 255 -+ -+struct ext3cow_dir_entry { -+ __le32 inode; /* Inode number */ -+ __le16 rec_len; /* Directory entry length */ -+ __le16 name_len; /* Name length */ -+ char name[EXT3COW_NAME_LEN]; /* File name */ -+}; -+ -+/* -+ * The new version of the directory entry. Since EXT3COW structures are -+ * stored in intel byte order, and the name_len field could never be -+ * bigger than 255 chars, it's safe to reclaim the extra byte for the -+ * file_type field. -+ */ -+struct ext3cow_dir_entry_2 { -+ __le32 inode; /* Inode number */ -+ __le16 rec_len; /* Directory entry length */ -+ __u8 name_len; /* Name length */ -+ __u8 file_type; -+ /* Added for versioning - znjp */ -+ __u32 birth_epoch; -+ __u32 death_epoch; -+ char name[EXT3COW_NAME_LEN]; /* File name */ -+}; -+ -+/* -+ * Ext3 directory file types. Only the low 3 bits are used. The -+ * other bits are reserved for now. -+ */ -+#define EXT3COW_FT_UNKNOWN 0 -+#define EXT3COW_FT_REG_FILE 1 -+#define EXT3COW_FT_DIR 2 -+#define EXT3COW_FT_CHRDEV 3 -+#define EXT3COW_FT_BLKDEV 4 -+#define EXT3COW_FT_FIFO 5 -+#define EXT3COW_FT_SOCK 6 -+#define EXT3COW_FT_SYMLINK 7 -+ -+#define EXT3COW_FT_MAX 8 -+ -+/* Versioning macros - znjp */ -+#define EXT3COW_DIRENT_ALIVE 0 -+#define EXT3COW_IS_DIRENT_ALIVE(de) ((le32_to_cpu(de->death_epoch) == EXT3COW_DIRENT_ALIVE)) -+#define EXT3COW_IS_DIRENT_SCOPED(de, epoch) \ -+((le32_to_cpu(de->birth_epoch) <= epoch) && \ -+(EXT3COW_IS_DIRENT_ALIVE(de) || (!EXT3COW_IS_DIRENT_ALIVE(de) && \ -+le32_to_cpu(de->death_epoch) > epoch))) -+#define EXT3COW_I_EPOCHNUMBER(inode) (((unsigned int)EXT3COW_I(inode)->i_epoch_number)) -+#define EXT3COW_S_EPOCHNUMBER(sb) (((unsigned int)EXT3COW_SB(sb)->s_epoch_number)) -+#define EXT3COW_I_NEXT_INODE(inode) (((unsigned int)EXT3COW_I(inode)->i_next_inode)) -+#define EXT3COW_IS_UNVERSIONABLE(inode) (((unsigned int)EXT3COW_I(inode)->i_flags & EXT3COW_UNVERSIONABLE_FL)) -+#define EXT3COW_IS_UNCHANGEABLE(inode) (((unsigned int)EXT3COW_I(inode)->i_flags & EXT3COW_UNCHANGEABLE_FL)) -+#define EXT3COW_IS_FAKEINODE(inode) (((unsigned int)EXT3COW_I(inode)->i_flags & EXT3COW_FAKEINODE_FL)) -+ -+ -+/* -+ * EXT3COW_DIR_PAD defines the directory entries boundaries -+ * -+ * NOTE: It must be a multiple of 4 -+ */ -+#define EXT3COW_DIR_PAD 4 -+#define EXT3COW_DIR_ROUND (EXT3COW_DIR_PAD - 1) -+/* Added 8 to account for birth and death epochs -znjp */ -+#define EXT3COW_DIR_REC_LEN(name_len) (((name_len) + 16 + EXT3COW_DIR_ROUND) & \ -+ ~EXT3COW_DIR_ROUND) -+/* -+ * Hash Tree Directory indexing -+ * (c) Daniel Phillips, 2001 -+ */ -+ -+#ifdef CONFIG_EXT3COW_INDEX -+ #define is_dx(dir) (EXT3COW_HAS_COMPAT_FEATURE(dir->i_sb, \ -+ EXT3COW_FEATURE_COMPAT_DIR_INDEX) && \ -+ (EXT3COW_I(dir)->i_flags & EXT3COW_INDEX_FL)) -+#define EXT3COW_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3COW_LINK_MAX) -+#define EXT3COW_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) -+#else -+ #define is_dx(dir) 0 -+#define EXT3COW_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3COW_LINK_MAX) -+#define EXT3COW_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2) -+#endif -+ -+/* Legal values for the dx_root hash_version field: */ -+ -+#define DX_HASH_LEGACY 0 -+#define DX_HASH_HALF_MD4 1 -+#define DX_HASH_TEA 2 -+ -+#ifdef __KERNEL__ -+ -+/* hash info structure used by the directory hash */ -+struct dx_hash_info -+{ -+ u32 hash; -+ u32 minor_hash; -+ int hash_version; -+ u32 *seed; -+}; -+ -+#define EXT3COW_HTREE_EOF 0x7fffffff -+ -+/* -+ * Control parameters used by ext3cow_htree_next_block -+ */ -+#define HASH_NB_ALWAYS 1 -+ -+ -+/* -+ * Describe an inode's exact location on disk and in memory -+ */ -+struct ext3cow_iloc -+{ -+ struct buffer_head *bh; -+ unsigned long offset; -+ unsigned long block_group; -+}; -+ -+static inline struct ext3cow_inode *ext3cow_raw_inode(struct ext3cow_iloc *iloc) -+{ -+ return (struct ext3cow_inode *) (iloc->bh->b_data + iloc->offset); -+} -+ -+/* -+ * This structure is stuffed into the struct file's private_data field -+ * for directories. It is where we put information so that we can do -+ * readdir operations in hash tree order. -+ */ -+struct dir_private_info { -+ struct rb_root root; -+ struct rb_node *curr_node; -+ struct fname *extra_fname; -+ loff_t last_pos; -+ __u32 curr_hash; -+ __u32 curr_minor_hash; -+ __u32 next_hash; -+}; -+ -+/* calculate the first block number of the group */ -+static inline ext3cow_fsblk_t -+ext3cow_group_first_block_no(struct super_block *sb, unsigned long group_no) -+{ -+ return group_no * (ext3cow_fsblk_t)EXT3COW_BLOCKS_PER_GROUP(sb) + -+ le32_to_cpu(EXT3COW_SB(sb)->s_es->s_first_data_block); -+} -+ -+/* -+ * Special error return code only used by dx_probe() and its callers. -+ */ -+#define ERR_BAD_DX_DIR -75000 -+ -+/* -+ * Function prototypes -+ */ -+ -+/* -+ * Ok, these declarations are also in but none of the -+ * ext3cow source programs needs to include it so they are duplicated here. -+ */ -+# define NORET_TYPE /**/ -+# define ATTRIB_NORET __attribute__((noreturn)) -+# define NORET_AND noreturn, -+ -+/* balloc.c */ -+extern int ext3cow_bg_has_super(struct super_block *sb, int group); -+extern unsigned long ext3cow_bg_num_gdb(struct super_block *sb, int group); -+extern ext3cow_fsblk_t ext3cow_new_block (handle_t *handle, struct inode *inode, -+ ext3cow_fsblk_t goal, int *errp); -+extern ext3cow_fsblk_t ext3cow_new_blocks (handle_t *handle, struct inode *inode, -+ ext3cow_fsblk_t goal, unsigned long *count, int *errp); -+extern void ext3cow_free_blocks (handle_t *handle, struct inode *inode, -+ ext3cow_fsblk_t block, unsigned long count); -+extern void ext3cow_free_blocks_sb (handle_t *handle, struct super_block *sb, -+ ext3cow_fsblk_t block, unsigned long count, -+ unsigned long *pdquot_freed_blocks); -+extern ext3cow_fsblk_t ext3cow_count_free_blocks (struct super_block *); -+extern void ext3cow_check_blocks_bitmap (struct super_block *); -+extern struct ext3cow_group_desc * ext3cow_get_group_desc(struct super_block * sb, -+ unsigned int block_group, -+ struct buffer_head ** bh); -+extern int ext3cow_should_retry_alloc(struct super_block *sb, int *retries); -+extern void ext3cow_init_block_alloc_info(struct inode *); -+extern void ext3cow_rsv_window_add(struct super_block *sb, struct ext3cow_reserve_window_node *rsv); -+ -+ -+/* dir.c */ -+extern int ext3cow_check_dir_entry(const char *, struct inode *, -+ struct ext3cow_dir_entry_2 *, -+ struct buffer_head *, unsigned long); -+extern int ext3cow_htree_store_dirent(struct file *dir_file, __u32 hash, -+ __u32 minor_hash, -+ struct ext3cow_dir_entry_2 *dirent); -+extern void ext3cow_htree_free_dir_info(struct dir_private_info *p); -+ -+/* fsync.c */ -+extern int ext3cow_sync_file (struct file *, struct dentry *, int); -+ -+/* hash.c */ -+extern int ext3cowfs_dirhash(const char *name, int len, struct -+ dx_hash_info *hinfo); -+ -+/* ialloc.c */ -+extern struct inode * ext3cow_new_inode (handle_t *, struct inode *, int); -+extern void ext3cow_free_inode (handle_t *, struct inode *); -+extern struct inode * ext3cow_orphan_get (struct super_block *, unsigned long); -+extern unsigned long ext3cow_count_free_inodes (struct super_block *); -+extern unsigned long ext3cow_count_dirs (struct super_block *); -+extern void ext3cow_check_inodes_bitmap (struct super_block *); -+extern unsigned long ext3cow_count_free (struct buffer_head *, unsigned); -+ -+ -+/* inode.c */ -+int ext3cow_forget(handle_t *handle, int is_metadata, struct inode *inode, -+ struct buffer_head *bh, ext3cow_fsblk_t blocknr); -+struct buffer_head * ext3cow_getblk (handle_t *, struct inode *, long, int, int *); -+struct buffer_head * ext3cow_bread (handle_t *, struct inode *, int, int, int *); -+int ext3cow_get_blocks_handle(handle_t *handle, struct inode *inode, -+ sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result, -+ int create, int extend_disksize); -+ -+extern void ext3cow_read_inode (struct inode *); -+extern int ext3cow_write_inode (struct inode *, int); -+extern int ext3cow_setattr (struct dentry *, struct iattr *); -+extern void ext3cow_delete_inode (struct inode *); -+extern int ext3cow_sync_inode (handle_t *, struct inode *); -+extern void ext3cow_discard_reservation (struct inode *); -+extern void ext3cow_dirty_inode(struct inode *); -+extern int ext3cow_change_inode_journal_flag(struct inode *, int); -+extern int ext3cow_get_inode_loc(struct inode *, struct ext3cow_iloc *); -+extern void ext3cow_truncate (struct inode *); -+extern void ext3cow_set_inode_flags(struct inode *); -+extern void ext3cow_set_aops(struct inode *inode); -+ -+/* ioctl.c */ -+extern int ext3cow_ioctl (struct inode *, struct file *, unsigned int, -+ unsigned long); -+extern long ext3cow_compat_ioctl (struct file *, unsigned int, unsigned long); -+ -+/* namei.c */ -+extern int is_unchangeable(struct inode *, struct dentry *); -+extern int ext3cow_orphan_add(handle_t *, struct inode *); -+extern int ext3cow_orphan_del(handle_t *, struct inode *); -+extern int ext3cow_htree_fill_tree(struct file *dir_file, __u32 start_hash, -+ __u32 start_minor_hash, __u32 *next_hash); -+extern struct inode *ext3cow_fake_inode(struct inode *, unsigned int); -+extern int ext3cow_dup_inode(struct inode *, struct inode *); -+extern int ext3cow_reclaim_dup_inode(struct inode *, struct inode *); -+ -+/* resize.c */ -+extern int ext3cow_group_add(struct super_block *sb, -+ struct ext3cow_new_group_data *input); -+extern int ext3cow_group_extend(struct super_block *sb, -+ struct ext3cow_super_block *es, -+ ext3cow_fsblk_t n_blocks_count); -+ -+/* super.c */ -+extern void ext3cow_error (struct super_block *, const char *, const char *, ...) -+ __attribute__ ((format (printf, 3, 4))); -+extern void __ext3cow_std_error (struct super_block *, const char *, int); -+extern void ext3cow_abort (struct super_block *, const char *, const char *, ...) -+ __attribute__ ((format (printf, 3, 4))); -+extern void ext3cow_warning (struct super_block *, const char *, const char *, ...) -+ __attribute__ ((format (printf, 3, 4))); -+extern void ext3cow_update_dynamic_rev (struct super_block *sb); -+extern unsigned int ext3cow_take_snapshot(struct super_block *sb); -+ -+#define ext3cow_std_error(sb, errno) \ -+do { \ -+ if ((errno)) \ -+ __ext3cow_std_error((sb), __FUNCTION__, (errno)); \ -+} while (0) -+ -+/* -+ * Inodes and files operations -+ */ -+ -+/* dir.c */ -+extern const struct file_operations ext3cow_dir_operations; -+ -+/* file.c */ -+extern struct inode_operations ext3cow_file_inode_operations; -+extern const struct file_operations ext3cow_file_operations; -+ -+/* namei.c */ -+extern struct inode_operations ext3cow_dir_inode_operations; -+extern struct inode_operations ext3cow_special_inode_operations; -+ -+/* symlink.c */ -+extern struct inode_operations ext3cow_symlink_inode_operations; -+extern struct inode_operations ext3cow_fast_symlink_inode_operations; -+ -+ -+#endif /* __KERNEL__ */ -+ -+#endif /* _LINUX_EXT3COW_FS_H */ -diff -ruN linux-2.6.20.3/include/linux/ext3cow_fs_i.h linux-2.6.20.3-ext3cow/include/linux/ext3cow_fs_i.h ---- linux-2.6.20.3/include/linux/ext3cow_fs_i.h 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.20.3-ext3cow/include/linux/ext3cow_fs_i.h 2008-03-09 11:10:55.000000000 -0400 -@@ -0,0 +1,152 @@ -+/* -+ * linux/include/linux/ext3cow_fs_i.h -+ * -+ * Copyright (C) 1992, 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ * -+ * from -+ * -+ * linux/include/linux/minix_fs_i.h -+ * -+ * Copyright (C) 1991, 1992 Linus Torvalds -+ */ -+ -+#ifndef _LINUX_EXT3COW_FS_I -+#define _LINUX_EXT3COW_FS_I -+ -+#include -+#include -+#include -+#include -+ -+/* data type for block offset of block group */ -+typedef int ext3cow_grpblk_t; -+ -+/* data type for filesystem-wide blocks number */ -+typedef unsigned long ext3cow_fsblk_t; -+ -+#define E3FSBLK "%lu" -+ -+struct ext3cow_reserve_window { -+ ext3cow_fsblk_t _rsv_start; /* First byte reserved */ -+ ext3cow_fsblk_t _rsv_end; /* Last byte reserved or 0 */ -+}; -+ -+struct ext3cow_reserve_window_node { -+ struct rb_node rsv_node; -+ __u32 rsv_goal_size; -+ __u32 rsv_alloc_hit; -+ struct ext3cow_reserve_window rsv_window; -+}; -+ -+struct ext3cow_block_alloc_info { -+ /* information about reservation window */ -+ struct ext3cow_reserve_window_node rsv_window_node; -+ /* -+ * was i_next_alloc_block in ext3cow_inode_info -+ * is the logical (file-relative) number of the -+ * most-recently-allocated block in this file. -+ * We use this for detecting linearly ascending allocation requests. -+ */ -+ __u32 last_alloc_logical_block; -+ /* -+ * Was i_next_alloc_goal in ext3cow_inode_info -+ * is the *physical* companion to i_next_alloc_block. -+ * it the the physical block number of the block which was most-recentl -+ * allocated to this file. This give us the goal (target) for the next -+ * allocation when we detect linearly ascending requests. -+ */ -+ ext3cow_fsblk_t last_alloc_physical_block; -+}; -+ -+#define rsv_start rsv_window._rsv_start -+#define rsv_end rsv_window._rsv_end -+ -+/* -+ * third extended file system inode data in memory -+ */ -+struct ext3cow_inode_info { -+ __le32 i_data[15]; /* unconverted */ -+ __u32 i_flags; -+#ifdef EXT3COW_FRAGMENTS -+ __u32 i_faddr; -+ __u8 i_frag_no; -+ __u8 i_frag_size; -+#endif -+ ext3cow_fsblk_t i_file_acl; -+ __u32 i_dir_acl; -+ __u32 i_dtime; -+ -+ /* -+ * i_block_group is the number of the block group which contains -+ * this file's inode. Constant across the lifetime of the inode, -+ * it is ued for making block allocation decisions - we try to -+ * place a file's data blocks near its inode block, and new inodes -+ * near to their parent directory's inode. -+ */ -+ __u32 i_block_group; -+ __u32 i_state; /* Dynamic state flags for ext3cow */ -+ -+ /* block reservation info */ -+ struct ext3cow_block_alloc_info *i_block_alloc_info; -+ -+ __u32 i_dir_start_lookup; -+ -+ /* For versioning -znjp */ -+ __u16 i_cow_bitmap; -+ __u32 i_epoch_number; -+ __u32 i_next_inode; -+#ifdef CONFIG_EXT3COW_FS_XATTR -+ /* -+ * Extended attributes can be read independently of the main file -+ * data. Taking i_mutex even when reading would cause contention -+ * between readers of EAs and writers of regular file data, so -+ * instead we synchronize on xattr_sem when reading or changing -+ * EAs. -+ */ -+ struct rw_semaphore xattr_sem; -+#endif -+#ifdef CONFIG_EXT3COW_FS_POSIX_ACL -+ struct posix_acl *i_acl; -+ struct posix_acl *i_default_acl; -+#endif -+ -+ struct list_head i_orphan; /* unlinked but open inodes */ -+ -+ /* -+ * i_disksize keeps track of what the inode size is ON DISK, not -+ * in memory. During truncate, i_size is set to the new size by -+ * the VFS prior to calling ext3cow_truncate(), but the filesystem won't -+ * set i_disksize to 0 until the truncate is actually under way. -+ * -+ * The intent is that i_disksize always represents the blocks which -+ * are used by this file. This allows recovery to restart truncate -+ * on orphans if we crash during truncate. We actually write i_disksize -+ * into the on-disk inode when writing inodes out, instead of i_size. -+ * -+ * The only time when i_disksize and i_size may be different is when -+ * a truncate is in progress. The only things which change i_disksize -+ * are ext3cow_get_block (growth) and ext3cow_truncate (shrinkth). -+ */ -+ loff_t i_disksize; -+ -+ /* on-disk additional length */ -+ __u16 i_extra_isize; -+ -+ /* -+ * truncate_mutex is for serialising ext3cow_truncate() against -+ * ext3cow_getblock(). In the 2.4 ext2 design, great chunks of inode's -+ * data tree are chopped off during truncate. We can't do that in -+ * ext3cow because whenever we perform intermediate commits during -+ * truncate, the inode and all the metadata blocks *must* be in a -+ * consistent state which allows truncation of the orphans to restart -+ * during recovery. Hence we must fix the get_block-vs-truncate race -+ * by other means, so we have truncate_mutex. -+ */ -+ struct mutex truncate_mutex; -+ struct inode vfs_inode; -+}; -+ -+#endif /* _LINUX_EXT3COW_FS_I */ -diff -ruN linux-2.6.20.3/include/linux/ext3cow_fs_sb.h linux-2.6.20.3-ext3cow/include/linux/ext3cow_fs_sb.h ---- linux-2.6.20.3/include/linux/ext3cow_fs_sb.h 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.20.3-ext3cow/include/linux/ext3cow_fs_sb.h 2008-03-09 11:10:57.000000000 -0400 -@@ -0,0 +1,86 @@ -+/* -+ * linux/include/linux/ext3cow_fs_sb.h -+ * -+ * Copyright (C) 1992, 1993, 1994, 1995 -+ * Remy Card (card@masi.ibp.fr) -+ * Laboratoire MASI - Institut Blaise Pascal -+ * Universite Pierre et Marie Curie (Paris VI) -+ * -+ * from -+ * -+ * linux/include/linux/minix_fs_sb.h -+ * -+ * Copyright (C) 1991, 1992 Linus Torvalds -+ */ -+ -+#ifndef _LINUX_EXT3COW_FS_SB -+#define _LINUX_EXT3COW_FS_SB -+ -+#ifdef __KERNEL__ -+#include -+#include -+#include -+#include -+#endif -+#include -+ -+/* -+ * third extended-fs super-block data in memory -+ */ -+struct ext3cow_sb_info { -+ unsigned long s_frag_size; /* Size of a fragment in bytes */ -+ unsigned long s_frags_per_block;/* Number of fragments per block */ -+ unsigned long s_inodes_per_block;/* Number of inodes per block */ -+ unsigned long s_frags_per_group;/* Number of fragments in a group */ -+ unsigned long s_blocks_per_group;/* Number of blocks in a group */ -+ unsigned long s_inodes_per_group;/* Number of inodes in a group */ -+ unsigned long s_itb_per_group; /* Number of inode table blocks per group */ -+ unsigned long s_gdb_count; /* Number of group descriptor blocks */ -+ unsigned long s_desc_per_block; /* Number of group descriptors per block */ -+ unsigned long s_groups_count; /* Number of groups in the fs */ -+ struct buffer_head * s_sbh; /* Buffer containing the super block */ -+ struct ext3cow_super_block * s_es; /* Pointer to the super block in the buffer */ -+ struct buffer_head ** s_group_desc; -+ unsigned long s_mount_opt; -+ uid_t s_resuid; -+ gid_t s_resgid; -+ unsigned short s_mount_state; -+ unsigned short s_pad; -+ int s_addr_per_block_bits; -+ int s_desc_per_block_bits; -+ int s_inode_size; -+ int s_first_ino; -+ spinlock_t s_next_gen_lock; -+ u32 s_next_generation; -+ u32 s_hash_seed[4]; -+ int s_def_hash_version; -+ struct percpu_counter s_freeblocks_counter; -+ struct percpu_counter s_freeinodes_counter; -+ struct percpu_counter s_dirs_counter; -+ struct blockgroup_lock s_blockgroup_lock; -+ -+ /* root of the per fs reservation window tree */ -+ spinlock_t s_rsv_window_lock; -+ struct rb_root s_rsv_window_root; -+ struct ext3cow_reserve_window_node s_rsv_window_head; -+ -+ /* For versioning -znjp */ -+ u32 s_epoch_number; -+ -+ /* Journaling */ -+ struct inode * s_journal_inode; -+ struct journal_s * s_journal; -+ struct list_head s_orphan; -+ unsigned long s_commit_interval; -+ struct block_device *journal_bdev; -+#ifdef CONFIG_JBD_DEBUG -+ struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ -+ wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ -+#endif -+#ifdef CONFIG_QUOTA -+ char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ -+ int s_jquota_fmt; /* Format of quota to use */ -+#endif -+}; -+ -+#endif /* _LINUX_EXT3COW_FS_SB */ -diff -ruN linux-2.6.20.3/include/linux/ext3cow_jbd.h linux-2.6.20.3-ext3cow/include/linux/ext3cow_jbd.h ---- linux-2.6.20.3/include/linux/ext3cow_jbd.h 1969-12-31 19:00:00.000000000 -0500 -+++ linux-2.6.20.3-ext3cow/include/linux/ext3cow_jbd.h 2008-03-09 11:10:56.000000000 -0400 -@@ -0,0 +1,226 @@ -+/* -+ * linux/include/linux/ext3cow_jbd.h -+ * -+ * Written by Stephen C. Tweedie , 1999 -+ * -+ * Copyright 1998--1999 Red Hat corp --- All Rights Reserved -+ * -+ * This file is part of the Linux kernel and is made available under -+ * the terms of the GNU General Public License, version 2, or at your -+ * option, any later version, incorporated herein by reference. -+ * -+ * Ext3-specific journaling extensions. -+ */ -+ -+#ifndef _LINUX_EXT3COW_JBD_H -+#define _LINUX_EXT3COW_JBD_H -+ -+#include -+#include -+#include -+ -+#define EXT3COW_JOURNAL(inode) (EXT3COW_SB((inode)->i_sb)->s_journal) -+ -+/* Define the number of blocks we need to account to a transaction to -+ * modify one block of data. -+ * -+ * We may have to touch one inode, one bitmap buffer, up to three -+ * indirection blocks, the group and superblock summaries, and the data -+ * block to complete the transaction. */ -+ -+#define EXT3COW_SINGLEDATA_TRANS_BLOCKS 8U -+ -+/* Extended attribute operations touch at most two data buffers, -+ * two bitmap buffers, and two group summaries, in addition to the inode -+ * and the superblock, which are already accounted for. */ -+ -+#define EXT3COW_XATTR_TRANS_BLOCKS 6U -+ -+/* Define the minimum size for a transaction which modifies data. This -+ * needs to take into account the fact that we may end up modifying two -+ * quota files too (one for the group, one for the user quota). The -+ * superblock only gets updated once, of course, so don't bother -+ * counting that again for the quota updates. */ -+ -+#define EXT3COW_DATA_TRANS_BLOCKS(sb) (EXT3COW_SINGLEDATA_TRANS_BLOCKS + \ -+ EXT3COW_XATTR_TRANS_BLOCKS - 2 + \ -+ 2*EXT3COW_QUOTA_TRANS_BLOCKS(sb)) -+ -+/* Delete operations potentially hit one directory's namespace plus an -+ * entire inode, plus arbitrary amounts of bitmap/indirection data. Be -+ * generous. We can grow the delete transaction later if necessary. */ -+ -+#define EXT3COW_DELETE_TRANS_BLOCKS(sb) (2 * EXT3COW_DATA_TRANS_BLOCKS(sb) + 64) -+ -+/* Define an arbitrary limit for the amount of data we will anticipate -+ * writing to any given transaction. For unbounded transactions such as -+ * write(2) and truncate(2) we can write more than this, but we always -+ * start off at the maximum transaction size and grow the transaction -+ * optimistically as we go. */ -+ -+#define EXT3COW_MAX_TRANS_DATA 64U -+ -+/* We break up a large truncate or write transaction once the handle's -+ * buffer credits gets this low, we need either to extend the -+ * transaction or to start a new one. Reserve enough space here for -+ * inode, bitmap, superblock, group and indirection updates for at least -+ * one block, plus two quota updates. Quota allocations are not -+ * needed. */ -+ -+#define EXT3COW_RESERVE_TRANS_BLOCKS 12U -+ -+#define EXT3COW_INDEX_EXTRA_TRANS_BLOCKS 8 -+ -+#ifdef CONFIG_QUOTA -+/* Amount of blocks needed for quota update - we know that the structure was -+ * allocated so we need to update only inode+data */ -+#define EXT3COW_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0) -+/* Amount of blocks needed for quota insert/delete - we do some block writes -+ * but inode, sb and group updates are done only once */ -+#define EXT3COW_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ -+ (EXT3COW_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_INIT_REWRITE) : 0) -+#define EXT3COW_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ -+ (EXT3COW_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_DEL_REWRITE) : 0) -+#else -+#define EXT3COW_QUOTA_TRANS_BLOCKS(sb) 0 -+#define EXT3COW_QUOTA_INIT_BLOCKS(sb) 0 -+#define EXT3COW_QUOTA_DEL_BLOCKS(sb) 0 -+#endif -+ -+int -+ext3cow_mark_iloc_dirty(handle_t *handle, -+ struct inode *inode, -+ struct ext3cow_iloc *iloc); -+ -+/* -+ * On success, We end up with an outstanding reference count against -+ * iloc->bh. This _must_ be cleaned up later. -+ */ -+ -+int ext3cow_reserve_inode_write(handle_t *handle, struct inode *inode, -+ struct ext3cow_iloc *iloc); -+ -+int ext3cow_mark_inode_dirty(handle_t *handle, struct inode *inode); -+ -+/* -+ * Wrapper functions with which ext3cow calls into JBD. The intent here is -+ * to allow these to be turned into appropriate stubs so ext3cow can control -+ * ext2 filesystems, so ext2+ext3cow systems only nee one fs. This work hasn't -+ * been done yet. -+ */ -+ -+static inline void ext3cow_journal_release_buffer(handle_t *handle, -+ struct buffer_head *bh) -+{ -+ journal_release_buffer(handle, bh); -+} -+ -+void ext3cow_journal_abort_handle(const char *caller, const char *err_fn, -+ struct buffer_head *bh, handle_t *handle, int err); -+ -+int __ext3cow_journal_get_undo_access(const char *where, handle_t *handle, -+ struct buffer_head *bh); -+ -+int __ext3cow_journal_get_write_access(const char *where, handle_t *handle, -+ struct buffer_head *bh); -+ -+int __ext3cow_journal_forget(const char *where, handle_t *handle, -+ struct buffer_head *bh); -+ -+int __ext3cow_journal_revoke(const char *where, handle_t *handle, -+ unsigned long blocknr, struct buffer_head *bh); -+ -+int __ext3cow_journal_get_create_access(const char *where, -+ handle_t *handle, struct buffer_head *bh); -+ -+int __ext3cow_journal_dirty_metadata(const char *where, -+ handle_t *handle, struct buffer_head *bh); -+ -+#define ext3cow_journal_get_undo_access(handle, bh) \ -+ __ext3cow_journal_get_undo_access(__FUNCTION__, (handle), (bh)) -+#define ext3cow_journal_get_write_access(handle, bh) \ -+ __ext3cow_journal_get_write_access(__FUNCTION__, (handle), (bh)) -+#define ext3cow_journal_revoke(handle, blocknr, bh) \ -+ __ext3cow_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh)) -+#define ext3cow_journal_get_create_access(handle, bh) \ -+ __ext3cow_journal_get_create_access(__FUNCTION__, (handle), (bh)) -+#define ext3cow_journal_dirty_metadata(handle, bh) \ -+ __ext3cow_journal_dirty_metadata(__FUNCTION__, (handle), (bh)) -+#define ext3cow_journal_forget(handle, bh) \ -+ __ext3cow_journal_forget(__FUNCTION__, (handle), (bh)) -+ -+int ext3cow_journal_dirty_data(handle_t *handle, struct buffer_head *bh); -+ -+handle_t *ext3cow_journal_start_sb(struct super_block *sb, int nblocks); -+int __ext3cow_journal_stop(const char *where, handle_t *handle); -+ -+static inline handle_t *ext3cow_journal_start(struct inode *inode, int nblocks) -+{ -+ return ext3cow_journal_start_sb(inode->i_sb, nblocks); -+} -+ -+#define ext3cow_journal_stop(handle) \ -+ __ext3cow_journal_stop(__FUNCTION__, (handle)) -+ -+static inline handle_t *ext3cow_journal_current_handle(void) -+{ -+ return journal_current_handle(); -+} -+ -+static inline int ext3cow_journal_extend(handle_t *handle, int nblocks) -+{ -+ return journal_extend(handle, nblocks); -+} -+ -+static inline int ext3cow_journal_restart(handle_t *handle, int nblocks) -+{ -+ return journal_restart(handle, nblocks); -+} -+ -+static inline int ext3cow_journal_blocks_per_page(struct inode *inode) -+{ -+ return journal_blocks_per_page(inode); -+} -+ -+static inline int ext3cow_journal_force_commit(journal_t *journal) -+{ -+ return journal_force_commit(journal); -+} -+ -+/* super.c */ -+int ext3cow_force_commit(struct super_block *sb); -+ -+static inline int ext3cow_should_journal_data(struct inode *inode) -+{ -+ if (!S_ISREG(inode->i_mode)) -+ return 1; -+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3COW_MOUNT_JOURNAL_DATA) -+ return 1; -+ if (EXT3COW_I(inode)->i_flags & EXT3COW_JOURNAL_DATA_FL) -+ return 1; -+ return 0; -+} -+ -+static inline int ext3cow_should_order_data(struct inode *inode) -+{ -+ if (!S_ISREG(inode->i_mode)) -+ return 0; -+ if (EXT3COW_I(inode)->i_flags & EXT3COW_JOURNAL_DATA_FL) -+ return 0; -+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3COW_MOUNT_ORDERED_DATA) -+ return 1; -+ return 0; -+} -+ -+static inline int ext3cow_should_writeback_data(struct inode *inode) -+{ -+ if (!S_ISREG(inode->i_mode)) -+ return 0; -+ if (EXT3COW_I(inode)->i_flags & EXT3COW_JOURNAL_DATA_FL) -+ return 0; -+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3COW_MOUNT_WRITEBACK_DATA) -+ return 1; -+ return 0; -+} -+ -+#endif /* _LINUX_EXT3COW_JBD_H */ -diff -ruN linux-2.6.20.3/include/linux/magic.h linux-2.6.20.3-ext3cow/include/linux/magic.h ---- linux-2.6.20.3/include/linux/magic.h 2007-03-13 14:27:08.000000000 -0400 -+++ linux-2.6.20.3-ext3cow/include/linux/magic.h 2008-03-09 11:10:57.000000000 -0400 -@@ -9,6 +9,7 @@ - #define EFS_SUPER_MAGIC 0x414A53 - #define EXT2_SUPER_MAGIC 0xEF53 - #define EXT3_SUPER_MAGIC 0xEF53 -+#define EXT3COW_SUPER_MAGIC 0xEF53 - #define EXT4_SUPER_MAGIC 0xEF53 - #define HPFS_SUPER_MAGIC 0xf995e849 - #define ISOFS_SUPER_MAGIC 0x9660 - diff --git a/pkgs/os-specific/linux/kernel/linux-2.6.20.nix b/pkgs/os-specific/linux/kernel/linux-2.6.20.nix index 365453e5872e..315937a68ec9 100644 --- a/pkgs/os-specific/linux/kernel/linux-2.6.20.nix +++ b/pkgs/os-specific/linux/kernel/linux-2.6.20.nix @@ -1,77 +1,21 @@ -{ stdenv, fetchurl, perl, mktemp, module_init_tools +args @ {stdenv, fetchurl, userModeLinux ? false, ...}: - # A list of patches to apply to the kernel. Each element of this list - # should be an attribute set {name, patch} where `name' is a - # symbolic name and `patch' is the actual patch. The patch may - # optionally be compressed with gzip or bzip2. -, kernelPatches ? [] +import ./generic.nix ( -, # Whether to build a User-Mode Linux kernel. - userModeLinux ? false - -, # Allows you to set your own kernel version suffix (e.g., - # "-my-kernel"). - localVersion ? "" - -, # Your own kernel configuration file, if you don't want to use the - # default. - kernelConfig ? null -}: - -assert stdenv.system == "i686-linux" || stdenv.system == "x86_64-linux"; - -let - - lib = import ../../../lib; - - version = "2.6.20.12"; - -in - -stdenv.mkDerivation { - name = if userModeLinux then "user-mode-linux-${version}" else "linux-${version}"; - builder = ./builder.sh; + rec { + version = "2.6.20.12"; - src = fetchurl { - url = "mirror://kernel/linux/kernel/v2.6/linux-${version}.tar.bz2"; - sha256 = "1s7vdpg2897q5pcyxxypqcnibwpbdawbimkf3pngmahj8wr9c03x"; - }; - - patches = map (p: p.patch) kernelPatches; - - extraConfig = lib.concatStrings (map (p: "\n" + (if p ? extraConfig then p.extraConfig else "") + "\n") kernelPatches); + src = fetchurl { + url = "mirror://kernel/linux/kernel/v2.6/linux-${version}.tar.bz2"; + sha256 = "1s7vdpg2897q5pcyxxypqcnibwpbdawbimkf3pngmahj8wr9c03x"; + }; - config = - if kernelConfig != null then kernelConfig else - if userModeLinux then ./config-2.6.20-uml else - if stdenv.system == "i686-linux" then ./config-2.6.20-i686-smp else - if stdenv.system == "x86_64-linux" then ./config-2.6.20-x86_64-smp else - abort "No kernel configuration for your platform!"; - - buildInputs = [perl mktemp]; - - arch = - if userModeLinux then "um" else - if stdenv.system == "i686-linux" then "i386" else - if stdenv.system == "x86_64-linux" then "x86_64" else - abort "Platform ${stdenv.system} is not supported."; + config = + if userModeLinux then ./config-2.6.20-uml else + if stdenv.system == "i686-linux" then ./config-2.6.20-i686-smp else + if stdenv.system == "x86_64-linux" then ./config-2.6.20-x86_64-smp else + abort "No kernel configuration for your platform!"; + } - makeFlags = if userModeLinux then "ARCH=um SHELL=bash HAVE_AIO_ABI=" else ""; - - inherit module_init_tools; - - allowLocalVersion = false; # don't allow patches to set a suffix - inherit localVersion; # but do allow the user to set one. - - meta = { - description = - (if userModeLinux then - "User-Mode Linux" - else - "The Linux kernel") + - (if kernelPatches == [] then "" else - " (with patches: " - + lib.concatStrings (lib.intersperse ", " (map (x: x.name) kernelPatches)) - + ")"); - }; -} + // args +) diff --git a/pkgs/os-specific/linux/kernel/linux-2.6.21.nix b/pkgs/os-specific/linux/kernel/linux-2.6.21.nix index e7fc2943a5cd..a63d39c2df7f 100644 --- a/pkgs/os-specific/linux/kernel/linux-2.6.21.nix +++ b/pkgs/os-specific/linux/kernel/linux-2.6.21.nix @@ -1,76 +1,21 @@ -{ stdenv, fetchurl, perl, mktemp, module_init_tools +args @ {stdenv, fetchurl, userModeLinux ? false, ...}: - # A list of patches to apply to the kernel. Each element of this list - # should be an attribute set {name, patch} where `name' is a - # symbolic name and `patch' is the actual patch. The patch may - # optionally be compressed with gzip or bzip2. -, kernelPatches ? [] +import ./generic.nix ( -, # Whether to build a User-Mode Linux kernel. - userModeLinux ? false - -, # Allows you to set your own kernel version suffix (e.g., - # "-my-kernel"). - localVersion ? "" - -, # Your own kernel configuration file, if you don't want to use the - # default. - kernelConfig ? null -}: - -assert stdenv.system == "i686-linux" || stdenv.system == "x86_64-linux"; - -let - - lib = import ../../../lib; - - version = "2.6.21.7"; - -in - -stdenv.mkDerivation { - name = if userModeLinux then "user-mode-linux-${version}" else "linux-${version}"; - builder = ./builder.sh; + rec { + version = "2.6.21.7"; - src = fetchurl { - url = "mirror://kernel/linux/kernel/v2.6/linux-${version}.tar.bz2"; - sha256 = "1c8ndsz35qd8vyng3xsxjjkjv5bnzyvc9b5vd85fz5v0bjp8hx50"; - }; - - patches = map (p: p.patch) kernelPatches; - extraConfig = lib.concatStrings (map (p: "\n" + (if p ? extraConfig then p.extraConfig else "") + "\n") kernelPatches); + src = fetchurl { + url = "mirror://kernel/linux/kernel/v2.6/linux-${version}.tar.bz2"; + sha256 = "1c8ndsz35qd8vyng3xsxjjkjv5bnzyvc9b5vd85fz5v0bjp8hx50"; + }; - config = - if kernelConfig != null then kernelConfig else - if userModeLinux then ./config-2.6.21-uml else - if stdenv.system == "i686-linux" then ./config-2.6.21-i686-smp else - if stdenv.system == "x86_64-linux" then ./config-2.6.21-x86_64-smp else - abort "No kernel configuration for your platform!"; - - buildInputs = [perl mktemp]; - - arch = - if userModeLinux then "um" else - if stdenv.system == "i686-linux" then "i386" else - if stdenv.system == "x86_64-linux" then "x86_64" else - abort "Platform ${stdenv.system} is not supported."; + config = + if userModeLinux then ./config-2.6.21-uml else + if stdenv.system == "i686-linux" then ./config-2.6.21-i686-smp else + if stdenv.system == "x86_64-linux" then ./config-2.6.21-x86_64-smp else + abort "No kernel configuration for your platform!"; + } - makeFlags = if userModeLinux then "ARCH=um SHELL=bash" else ""; - - inherit module_init_tools; - - allowLocalVersion = false; # don't allow patches to set a suffix - inherit localVersion; # but do allow the user to set one. - - meta = { - description = - (if userModeLinux then - "User-Mode Linux" - else - "The Linux kernel") + - (if kernelPatches == [] then "" else - " (with patches: " - + lib.concatStrings (lib.intersperse ", " (map (x: x.name) kernelPatches)) - + ")"); - }; -} + // args +) diff --git a/pkgs/os-specific/linux/kernel/linux-2.6.21_ck.nix b/pkgs/os-specific/linux/kernel/linux-2.6.21_ck.nix deleted file mode 100644 index 55748d3e9f54..000000000000 --- a/pkgs/os-specific/linux/kernel/linux-2.6.21_ck.nix +++ /dev/null @@ -1,84 +0,0 @@ -{ stdenv, fetchurl, perl, mktemp, module_init_tools - - # A list of patches to apply to the kernel. Each element of this list - # should be an attribute set {name, patch} where `name' is a - # symbolic name and `patch' is the actual patch. The patch may - # optionally be compressed with gzip or bzip2. -, kernelPatches ? [] - -, # Whether to build a User-Mode Linux kernel. - userModeLinux ? false - -, # Allows you to set your own kernel version suffix (e.g., - # "-my-kernel"). - localVersion ? "" - -, # Your own kernel configuration file, if you don't want to use the - # default. - kernelConfig ? null - -, # A list of additional statements to be appended to the - # configuration file. - extraConfig ? [] -}: - -assert stdenv.system == "i686-linux" || stdenv.system == "x86_64-linux"; - -let - - lib = import ../../../lib; - - version = "2.6.21"; - -in - -stdenv.mkDerivation { - name = if userModeLinux then "user-mode-linux-${version}" else "linux-${version}"; - builder = ./builder.sh; - - src = fetchurl { - url = "http://www.kernel.org/pub/linux/kernel/v2.6/linux-2.6.21.tar.bz2"; - sha256 = "f187b12d70e0a48ce81f0472dfe9504fb5f0f966be339ac9d57dd2b991a74942"; - }; - - patches = map (p: p.patch) kernelPatches; - extraConfig = - let addNewlines = map (s: "\n" + s + "\n"); - configFromPatches = - map (p: if p ? extraConfig then p.extraConfig else "") kernelPatches; - in lib.concatStrings (addNewlines (configFromPatches ++ extraConfig)); - - config = - if kernelConfig != null then kernelConfig else - if userModeLinux then ./config-2.6.21-uml else - if stdenv.system == "i686-linux" then ./config-2.6.21-i686-smp else - if stdenv.system == "x86_64-linux" then ./config-2.6.21-x86_64-smp else - abort "No kernel configuration for your platform!"; - - buildInputs = [perl mktemp]; - - arch = - if userModeLinux then "um" else - if stdenv.system == "i686-linux" then "i386" else - if stdenv.system == "x86_64-linux" then "x86_64" else - abort "Platform ${stdenv.system} is not supported."; - - makeFlags = if userModeLinux then "ARCH=um SHELL=bash" else ""; - - inherit module_init_tools; - - allowLocalVersion = false; # don't allow patches to set a suffix - inherit localVersion; # but do allow the user to set one. - - meta = { - description = - (if userModeLinux then - "User-Mode Linux" - else - "The Linux kernel") + - (if kernelPatches == [] then "" else - " (with patches: " - + lib.concatStrings (lib.intersperse ", " (map (x: x.name) kernelPatches)) - + ")"); - }; -} diff --git a/pkgs/os-specific/linux/kernel/linux-2.6.22.nix b/pkgs/os-specific/linux/kernel/linux-2.6.22.nix index 1abc1470616e..41694c17feb7 100644 --- a/pkgs/os-specific/linux/kernel/linux-2.6.22.nix +++ b/pkgs/os-specific/linux/kernel/linux-2.6.22.nix @@ -1,84 +1,21 @@ -{ stdenv, fetchurl, perl, mktemp, module_init_tools +args @ {stdenv, fetchurl, userModeLinux ? false, ...}: - # A list of patches to apply to the kernel. Each element of this list - # should be an attribute set {name, patch} where `name' is a - # symbolic name and `patch' is the actual patch. The patch may - # optionally be compressed with gzip or bzip2. -, kernelPatches ? [] +import ./generic.nix ( -, # Whether to build a User-Mode Linux kernel. - userModeLinux ? false - -, # Allows you to set your own kernel version suffix (e.g., - # "-my-kernel"). - localVersion ? "" - -, # Your own kernel configuration file, if you don't want to use the - # default. - kernelConfig ? null - -, # A list of additional statements to be appended to the - # configuration file. - extraConfig ? [] -}: - -assert stdenv.system == "i686-linux" || stdenv.system == "x86_64-linux"; - -let - - lib = import ../../../lib; - - version = "2.6.22.18"; - -in - -stdenv.mkDerivation { - name = if userModeLinux then "user-mode-linux-${version}" else "linux-${version}"; - builder = ./builder.sh; + rec { + version = "2.6.22.18"; - src = fetchurl { - url = "mirror://kernel/linux/kernel/v2.6/linux-${version}.tar.bz2"; - sha256 = "09acj1xr16j9y91gzwzcjhanhcpyac1ah2lc42mfi7d8c0plagry"; - }; - - patches = map (p: p.patch) kernelPatches; - extraConfig = - let addNewlines = map (s: "\n" + s + "\n"); - configFromPatches = - map (p: if p ? extraConfig then p.extraConfig else "") kernelPatches; - in lib.concatStrings (addNewlines (configFromPatches ++ extraConfig)); + src = fetchurl { + url = "mirror://kernel/linux/kernel/v2.6/linux-${version}.tar.bz2"; + sha256 = "09acj1xr16j9y91gzwzcjhanhcpyac1ah2lc42mfi7d8c0plagry"; + }; - config = - if kernelConfig != null then kernelConfig else - if userModeLinux then ./config-2.6.22-uml else - if stdenv.system == "i686-linux" then ./config-2.6.22-i686-smp else - if stdenv.system == "x86_64-linux" then ./config-2.6.22-x86_64-smp else - abort "No kernel configuration for your platform!"; - - buildInputs = [perl mktemp]; - - arch = - if userModeLinux then "um" else - if stdenv.system == "i686-linux" then "i386" else - if stdenv.system == "x86_64-linux" then "x86_64" else - abort "Platform ${stdenv.system} is not supported."; + config = + if userModeLinux then ./config-2.6.22-uml else + if stdenv.system == "i686-linux" then ./config-2.6.22-i686-smp else + if stdenv.system == "x86_64-linux" then ./config-2.6.22-x86_64-smp else + abort "No kernel configuration for your platform!"; + } - makeFlags = if userModeLinux then "ARCH=um SHELL=bash" else ""; - - inherit module_init_tools; - - allowLocalVersion = false; # don't allow patches to set a suffix - inherit localVersion; # but do allow the user to set one. - - meta = { - description = - (if userModeLinux then - "User-Mode Linux" - else - "The Linux kernel") + - (if kernelPatches == [] then "" else - " (with patches: " - + lib.concatStrings (lib.intersperse ", " (map (x: x.name) kernelPatches)) - + ")"); - }; -} + // args +) diff --git a/pkgs/os-specific/linux/kernel/linux-2.6.23.nix b/pkgs/os-specific/linux/kernel/linux-2.6.23.nix index b6756c113dd1..c05e959ddb3a 100644 --- a/pkgs/os-specific/linux/kernel/linux-2.6.23.nix +++ b/pkgs/os-specific/linux/kernel/linux-2.6.23.nix @@ -1,91 +1,21 @@ -{ stdenv, fetchurl, perl, mktemp, module_init_tools +args @ {stdenv, fetchurl, userModeLinux ? false, ...}: - # A list of patches to apply to the kernel. Each element of this list - # should be an attribute set {name, patch} where `name' is a - # symbolic name and `patch' is the actual patch. The patch may - # optionally be compressed with gzip or bzip2. -, kernelPatches ? [] +import ./generic.nix ( -, # Whether to build a User-Mode Linux kernel. - userModeLinux ? false - -, # Allows you to set your own kernel version suffix (e.g., - # "-my-kernel"). - localVersion ? "" - -, # Your own kernel configuration file, if you don't want to use the - # default. - kernelConfig ? null - -, # A list of additional statements to be appended to the - # configuration file. - extraConfig ? [] -}: - -assert stdenv.system == "i686-linux" || stdenv.system == "x86_64-linux"; - -let - - lib = import ../../../lib; - - version = "2.6.23.17"; - -in - -stdenv.mkDerivation { - name = if userModeLinux then "user-mode-linux-${version}" else "linux-${version}"; - - passthru = { - inherit version; - # Combine the `features' attribute sets of all the kernel patches. - features = lib.fold (x: y: (if x ? features then x.features else {}) // y) {} kernelPatches; - }; + rec { + version = "2.6.23.17"; - builder = ./builder.sh; - - src = fetchurl { - url = "mirror://kernel/linux/kernel/v2.6/linux-${version}.tar.bz2"; - sha256 = "0lww6ywgl353xlaxcc3hg5d2q1vcydbqhddvkfpphr07zr7mwl32"; - }; - - patches = map (p: p.patch) kernelPatches; - extraConfig = - let addNewlines = map (s: "\n" + s + "\n"); - configFromPatches = - map (p: if p ? extraConfig then p.extraConfig else "") kernelPatches; - in lib.concatStrings (addNewlines (configFromPatches ++ extraConfig)); + src = fetchurl { + url = "mirror://kernel/linux/kernel/v2.6/linux-${version}.tar.bz2"; + sha256 = "0lww6ywgl353xlaxcc3hg5d2q1vcydbqhddvkfpphr07zr7mwl32"; + }; - config = - if kernelConfig != null then kernelConfig else - if userModeLinux then ./config-2.6.23-uml else - if stdenv.system == "i686-linux" then ./config-2.6.23-i686-smp else - if stdenv.system == "x86_64-linux" then ./config-2.6.23-x86_64-smp else - abort "No kernel configuration for your platform!"; - - buildInputs = [perl mktemp]; - - arch = - if userModeLinux then "um" else - if stdenv.system == "i686-linux" then "i386" else - if stdenv.system == "x86_64-linux" then "x86_64" else - abort "Platform ${stdenv.system} is not supported."; + config = + if userModeLinux then ./config-2.6.23-uml else + if stdenv.system == "i686-linux" then ./config-2.6.23-i686-smp else + if stdenv.system == "x86_64-linux" then ./config-2.6.23-x86_64-smp else + abort "No kernel configuration for your platform!"; + } - makeFlags = if userModeLinux then "ARCH=um SHELL=bash" else ""; - - inherit module_init_tools; - - allowLocalVersion = false; # don't allow patches to set a suffix - inherit localVersion; # but do allow the user to set one. - - meta = { - description = - (if userModeLinux then - "User-Mode Linux" - else - "The Linux kernel") + - (if kernelPatches == [] then "" else - " (with patches: " - + lib.concatStrings (lib.intersperse ", " (map (x: x.name) kernelPatches)) - + ")"); - }; -} + // args +) diff --git a/pkgs/os-specific/linux/kernel/linux-2.6.25.nix b/pkgs/os-specific/linux/kernel/linux-2.6.25.nix index 3556ee5d34d3..a0ee9aea779a 100644 --- a/pkgs/os-specific/linux/kernel/linux-2.6.25.nix +++ b/pkgs/os-specific/linux/kernel/linux-2.6.25.nix @@ -1,95 +1,25 @@ -{ stdenv, fetchurl, perl, mktemp, module_init_tools +args @ {stdenv, fetchurl, userModeLinux ? false, ...}: - # A list of patches to apply to the kernel. Each element of this list - # should be an attribute set {name, patch} where `name' is a - # symbolic name and `patch' is the actual patch. The patch may - # optionally be compressed with gzip or bzip2. -, kernelPatches ? [] +import ./generic.nix ( -, # Whether to build a User-Mode Linux kernel. - userModeLinux ? false - -, # Allows you to set your own kernel version suffix (e.g., - # "-my-kernel"). - localVersion ? "" - -, # Your own kernel configuration file, if you don't want to use the - # default. - kernelConfig ? null - -, # A list of additional statements to be appended to the - # configuration file. - extraConfig ? [] -}: - -assert stdenv.system == "i686-linux" || stdenv.system == "x86_64-linux"; - -let - - lib = stdenv.lib; - - version = "2.6.25.17"; - - baseFeatures = { - iwlwifi = true; - }; - -in - -stdenv.mkDerivation { - name = if userModeLinux then "user-mode-linux-${version}" else "linux-${version}"; - - passthru = { - inherit version; - # Combine the `features' attribute sets of all the kernel patches. - features = lib.fold (x: y: (if x ? features then x.features else {}) // y) baseFeatures kernelPatches; - }; + rec { + version = "2.6.25.17"; - builder = ./builder.sh; - - src = fetchurl { - url = "mirror://kernel/linux/kernel/v2.6/linux-${version}.tar.bz2"; - sha256 = "15jx163rryvvdy65wgfpws8l5cqrczfygsz6v5280i5glhy1dh77"; - }; - - patches = map (p: p.patch) kernelPatches; - extraConfig = - let addNewlines = map (s: "\n" + s + "\n"); - configFromPatches = - map (p: if p ? extraConfig then p.extraConfig else "") kernelPatches; - in lib.concatStrings (addNewlines (configFromPatches ++ extraConfig)); + src = fetchurl { + url = "mirror://kernel/linux/kernel/v2.6/linux-${version}.tar.bz2"; + sha256 = "15jx163rryvvdy65wgfpws8l5cqrczfygsz6v5280i5glhy1dh77"; + }; - config = - if kernelConfig != null then kernelConfig else - if userModeLinux then ./config-2.6.25-uml else - if stdenv.system == "i686-linux" then ./config-2.6.25-i686-smp else - if stdenv.system == "x86_64-linux" then ./config-2.6.25-x86_64-smp else - abort "No kernel configuration for your platform!"; - - buildInputs = [perl mktemp]; - - arch = - if userModeLinux then "um" else - if stdenv.system == "i686-linux" then "i386" else - if stdenv.system == "x86_64-linux" then "x86_64" else - abort "Platform ${stdenv.system} is not supported."; + features = { + iwlwifi = true; + }; + + config = + if userModeLinux then ./config-2.6.25-uml else + if stdenv.system == "i686-linux" then ./config-2.6.25-i686-smp else + if stdenv.system == "x86_64-linux" then ./config-2.6.25-x86_64-smp else + abort "No kernel configuration for your platform!"; + } - makeFlags = if userModeLinux then "ARCH=um SHELL=bash" else ""; - - inherit module_init_tools; - - allowLocalVersion = false; # don't allow patches to set a suffix - inherit localVersion; # but do allow the user to set one. - - meta = { - description = - (if userModeLinux then - "User-Mode Linux" - else - "The Linux kernel") + - (if kernelPatches == [] then "" else - " (with patches: " - + lib.concatStrings (lib.intersperse ", " (map (x: x.name) kernelPatches)) - + ")"); - }; -} + // args +) diff --git a/pkgs/os-specific/linux/kernel/linux-2.6.26.nix b/pkgs/os-specific/linux/kernel/linux-2.6.26.nix index b5c45f60edc1..fd2aeb61a168 100644 --- a/pkgs/os-specific/linux/kernel/linux-2.6.26.nix +++ b/pkgs/os-specific/linux/kernel/linux-2.6.26.nix @@ -1,95 +1,26 @@ -{ stdenv, fetchurl, perl, mktemp, module_init_tools +args @ {stdenv, fetchurl, userModeLinux ? false, ...}: - # A list of patches to apply to the kernel. Each element of this list - # should be an attribute set {name, patch} where `name' is a - # symbolic name and `patch' is the actual patch. The patch may - # optionally be compressed with gzip or bzip2. -, kernelPatches ? [] +assert !userModeLinux; -, # Whether to build a User-Mode Linux kernel. - userModeLinux ? false +import ./generic.nix ( -, # Allows you to set your own kernel version suffix (e.g., - # "-my-kernel"). - localVersion ? "" - -, # Your own kernel configuration file, if you don't want to use the - # default. - kernelConfig ? null - -, # A list of additional statements to be appended to the - # configuration file. - extraConfig ? [] -}: - -assert stdenv.system == "i686-linux" || stdenv.system == "x86_64-linux"; - -let - - lib = stdenv.lib; - - version = "2.6.26.7"; - - baseFeatures = { - iwlwifi = true; - }; - -in - -stdenv.mkDerivation { - name = if userModeLinux then "user-mode-linux-${version}" else "linux-${version}"; - - passthru = { - inherit version; - # Combine the `features' attribute sets of all the kernel patches. - features = lib.fold (x: y: (if x ? features then x.features else {}) // y) baseFeatures kernelPatches; - }; + rec { + version = "2.6.26.7"; - builder = ./builder.sh; - - src = fetchurl { - url = "mirror://kernel/linux/kernel/v2.6/linux-${version}.tar.bz2"; - sha256 = "1za4xq9q4gngmdxxwi728hdp30wjkwg4sh07fgyrs4nakjbjsgsj"; - }; - - patches = map (p: p.patch) kernelPatches; - extraConfig = - let addNewlines = map (s: "\n" + s + "\n"); - configFromPatches = - map (p: if p ? extraConfig then p.extraConfig else "") kernelPatches; - in lib.concatStrings (addNewlines (configFromPatches ++ extraConfig)); + src = fetchurl { + url = "mirror://kernel/linux/kernel/v2.6/linux-${version}.tar.bz2"; + sha256 = "1za4xq9q4gngmdxxwi728hdp30wjkwg4sh07fgyrs4nakjbjsgsj"; + }; - config = - if kernelConfig != null then kernelConfig else - if userModeLinux then ./config-2.6.26-uml else - if stdenv.system == "i686-linux" then ./config-2.6.26-i686-smp else - if stdenv.system == "x86_64-linux" then ./config-2.6.26-x86_64-smp else - abort "No kernel configuration for your platform!"; - - buildInputs = [perl mktemp]; - - arch = - if userModeLinux then "um" else - if stdenv.system == "i686-linux" then "i386" else - if stdenv.system == "x86_64-linux" then "x86_64" else - abort "Platform ${stdenv.system} is not supported."; + features = { + iwlwifi = true; + }; + + config = + if stdenv.system == "i686-linux" then ./config-2.6.26-i686-smp else + if stdenv.system == "x86_64-linux" then ./config-2.6.26-x86_64-smp else + abort "No kernel configuration for your platform!"; + } - makeFlags = if userModeLinux then "ARCH=um SHELL=bash" else ""; - - inherit module_init_tools; - - allowLocalVersion = false; # don't allow patches to set a suffix - inherit localVersion; # but do allow the user to set one. - - meta = { - description = - (if userModeLinux then - "User-Mode Linux" - else - "The Linux kernel") + - (if kernelPatches == [] then "" else - " (with patches: " - + lib.concatStrings (lib.intersperse ", " (map (x: x.name) kernelPatches)) - + ")"); - }; -} + // args +) diff --git a/pkgs/os-specific/linux/kernel/linux-2.6.27.nix b/pkgs/os-specific/linux/kernel/linux-2.6.27.nix index 1e451b6da391..ae747404bc41 100644 --- a/pkgs/os-specific/linux/kernel/linux-2.6.27.nix +++ b/pkgs/os-specific/linux/kernel/linux-2.6.27.nix @@ -1,95 +1,26 @@ -{ stdenv, fetchurl, perl, mktemp, module_init_tools +args @ {stdenv, fetchurl, userModeLinux ? false, ...}: - # A list of patches to apply to the kernel. Each element of this list - # should be an attribute set {name, patch} where `name' is a - # symbolic name and `patch' is the actual patch. The patch may - # optionally be compressed with gzip or bzip2. -, kernelPatches ? [] +assert !userModeLinux; -, # Whether to build a User-Mode Linux kernel. - userModeLinux ? false +import ./generic.nix ( -, # Allows you to set your own kernel version suffix (e.g., - # "-my-kernel"). - localVersion ? "" - -, # Your own kernel configuration file, if you don't want to use the - # default. - kernelConfig ? null - -, # A list of additional statements to be appended to the - # configuration file. - extraConfig ? [] -}: - -assert stdenv.system == "i686-linux" || stdenv.system == "x86_64-linux"; - -let - - lib = stdenv.lib; - - version = "2.6.27.10"; - - baseFeatures = { - iwlwifi = true; - }; - -in - -stdenv.mkDerivation { - name = if userModeLinux then "user-mode-linux-${version}" else "linux-${version}"; - - passthru = { - inherit version; - # Combine the `features' attribute sets of all the kernel patches. - features = lib.fold (x: y: (if x ? features then x.features else {}) // y) baseFeatures kernelPatches; - }; + rec { + version = "2.6.27.10"; - builder = ./builder.sh; - - src = fetchurl { - url = "mirror://kernel/linux/kernel/v2.6/linux-${version}.tar.bz2"; - sha256 = "1g6k7m75cqjznibl249g43plkrgmca96sq5c7bdp18rmnalwh9w5"; - }; - - patches = map (p: p.patch) kernelPatches; - extraConfig = - let addNewlines = map (s: "\n" + s + "\n"); - configFromPatches = - map (p: if p ? extraConfig then p.extraConfig else "") kernelPatches; - in lib.concatStrings (addNewlines (configFromPatches ++ extraConfig)); + src = fetchurl { + url = "mirror://kernel/linux/kernel/v2.6/linux-${version}.tar.bz2"; + sha256 = "1g6k7m75cqjznibl249g43plkrgmca96sq5c7bdp18rmnalwh9w5"; + }; - config = - if kernelConfig != null then kernelConfig else - if userModeLinux then ./config-2.6.27-uml else - if stdenv.system == "i686-linux" then ./config-2.6.27-i686-smp else - if stdenv.system == "x86_64-linux" then ./config-2.6.27-x86_64-smp else - abort "No kernel configuration for your platform!"; - - buildInputs = [perl mktemp]; - - arch = - if userModeLinux then "um" else - if stdenv.system == "i686-linux" then "i386" else - if stdenv.system == "x86_64-linux" then "x86_64" else - abort "Platform ${stdenv.system} is not supported."; + features = { + iwlwifi = true; + }; + + config = + if stdenv.system == "i686-linux" then ./config-2.6.27-i686-smp else + if stdenv.system == "x86_64-linux" then ./config-2.6.27-x86_64-smp else + abort "No kernel configuration for your platform!"; + } - makeFlags = if userModeLinux then "ARCH=um SHELL=bash" else ""; - - inherit module_init_tools; - - allowLocalVersion = false; # don't allow patches to set a suffix - inherit localVersion; # but do allow the user to set one. - - meta = { - description = - (if userModeLinux then - "User-Mode Linux" - else - "The Linux kernel") + - (if kernelPatches == [] then "" else - " (with patches: " - + lib.concatStrings (lib.intersperse ", " (map (x: x.name) kernelPatches)) - + ")"); - }; -} + // args +) diff --git a/pkgs/os-specific/linux/kernel/patch-2.6.21-ck1 b/pkgs/os-specific/linux/kernel/patch-2.6.21-ck1 deleted file mode 100644 index 0bf63f5aca37..000000000000 --- a/pkgs/os-specific/linux/kernel/patch-2.6.21-ck1 +++ /dev/null @@ -1,5040 +0,0 @@ -Index: linux-2.6.21-ck1/Makefile -=================================================================== ---- linux-2.6.21-ck1.orig/Makefile 2007-05-04 12:10:52.000000000 +1000 -+++ linux-2.6.21-ck1/Makefile 2007-05-04 12:21:37.000000000 +1000 -@@ -1,7 +1,7 @@ - VERSION = 2 - PATCHLEVEL = 6 - SUBLEVEL = 21 --EXTRAVERSION = -+EXTRAVERSION = -ck1 - NAME = Nocturnal Monster Puppy - - # *DOCUMENTATION* -Index: linux-2.6.21-ck1/kernel/workqueue.c -=================================================================== ---- linux-2.6.21-ck1.orig/kernel/workqueue.c 2007-05-04 12:10:52.000000000 +1000 -+++ linux-2.6.21-ck1/kernel/workqueue.c 2007-05-04 12:10:54.000000000 +1000 -@@ -355,8 +355,6 @@ static int worker_thread(void *__cwq) - if (!cwq->freezeable) - current->flags |= PF_NOFREEZE; - -- set_user_nice(current, -5); -- - /* Block and flush all signals */ - sigfillset(&blocked); - sigprocmask(SIG_BLOCK, &blocked, NULL); -Index: linux-2.6.21-ck1/fs/proc/array.c -=================================================================== ---- linux-2.6.21-ck1.orig/fs/proc/array.c 2007-05-04 12:10:52.000000000 +1000 -+++ linux-2.6.21-ck1/fs/proc/array.c 2007-05-04 12:10:54.000000000 +1000 -@@ -165,7 +165,6 @@ static inline char * task_state(struct t - rcu_read_lock(); - buffer += sprintf(buffer, - "State:\t%s\n" -- "SleepAVG:\t%lu%%\n" - "Tgid:\t%d\n" - "Pid:\t%d\n" - "PPid:\t%d\n" -@@ -173,7 +172,6 @@ static inline char * task_state(struct t - "Uid:\t%d\t%d\t%d\t%d\n" - "Gid:\t%d\t%d\t%d\t%d\n", - get_task_state(p), -- (p->sleep_avg/1024)*100/(1020000000/1024), - p->tgid, p->pid, - pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0, - pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0, -Index: linux-2.6.21-ck1/include/linux/init_task.h -=================================================================== ---- linux-2.6.21-ck1.orig/include/linux/init_task.h 2007-05-04 12:10:52.000000000 +1000 -+++ linux-2.6.21-ck1/include/linux/init_task.h 2007-05-04 12:24:19.000000000 +1000 -@@ -102,13 +102,15 @@ extern struct group_info init_groups; - .prio = MAX_PRIO-20, \ - .static_prio = MAX_PRIO-20, \ - .normal_prio = MAX_PRIO-20, \ -+ .rotation = 0, \ - .policy = SCHED_NORMAL, \ - .cpus_allowed = CPU_MASK_ALL, \ - .mm = NULL, \ - .active_mm = &init_mm, \ - .run_list = LIST_HEAD_INIT(tsk.run_list), \ - .ioprio = 0, \ -- .time_slice = HZ, \ -+ .time_slice = 1000000000, \ -+ .quota = 1000000000, \ - .tasks = LIST_HEAD_INIT(tsk.tasks), \ - .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ - .ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \ -@@ -135,6 +137,7 @@ extern struct group_info init_groups; - .signal = {{0}}}, \ - .blocked = {{0}}, \ - .alloc_lock = __SPIN_LOCK_UNLOCKED(tsk.alloc_lock), \ -+ .mutexes_held = 0, \ - .journal_info = NULL, \ - .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ - .fs_excl = ATOMIC_INIT(0), \ -Index: linux-2.6.21-ck1/include/linux/sched.h -=================================================================== ---- linux-2.6.21-ck1.orig/include/linux/sched.h 2007-05-04 12:10:52.000000000 +1000 -+++ linux-2.6.21-ck1/include/linux/sched.h 2007-05-04 12:24:19.000000000 +1000 -@@ -34,9 +34,14 @@ - #define SCHED_FIFO 1 - #define SCHED_RR 2 - #define SCHED_BATCH 3 -+#define SCHED_ISO 4 -+#define SCHED_IDLEPRIO 5 - - #ifdef __KERNEL__ - -+#define SCHED_MAX SCHED_IDLEPRIO -+#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) -+ - struct sched_param { - int sched_priority; - }; -@@ -149,8 +154,7 @@ extern unsigned long weighted_cpuload(co - #define EXIT_ZOMBIE 16 - #define EXIT_DEAD 32 - /* in tsk->state again */ --#define TASK_NONINTERACTIVE 64 --#define TASK_DEAD 128 -+#define TASK_DEAD 64 - - #define __set_task_state(tsk, state_value) \ - do { (tsk)->state = (state_value); } while (0) -@@ -522,14 +526,19 @@ struct signal_struct { - - #define MAX_USER_RT_PRIO 100 - #define MAX_RT_PRIO MAX_USER_RT_PRIO -+#define PRIO_RANGE (40) -+#define ISO_PRIO (MAX_RT_PRIO - 1) - --#define MAX_PRIO (MAX_RT_PRIO + 40) -+#define MAX_PRIO (MAX_RT_PRIO + PRIO_RANGE) - --#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) -+#define rt_prio(prio) unlikely((prio) < ISO_PRIO) - #define rt_task(p) rt_prio((p)->prio) - #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) --#define is_rt_policy(p) ((p) != SCHED_NORMAL && (p) != SCHED_BATCH) -+#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ -+ (policy) == SCHED_RR) - #define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) -+#define iso_task(p) unlikely((p)->policy == SCHED_ISO) -+#define idleprio_task(p) unlikely((p)->policy == SCHED_IDLEPRIO) - - /* - * Some day this will be a full-fledged user tracking system.. -@@ -740,6 +749,22 @@ extern unsigned int max_cache_size; - - #endif /* CONFIG_SMP */ - -+/* -+ * A runqueue laden with a single nice 0 task scores a weighted_cpuload of -+ * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a -+ * task of nice 0 or enough lower priority tasks to bring up the -+ * weighted_cpuload -+ */ -+static inline int above_background_load(void) -+{ -+ unsigned long cpu; -+ -+ for_each_online_cpu(cpu) { -+ if (weighted_cpuload(cpu) >= SCHED_LOAD_SCALE) -+ return 1; -+ } -+ return 0; -+} - - struct io_context; /* See blkdev.h */ - struct cpuset; -@@ -788,13 +813,6 @@ struct mempolicy; - struct pipe_inode_info; - struct uts_namespace; - --enum sleep_type { -- SLEEP_NORMAL, -- SLEEP_NONINTERACTIVE, -- SLEEP_INTERACTIVE, -- SLEEP_INTERRUPTED, --}; -- - struct prio_array; - - struct task_struct { -@@ -814,20 +832,33 @@ struct task_struct { - int load_weight; /* for niceness load balancing purposes */ - int prio, static_prio, normal_prio; - struct list_head run_list; -+ /* -+ * This bitmap shows what priorities this task has received quota -+ * from for this major priority rotation on its current runqueue. -+ */ -+ DECLARE_BITMAP(bitmap, PRIO_RANGE + 1); - struct prio_array *array; -+ /* Which major runqueue rotation did this task run */ -+ unsigned long rotation; - - unsigned short ioprio; - #ifdef CONFIG_BLK_DEV_IO_TRACE - unsigned int btrace_seq; - #endif -- unsigned long sleep_avg; - unsigned long long timestamp, last_ran; - unsigned long long sched_time; /* sched_clock time spent running */ -- enum sleep_type sleep_type; - - unsigned long policy; - cpumask_t cpus_allowed; -- unsigned int time_slice, first_time_slice; -+ /* -+ * How much this task is entitled to run at the current priority -+ * before being requeued at a lower priority. -+ */ -+ int time_slice; -+ /* Is this the very first time_slice this task has ever run. */ -+ unsigned int first_time_slice; -+ /* How much this task receives at each priority level */ -+ int quota; - - #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) - struct sched_info sched_info; -@@ -992,6 +1023,7 @@ struct task_struct { - struct held_lock held_locks[MAX_LOCK_DEPTH]; - unsigned int lockdep_recursion; - #endif -+ unsigned long mutexes_held; - - /* journalling filesystem info */ - void *journal_info; -@@ -1156,8 +1188,10 @@ static inline void put_task_struct(struc - #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ - #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ - #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ -+#define PF_ISOREF 0x04000000 /* SCHED_ISO task has used up quota */ - #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ - #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ -+#define PF_NONSLEEP 0x40000000 /* Waiting on in-kernel activity */ - - /* - * Only the _current_ task can read/write to tsk->flags, but other -Index: linux-2.6.21-ck1/kernel/sched.c -=================================================================== ---- linux-2.6.21-ck1.orig/kernel/sched.c 2007-05-04 12:10:52.000000000 +1000 -+++ linux-2.6.21-ck1/kernel/sched.c 2007-05-04 12:24:22.000000000 +1000 -@@ -16,6 +16,7 @@ - * by Davide Libenzi, preemptible kernel bits by Robert Love. - * 2003-09-03 Interactivity tuning by Con Kolivas. - * 2004-04-02 Scheduler domains code by Nick Piggin -+ * 2007-03-02 Staircase deadline scheduling policy by Con Kolivas - */ - - #include -@@ -52,6 +53,7 @@ - #include - #include - #include -+#include - #include - - #include -@@ -83,126 +85,85 @@ unsigned long long __attribute__((weak)) - #define USER_PRIO(p) ((p)-MAX_RT_PRIO) - #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) - #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) -+#define SCHED_PRIO(p) ((p)+MAX_RT_PRIO) - --/* -- * Some helpers for converting nanosecond timing to jiffy resolution -- */ --#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) -+/* Some helpers for converting to/from various scales.*/ - #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) -- --/* -- * These are the 'tuning knobs' of the scheduler: -- * -- * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), -- * default timeslice is 100 msecs, maximum timeslice is 800 msecs. -- * Timeslices get refilled after they expire. -- */ --#define MIN_TIMESLICE max(5 * HZ / 1000, 1) --#define DEF_TIMESLICE (100 * HZ / 1000) --#define ON_RUNQUEUE_WEIGHT 30 --#define CHILD_PENALTY 95 --#define PARENT_PENALTY 100 --#define EXIT_WEIGHT 3 --#define PRIO_BONUS_RATIO 25 --#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) --#define INTERACTIVE_DELTA 2 --#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) --#define STARVATION_LIMIT (MAX_SLEEP_AVG) --#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) -- --/* -- * If a task is 'interactive' then we reinsert it in the active -- * array after it has expired its current timeslice. (it will not -- * continue to run immediately, it will still roundrobin with -- * other interactive tasks.) -- * -- * This part scales the interactivity limit depending on niceness. -- * -- * We scale it linearly, offset by the INTERACTIVE_DELTA delta. -- * Here are a few examples of different nice levels: -- * -- * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] -- * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] -- * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] -- * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] -- * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] -- * -- * (the X axis represents the possible -5 ... 0 ... +5 dynamic -- * priority range a task can explore, a value of '1' means the -- * task is rated interactive.) -- * -- * Ie. nice +19 tasks can never get 'interactive' enough to be -- * reinserted into the active array. And only heavily CPU-hog nice -20 -- * tasks will be expired. Default nice 0 tasks are somewhere between, -- * it takes some effort for them to get interactive, but it's not -- * too hard. -- */ -- --#define CURRENT_BONUS(p) \ -- (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ -- MAX_SLEEP_AVG) -- --#define GRANULARITY (10 * HZ / 1000 ? : 1) -- --#ifdef CONFIG_SMP --#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ -- (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ -- num_online_cpus()) --#else --#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ -- (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) --#endif -- --#define SCALE(v1,v1_max,v2_max) \ -- (v1) * (v2_max) / (v1_max) -- --#define DELTA(p) \ -- (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \ -- INTERACTIVE_DELTA) -- --#define TASK_INTERACTIVE(p) \ -- ((p)->prio <= (p)->static_prio - DELTA(p)) -- --#define INTERACTIVE_SLEEP(p) \ -- (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ -- (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) -- --#define TASK_PREEMPTS_CURR(p, rq) \ -- ((p)->prio < (rq)->curr->prio) -- --#define SCALE_PRIO(x, prio) \ -- max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) -- --static unsigned int static_prio_timeslice(int static_prio) --{ -- if (static_prio < NICE_TO_PRIO(0)) -- return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); -- else -- return SCALE_PRIO(DEF_TIMESLICE, static_prio); --} -- --/* -- * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] -- * to time slice values: [800ms ... 100ms ... 5ms] -- * -- * The higher a thread's priority, the bigger timeslices -- * it gets during one round of execution. But even the lowest -- * priority thread gets MIN_TIMESLICE worth of execution time. -+#define MS_TO_NS(TIME) ((TIME) * 1000000) -+#define MS_TO_US(TIME) ((TIME) * 1000) -+#define US_TO_MS(TIME) ((TIME) / 1000) -+ -+#define TASK_PREEMPTS_CURR(p, curr) ((p)->prio < (curr)->prio) -+ -+/* -+ * This is the time all tasks within the same priority round robin. -+ * Value is in ms and set to a minimum of 8ms. Scales with number of cpus. -+ * Tunable via /proc interface. -+ */ -+int rr_interval __read_mostly = 6; -+int sched_interactive __read_mostly = 1; -+ -+/* -+ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks -+ * are allowed to run (over ISO_PERIOD seconds) as real time tasks. -+ * sched_iso_period - sysctl which determines the number of seconds over -+ * which cpu usage of SCHED_ISO tasks is averaged to determine if they are -+ * exceeding their allowable bandwidth. -+*/ -+int sched_iso_cpu __read_mostly = 80; -+int sched_iso_period __read_mostly = 5; -+ -+#define ISO_PERIOD ((sched_iso_period * HZ) + 1) -+ -+/* -+ * This contains a bitmap for each dynamic priority level with empty slots -+ * for the valid priorities each different nice level can have. It allows -+ * us to stagger the slots where differing priorities run in a way that -+ * keeps latency differences between different nice levels at a minimum. -+ * The purpose of a pre-generated matrix is for rapid lookup of next slot in -+ * O(1) time without having to recalculate every time priority gets demoted. -+ * All nice levels use priority slot 39 as this allows less niced tasks to -+ * get all priority slots better than that before expiration is forced. -+ * ie, where 0 means a slot for that priority, priority running from left to -+ * right is from prio 0 to prio 39: -+ * nice -20 0000000000000000000000000000000000000000 -+ * nice -10 1000100010001000100010001000100010010000 -+ * nice 0 1010101010101010101010101010101010101010 -+ * nice 5 1011010110110101101101011011010110110110 -+ * nice 10 1110111011101110111011101110111011101110 -+ * nice 15 1111111011111110111111101111111011111110 -+ * nice 19 1111111111111111111111111111111111111110 - */ -+static unsigned long prio_matrix[PRIO_RANGE][BITS_TO_LONGS(PRIO_RANGE)] -+ __read_mostly; - --static inline unsigned int task_timeslice(struct task_struct *p) --{ -- return static_prio_timeslice(p->static_prio); --} -+struct rq; - - /* - * These are the runqueue data structures: - */ -- - struct prio_array { -- unsigned int nr_active; -- DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ -- struct list_head queue[MAX_PRIO]; -+ /* Tasks queued at each priority */ -+ struct list_head queue[MAX_PRIO + 1]; -+ -+ /* -+ * The bitmap of priorities queued for this array. While the expired -+ * array will never have realtime tasks on it, it is simpler to have -+ * equal sized bitmaps for a cheap array swap. Include 1 bit for -+ * delimiter. -+ */ -+ DECLARE_BITMAP(prio_bitmap, MAX_PRIO + 1); -+ -+ /* -+ * The best static priority (of the dynamic priority tasks) queued -+ * this array. -+ */ -+ int best_static_prio; -+ -+#ifdef CONFIG_SMP -+ /* For convenience looks back at rq */ -+ struct rq *rq; -+#endif - }; - - /* -@@ -234,14 +195,28 @@ struct rq { - */ - unsigned long nr_uninterruptible; - -- unsigned long expired_timestamp; - /* Cached timestamp set by update_cpu_clock() */ - unsigned long long most_recent_timestamp; - struct task_struct *curr, *idle; - unsigned long next_balance; - struct mm_struct *prev_mm; -- struct prio_array *active, *expired, arrays[2]; -- int best_expired_prio; -+ -+ struct prio_array *active, *expired, *idleprio, arrays[2]; -+ unsigned long *dyn_bitmap, *exp_bitmap; -+ -+ /* -+ * The current dynamic priority level this runqueue is at per static -+ * priority level. -+ */ -+ int prio_level[PRIO_RANGE]; -+ -+ /* How many times we have rotated the priority queue */ -+ unsigned long prio_rotation; -+ unsigned long iso_ticks; -+ unsigned short iso_refractory; -+ -+ /* Number of idleprio tasks running */ -+ unsigned long nr_idleprio; - atomic_t nr_iowait; - - #ifdef CONFIG_SMP -@@ -579,12 +554,9 @@ static inline struct rq *this_rq_lock(vo - #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) - /* - * Called when a process is dequeued from the active array and given -- * the cpu. We should note that with the exception of interactive -- * tasks, the expired queue will become the active queue after the active -- * queue is empty, without explicitly dequeuing and requeuing tasks in the -- * expired queue. (Interactive tasks may be requeued directly to the -- * active queue, thus delaying tasks in the expired queue from running; -- * see scheduler_tick()). -+ * the cpu. We should note that the expired queue will become the active -+ * queue after the active queue is empty, without explicitly dequeuing and -+ * requeuing tasks in the expired queue. - * - * This function is only called from sched_info_arrive(), rather than - * dequeue_task(). Even though a task may be queued and dequeued multiple -@@ -682,71 +654,304 @@ sched_info_switch(struct task_struct *pr - #define sched_info_switch(t, next) do { } while (0) - #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ - -+static int idleprio_suitable(struct task_struct *p) -+{ -+ return (!p->mutexes_held && !freezing(p) && !signal_pending(p) && -+ !(p->flags & (PF_NONSLEEP | PF_EXITING))); -+} -+ -+static int idleprio(const struct task_struct *p) -+{ -+ return (p->prio == MAX_PRIO); -+} -+ -+static inline int task_queued(struct task_struct *task) -+{ -+ return !list_empty(&task->run_list); -+} -+ -+static inline void set_dynamic_bit(struct task_struct *p, struct rq *rq) -+{ -+ __set_bit(p->prio, p->array->prio_bitmap); -+} -+ - /* -- * Adding/removing a task to/from a priority array: -+ * Removing from a runqueue. - */ --static void dequeue_task(struct task_struct *p, struct prio_array *array) -+static void dequeue_task(struct task_struct *p, struct rq *rq) - { -- array->nr_active--; -- list_del(&p->run_list); -- if (list_empty(array->queue + p->prio)) -- __clear_bit(p->prio, array->bitmap); -+ list_del_init(&p->run_list); -+ if (idleprio_task(p) && idleprio(p)) -+ rq->nr_idleprio--; -+ else if (list_empty(p->array->queue + p->prio)) -+ __clear_bit(p->prio, p->array->prio_bitmap); - } - --static void enqueue_task(struct task_struct *p, struct prio_array *array) -+static void reset_first_time_slice(struct task_struct *p) - { -- sched_info_queued(p); -- list_add_tail(&p->run_list, array->queue + p->prio); -- __set_bit(p->prio, array->bitmap); -- array->nr_active++; -+ if (unlikely(p->first_time_slice)) -+ p->first_time_slice = 0; -+} -+ -+/* -+ * The task is being queued on a fresh array so it has its entitlement -+ * bitmap cleared. -+ */ -+static void task_new_array(struct task_struct *p, struct rq *rq, -+ struct prio_array *array) -+{ -+ bitmap_zero(p->bitmap, PRIO_RANGE); -+ p->rotation = rq->prio_rotation; -+ p->time_slice = p->quota; - p->array = array; -+ reset_first_time_slice(p); -+} -+ -+/* Find the first slot from the relevant prio_matrix entry */ -+static int first_prio_slot(struct task_struct *p) -+{ -+ if (unlikely(p->policy == SCHED_BATCH)) -+ return p->static_prio; -+ return SCHED_PRIO(find_first_zero_bit( -+ prio_matrix[USER_PRIO(p->static_prio)], PRIO_RANGE)); - } - - /* -- * Put task to the end of the run list without the overhead of dequeue -- * followed by enqueue. -+ * In sched_interactive mode priority allocation occurs per process per rq -+ * array swap. In !sched_interactive mode all waking tasks must obey the -+ * current prio level of all other tasks running per array swap. - */ --static void requeue_task(struct task_struct *p, struct prio_array *array) -+static int minprio(struct rq *rq, int uprio) - { -- list_move_tail(&p->run_list, array->queue + p->prio); -+ if (sched_interactive) -+ return MAX_RT_PRIO; -+ return rq->prio_level[uprio]; - } - --static inline void --enqueue_task_head(struct task_struct *p, struct prio_array *array) -+/* -+ * Find the first unused slot by this task that is also in its prio_matrix -+ * level. SCHED_BATCH tasks do not use the priority matrix. They only take -+ * priority slots from their static_prio and above. -+ */ -+static int next_entitled_slot(struct task_struct *p, struct rq *rq) - { -- list_add(&p->run_list, array->queue + p->prio); -- __set_bit(p->prio, array->bitmap); -- array->nr_active++; -- p->array = array; -+ int search_prio = MAX_RT_PRIO, uprio = USER_PRIO(p->static_prio); -+ struct prio_array *array = rq->active; -+ DECLARE_BITMAP(tmp, PRIO_RANGE); -+ -+ /* -+ * Go straight to expiration if there are higher priority tasks -+ * already expired. -+ */ -+ if (p->static_prio > rq->expired->best_static_prio) -+ return MAX_PRIO; -+ if (!rq->prio_level[uprio]) -+ rq->prio_level[uprio] = MAX_RT_PRIO; -+ /* -+ * Only priorities equal to the prio_level and above for their -+ * static_prio are acceptable, and only if it's not better than -+ * a queued better static_prio's prio_level. -+ */ -+ if (p->static_prio < array->best_static_prio) { -+ if (likely(p->policy != SCHED_BATCH)) -+ array->best_static_prio = p->static_prio; -+ } else if (p->static_prio == array->best_static_prio) { -+ search_prio = minprio(rq, uprio); -+ } else { -+ int i; -+ -+ search_prio = minprio(rq, uprio); -+ /* A bound O(n) function, worst case n is 40 */ -+ for (i = array->best_static_prio; i <= p->static_prio ; i++) { -+ if (!rq->prio_level[USER_PRIO(i)]) -+ rq->prio_level[USER_PRIO(i)] = MAX_RT_PRIO; -+ search_prio = max(search_prio, -+ rq->prio_level[USER_PRIO(i)]); -+ } -+ } -+ if (unlikely(p->policy == SCHED_BATCH)) { -+ search_prio = max(search_prio, p->static_prio); -+ return SCHED_PRIO(find_next_zero_bit(p->bitmap, PRIO_RANGE, -+ USER_PRIO(search_prio))); -+ } -+ bitmap_or(tmp, p->bitmap, prio_matrix[uprio], PRIO_RANGE); -+ return SCHED_PRIO(find_next_zero_bit(tmp, PRIO_RANGE, -+ USER_PRIO(search_prio))); -+} -+ -+static void queue_expired(struct task_struct *p, struct rq *rq) -+{ -+ task_new_array(p, rq, rq->expired); -+ p->prio = p->normal_prio = first_prio_slot(p); -+ if (p->static_prio < rq->expired->best_static_prio) -+ rq->expired->best_static_prio = p->static_prio; -+ reset_first_time_slice(p); - } - -+#ifdef CONFIG_SMP - /* -- * __normal_prio - return the priority that is based on the static -- * priority but is modified by bonuses/penalties. -- * -- * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] -- * into the -5 ... 0 ... +5 bonus/penalty range. -- * -- * We use 25% of the full 0...39 priority range so that: -- * -- * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. -- * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. -- * -- * Both properties are important to certain workloads. -+ * If we're waking up a task that was previously on a different runqueue, -+ * update its data appropriately. Note we may be reading data from src_rq-> -+ * outside of lock, but the occasional inaccurate result should be harmless. - */ -+ static void update_if_moved(struct task_struct *p, struct rq *rq) -+{ -+ struct rq *src_rq = p->array->rq; - --static inline int __normal_prio(struct task_struct *p) -+ if (src_rq == rq) -+ return; -+ /* -+ * Only need to set p->array when p->rotation == rq->prio_rotation as -+ * they will be set in recalc_task_prio when != rq->prio_rotation. -+ */ -+ if (p->rotation == src_rq->prio_rotation) { -+ p->rotation = rq->prio_rotation; -+ if (p->array == src_rq->expired) -+ p->array = rq->expired; -+ else -+ p->array = rq->active; -+ } else -+ p->rotation = 0; -+} -+#else -+static inline void update_if_moved(struct task_struct *p, struct rq *rq) -+{ -+} -+#endif -+ -+static inline int isoprio_suitable(struct task_struct *p) - { -- int bonus, prio; -+ return !(p->flags & PF_ISOREF); -+} - -- bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; -+static int task_timeslice(struct task_struct *p); - -- prio = p->static_prio - bonus; -- if (prio < MAX_RT_PRIO) -- prio = MAX_RT_PRIO; -- if (prio > MAX_PRIO-1) -- prio = MAX_PRIO-1; -- return prio; -+/* -+ * recalc_task_prio determines what priority a non rt_task will be -+ * queued at. If the task has already been running during this runqueue's -+ * major rotation (rq->prio_rotation) then it continues at the same -+ * priority if it has tick entitlement left. If it does not have entitlement -+ * left, it finds the next priority slot according to its nice value that it -+ * has not extracted quota from. If it has not run during this major -+ * rotation, it starts at the next_entitled_slot and has its bitmap quota -+ * cleared. If it does not have any slots left it has all its slots reset and -+ * is queued on the expired at its first_prio_slot. -+ */ -+static void recalc_task_prio(struct task_struct *p, struct rq *rq) -+{ -+ struct prio_array *array = rq->active; -+ int queue_prio; -+ -+ if (iso_task(p)) { -+ if (isoprio_suitable(p)) { -+ /* -+ * If SCHED_ISO tasks have not used up their real time -+ * quota they have run just better than highest -+ * SCHED_NORMAL priority. Otherwise they run as -+ * SCHED_NORMAL. -+ */ -+ p->prio = p->normal_prio = ISO_PRIO; -+ p->array = rq->active; -+ if (p->time_slice <= 0) -+ p->time_slice = p->quota; -+ return; -+ } else if (p->prio == ISO_PRIO) { -+ /* Just about to be demoted to SCHED_NORMAL */ -+ p->time_slice = 0; -+ } -+ } else if (idleprio_task(p)) { -+ if (idleprio_suitable(p)) { -+ /* -+ * If suitable idleprio_tasks are queued at MAX_PRIO -+ * only on the idleprio array. Their time_slice is -+ * their full task_timeslice as they cooperatively -+ * multitask. -+ */ -+ p->prio = p->normal_prio = MAX_PRIO; -+ p->array = rq->idleprio; -+ if (p->time_slice <= 0) -+ p->time_slice = task_timeslice(p); -+ return; -+ } -+ /* -+ * If unsuitable idleprio_tasks are queued equivalent to -+ * nice 19 tasks on the expired array. -+ */ -+ p->flags &= ~PF_NONSLEEP; -+ p->prio = p->normal_prio = MAX_PRIO - 1; -+ p->array = rq->expired; -+ if (p->time_slice <= 0 || p->time_slice > p->quota) -+ p->time_slice = p->quota; -+ return; -+ } -+ -+ update_if_moved(p, rq); -+ if (p->rotation == rq->prio_rotation) { -+ if (p->array == array) { -+ if (p->time_slice > 0) -+ return; -+ p->time_slice = p->quota; -+ } else if (p->array == rq->expired) { -+ queue_expired(p, rq); -+ return; -+ } else -+ task_new_array(p, rq, array); -+ } else -+ task_new_array(p, rq, array); -+ -+ queue_prio = next_entitled_slot(p, rq); -+ if (queue_prio >= MAX_PRIO) { -+ queue_expired(p, rq); -+ return; -+ } -+ p->prio = p->normal_prio = queue_prio; -+ __set_bit(USER_PRIO(p->prio), p->bitmap); -+} -+ -+/* -+ * Adding to a runqueue. The dynamic priority queue that it is added to is -+ * determined by recalc_task_prio() above. -+ */ -+static inline void __enqueue_task(struct task_struct *p, struct rq *rq) -+{ -+ if (rt_task(p)) -+ p->array = rq->active; -+ else -+ recalc_task_prio(p, rq); -+ -+ if (idleprio_task(p) && idleprio(p)) -+ rq->nr_idleprio++; -+ sched_info_queued(p); -+ set_dynamic_bit(p, rq); -+} -+ -+static void enqueue_task(struct task_struct *p, struct rq *rq) -+{ -+ __enqueue_task(p, rq); -+ list_add_tail(&p->run_list, p->array->queue + p->prio); -+} -+ -+static inline void enqueue_task_head(struct task_struct *p, struct rq *rq) -+{ -+ __enqueue_task(p, rq); -+ list_add(&p->run_list, p->array->queue + p->prio); -+} -+ -+/* -+ * requeue_task is only called when p->static_prio does not change. p->prio -+ * can change with dynamic tasks. -+ */ -+static void requeue_task(struct task_struct *p, struct rq *rq, -+ struct prio_array *old_array, int old_prio) -+{ -+ if (p->array == rq->expired) -+ queue_expired(p, rq); -+ list_move_tail(&p->run_list, p->array->queue + p->prio); -+ if (!rt_task(p)) { -+ if (list_empty(old_array->queue + old_prio)) -+ __clear_bit(old_prio, old_array->prio_bitmap); -+ set_dynamic_bit(p, rq); -+ } - } - - /* -@@ -759,20 +964,29 @@ static inline int __normal_prio(struct t - */ - - /* -- * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE -- * If static_prio_timeslice() is ever changed to break this assumption then -- * this code will need modification -- */ --#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE --#define LOAD_WEIGHT(lp) \ -- (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) --#define PRIO_TO_LOAD_WEIGHT(prio) \ -- LOAD_WEIGHT(static_prio_timeslice(prio)) --#define RTPRIO_TO_LOAD_WEIGHT(rp) \ -- (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) -+ * task_timeslice - the total duration a task can run during one major -+ * rotation. Returns value in milliseconds as the smallest value can be 1. -+ */ -+static int task_timeslice(struct task_struct *p) -+{ -+ int slice = p->quota; /* quota is in us */ -+ -+ if (!rt_task(p)) -+ slice += (PRIO_RANGE - 1 - TASK_USER_PRIO(p)) * slice; -+ return US_TO_MS(slice); -+} -+ -+/* -+ * The load weight is basically the task_timeslice in ms. Realtime tasks are -+ * special cased to be proportionately larger than nice -20 by their -+ * rt_priority. The weight for rt tasks can only be arbitrary at best. -+ */ -+#define RTPRIO_TO_LOAD_WEIGHT(rp) (rr_interval * 20 * (40 + rp)) - - static void set_load_weight(struct task_struct *p) - { -+ int load_weight; -+ - if (has_rt_policy(p)) { - #ifdef CONFIG_SMP - if (p == task_rq(p)->migration_thread) -@@ -781,12 +995,19 @@ static void set_load_weight(struct task_ - * Giving its load any weight will skew balancing - * adversely. - */ -- p->load_weight = 0; -+ load_weight = 0; - else - #endif -- p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); -+ load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); - } else -- p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); -+ load_weight = task_timeslice(p); -+ /* -+ * idleprio tasks have much lower weight than SCHED_NORMAL tasks but -+ * still need to be weighted to allow balancing to occur. -+ */ -+ if (likely(!idleprio_task(p))) -+ load_weight *= PRIO_RANGE; -+ p->load_weight = load_weight; - } - - static inline void -@@ -814,28 +1035,38 @@ static inline void dec_nr_running(struct - } - - /* -- * Calculate the expected normal priority: i.e. priority -- * without taking RT-inheritance into account. Might be -- * boosted by interactivity modifiers. Changes upon fork, -- * setprio syscalls, and whenever the interactivity -- * estimator recalculates. -+ * __activate_task - move a task to the runqueue. - */ --static inline int normal_prio(struct task_struct *p) -+static inline void __activate_task(struct task_struct *p, struct rq *rq) - { -- int prio; -+ enqueue_task(p, rq); -+ inc_nr_running(p, rq); -+} - -+/* -+ * __activate_idle_task - move idle task to the _front_ of runqueue. -+ */ -+static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) -+{ -+ enqueue_task_head(p, rq); -+ inc_nr_running(p, rq); -+} -+ -+static inline int normal_prio(struct task_struct *p) -+{ - if (has_rt_policy(p)) -- prio = MAX_RT_PRIO-1 - p->rt_priority; -+ return MAX_RT_PRIO-1 - p->rt_priority; -+ /* Other tasks all have normal_prio set in recalc_task_prio */ -+ if (likely(p->prio >= MAX_RT_PRIO && p->prio < MAX_PRIO)) -+ return p->prio; - else -- prio = __normal_prio(p); -- return prio; -+ return p->static_prio; - } - - /* - * Calculate the current priority, i.e. the priority - * taken into account by the scheduler. This value might -- * be boosted by RT tasks, or might be boosted by -- * interactivity modifiers. Will be RT if the task got -+ * be boosted by RT tasks as it will be RT if the task got - * RT-boosted. If not then it returns p->normal_prio. - */ - static int effective_prio(struct task_struct *p) -@@ -852,111 +1083,41 @@ static int effective_prio(struct task_st - } - - /* -- * __activate_task - move a task to the runqueue. -+ * All tasks have quotas based on rr_interval. RT tasks all get rr_interval. -+ * From nice 1 to 19 they are smaller than it only if they are at least one -+ * tick still. Below nice 0 they get progressively larger. -+ * ie nice -6..0 = rr_interval. nice -10 = 2.5 * rr_interval -+ * nice -20 = 10 * rr_interval. nice 1-19 = rr_interval / 2. -+ * Value returned is in microseconds. - */ --static void __activate_task(struct task_struct *p, struct rq *rq) -+static inline unsigned int rr_quota(struct task_struct *p) - { -- struct prio_array *target = rq->active; -+ int nice = TASK_NICE(p), rr = rr_interval; - -- if (batch_task(p)) -- target = rq->expired; -- enqueue_task(p, target); -- inc_nr_running(p, rq); --} -- --/* -- * __activate_idle_task - move idle task to the _front_ of runqueue. -- */ --static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) --{ -- enqueue_task_head(p, rq->active); -- inc_nr_running(p, rq); -+ if (!rt_task(p)) { -+ if (nice < -6) { -+ rr *= nice * nice; -+ rr /= 40; -+ } else if (nice > 0) -+ rr = rr / 2 ? : 1; -+ } -+ return MS_TO_US(rr); - } - --/* -- * Recalculate p->normal_prio and p->prio after having slept, -- * updating the sleep-average too: -- */ --static int recalc_task_prio(struct task_struct *p, unsigned long long now) -+/* Every time we set the quota we need to set the load weight */ -+static void set_quota(struct task_struct *p) - { -- /* Caller must always ensure 'now >= p->timestamp' */ -- unsigned long sleep_time = now - p->timestamp; -- -- if (batch_task(p)) -- sleep_time = 0; -- -- if (likely(sleep_time > 0)) { -- /* -- * This ceiling is set to the lowest priority that would allow -- * a task to be reinserted into the active array on timeslice -- * completion. -- */ -- unsigned long ceiling = INTERACTIVE_SLEEP(p); -- -- if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) { -- /* -- * Prevents user tasks from achieving best priority -- * with one single large enough sleep. -- */ -- p->sleep_avg = ceiling; -- /* -- * Using INTERACTIVE_SLEEP() as a ceiling places a -- * nice(0) task 1ms sleep away from promotion, and -- * gives it 700ms to round-robin with no chance of -- * being demoted. This is more than generous, so -- * mark this sleep as non-interactive to prevent the -- * on-runqueue bonus logic from intervening should -- * this task not receive cpu immediately. -- */ -- p->sleep_type = SLEEP_NONINTERACTIVE; -- } else { -- /* -- * Tasks waking from uninterruptible sleep are -- * limited in their sleep_avg rise as they -- * are likely to be waiting on I/O -- */ -- if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { -- if (p->sleep_avg >= ceiling) -- sleep_time = 0; -- else if (p->sleep_avg + sleep_time >= -- ceiling) { -- p->sleep_avg = ceiling; -- sleep_time = 0; -- } -- } -- -- /* -- * This code gives a bonus to interactive tasks. -- * -- * The boost works by updating the 'average sleep time' -- * value here, based on ->timestamp. The more time a -- * task spends sleeping, the higher the average gets - -- * and the higher the priority boost gets as well. -- */ -- p->sleep_avg += sleep_time; -- -- } -- if (p->sleep_avg > NS_MAX_SLEEP_AVG) -- p->sleep_avg = NS_MAX_SLEEP_AVG; -- } -- -- return effective_prio(p); -+ p->quota = rr_quota(p); -+ set_load_weight(p); - } - - /* - * activate_task - move a task to the runqueue and do priority recalculation -- * -- * Update all the scheduling statistics stuff. (sleep average -- * calculation, priority modifiers, etc.) - */ - static void activate_task(struct task_struct *p, struct rq *rq, int local) - { -- unsigned long long now; -- -- if (rt_task(p)) -- goto out; -+ unsigned long long now = sched_clock(); - -- now = sched_clock(); - #ifdef CONFIG_SMP - if (!local) { - /* Compensate for drifting sched_clock */ -@@ -977,32 +1138,9 @@ static void activate_task(struct task_st - (now - p->timestamp) >> 20); - } - -- p->prio = recalc_task_prio(p, now); -- -- /* -- * This checks to make sure it's not an uninterruptible task -- * that is now waking up. -- */ -- if (p->sleep_type == SLEEP_NORMAL) { -- /* -- * Tasks which were woken up by interrupts (ie. hw events) -- * are most likely of interactive nature. So we give them -- * the credit of extending their sleep time to the period -- * of time they spend on the runqueue, waiting for execution -- * on a CPU, first time around: -- */ -- if (in_interrupt()) -- p->sleep_type = SLEEP_INTERRUPTED; -- else { -- /* -- * Normal first-time wakeups get a credit too for -- * on-runqueue time, but it will be weighted down: -- */ -- p->sleep_type = SLEEP_INTERACTIVE; -- } -- } -+ set_quota(p); -+ p->prio = effective_prio(p); - p->timestamp = now; --out: - __activate_task(p, rq); - } - -@@ -1012,8 +1150,7 @@ out: - static void deactivate_task(struct task_struct *p, struct rq *rq) - { - dec_nr_running(p, rq); -- dequeue_task(p, p->array); -- p->array = NULL; -+ dequeue_task(p, rq); - } - - /* -@@ -1095,7 +1232,7 @@ migrate_task(struct task_struct *p, int - * If the task is not on a runqueue (and not running), then - * it is sufficient to simply update the task's cpu field. - */ -- if (!p->array && !task_running(rq, p)) { -+ if (!task_queued(p) && !task_running(rq, p)) { - set_task_cpu(p, dest_cpu); - return 0; - } -@@ -1126,7 +1263,7 @@ void wait_task_inactive(struct task_stru - repeat: - rq = task_rq_lock(p, &flags); - /* Must be off runqueue entirely, not preempted. */ -- if (unlikely(p->array || task_running(rq, p))) { -+ if (unlikely(task_queued(p) || task_running(rq, p))) { - /* If it's preempted, we yield. It could be a while. */ - preempted = !task_running(rq, p); - task_rq_unlock(rq, &flags); -@@ -1391,6 +1528,31 @@ static inline int wake_idle(int cpu, str - } - #endif - -+/* -+ * We need to have a special definition for an idle runqueue when testing -+ * for preemption on CONFIG_HOTPLUG_CPU as the idle task may be scheduled as -+ * a realtime task in sched_idle_next. -+ */ -+#ifdef CONFIG_HOTPLUG_CPU -+#define rq_idle(rq) ((rq)->curr == (rq)->idle && !rt_task((rq)->curr)) -+#else -+#define rq_idle(rq) ((rq)->curr == (rq)->idle) -+#endif -+ -+static inline int task_preempts_curr(struct task_struct *p, struct rq *rq) -+{ -+ struct task_struct *curr = rq->curr; -+ -+ return ((p->array == task_rq(p)->active && -+ TASK_PREEMPTS_CURR(p, curr)) || rq_idle(rq)); -+} -+ -+static inline void try_preempt(struct task_struct *p, struct rq *rq) -+{ -+ if (task_preempts_curr(p, rq)) -+ resched_task(rq->curr); -+} -+ - /*** - * try_to_wake_up - wake up a thread - * @p: the to-be-woken-up thread -@@ -1422,7 +1584,7 @@ static int try_to_wake_up(struct task_st - if (!(old_state & state)) - goto out; - -- if (p->array) -+ if (task_queued(p)) - goto out_running; - - cpu = task_cpu(p); -@@ -1515,7 +1677,7 @@ out_set_cpu: - old_state = p->state; - if (!(old_state & state)) - goto out; -- if (p->array) -+ if (task_queued(p)) - goto out_running; - - this_cpu = smp_processor_id(); -@@ -1524,25 +1686,9 @@ out_set_cpu: - - out_activate: - #endif /* CONFIG_SMP */ -- if (old_state == TASK_UNINTERRUPTIBLE) { -+ if (old_state == TASK_UNINTERRUPTIBLE) - rq->nr_uninterruptible--; -- /* -- * Tasks on involuntary sleep don't earn -- * sleep_avg beyond just interactive state. -- */ -- p->sleep_type = SLEEP_NONINTERACTIVE; -- } else -- -- /* -- * Tasks that have marked their sleep as noninteractive get -- * woken up with their sleep average not weighted in an -- * interactive way. -- */ -- if (old_state & TASK_NONINTERACTIVE) -- p->sleep_type = SLEEP_NONINTERACTIVE; - -- -- activate_task(p, rq, cpu == this_cpu); - /* - * Sync wakeups (i.e. those types of wakeups where the waker - * has indicated that it will leave the CPU in short order) -@@ -1551,15 +1697,22 @@ out_activate: - * the waker guarantees that the freshly woken up task is going - * to be considered on this CPU.) - */ -- if (!sync || cpu != this_cpu) { -- if (TASK_PREEMPTS_CURR(p, rq)) -- resched_task(rq->curr); -- } -+ activate_task(p, rq, cpu == this_cpu); -+ if (!sync || cpu != this_cpu) -+ try_preempt(p, rq); - success = 1; - - out_running: - p->state = TASK_RUNNING; - out: -+ /* -+ * Special case when freezing we need to reschedule idleprio tasks -+ * as SCHED_NORMAL or else they'll never freeze -+ */ -+ if (idleprio_task(p) && freezing(p) && idleprio(p)) { -+ dequeue_task(p, rq); -+ enqueue_task(p, rq); -+ } - task_rq_unlock(rq, &flags); - - return success; -@@ -1577,7 +1730,6 @@ int fastcall wake_up_state(struct task_s - return try_to_wake_up(p, state, 0); - } - --static void task_running_tick(struct rq *rq, struct task_struct *p); - /* - * Perform scheduler related setup for a newly forked process p. - * p is forked by current. -@@ -1605,7 +1757,6 @@ void fastcall sched_fork(struct task_str - p->prio = current->normal_prio; - - INIT_LIST_HEAD(&p->run_list); -- p->array = NULL; - #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) - if (unlikely(sched_info_on())) - memset(&p->sched_info, 0, sizeof(p->sched_info)); -@@ -1617,30 +1768,31 @@ void fastcall sched_fork(struct task_str - /* Want to start with kernel preemption disabled. */ - task_thread_info(p)->preempt_count = 1; - #endif -+ if (unlikely(p->policy == SCHED_FIFO)) -+ goto out; - /* - * Share the timeslice between parent and child, thus the - * total amount of pending timeslices in the system doesn't change, - * resulting in more scheduling fairness. - */ - local_irq_disable(); -- p->time_slice = (current->time_slice + 1) >> 1; -- /* -- * The remainder of the first timeslice might be recovered by -- * the parent if the child exits early enough. -- */ -- p->first_time_slice = 1; -- current->time_slice >>= 1; -- p->timestamp = sched_clock(); -- if (unlikely(!current->time_slice)) { -+ if (current->time_slice > 0) { -+ current->time_slice /= 2; -+ if (current->time_slice) -+ p->time_slice = current->time_slice; -+ else -+ p->time_slice = 1; - /* -- * This case is rare, it happens when the parent has only -- * a single jiffy left from its timeslice. Taking the -- * runqueue lock is not a problem. -+ * The remainder of the first timeslice might be recovered by -+ * the parent if the child exits early enough. - */ -- current->time_slice = 1; -- task_running_tick(cpu_rq(cpu), current); -- } -+ p->first_time_slice = 1; -+ } else -+ p->time_slice = 0; -+ -+ p->timestamp = sched_clock(); - local_irq_enable(); -+out: - put_cpu(); - } - -@@ -1662,38 +1814,16 @@ void fastcall wake_up_new_task(struct ta - this_cpu = smp_processor_id(); - cpu = task_cpu(p); - -- /* -- * We decrease the sleep average of forking parents -- * and children as well, to keep max-interactive tasks -- * from forking tasks that are max-interactive. The parent -- * (current) is done further down, under its lock. -- */ -- p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * -- CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); -- -- p->prio = effective_prio(p); -- - if (likely(cpu == this_cpu)) { -+ activate_task(p, rq, 1); - if (!(clone_flags & CLONE_VM)) { - /* - * The VM isn't cloned, so we're in a good position to - * do child-runs-first in anticipation of an exec. This - * usually avoids a lot of COW overhead. - */ -- if (unlikely(!current->array)) -- __activate_task(p, rq); -- else { -- p->prio = current->prio; -- p->normal_prio = current->normal_prio; -- list_add_tail(&p->run_list, ¤t->run_list); -- p->array = current->array; -- p->array->nr_active++; -- inc_nr_running(p, rq); -- } - set_need_resched(); -- } else -- /* Run child last */ -- __activate_task(p, rq); -+ } - /* - * We skip the following code due to cpu == this_cpu - * -@@ -1710,19 +1840,16 @@ void fastcall wake_up_new_task(struct ta - */ - p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) - + rq->most_recent_timestamp; -- __activate_task(p, rq); -- if (TASK_PREEMPTS_CURR(p, rq)) -- resched_task(rq->curr); -+ activate_task(p, rq, 0); -+ try_preempt(p, rq); - - /* - * Parent and child are on different CPUs, now get the -- * parent runqueue to update the parent's ->sleep_avg: -+ * parent runqueue to update the parent's ->flags: - */ - task_rq_unlock(rq, &flags); - this_rq = task_rq_lock(current, &flags); - } -- current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * -- PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - task_rq_unlock(this_rq, &flags); - } - -@@ -1737,23 +1864,17 @@ void fastcall wake_up_new_task(struct ta - */ - void fastcall sched_exit(struct task_struct *p) - { -+ struct task_struct *parent; - unsigned long flags; - struct rq *rq; - -- /* -- * If the child was a (relative-) CPU hog then decrease -- * the sleep_avg of the parent as well. -- */ -- rq = task_rq_lock(p->parent, &flags); -- if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { -- p->parent->time_slice += p->time_slice; -- if (unlikely(p->parent->time_slice > task_timeslice(p))) -- p->parent->time_slice = task_timeslice(p); -- } -- if (p->sleep_avg < p->parent->sleep_avg) -- p->parent->sleep_avg = p->parent->sleep_avg / -- (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / -- (EXIT_WEIGHT + 1); -+ parent = p->parent; -+ rq = task_rq_lock(parent, &flags); -+ if (p->first_time_slice > 0 && task_cpu(p) == task_cpu(parent)) { -+ parent->time_slice += p->time_slice; -+ if (unlikely(parent->time_slice > parent->quota)) -+ parent->time_slice = parent->quota; -+ } - task_rq_unlock(rq, &flags); - } - -@@ -2085,23 +2206,17 @@ void sched_exec(void) - * pull_task - move a task from a remote runqueue to the local runqueue. - * Both runqueues must be locked. - */ --static void pull_task(struct rq *src_rq, struct prio_array *src_array, -- struct task_struct *p, struct rq *this_rq, -- struct prio_array *this_array, int this_cpu) -+static void pull_task(struct rq *src_rq, struct task_struct *p, -+ struct rq *this_rq, int this_cpu) - { -- dequeue_task(p, src_array); -+ dequeue_task(p, src_rq); - dec_nr_running(p, src_rq); - set_task_cpu(p, this_cpu); - inc_nr_running(p, this_rq); -- enqueue_task(p, this_array); -+ enqueue_task(p, this_rq); - p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) - + this_rq->most_recent_timestamp; -- /* -- * Note that idle threads have a prio of MAX_PRIO, for this test -- * to be always true for them. -- */ -- if (TASK_PREEMPTS_CURR(p, this_rq)) -- resched_task(this_rq->curr); -+ try_preempt(p, this_rq); - } - - /* -@@ -2144,7 +2259,16 @@ int can_migrate_task(struct task_struct - return 1; - } - --#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) -+static inline int rq_best_prio(struct rq *rq) -+{ -+ int best_prio, exp_prio; -+ -+ best_prio = sched_find_first_bit(rq->dyn_bitmap); -+ exp_prio = find_next_bit(rq->exp_bitmap, MAX_PRIO, MAX_RT_PRIO); -+ if (unlikely(best_prio > exp_prio)) -+ best_prio = exp_prio; -+ return best_prio; -+} - - /* - * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted -@@ -2160,7 +2284,7 @@ static int move_tasks(struct rq *this_rq - { - int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, - best_prio_seen, skip_for_load; -- struct prio_array *array, *dst_array; -+ struct prio_array *array; - struct list_head *head, *curr; - struct task_struct *tmp; - long rem_load_move; -@@ -2187,31 +2311,29 @@ static int move_tasks(struct rq *this_rq - * be cache-cold, thus switching CPUs has the least effect - * on them. - */ -- if (busiest->expired->nr_active) { -- array = busiest->expired; -- dst_array = this_rq->expired; -- } else { -- array = busiest->active; -- dst_array = this_rq->active; -- } -- -+ array = busiest->expired; - new_array: -- /* Start searching at priority 0: */ -- idx = 0; -+ /* Expired arrays don't have RT tasks so they're always MAX_RT_PRIO+ */ -+ if (array == busiest->expired) -+ idx = MAX_RT_PRIO; -+ else -+ idx = 0; - skip_bitmap: - if (!idx) -- idx = sched_find_first_bit(array->bitmap); -+ idx = sched_find_first_bit(array->prio_bitmap); - else -- idx = find_next_bit(array->bitmap, MAX_PRIO, idx); -- if (idx >= MAX_PRIO) { -- if (array == busiest->expired && busiest->active->nr_active) { -+ idx = find_next_bit(array->prio_bitmap, MAX_PRIO, idx); -+ if (idx == MAX_PRIO) { -+ if (array == busiest->idleprio && busiest->nr_idleprio) -+ goto found_idleprio; -+ if (array == busiest->expired) { - array = busiest->active; -- dst_array = this_rq->active; - goto new_array; - } - goto out; - } - -+found_idleprio: - head = array->queue + idx; - curr = head->prev; - skip_queue: -@@ -2233,11 +2355,22 @@ skip_queue: - best_prio_seen |= idx == best_prio; - if (curr != head) - goto skip_queue; -+ if (idx == MAX_PRIO) { -+ /* -+ * Occurs either when balancing idleprio tasks or -+ * there really are no more tasks to find. -+ */ -+ if (array == busiest->expired) { -+ array = busiest->active; -+ goto new_array; -+ } -+ goto out; -+ } - idx++; - goto skip_bitmap; - } - -- pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); -+ pull_task(busiest, tmp, this_rq, this_cpu); - pulled++; - rem_load_move -= tmp->load_weight; - -@@ -2250,6 +2383,13 @@ skip_queue: - this_best_prio = idx; - if (curr != head) - goto skip_queue; -+ if (idx == MAX_PRIO) { -+ if (array == busiest->expired) { -+ array = busiest->active; -+ goto new_array; -+ } -+ goto out; -+ } - idx++; - goto skip_bitmap; - } -@@ -3013,11 +3153,36 @@ EXPORT_PER_CPU_SYMBOL(kstat); - /* - * This is called on clock ticks and on context switches. - * Bank in p->sched_time the ns elapsed since the last tick or switch. -+ * CPU scheduler quota accounting is also performed here in microseconds. -+ * The value returned from sched_clock() occasionally gives bogus values so -+ * some sanity checking is required. - */ --static inline void --update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) -+static void -+update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now, -+ int tick) - { -- p->sched_time += now - p->last_ran; -+ long time_diff = now - p->last_ran; -+ -+ if (tick) { -+ /* -+ * Called from scheduler_tick() there should be less than two -+ * jiffies worth, and not negative/overflow. -+ */ -+ if (time_diff > JIFFIES_TO_NS(2) || time_diff < 0) -+ time_diff = JIFFIES_TO_NS(1); -+ } else { -+ /* -+ * Called from context_switch there should be less than one -+ * jiffy worth, and not negative/overflow. There should be -+ * some time banked here so use a nominal 1us. -+ */ -+ if (time_diff > JIFFIES_TO_NS(1) || time_diff < 1) -+ time_diff = 1000; -+ } -+ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ -+ if (p != rq->idle && p->policy != SCHED_FIFO) -+ p->time_slice -= time_diff / 1000; -+ p->sched_time += time_diff; - p->last_ran = rq->most_recent_timestamp = now; - } - -@@ -3038,27 +3203,6 @@ unsigned long long current_sched_time(co - } - - /* -- * We place interactive tasks back into the active array, if possible. -- * -- * To guarantee that this does not starve expired tasks we ignore the -- * interactivity of a task if the first expired task had to wait more -- * than a 'reasonable' amount of time. This deadline timeout is -- * load-dependent, as the frequency of array switched decreases with -- * increasing number of running tasks. We also ignore the interactivity -- * if a better static_prio task has expired: -- */ --static inline int expired_starving(struct rq *rq) --{ -- if (rq->curr->static_prio > rq->best_expired_prio) -- return 1; -- if (!STARVATION_LIMIT || !rq->expired_timestamp) -- return 0; -- if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running) -- return 1; -- return 0; --} -- --/* - * Account user cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @hardirq_offset: the offset to subtract from hardirq_count() -@@ -3073,7 +3217,7 @@ void account_user_time(struct task_struc - - /* Add user time to cpustat. */ - tmp = cputime_to_cputime64(cputime); -- if (TASK_NICE(p) > 0) -+ if (TASK_NICE(p) > 0 || idleprio_task(p)) - cpustat->nice = cputime64_add(cpustat->nice, tmp); - else - cpustat->user = cputime64_add(cpustat->user, tmp); -@@ -3131,87 +3275,94 @@ void account_steal_time(struct task_stru - cpustat->steal = cputime64_add(cpustat->steal, tmp); - } - --static void task_running_tick(struct rq *rq, struct task_struct *p) -+/* -+ * The task has used up its quota of running in this prio_level so it must be -+ * dropped a priority level, all managed by recalc_task_prio(). -+ */ -+static void task_expired_entitlement(struct rq *rq, struct task_struct *p) - { -- if (p->array != rq->active) { -- /* Task has expired but was not scheduled yet */ -- set_tsk_need_resched(p); -+ int overrun; -+ -+ reset_first_time_slice(p); -+ if (rt_task(p)) { -+ p->time_slice += p->quota; -+ list_move_tail(&p->run_list, p->array->queue + p->prio); - return; - } -- spin_lock(&rq->lock); -+ overrun = p->time_slice; -+ dequeue_task(p, rq); -+ enqueue_task(p, rq); - /* -- * The task was running during this tick - update the -- * time slice counter. Note: we do not update a thread's -- * priority until it either goes to sleep or uses up its -- * timeslice. This makes it possible for interactive tasks -- * to use up their timeslices at their highest priority levels. -+ * Subtract any extra time this task ran over its time_slice; ie -+ * overrun will either be 0 or negative. - */ -- if (rt_task(p)) { -- /* -- * RR tasks need a special form of timeslice management. -- * FIFO tasks have no timeslices. -- */ -- if ((p->policy == SCHED_RR) && !--p->time_slice) { -- p->time_slice = task_timeslice(p); -- p->first_time_slice = 0; -- set_tsk_need_resched(p); -+ p->time_slice += overrun; -+} - -- /* put it at the end of the queue: */ -- requeue_task(p, rq->active); -- } -- goto out_unlock; -+/* -+ * Test if SCHED_ISO tasks have run longer than their alloted period as RT -+ * tasks and set the refractory flag if necessary. There is 10% hysteresis -+ * for unsetting the flag. -+ */ -+static unsigned int test_ret_isorefractory(struct rq *rq) -+{ -+ if (likely(!rq->iso_refractory)) { -+ if (rq->iso_ticks / ISO_PERIOD > sched_iso_cpu) -+ rq->iso_refractory = 1; -+ } else { -+ if (rq->iso_ticks / ISO_PERIOD < (sched_iso_cpu * 90 / 100)) -+ rq->iso_refractory = 0; - } -- if (!--p->time_slice) { -- dequeue_task(p, rq->active); -- set_tsk_need_resched(p); -- p->prio = effective_prio(p); -- p->time_slice = task_timeslice(p); -- p->first_time_slice = 0; -+ return rq->iso_refractory; -+} - -- if (!rq->expired_timestamp) -- rq->expired_timestamp = jiffies; -- if (!TASK_INTERACTIVE(p) || expired_starving(rq)) { -- enqueue_task(p, rq->expired); -- if (p->static_prio < rq->best_expired_prio) -- rq->best_expired_prio = p->static_prio; -- } else -- enqueue_task(p, rq->active); -- } else { -- /* -- * Prevent a too long timeslice allowing a task to monopolize -- * the CPU. We do this by splitting up the timeslice into -- * smaller pieces. -- * -- * Note: this does not mean the task's timeslices expire or -- * get lost in any way, they just might be preempted by -- * another task of equal priority. (one with higher -- * priority would have preempted this task already.) We -- * requeue this task to the end of the list on this priority -- * level, which is in essence a round-robin of tasks with -- * equal priority. -- * -- * This only applies to tasks in the interactive -- * delta range with at least TIMESLICE_GRANULARITY to requeue. -- */ -- if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - -- p->time_slice) % TIMESLICE_GRANULARITY(p)) && -- (p->time_slice >= TIMESLICE_GRANULARITY(p)) && -- (p->array == rq->active)) { -+/* No SCHED_ISO task was running so decrease rq->iso_ticks */ -+static inline void no_iso_tick(struct rq *rq) -+{ -+ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; -+} - -- requeue_task(p, rq->active); -- set_tsk_need_resched(p); -- } -+/* This manages tasks that have run out of timeslice during a scheduler_tick */ -+static void task_running_tick(struct rq *rq, struct task_struct *p) -+{ -+ /* -+ * If a SCHED_ISO task is running we increment the iso_ticks. In -+ * order to prevent SCHED_ISO tasks from causing starvation in the -+ * presence of true RT tasks we account those as iso_ticks as well. -+ */ -+ if ((rt_task(p) || (iso_task(p) && !rq->iso_refractory))) { -+ if (rq->iso_ticks <= (ISO_PERIOD * 100) - 100) -+ rq->iso_ticks += 100; -+ } else -+ no_iso_tick(rq); -+ -+ if (iso_task(p)) { -+ if (unlikely(test_ret_isorefractory(rq))) { -+ if (isoprio_suitable(p)) { -+ /* -+ * SCHED_ISO task is running as RT and limit -+ * has been hit. Set the PF_ISOREF flag and -+ * force it to reschedule as SCHED_NORMAL -+ * by zeroing its time_slice -+ */ -+ p->flags |= PF_ISOREF; -+ p->time_slice = 0; -+ } -+ } else -+ p->flags &= ~PF_ISOREF; - } --out_unlock: -- spin_unlock(&rq->lock); -+ /* SCHED_FIFO tasks never run out of timeslice. */ -+ if (p->time_slice > 0 || p->policy == SCHED_FIFO) -+ return; -+ /* p->time_slice <= 0 */ -+ set_tsk_need_resched(p); -+ if (likely(task_queued(p))) -+ task_expired_entitlement(rq, p); - } - - /* - * This function gets called by the timer code, with HZ frequency. - * We call it with interrupts disabled. -- * -- * It also gets called by the fork code, when changing the parent's -- * timeslices. - */ - void scheduler_tick(void) - { -@@ -3220,10 +3371,14 @@ void scheduler_tick(void) - int cpu = smp_processor_id(); - struct rq *rq = cpu_rq(cpu); - -- update_cpu_clock(p, rq, now); -+ update_cpu_clock(p, rq, now, 1); - -+ spin_lock(&rq->lock); - if (p != rq->idle) - task_running_tick(rq, p); -+ else -+ no_iso_tick(rq); -+ spin_unlock(&rq->lock); - #ifdef CONFIG_SMP - update_load(rq); - if (time_after_eq(jiffies, rq->next_balance)) -@@ -3269,10 +3424,80 @@ EXPORT_SYMBOL(sub_preempt_count); - - #endif - --static inline int interactive_sleep(enum sleep_type sleep_type) -+static void reset_prio_levels(struct rq *rq) -+{ -+ rq->active->best_static_prio = MAX_PRIO - 1; -+ rq->expired->best_static_prio = MAX_PRIO - 1; -+ memset(rq->prio_level, 0, sizeof(int) * PRIO_RANGE); -+} -+ -+/* -+ * Only tasks running are SCHED_IDLEPRIO. Set the active array to the -+ * idleprio array and if it isn't already active -+ */ -+static struct task_struct *next_idleprio_task(struct rq *rq) - { -- return (sleep_type == SLEEP_INTERACTIVE || -- sleep_type == SLEEP_INTERRUPTED); -+ struct prio_array *array = rq->active; -+ struct list_head *queue; -+ -+ if (array != rq->idleprio) { -+ rq->active = rq->idleprio; -+ rq->expired = array; -+ array = rq->active; -+ rq->exp_bitmap = rq->expired->prio_bitmap; -+ rq->dyn_bitmap = rq->active->prio_bitmap; -+ } -+ rq->prio_rotation++; -+ reset_prio_levels(rq); -+ queue = array->queue + MAX_PRIO; -+ return list_entry(queue->next, struct task_struct, run_list); -+} -+ -+/* -+ * next_dynamic_task finds the next suitable dynamic task. -+ */ -+static inline struct task_struct *next_dynamic_task(struct rq *rq, int idx) -+{ -+ struct prio_array *array = rq->active; -+ struct task_struct *next; -+ struct list_head *queue; -+ int nstatic; -+ -+retry: -+ if (unlikely(rq->nr_running == rq->nr_idleprio)) -+ return next_idleprio_task(rq); -+ if (idx >= MAX_PRIO) { -+ /* There are no more tasks in the active array. Swap arrays */ -+ array = rq->expired; -+ rq->expired = rq->active; -+ rq->active = array; -+ rq->exp_bitmap = rq->expired->prio_bitmap; -+ rq->dyn_bitmap = rq->active->prio_bitmap; -+ rq->prio_rotation++; -+ idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO); -+ reset_prio_levels(rq); -+ } -+ queue = array->queue + idx; -+ next = list_entry(queue->next, struct task_struct, run_list); -+ if (unlikely(next->time_slice <= 0 && !(iso_task(next) && -+ isoprio_suitable(next)))) { -+ /* -+ * Unlucky enough that this task ran out of time_slice -+ * before it hit a scheduler_tick so it should have its -+ * priority reassessed and choose another task (possibly -+ * the same one) -+ */ -+ task_expired_entitlement(rq, next); -+ idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO); -+ goto retry; -+ } -+ next->rotation = rq->prio_rotation; -+ nstatic = next->static_prio; -+ if (nstatic < array->best_static_prio) -+ array->best_static_prio = nstatic; -+ if (idx > rq->prio_level[USER_PRIO(nstatic)]) -+ rq->prio_level[USER_PRIO(nstatic)] = idx; -+ return next; - } - - /* -@@ -3281,13 +3506,11 @@ static inline int interactive_sleep(enum - asmlinkage void __sched schedule(void) - { - struct task_struct *prev, *next; -- struct prio_array *array; - struct list_head *queue; - unsigned long long now; -- unsigned long run_time; -- int cpu, idx, new_prio; - long *switch_count; - struct rq *rq; -+ int cpu, idx; - - /* - * Test if we are atomic. Since do_exit() needs to call into -@@ -3323,18 +3546,6 @@ need_resched_nonpreemptible: - - schedstat_inc(rq, sched_cnt); - now = sched_clock(); -- if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { -- run_time = now - prev->timestamp; -- if (unlikely((long long)(now - prev->timestamp) < 0)) -- run_time = 0; -- } else -- run_time = NS_MAX_SLEEP_AVG; -- -- /* -- * Tasks charged proportionately less run_time at high sleep_avg to -- * delay them losing their interactive status -- */ -- run_time /= (CURRENT_BONUS(prev) ? : 1); - - spin_lock_irq(&rq->lock); - -@@ -3345,8 +3556,10 @@ need_resched_nonpreemptible: - unlikely(signal_pending(prev)))) - prev->state = TASK_RUNNING; - else { -- if (prev->state == TASK_UNINTERRUPTIBLE) -+ if (prev->state == TASK_UNINTERRUPTIBLE) { -+ prev->flags |= PF_NONSLEEP; - rq->nr_uninterruptible++; -+ } - deactivate_task(prev, rq); - } - } -@@ -3356,59 +3569,29 @@ need_resched_nonpreemptible: - idle_balance(cpu, rq); - if (!rq->nr_running) { - next = rq->idle; -- rq->expired_timestamp = 0; - goto switch_tasks; - } - } - -- array = rq->active; -- if (unlikely(!array->nr_active)) { -- /* -- * Switch the active and expired arrays. -- */ -- schedstat_inc(rq, sched_switch); -- rq->active = rq->expired; -- rq->expired = array; -- array = rq->active; -- rq->expired_timestamp = 0; -- rq->best_expired_prio = MAX_PRIO; -+ idx = sched_find_first_bit(rq->dyn_bitmap); -+ if (likely(idx > ISO_PRIO)) -+ next = next_dynamic_task(rq, idx); -+ else { -+ queue = rq->active->queue + idx; -+ next = list_entry(queue->next, struct task_struct, run_list); - } -- -- idx = sched_find_first_bit(array->bitmap); -- queue = array->queue + idx; -- next = list_entry(queue->next, struct task_struct, run_list); -- -- if (!rt_task(next) && interactive_sleep(next->sleep_type)) { -- unsigned long long delta = now - next->timestamp; -- if (unlikely((long long)(now - next->timestamp) < 0)) -- delta = 0; -- -- if (next->sleep_type == SLEEP_INTERACTIVE) -- delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; -- -- array = next->array; -- new_prio = recalc_task_prio(next, next->timestamp + delta); -- -- if (unlikely(next->prio != new_prio)) { -- dequeue_task(next, array); -- next->prio = new_prio; -- enqueue_task(next, array); -- } -- } -- next->sleep_type = SLEEP_NORMAL; - switch_tasks: -- if (next == rq->idle) -+ if (next == rq->idle) { -+ reset_prio_levels(rq); -+ rq->prio_rotation++; - schedstat_inc(rq, sched_goidle); -+ } - prefetch(next); - prefetch_stack(next); - clear_tsk_need_resched(prev); - rcu_qsctr_inc(task_cpu(prev)); - -- update_cpu_clock(prev, rq, now); -- -- prev->sleep_avg -= run_time; -- if ((long)prev->sleep_avg <= 0) -- prev->sleep_avg = 0; -+ update_cpu_clock(prev, rq, now, 0); - prev->timestamp = prev->last_ran = now; - - sched_info_switch(prev, next); -@@ -3844,29 +4027,22 @@ EXPORT_SYMBOL(sleep_on_timeout); - */ - void rt_mutex_setprio(struct task_struct *p, int prio) - { -- struct prio_array *array; - unsigned long flags; -+ int queued, oldprio; - struct rq *rq; -- int oldprio; - - BUG_ON(prio < 0 || prio > MAX_PRIO); - - rq = task_rq_lock(p, &flags); - - oldprio = p->prio; -- array = p->array; -- if (array) -- dequeue_task(p, array); -+ queued = task_queued(p); -+ if (queued) -+ dequeue_task(p, rq); - p->prio = prio; - -- if (array) { -- /* -- * If changing to an RT priority then queue it -- * in the active array! -- */ -- if (rt_task(p)) -- array = rq->active; -- enqueue_task(p, array); -+ if (queued) { -+ enqueue_task(p, rq); - /* - * Reschedule if we are currently running on this runqueue and - * our priority decreased, or if we are not currently running on -@@ -3875,8 +4051,8 @@ void rt_mutex_setprio(struct task_struct - if (task_running(rq, p)) { - if (p->prio > oldprio) - resched_task(rq->curr); -- } else if (TASK_PREEMPTS_CURR(p, rq)) -- resched_task(rq->curr); -+ } else -+ try_preempt(p, rq); - } - task_rq_unlock(rq, &flags); - } -@@ -3885,8 +4061,7 @@ void rt_mutex_setprio(struct task_struct - - void set_user_nice(struct task_struct *p, long nice) - { -- struct prio_array *array; -- int old_prio, delta; -+ int queued, old_prio,delta; - unsigned long flags; - struct rq *rq; - -@@ -3907,26 +4082,27 @@ void set_user_nice(struct task_struct *p - p->static_prio = NICE_TO_PRIO(nice); - goto out_unlock; - } -- array = p->array; -- if (array) { -- dequeue_task(p, array); -+ queued = task_queued(p); -+ if (queued) { -+ dequeue_task(p, rq); - dec_raw_weighted_load(rq, p); - } - - p->static_prio = NICE_TO_PRIO(nice); -- set_load_weight(p); - old_prio = p->prio; - p->prio = effective_prio(p); -+ set_quota(p); - delta = p->prio - old_prio; - -- if (array) { -- enqueue_task(p, array); -+ if (queued) { -+ enqueue_task(p, rq); - inc_raw_weighted_load(rq, p); - /* - * If the task increased its priority or is running and - * lowered its priority, then reschedule its CPU: - */ -- if (delta < 0 || (delta > 0 && task_running(rq, p))) -+ if (delta < 0 || ((delta > 0 || idleprio_task(p)) && -+ task_running(rq, p))) - resched_task(rq->curr); - } - out_unlock: -@@ -3996,7 +4172,7 @@ asmlinkage long sys_nice(int increment) - * - * This is the priority value as seen by users in /proc. - * RT tasks are offset by -200. Normal tasks are centered -- * around 0, value goes from -16 to +15. -+ * around 0, value goes from 0 to +39. - */ - int task_prio(const struct task_struct *p) - { -@@ -4043,19 +4219,14 @@ static inline struct task_struct *find_p - /* Actually do priority change: must hold rq lock. */ - static void __setscheduler(struct task_struct *p, int policy, int prio) - { -- BUG_ON(p->array); -+ BUG_ON(task_queued(p)); - - p->policy = policy; - p->rt_priority = prio; - p->normal_prio = normal_prio(p); - /* we are holding p->pi_lock already */ - p->prio = rt_mutex_getprio(p); -- /* -- * SCHED_BATCH tasks are treated as perpetual CPU hogs: -- */ -- if (policy == SCHED_BATCH) -- p->sleep_avg = 0; -- set_load_weight(p); -+ set_quota(p); - } - - /** -@@ -4069,19 +4240,27 @@ static void __setscheduler(struct task_s - int sched_setscheduler(struct task_struct *p, int policy, - struct sched_param *param) - { -- int retval, oldprio, oldpolicy = -1; -- struct prio_array *array; -+ struct sched_param zero_param = { .sched_priority = 0 }; -+ int queued, retval, oldprio, oldpolicy = -1; - unsigned long flags; - struct rq *rq; - - /* may grab non-irq protected spin_locks */ - BUG_ON(in_interrupt()); -+ if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { -+ /* -+ * If the caller requested an RT policy without having the -+ * necessary rights, we downgrade the policy to SCHED_ISO. -+ * We also set the parameter to zero to pass the checks. -+ */ -+ policy = SCHED_ISO; -+ param = &zero_param; -+ } - recheck: - /* double check policy once rq lock held */ - if (policy < 0) - policy = oldpolicy = p->policy; -- else if (policy != SCHED_FIFO && policy != SCHED_RR && -- policy != SCHED_NORMAL && policy != SCHED_BATCH) -+ else if (!SCHED_RANGE(policy)) - return -EINVAL; - /* - * Valid priorities for SCHED_FIFO and SCHED_RR are -@@ -4116,6 +4295,31 @@ recheck: - if (param->sched_priority > p->rt_priority && - param->sched_priority > rlim_rtprio) - return -EPERM; -+ } else { -+ switch (p->policy) { -+ /* -+ * Can only downgrade policies but not back to -+ * SCHED_NORMAL -+ */ -+ case SCHED_ISO: -+ if (policy == SCHED_ISO) -+ goto out; -+ if (policy == SCHED_NORMAL) -+ return -EPERM; -+ break; -+ case SCHED_BATCH: -+ if (policy == SCHED_BATCH) -+ goto out; -+ if (policy != SCHED_IDLEPRIO) -+ return -EPERM; -+ break; -+ case SCHED_IDLEPRIO: -+ if (policy == SCHED_IDLEPRIO) -+ goto out; -+ return -EPERM; -+ default: -+ break; -+ } - } - - /* can't change other user's priorities */ -@@ -4124,6 +4328,11 @@ recheck: - return -EPERM; - } - -+ if (!(p->mm) && policy == SCHED_IDLEPRIO) { -+ /* Don't allow kernel threads to be SCHED_IDLEPRIO. */ -+ return -EINVAL; -+ } -+ - retval = security_task_setscheduler(p, policy, param); - if (retval) - return retval; -@@ -4144,12 +4353,12 @@ recheck: - spin_unlock_irqrestore(&p->pi_lock, flags); - goto recheck; - } -- array = p->array; -- if (array) -+ queued = task_queued(p); -+ if (queued) - deactivate_task(p, rq); - oldprio = p->prio; - __setscheduler(p, policy, param->sched_priority); -- if (array) { -+ if (queued) { - __activate_task(p, rq); - /* - * Reschedule if we are currently running on this runqueue and -@@ -4159,14 +4368,15 @@ recheck: - if (task_running(rq, p)) { - if (p->prio > oldprio) - resched_task(rq->curr); -- } else if (TASK_PREEMPTS_CURR(p, rq)) -- resched_task(rq->curr); -+ } else -+ try_preempt(p, rq); - } - __task_rq_unlock(rq); - spin_unlock_irqrestore(&p->pi_lock, flags); - - rt_mutex_adjust_pi(p); - -+out: - return 0; - } - EXPORT_SYMBOL_GPL(sched_setscheduler); -@@ -4433,41 +4643,34 @@ asmlinkage long sys_sched_getaffinity(pi - * sys_sched_yield - yield the current processor to other threads. - * - * This function yields the current CPU by moving the calling thread -- * to the expired array. If there are no other threads running on this -- * CPU then this function will return. -+ * to the expired array if SCHED_NORMAL or the end of its current priority -+ * queue if a realtime task. If there are no other threads running on this -+ * cpu this function will return. - */ - asmlinkage long sys_sched_yield(void) - { - struct rq *rq = this_rq_lock(); -- struct prio_array *array = current->array, *target = rq->expired; -+ struct task_struct *p = current; - - schedstat_inc(rq, yld_cnt); -- /* -- * We implement yielding by moving the task into the expired -- * queue. -- * -- * (special rule: RT tasks will just roundrobin in the active -- * array.) -- */ -- if (rt_task(current)) -- target = rq->active; -- -- if (array->nr_active == 1) { -- schedstat_inc(rq, yld_act_empty); -- if (!rq->expired->nr_active) -- schedstat_inc(rq, yld_both_empty); -- } else if (!rq->expired->nr_active) -- schedstat_inc(rq, yld_exp_empty); -- -- if (array != target) { -- dequeue_task(current, array); -- enqueue_task(current, target); -- } else -- /* -- * requeue_task is cheaper so perform that if possible. -- */ -- requeue_task(current, array); -+ if (rq->nr_running == 1) -+ schedstat_inc(rq, yld_both_empty); -+ else { -+ struct prio_array *old_array = p->array; -+ int old_prio = p->prio; -+ -+ if (idleprio_task(p)) { -+ dequeue_task(p, rq); -+ enqueue_task(p, rq); -+ goto out_release; -+ } -+ /* p->prio will be updated in requeue_task via queue_expired */ -+ if (!rt_task(p)) -+ p->array = rq->expired; -+ requeue_task(p, rq, old_array, old_prio); -+ } - -+out_release: - /* - * Since we are going to call schedule() anyway, there's - * no need to preempt or enable interrupts: -@@ -4619,6 +4822,8 @@ asmlinkage long sys_sched_get_priority_m - break; - case SCHED_NORMAL: - case SCHED_BATCH: -+ case SCHED_ISO: -+ case SCHED_IDLEPRIO: - ret = 0; - break; - } -@@ -4643,6 +4848,8 @@ asmlinkage long sys_sched_get_priority_m - break; - case SCHED_NORMAL: - case SCHED_BATCH: -+ case SCHED_ISO: -+ case SCHED_IDLEPRIO: - ret = 0; - } - return ret; -@@ -4676,8 +4883,8 @@ long sys_sched_rr_get_interval(pid_t pid - if (retval) - goto out_unlock; - -- jiffies_to_timespec(p->policy == SCHED_FIFO ? -- 0 : task_timeslice(p), &t); -+ t = ns_to_timespec(p->policy == SCHED_FIFO ? 0 : -+ MS_TO_NS(task_timeslice(p))); - read_unlock(&tasklist_lock); - retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; - out_nounlock: -@@ -4771,10 +4978,10 @@ void __cpuinit init_idle(struct task_str - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - -- idle->timestamp = sched_clock(); -- idle->sleep_avg = 0; -- idle->array = NULL; -- idle->prio = idle->normal_prio = MAX_PRIO; -+ bitmap_zero(idle->bitmap, PRIO_RANGE); -+ idle->timestamp = idle->last_ran = sched_clock(); -+ idle->array = rq->active; -+ idle->prio = idle->normal_prio = NICE_TO_PRIO(0); - idle->state = TASK_RUNNING; - idle->cpus_allowed = cpumask_of_cpu(cpu); - set_task_cpu(idle, cpu); -@@ -4893,7 +5100,7 @@ static int __migrate_task(struct task_st - goto out; - - set_task_cpu(p, dest_cpu); -- if (p->array) { -+ if (task_queued(p)) { - /* - * Sync timestamp with rq_dest's before activating. - * The same thing could be achieved by doing this step -@@ -4904,8 +5111,7 @@ static int __migrate_task(struct task_st - + rq_dest->most_recent_timestamp; - deactivate_task(p, rq_src); - __activate_task(p, rq_dest); -- if (TASK_PREEMPTS_CURR(p, rq_dest)) -- resched_task(rq_dest->curr); -+ try_preempt(p, rq_dest); - } - ret = 1; - out: -@@ -5194,7 +5400,7 @@ migration_call(struct notifier_block *nf - /* Idle task back to normal (off runqueue, low prio) */ - rq = task_rq_lock(rq->idle, &flags); - deactivate_task(rq->idle, rq); -- rq->idle->static_prio = MAX_PRIO; -+ rq->idle->static_prio = NICE_TO_PRIO(0); - __setscheduler(rq->idle, SCHED_NORMAL, 0); - migrate_dead_tasks(cpu); - task_rq_unlock(rq, &flags); -@@ -6706,6 +6912,13 @@ void __init sched_init_smp(void) - /* Move init over to a non-isolated CPU */ - if (set_cpus_allowed(current, non_isolated_cpus) < 0) - BUG(); -+ -+ /* -+ * Assume that every added cpu gives us slightly less overall latency -+ * allowing us to increase the base rr_interval, but in a non linear -+ * fashion. -+ */ -+ rr_interval *= 1 + ilog2(num_online_cpus()); - } - #else - void __init sched_init_smp(void) -@@ -6727,6 +6940,16 @@ void __init sched_init(void) - { - int i, j, k; - -+ /* Generate the priority matrix */ -+ for (i = 0; i < PRIO_RANGE; i++) { -+ bitmap_fill(prio_matrix[i], PRIO_RANGE); -+ j = PRIO_RANGE * PRIO_RANGE / (PRIO_RANGE - i); -+ for (k = 0; k <= PRIO_RANGE * (PRIO_RANGE - 1); k += j) { -+ __clear_bit(PRIO_RANGE - 1 - (k / PRIO_RANGE), -+ prio_matrix[i]); -+ } -+ } -+ - for_each_possible_cpu(i) { - struct prio_array *array; - struct rq *rq; -@@ -6734,12 +6957,20 @@ void __init sched_init(void) - rq = cpu_rq(i); - spin_lock_init(&rq->lock); - lockdep_set_class(&rq->lock, &rq->rq_lock_key); -+ rq->iso_ticks = 0; - rq->nr_running = 0; -+ rq->nr_idleprio = 0; -+ rq->prio_rotation = 0; - rq->active = rq->arrays; -+ rq->idleprio = rq->active; - rq->expired = rq->arrays + 1; -- rq->best_expired_prio = MAX_PRIO; -+ reset_prio_levels(rq); -+ rq->dyn_bitmap = rq->active->prio_bitmap; -+ rq->exp_bitmap = rq->expired->prio_bitmap; - - #ifdef CONFIG_SMP -+ rq->active->rq = rq; -+ rq->expired->rq = rq; - rq->sd = NULL; - for (j = 1; j < 3; j++) - rq->cpu_load[j] = 0; -@@ -6752,16 +6983,16 @@ void __init sched_init(void) - atomic_set(&rq->nr_iowait, 0); - - for (j = 0; j < 2; j++) { -+ - array = rq->arrays + j; -- for (k = 0; k < MAX_PRIO; k++) { -+ for (k = 0; k <= MAX_PRIO; k++) - INIT_LIST_HEAD(array->queue + k); -- __clear_bit(k, array->bitmap); -- } -- // delimiter for bitsearch -- __set_bit(MAX_PRIO, array->bitmap); -+ bitmap_zero(array->prio_bitmap, MAX_PRIO); -+ /* delimiter for bitsearch */ -+ __set_bit(MAX_PRIO, array->prio_bitmap); - } -- } - -+ } - set_load_weight(&init_task); - - #ifdef CONFIG_SMP -@@ -6815,24 +7046,24 @@ EXPORT_SYMBOL(__might_sleep); - #ifdef CONFIG_MAGIC_SYSRQ - void normalize_rt_tasks(void) - { -- struct prio_array *array; - struct task_struct *p; - unsigned long flags; - struct rq *rq; -+ int queued; - - read_lock_irq(&tasklist_lock); - for_each_process(p) { -- if (!rt_task(p)) -+ if (!rt_task(p) && !iso_task(p)) - continue; - - spin_lock_irqsave(&p->pi_lock, flags); - rq = __task_rq_lock(p); - -- array = p->array; -- if (array) -+ queued = task_queued(p); -+ if (queued) - deactivate_task(p, task_rq(p)); - __setscheduler(p, SCHED_NORMAL, 0); -- if (array) { -+ if (queued) { - __activate_task(p, task_rq(p)); - resched_task(rq->curr); - } -Index: linux-2.6.21-ck1/Documentation/sysctl/kernel.txt -=================================================================== ---- linux-2.6.21-ck1.orig/Documentation/sysctl/kernel.txt 2007-05-04 12:10:52.000000000 +1000 -+++ linux-2.6.21-ck1/Documentation/sysctl/kernel.txt 2007-05-04 12:10:55.000000000 +1000 -@@ -25,6 +25,9 @@ show up in /proc/sys/kernel: - - domainname - - hostname - - hotplug -+- interactive -+- iso_cpu -+- iso_period - - java-appletviewer [ binfmt_java, obsolete ] - - java-interpreter [ binfmt_java, obsolete ] - - kstack_depth_to_print [ X86 only ] -@@ -43,6 +46,7 @@ show up in /proc/sys/kernel: - - printk - - real-root-dev ==> Documentation/initrd.txt - - reboot-cmd [ SPARC only ] -+- rr_interval - - rtsig-max - - rtsig-nr - - sem -@@ -164,6 +168,40 @@ Default value is "/sbin/hotplug". - - ============================================================== - -+interactive: -+ -+The staircase-deadline cpu scheduler can be set in either purely -+forward-looking mode for absolutely rigid fairness and cpu distribution -+according to nice level, or it can allow a small per-process history -+to smooth out cpu usage perturbations common in interactive tasks by -+enabling this sysctl. While small fairness issues can arise with this -+enabled, overall fairness is usually still strongly maintained and -+starvation is never possible. Enabling this can significantly smooth -+out 3d graphics and games. -+ -+Default value is 1 (enabled). -+ -+============================================================== -+ -+iso_cpu: -+ -+This sets the percentage cpu that the unprivileged SCHED_ISO tasks can -+run effectively at realtime priority, averaged over a rolling iso_period -+seconds. -+ -+Set to 80 (percent) by default. -+ -+============================================================== -+ -+iso_period: -+ -+This sets the number of seconds over which SCHED_ISO cpu usage is averaged -+to see if it exceeds its allocated cpu bandwidth. -+ -+Set to 5 (seconds) by default. -+ -+============================================================== -+ - l2cr: (PPC only) - - This flag controls the L2 cache of G3 processor boards. If -@@ -288,6 +326,19 @@ rebooting. ??? - - ============================================================== - -+rr_interval: -+ -+This is the smallest duration that any cpu process scheduling unit -+will run for. Increasing this value can increase throughput of cpu -+bound tasks substantially but at the expense of increased latencies -+overall. This value is in milliseconds and the default value chosen -+depends on the number of cpus available at scheduler initialisation -+with a minimum of 8. -+ -+Valid values are from 1-5000. -+ -+============================================================== -+ - rtsig-max & rtsig-nr: - - The file rtsig-max can be used to tune the maximum number -Index: linux-2.6.21-ck1/kernel/sysctl.c -=================================================================== ---- linux-2.6.21-ck1.orig/kernel/sysctl.c 2007-05-04 12:10:52.000000000 +1000 -+++ linux-2.6.21-ck1/kernel/sysctl.c 2007-05-04 12:24:21.000000000 +1000 -@@ -22,6 +22,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -70,12 +71,17 @@ extern int suid_dumpable; - extern char core_pattern[]; - extern int pid_max; - extern int min_free_kbytes; -+extern int vm_tail_largefiles; - extern int printk_ratelimit_jiffies; - extern int printk_ratelimit_burst; - extern int pid_max_min, pid_max_max; - extern int sysctl_drop_caches; - extern int percpu_pagelist_fraction; - extern int compat_log; -+extern int rr_interval; -+extern int sched_interactive; -+extern int sched_iso_cpu; -+extern int sched_iso_period; - - /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ - static int maxolduid = 65535; -@@ -159,6 +165,14 @@ int sysctl_legacy_va_layout; - #endif - - -+/* Constants for minimum and maximum testing. -+ We use these as one-element integer vectors. */ -+static int __read_mostly zero; -+static int __read_mostly one = 1; -+static int __read_mostly one_hundred = 100; -+static int __read_mostly five_thousand = 5000; -+ -+ - /* The default sysctl tables: */ - - static ctl_table root_table[] = { -@@ -499,6 +513,47 @@ static ctl_table kern_table[] = { - .mode = 0444, - .proc_handler = &proc_dointvec, - }, -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "rr_interval", -+ .data = &rr_interval, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .strategy = &sysctl_intvec, -+ .extra1 = &one, -+ .extra2 = &five_thousand, -+ }, -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "interactive", -+ .data = &sched_interactive, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec, -+ }, -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "iso_cpu", -+ .data = &sched_iso_cpu, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .strategy = &sysctl_intvec, -+ .extra1 = &zero, -+ .extra2 = &one_hundred, -+ }, -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "iso_period", -+ .data = &sched_iso_period, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .strategy = &sysctl_intvec, -+ .extra1 = &one, -+ .extra2 = &one_hundred, -+ }, - #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) - { - .ctl_name = KERN_UNKNOWN_NMI_PANIC, -@@ -607,12 +662,6 @@ static ctl_table kern_table[] = { - { .ctl_name = 0 } - }; - --/* Constants for minimum and maximum testing in vm_table. -- We use these as one-element integer vectors. */ --static int zero; --static int one_hundred = 100; -- -- - static ctl_table vm_table[] = { - { - .ctl_name = VM_OVERCOMMIT_MEMORY, -@@ -693,16 +742,32 @@ static ctl_table vm_table[] = { - .proc_handler = &proc_dointvec, - }, - { -- .ctl_name = VM_SWAPPINESS, -- .procname = "swappiness", -- .data = &vm_swappiness, -- .maxlen = sizeof(vm_swappiness), -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "mapped", -+ .data = &vm_mapped, -+ .maxlen = sizeof(vm_mapped), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &zero, - .extra2 = &one_hundred, - }, -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "hardmaplimit", -+ .data = &vm_hardmaplimit, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec, -+ }, -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "tail_largefiles", -+ .data = &vm_tail_largefiles, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec, -+ }, - #ifdef CONFIG_HUGETLB_PAGE - { - .ctl_name = VM_HUGETLB_PAGES, -@@ -859,6 +924,16 @@ static ctl_table vm_table[] = { - .extra1 = &zero, - }, - #endif -+#ifdef CONFIG_SWAP_PREFETCH -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "swap_prefetch", -+ .data = &swap_prefetch, -+ .maxlen = sizeof(swap_prefetch), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif - { .ctl_name = 0 } - }; - -Index: linux-2.6.21-ck1/fs/pipe.c -=================================================================== ---- linux-2.6.21-ck1.orig/fs/pipe.c 2007-05-04 12:10:52.000000000 +1000 -+++ linux-2.6.21-ck1/fs/pipe.c 2007-05-04 12:10:54.000000000 +1000 -@@ -41,12 +41,7 @@ void pipe_wait(struct pipe_inode_info *p - { - DEFINE_WAIT(wait); - -- /* -- * Pipes are system-local resources, so sleeping on them -- * is considered a noninteractive wait: -- */ -- prepare_to_wait(&pipe->wait, &wait, -- TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE); -+ prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); - if (pipe->inode) - mutex_unlock(&pipe->inode->i_mutex); - schedule(); -Index: linux-2.6.21-ck1/Documentation/sched-design.txt -=================================================================== ---- linux-2.6.21-ck1.orig/Documentation/sched-design.txt 2007-05-04 12:10:52.000000000 +1000 -+++ linux-2.6.21-ck1/Documentation/sched-design.txt 2007-05-04 12:10:54.000000000 +1000 -@@ -1,11 +1,14 @@ -- Goals, Design and Implementation of the -- new ultra-scalable O(1) scheduler -+ Goals, Design and Implementation of the ultra-scalable O(1) scheduler by -+ Ingo Molnar and theStaircase Deadline cpu scheduler policy designed by -+ Con Kolivas. - - -- This is an edited version of an email Ingo Molnar sent to -- lkml on 4 Jan 2002. It describes the goals, design, and -- implementation of Ingo's new ultra-scalable O(1) scheduler. -- Last Updated: 18 April 2002. -+ This was originally an edited version of an email Ingo Molnar sent to -+ lkml on 4 Jan 2002. It describes the goals, design, and implementation -+ of Ingo's ultra-scalable O(1) scheduler. It now contains a description -+ of the Staircase Deadline priority scheduler that was built on this -+ design. -+ Last Updated: Fri, 4 May 2007 - - - Goal -@@ -163,3 +166,222 @@ certain code paths and data constructs. - code is smaller than the old one. - - Ingo -+ -+ -+Staircase Deadline cpu scheduler policy -+================================================ -+ -+Design summary -+============== -+ -+A novel design which incorporates a foreground-background descending priority -+system (the staircase) via a bandwidth allocation matrix according to nice -+level. -+ -+ -+Features -+======== -+ -+A starvation free, strict fairness O(1) scalable design with interactivity -+as good as the above restrictions can provide. There is no interactivity -+estimator, no sleep/run measurements and only simple fixed accounting. -+The design has strict enough a design and accounting that task behaviour -+can be modelled and maximum scheduling latencies can be predicted by -+the virtual deadline mechanism that manages runqueues. The prime concern -+in this design is to maintain fairness at all costs determined by nice level, -+yet to maintain as good interactivity as can be allowed within the -+constraints of strict fairness. -+ -+ -+Design description -+================== -+ -+SD works off the principle of providing each task a quota of runtime that it is -+allowed to run at a number of priority levels determined by its static priority -+(ie. its nice level). If the task uses up its quota it has its priority -+decremented to the next level determined by a priority matrix. Once every -+runtime quota has been consumed of every priority level, a task is queued on the -+"expired" array. When no other tasks exist with quota, the expired array is -+activated and fresh quotas are handed out. This is all done in O(1). -+ -+Design details -+============== -+ -+Each task keeps a record of its own entitlement of cpu time. Most of the rest of -+these details apply to non-realtime tasks as rt task management is straight -+forward. -+ -+Each runqueue keeps a record of what major epoch it is up to in the -+rq->prio_rotation field which is incremented on each major epoch. It also -+keeps a record of the current prio_level for each static priority task. -+ -+Each task keeps a record of what major runqueue epoch it was last running -+on in p->rotation. It also keeps a record of what priority levels it has -+already been allocated quota from during this epoch in a bitmap p->bitmap. -+ -+The only tunable that determines all other details is the RR_INTERVAL. This -+is set to 8ms, and is scaled gently upwards with more cpus. This value is -+tunable via a /proc interface. -+ -+All tasks are initially given a quota based on RR_INTERVAL. This is equal to -+RR_INTERVAL between nice values of -6 and 0, half that size above nice 0, and -+progressively larger for nice values from -1 to -20. This is assigned to -+p->quota and only changes with changes in nice level. -+ -+As a task is first queued, it checks in recalc_task_prio to see if it has run at -+this runqueue's current priority rotation. If it has not, it will have its -+p->prio level set according to the first slot in a "priority matrix" and will be -+given a p->time_slice equal to the p->quota, and has its allocation bitmap bit -+set in p->bitmap for this prio level. It is then queued on the current active -+priority array. -+ -+If a task has already been running during this major epoch, and it has -+p->time_slice left and the rq->prio_quota for the task's p->prio still -+has quota, it will be placed back on the active array, but no more quota -+will be added. -+ -+If a task has been running during this major epoch, but does not have -+p->time_slice left, it will find the next lowest priority in its bitmap that it -+has not been allocated quota from. It then gets the a full quota in -+p->time_slice. It is then queued on the current active priority array at the -+newly determined lower priority. -+ -+If a task has been running during this major epoch, and does not have -+any entitlement left in p->bitmap and no time_slice left, it will have its -+bitmap cleared, and be queued at its best prio again, but on the expired -+priority array. -+ -+When a task is queued, it has its relevant bit set in the array->prio_bitmap. -+ -+p->time_slice is stored in nanosconds and is updated via update_cpu_clock on -+schedule() and scheduler_tick. If p->time_slice is below zero then the -+recalc_task_prio is readjusted and the task rescheduled. -+ -+ -+Priority Matrix -+=============== -+ -+In order to minimise the latencies between tasks of different nice levels -+running concurrently, the dynamic priority slots where different nice levels -+are queued are dithered instead of being sequential. What this means is that -+there are 40 priority slots where a task may run during one major rotation, -+and the allocation of slots is dependant on nice level. In the -+following table, a zero represents a slot where the task may run. -+ -+PRIORITY:0..................20.................39 -+nice -20 0000000000000000000000000000000000000000 -+nice -10 1000100010001000100010001000100010010000 -+nice 0 1010101010101010101010101010101010101010 -+nice 5 1011010110110101101101011011010110110110 -+nice 10 1110111011101110111011101110111011101110 -+nice 15 1111111011111110111111101111111011111110 -+nice 19 1111111111111111111111111111111111111110 -+ -+As can be seen, a nice -20 task runs in every priority slot whereas a nice 19 -+task only runs one slot per major rotation. This dithered table allows for the -+smallest possible maximum latencies between tasks of varying nice levels, thus -+allowing vastly different nice levels to be used. -+ -+SCHED_BATCH tasks are managed slightly differently, receiving only the top -+slots from its priority bitmap giving it equal cpu as SCHED_NORMAL, but -+slightly higher latencies. -+ -+ -+Modelling deadline behaviour -+============================ -+ -+As the accounting in this design is hard and not modified by sleep average -+calculations or interactivity modifiers, it is possible to accurately -+predict the maximum latency that a task may experience under different -+conditions. This is a virtual deadline mechanism enforced by mandatory -+timeslice expiration and not outside bandwidth measurement. -+ -+The maximum duration a task can run during one major epoch is determined by its -+nice value. Nice 0 tasks can run at 19 different priority levels for RR_INTERVAL -+duration during each epoch. Nice 10 tasks can run at 9 priority levels for each -+epoch, and so on. The table in the priority matrix above demonstrates how this -+is enforced. -+ -+Therefore the maximum duration a runqueue epoch can take is determined by -+the number of tasks running, and their nice level. After that, the maximum -+duration it can take before a task can wait before it get scheduled is -+determined by the position of its first slot on the matrix. -+ -+In the following examples, these are _worst case scenarios_ and would rarely -+occur, but can be modelled nonetheless to determine the maximum possible -+latency. -+ -+So for example, if two nice 0 tasks are running, and one has just expired as -+another is activated for the first time receiving a full quota for this -+runqueue rotation, the first task will wait: -+ -+nr_tasks * max_duration + nice_difference * rr_interval -+1 * 19 * RR_INTERVAL + 0 = 152ms -+ -+In the presence of a nice 10 task, a nice 0 task would wait a maximum of -+1 * 10 * RR_INTERVAL + 0 = 80ms -+ -+In the presence of a nice 0 task, a nice 10 task would wait a maximum of -+1 * 19 * RR_INTERVAL + 1 * RR_INTERVAL = 160ms -+ -+More useful than these values, though, are the average latencies which are -+a matter of determining the average distance between priority slots of -+different nice values and multiplying them by the tasks' quota. For example -+in the presence of a nice -10 task, a nice 0 task will wait either one or -+two slots. Given that nice -10 tasks have a quota 2.5 times the RR_INTERVAL, -+this means the latencies will alternate between 2.5 and 5 RR_INTERVALs or -+20 and 40ms respectively (on uniprocessor at 1000HZ). -+ -+ -+Achieving interactivity -+======================= -+ -+A requirement of this scheduler design was to achieve good interactivity -+despite being a completely fair deadline based design. The disadvantage of -+designs that try to achieve interactivity is that they usually do so at -+the expense of maintaining fairness. As cpu speeds increase, the requirement -+for some sort of metered unfairness towards interactive tasks becomes a less -+desirable phenomenon, but low latency and fairness remains mandatory to -+good interactive performance. -+ -+This design relies on the fact that interactive tasks, by their nature, -+sleep often. Most fair scheduling designs end up penalising such tasks -+indirectly giving them less than their fair possible share because of the -+sleep, and have to use a mechanism of bonusing their priority to offset -+this based on the duration they sleep. This becomes increasingly inaccurate -+as the number of running tasks rises and more tasks spend time waiting on -+runqueues rather than sleeping, and it is impossible to tell whether the -+task that's waiting on a runqueue only intends to run for a short period and -+then sleep again after than runqueue wait. Furthermore, all such designs rely -+on a period of time to pass to accumulate some form of statistic on the task -+before deciding on how much to give them preference. The shorter this period, -+the more rapidly bursts of cpu ruin the interactive tasks behaviour. The -+longer this period, the longer it takes for interactive tasks to get low -+scheduling latencies and fair cpu. -+ -+This design does not measure sleep time at all. Interactive tasks that sleep -+often will wake up having consumed very little if any of their quota for -+the current major priority rotation. The longer they have slept, the less -+likely they are to even be on the current major priority rotation. Once -+woken up, though, they get to use up a their full quota for that epoch, -+whether part of a quota remains or a full quota. Overall, however, they -+can still only run as much cpu time for that epoch as any other task of the -+same nice level. This means that two tasks behaving completely differently -+from fully cpu bound to waking/sleeping extremely frequently will still -+get the same quota of cpu, but the latter will be using its quota for that -+epoch in bursts rather than continuously. This guarantees that interactive -+tasks get the same amount of cpu as cpu bound ones. -+ -+The other requirement of interactive tasks is also to obtain low latencies -+for when they are scheduled. Unlike fully cpu bound tasks and the maximum -+latencies possible described in the modelling deadline behaviour section -+above, tasks that sleep will wake up with quota available usually at the -+current runqueue's priority_level or better. This means that the most latency -+they are likely to see is one RR_INTERVAL, and often they will preempt the -+current task if it is not of a sleeping nature. This then guarantees very -+low latency for interactive tasks, and the lowest latencies for the least -+cpu bound tasks. -+ -+ -+Fri, 4 May 2007 -+Con Kolivas -Index: linux-2.6.21-ck1/kernel/softirq.c -=================================================================== ---- linux-2.6.21-ck1.orig/kernel/softirq.c 2007-05-04 12:10:52.000000000 +1000 -+++ linux-2.6.21-ck1/kernel/softirq.c 2007-05-04 12:10:54.000000000 +1000 -@@ -488,7 +488,7 @@ void __init softirq_init(void) - - static int ksoftirqd(void * __bind_cpu) - { -- set_user_nice(current, 19); -+ set_user_nice(current, 15); - current->flags |= PF_NOFREEZE; - - set_current_state(TASK_INTERRUPTIBLE); -Index: linux-2.6.21-ck1/kernel/fork.c -=================================================================== ---- linux-2.6.21-ck1.orig/kernel/fork.c 2007-05-04 12:24:01.000000000 +1000 -+++ linux-2.6.21-ck1/kernel/fork.c 2007-05-04 12:24:19.000000000 +1000 -@@ -1060,6 +1060,7 @@ static struct task_struct *copy_process( - p->io_context = NULL; - p->io_wait = NULL; - p->audit_context = NULL; -+ p->mutexes_held = 0; - cpuset_fork(p); - #ifdef CONFIG_NUMA - p->mempolicy = mpol_copy(p->mempolicy); -Index: linux-2.6.21-ck1/kernel/mutex.c -=================================================================== ---- linux-2.6.21-ck1.orig/kernel/mutex.c 2007-05-04 12:24:01.000000000 +1000 -+++ linux-2.6.21-ck1/kernel/mutex.c 2007-05-04 12:24:19.000000000 +1000 -@@ -60,6 +60,16 @@ EXPORT_SYMBOL(__mutex_init); - static void fastcall noinline __sched - __mutex_lock_slowpath(atomic_t *lock_count); - -+static inline void inc_mutex_count(void) -+{ -+ current->mutexes_held++; -+} -+ -+static inline void dec_mutex_count(void) -+{ -+ current->mutexes_held--; -+} -+ - /*** - * mutex_lock - acquire the mutex - * @lock: the mutex to be acquired -@@ -89,6 +99,7 @@ void inline fastcall __sched mutex_lock( - * 'unlocked' into 'locked' state. - */ - __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath); -+ inc_mutex_count(); - } - - EXPORT_SYMBOL(mutex_lock); -@@ -114,6 +125,7 @@ void fastcall __sched mutex_unlock(struc - * into 'unlocked' state: - */ - __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath); -+ dec_mutex_count(); - } - - EXPORT_SYMBOL(mutex_unlock); -@@ -283,9 +295,14 @@ __mutex_lock_interruptible_slowpath(atom - */ - int fastcall __sched mutex_lock_interruptible(struct mutex *lock) - { -+ int ret; -+ - might_sleep(); -- return __mutex_fastpath_lock_retval -+ ret = __mutex_fastpath_lock_retval - (&lock->count, __mutex_lock_interruptible_slowpath); -+ if (likely(!ret)) -+ inc_mutex_count(); -+ return ret; - } - - EXPORT_SYMBOL(mutex_lock_interruptible); -@@ -340,8 +357,12 @@ static inline int __mutex_trylock_slowpa - */ - int fastcall __sched mutex_trylock(struct mutex *lock) - { -- return __mutex_fastpath_trylock(&lock->count, -+ int ret = __mutex_fastpath_trylock(&lock->count, - __mutex_trylock_slowpath); -+ -+ if (likely(ret)) -+ inc_mutex_count(); -+ return ret; - } - - EXPORT_SYMBOL(mutex_trylock); -Index: linux-2.6.21-ck1/block/cfq-iosched.c -=================================================================== ---- linux-2.6.21-ck1.orig/block/cfq-iosched.c 2007-05-04 12:24:01.000000000 +1000 -+++ linux-2.6.21-ck1/block/cfq-iosched.c 2007-05-04 12:24:19.000000000 +1000 -@@ -1258,10 +1258,12 @@ static void cfq_init_prio_data(struct cf - printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); - case IOPRIO_CLASS_NONE: - /* -- * no prio set, place us in the middle of the BE classes -+ * Select class and ioprio according to policy and nice - */ -+ cfqq->ioprio_class = task_policy_ioprio_class(tsk); - cfqq->ioprio = task_nice_ioprio(tsk); -- cfqq->ioprio_class = IOPRIO_CLASS_BE; -+ if (cfqq->ioprio_class == IOPRIO_CLASS_IDLE) -+ cfq_clear_cfqq_idle_window(cfqq); - break; - case IOPRIO_CLASS_RT: - cfqq->ioprio = task_ioprio(tsk); -Index: linux-2.6.21-ck1/include/linux/ioprio.h -=================================================================== ---- linux-2.6.21-ck1.orig/include/linux/ioprio.h 2007-05-04 12:24:01.000000000 +1000 -+++ linux-2.6.21-ck1/include/linux/ioprio.h 2007-05-04 12:24:19.000000000 +1000 -@@ -22,7 +22,7 @@ - * class, the default for any process. IDLE is the idle scheduling class, it - * is only served when no one else is using the disk. - */ --enum { -+enum ioprio_class { - IOPRIO_CLASS_NONE, - IOPRIO_CLASS_RT, - IOPRIO_CLASS_BE, -@@ -51,8 +51,25 @@ static inline int task_ioprio(struct tas - return IOPRIO_PRIO_DATA(task->ioprio); - } - -+static inline enum ioprio_class -+ task_policy_ioprio_class(struct task_struct *task) -+{ -+ if (rt_task(task)) -+ return IOPRIO_CLASS_RT; -+ if (idleprio_task(task)) -+ return IOPRIO_CLASS_IDLE; -+ return IOPRIO_CLASS_BE; -+} -+ - static inline int task_nice_ioprio(struct task_struct *task) - { -+ if (rt_task(task)) -+ return (MAX_RT_PRIO - task->rt_priority) * IOPRIO_BE_NR / -+ (MAX_RT_PRIO + 1); -+ if (iso_task(task)) -+ return 0; -+ if (idleprio_task(task)) -+ return IOPRIO_BE_NR - 1; - return (task_nice(task) + 20) / 5; - } - -Index: linux-2.6.21-ck1/Documentation/sysctl/vm.txt -=================================================================== ---- linux-2.6.21-ck1.orig/Documentation/sysctl/vm.txt 2007-05-04 12:24:01.000000000 +1000 -+++ linux-2.6.21-ck1/Documentation/sysctl/vm.txt 2007-05-04 12:24:21.000000000 +1000 -@@ -22,6 +22,8 @@ Currently, these files are in /proc/sys/ - - dirty_background_ratio - - dirty_expire_centisecs - - dirty_writeback_centisecs -+- hardmaplimit -+- mapped - - max_map_count - - min_free_kbytes - - laptop_mode -@@ -31,12 +33,13 @@ Currently, these files are in /proc/sys/ - - min_unmapped_ratio - - min_slab_ratio - - panic_on_oom -+- swap_prefetch - - ============================================================== - - dirty_ratio, dirty_background_ratio, dirty_expire_centisecs, - dirty_writeback_centisecs, vfs_cache_pressure, laptop_mode, --block_dump, swap_token_timeout, drop-caches: -+block_dump, swap_token_timeout, drop-caches, tail_largefiles: - - See Documentation/filesystems/proc.txt - -@@ -86,6 +89,27 @@ for swap because we only cluster swap da - - ============================================================== - -+hardmaplimit: -+ -+This flag makes the vm adhere to the mapped value as closely as possible -+except in the most extreme vm stress where doing so would provoke an out -+of memory condition (see mapped below). -+ -+Enabled by default. -+ -+============================================================== -+ -+mapped: -+ -+This is the percentage ram that is filled with mapped pages (applications) -+before the vm will start reclaiming mapped pages by moving them to swap. -+It is altered by the relative stress of the vm at the time so is not -+strictly adhered to to prevent provoking out of memory kills. -+ -+Set to 66 by default. -+ -+============================================================== -+ - max_map_count: - - This file contains the maximum number of memory map areas a process -@@ -205,3 +229,14 @@ rather than killing rogue processes, set - - The default value is 0. - -+============================================================== -+ -+swap_prefetch -+ -+This enables or disables the swap prefetching feature. When the virtual -+memory subsystem has been extremely idle for at least 5 seconds it will start -+copying back pages from swap into the swapcache and keep a copy in swap. In -+practice it can take many minutes before the vm is idle enough. -+ -+The default value is 1. -+ -Index: linux-2.6.21-ck1/include/linux/swap.h -=================================================================== ---- linux-2.6.21-ck1.orig/include/linux/swap.h 2007-05-04 12:24:01.000000000 +1000 -+++ linux-2.6.21-ck1/include/linux/swap.h 2007-05-04 12:24:20.000000000 +1000 -@@ -180,6 +180,7 @@ extern unsigned int nr_free_pagecache_pa - /* linux/mm/swap.c */ - extern void FASTCALL(lru_cache_add(struct page *)); - extern void FASTCALL(lru_cache_add_active(struct page *)); -+extern void FASTCALL(lru_cache_add_tail(struct page *)); - extern void FASTCALL(activate_page(struct page *)); - extern void FASTCALL(mark_page_accessed(struct page *)); - extern void lru_add_drain(void); -@@ -188,9 +189,11 @@ extern int rotate_reclaimable_page(struc - extern void swap_setup(void); - - /* linux/mm/vmscan.c */ --extern unsigned long try_to_free_pages(struct zone **, gfp_t); -+extern unsigned long try_to_free_pages(struct zone **, gfp_t, -+ struct task_struct *p); - extern unsigned long shrink_all_memory(unsigned long nr_pages); --extern int vm_swappiness; -+extern int vm_mapped; -+extern int vm_hardmaplimit; - extern int remove_mapping(struct address_space *mapping, struct page *page); - extern long vm_total_pages; - -@@ -237,6 +240,7 @@ extern void free_pages_and_swap_cache(st - extern struct page * lookup_swap_cache(swp_entry_t); - extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma, - unsigned long addr); -+extern int add_to_swap_cache(struct page *page, swp_entry_t entry); - /* linux/mm/swapfile.c */ - extern long total_swap_pages; - extern unsigned int nr_swapfiles; -Index: linux-2.6.21-ck1/init/Kconfig -=================================================================== ---- linux-2.6.21-ck1.orig/init/Kconfig 2007-05-04 12:24:01.000000000 +1000 -+++ linux-2.6.21-ck1/init/Kconfig 2007-05-04 12:24:20.000000000 +1000 -@@ -101,6 +101,28 @@ config SWAP - used to provide more virtual memory than the actual RAM present - in your computer. If unsure say Y. - -+config SWAP_PREFETCH -+ bool "Support for prefetching swapped memory" -+ depends on SWAP -+ default y -+ ---help--- -+ This option will allow the kernel to prefetch swapped memory pages -+ when idle. The pages will be kept on both swap and in swap_cache -+ thus avoiding the need for further I/O if either ram or swap space -+ is required. -+ -+ What this will do on workstations is slowly bring back applications -+ that have swapped out after memory intensive workloads back into -+ physical ram if you have free ram at a later stage and the machine -+ is relatively idle. This means that when you come back to your -+ computer after leaving it idle for a while, applications will come -+ to life faster. Note that your swap usage will appear to increase -+ but these are cached pages, can be dropped freely by the vm, and it -+ should stabilise around 50% swap usage maximum. -+ -+ Workstations and multiuser workstation servers will most likely want -+ to say Y. -+ - config SYSVIPC - bool "System V IPC" - ---help--- -Index: linux-2.6.21-ck1/mm/Makefile -=================================================================== ---- linux-2.6.21-ck1.orig/mm/Makefile 2007-05-04 12:24:01.000000000 +1000 -+++ linux-2.6.21-ck1/mm/Makefile 2007-05-04 12:24:20.000000000 +1000 -@@ -17,6 +17,7 @@ ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy) - obj-y += bounce.o - endif - obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o -+obj-$(CONFIG_SWAP_PREFETCH) += swap_prefetch.o - obj-$(CONFIG_HUGETLBFS) += hugetlb.o - obj-$(CONFIG_NUMA) += mempolicy.o - obj-$(CONFIG_SPARSEMEM) += sparse.o -Index: linux-2.6.21-ck1/mm/swap.c -=================================================================== ---- linux-2.6.21-ck1.orig/mm/swap.c 2007-05-04 12:24:01.000000000 +1000 -+++ linux-2.6.21-ck1/mm/swap.c 2007-05-04 12:24:21.000000000 +1000 -@@ -17,6 +17,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -176,6 +177,7 @@ EXPORT_SYMBOL(mark_page_accessed); - */ - static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; - static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; -+static DEFINE_PER_CPU(struct pagevec, lru_add_tail_pvecs) = { 0, }; - - void fastcall lru_cache_add(struct page *page) - { -@@ -197,6 +199,31 @@ void fastcall lru_cache_add_active(struc - put_cpu_var(lru_add_active_pvecs); - } - -+static void __pagevec_lru_add_tail(struct pagevec *pvec) -+{ -+ int i; -+ struct zone *zone = NULL; -+ -+ for (i = 0; i < pagevec_count(pvec); i++) { -+ struct page *page = pvec->pages[i]; -+ struct zone *pagezone = page_zone(page); -+ -+ if (pagezone != zone) { -+ if (zone) -+ spin_unlock_irq(&zone->lru_lock); -+ zone = pagezone; -+ spin_lock_irq(&zone->lru_lock); -+ } -+ BUG_ON(PageLRU(page)); -+ SetPageLRU(page); -+ add_page_to_inactive_list_tail(zone, page); -+ } -+ if (zone) -+ spin_unlock_irq(&zone->lru_lock); -+ release_pages(pvec->pages, pvec->nr, pvec->cold); -+ pagevec_reinit(pvec); -+} -+ - static void __lru_add_drain(int cpu) - { - struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu); -@@ -207,6 +234,9 @@ static void __lru_add_drain(int cpu) - pvec = &per_cpu(lru_add_active_pvecs, cpu); - if (pagevec_count(pvec)) - __pagevec_lru_add_active(pvec); -+ pvec = &per_cpu(lru_add_tail_pvecs, cpu); -+ if (pagevec_count(pvec)) -+ __pagevec_lru_add_tail(pvec); - } - - void lru_add_drain(void) -@@ -403,6 +433,20 @@ void __pagevec_lru_add_active(struct pag - } - - /* -+ * Function used uniquely to put pages back to the lru at the end of the -+ * inactive list to preserve the lru order. -+ */ -+void fastcall lru_cache_add_tail(struct page *page) -+{ -+ struct pagevec *pvec = &get_cpu_var(lru_add_tail_pvecs); -+ -+ page_cache_get(page); -+ if (!pagevec_add(pvec, page)) -+ __pagevec_lru_add_tail(pvec); -+ put_cpu_var(lru_add_pvecs); -+} -+ -+/* - * Try to drop buffers from the pages in a pagevec - */ - void pagevec_strip(struct pagevec *pvec) -@@ -514,6 +558,9 @@ void __init swap_setup(void) - * Right now other parts of the system means that we - * _really_ don't want to cluster much more - */ -+ -+ prepare_swap_prefetch(); -+ - #ifdef CONFIG_HOTPLUG_CPU - hotcpu_notifier(cpu_swap_callback, 0); - #endif -Index: linux-2.6.21-ck1/mm/swap_prefetch.c -=================================================================== ---- /dev/null 1970-01-01 00:00:00.000000000 +0000 -+++ linux-2.6.21-ck1/mm/swap_prefetch.c 2007-05-04 12:24:20.000000000 +1000 -@@ -0,0 +1,581 @@ -+/* -+ * linux/mm/swap_prefetch.c -+ * -+ * Copyright (C) 2005-2006 Con Kolivas -+ * -+ * Written by Con Kolivas -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 as -+ * published by the Free Software Foundation. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ * Time to delay prefetching if vm is busy or prefetching unsuccessful. There -+ * needs to be at least this duration of idle time meaning in practice it can -+ * be much longer -+ */ -+#define PREFETCH_DELAY (HZ * 5) -+ -+/* sysctl - enable/disable swap prefetching */ -+int swap_prefetch __read_mostly = 1; -+ -+struct swapped_root { -+ unsigned long busy; /* vm busy */ -+ spinlock_t lock; /* protects all data */ -+ struct list_head list; /* MRU list of swapped pages */ -+ struct radix_tree_root swap_tree; /* Lookup tree of pages */ -+ unsigned int count; /* Number of entries */ -+ unsigned int maxcount; /* Maximum entries allowed */ -+ struct kmem_cache *cache; /* Of struct swapped_entry */ -+}; -+ -+static struct swapped_root swapped = { -+ .lock = SPIN_LOCK_UNLOCKED, -+ .list = LIST_HEAD_INIT(swapped.list), -+ .swap_tree = RADIX_TREE_INIT(GFP_ATOMIC), -+}; -+ -+static struct task_struct *kprefetchd_task; -+ -+/* -+ * We check to see no part of the vm is busy. If it is this will interrupt -+ * trickle_swap and wait another PREFETCH_DELAY. Purposefully racy. -+ */ -+inline void delay_swap_prefetch(void) -+{ -+ if (!test_bit(0, &swapped.busy)) -+ __set_bit(0, &swapped.busy); -+} -+ -+/* -+ * Drop behind accounting which keeps a list of the most recently used swap -+ * entries. -+ */ -+void add_to_swapped_list(struct page *page) -+{ -+ struct swapped_entry *entry; -+ unsigned long index, flags; -+ int wakeup; -+ -+ if (!swap_prefetch) -+ return; -+ -+ wakeup = 0; -+ -+ spin_lock_irqsave(&swapped.lock, flags); -+ if (swapped.count >= swapped.maxcount) { -+ /* -+ * We limit the number of entries to 2/3 of physical ram. -+ * Once the number of entries exceeds this we start removing -+ * the least recently used entries. -+ */ -+ entry = list_entry(swapped.list.next, -+ struct swapped_entry, swapped_list); -+ radix_tree_delete(&swapped.swap_tree, entry->swp_entry.val); -+ list_del(&entry->swapped_list); -+ swapped.count--; -+ } else { -+ entry = kmem_cache_alloc(swapped.cache, GFP_ATOMIC); -+ if (unlikely(!entry)) -+ /* bad, can't allocate more mem */ -+ goto out_locked; -+ } -+ -+ index = page_private(page); -+ entry->swp_entry.val = index; -+ /* -+ * On numa we need to store the node id to ensure that we prefetch to -+ * the same node it came from. -+ */ -+ store_swap_entry_node(entry, page); -+ -+ if (likely(!radix_tree_insert(&swapped.swap_tree, index, entry))) { -+ /* -+ * If this is the first entry, kprefetchd needs to be -+ * (re)started. -+ */ -+ if (!swapped.count) -+ wakeup = 1; -+ list_add(&entry->swapped_list, &swapped.list); -+ swapped.count++; -+ } -+ -+out_locked: -+ spin_unlock_irqrestore(&swapped.lock, flags); -+ -+ /* Do the wakeup outside the lock to shorten lock hold time. */ -+ if (wakeup) -+ wake_up_process(kprefetchd_task); -+ -+ return; -+} -+ -+/* -+ * Removes entries from the swapped_list. The radix tree allows us to quickly -+ * look up the entry from the index without having to iterate over the whole -+ * list. -+ */ -+void remove_from_swapped_list(const unsigned long index) -+{ -+ struct swapped_entry *entry; -+ unsigned long flags; -+ -+ if (list_empty(&swapped.list)) -+ return; -+ -+ spin_lock_irqsave(&swapped.lock, flags); -+ entry = radix_tree_delete(&swapped.swap_tree, index); -+ if (likely(entry)) { -+ list_del_init(&entry->swapped_list); -+ swapped.count--; -+ kmem_cache_free(swapped.cache, entry); -+ } -+ spin_unlock_irqrestore(&swapped.lock, flags); -+} -+ -+enum trickle_return { -+ TRICKLE_SUCCESS, -+ TRICKLE_FAILED, -+ TRICKLE_DELAY, -+}; -+ -+struct node_stats { -+ unsigned long last_free; -+ /* Free ram after a cycle of prefetching */ -+ unsigned long current_free; -+ /* Free ram on this cycle of checking prefetch_suitable */ -+ unsigned long prefetch_watermark; -+ /* Maximum amount we will prefetch to */ -+ unsigned long highfree[MAX_NR_ZONES]; -+ /* The amount of free ram before we start prefetching */ -+ unsigned long lowfree[MAX_NR_ZONES]; -+ /* The amount of free ram where we will stop prefetching */ -+ unsigned long *pointfree[MAX_NR_ZONES]; -+ /* highfree or lowfree depending on whether we've hit a watermark */ -+}; -+ -+/* -+ * prefetch_stats stores the free ram data of each node and this is used to -+ * determine if a node is suitable for prefetching into. -+ */ -+struct prefetch_stats { -+ nodemask_t prefetch_nodes; -+ /* Which nodes are currently suited to prefetching */ -+ unsigned long prefetched_pages; -+ /* Total pages we've prefetched on this wakeup of kprefetchd */ -+ struct node_stats node[MAX_NUMNODES]; -+}; -+ -+static struct prefetch_stats sp_stat; -+ -+/* -+ * This tries to read a swp_entry_t into swap cache for swap prefetching. -+ * If it returns TRICKLE_DELAY we should delay further prefetching. -+ */ -+static enum trickle_return trickle_swap_cache_async(const swp_entry_t entry, -+ const int node) -+{ -+ enum trickle_return ret = TRICKLE_FAILED; -+ struct page *page; -+ -+ read_lock_irq(&swapper_space.tree_lock); -+ /* Entry may already exist */ -+ page = radix_tree_lookup(&swapper_space.page_tree, entry.val); -+ read_unlock_irq(&swapper_space.tree_lock); -+ if (page) { -+ remove_from_swapped_list(entry.val); -+ goto out; -+ } -+ -+ /* -+ * Get a new page to read from swap. We have already checked the -+ * watermarks so __alloc_pages will not call on reclaim. -+ */ -+ page = alloc_pages_node(node, GFP_HIGHUSER & ~__GFP_WAIT, 0); -+ if (unlikely(!page)) { -+ ret = TRICKLE_DELAY; -+ goto out; -+ } -+ -+ if (add_to_swap_cache(page, entry)) { -+ /* Failed to add to swap cache */ -+ goto out_release; -+ } -+ -+ /* Add them to the tail of the inactive list to preserve LRU order */ -+ lru_cache_add_tail(page); -+ if (unlikely(swap_readpage(NULL, page))) { -+ ret = TRICKLE_DELAY; -+ goto out_release; -+ } -+ -+ sp_stat.prefetched_pages++; -+ sp_stat.node[node].last_free--; -+ -+ ret = TRICKLE_SUCCESS; -+out_release: -+ page_cache_release(page); -+out: -+ return ret; -+} -+ -+static void clear_last_prefetch_free(void) -+{ -+ int node; -+ -+ /* -+ * Reset the nodes suitable for prefetching to all nodes. We could -+ * update the data to take into account memory hotplug if desired.. -+ */ -+ sp_stat.prefetch_nodes = node_online_map; -+ for_each_node_mask(node, sp_stat.prefetch_nodes) { -+ struct node_stats *ns = &sp_stat.node[node]; -+ -+ ns->last_free = 0; -+ } -+} -+ -+static void clear_current_prefetch_free(void) -+{ -+ int node; -+ -+ sp_stat.prefetch_nodes = node_online_map; -+ for_each_node_mask(node, sp_stat.prefetch_nodes) { -+ struct node_stats *ns = &sp_stat.node[node]; -+ -+ ns->current_free = 0; -+ } -+} -+ -+/* -+ * This updates the high and low watermarks of amount of free ram in each -+ * node used to start and stop prefetching. We prefetch from pages_high * 4 -+ * down to pages_high * 3. -+ */ -+static void examine_free_limits(void) -+{ -+ struct zone *z; -+ -+ for_each_zone(z) { -+ struct node_stats *ns; -+ int idx; -+ -+ if (!populated_zone(z)) -+ continue; -+ -+ ns = &sp_stat.node[z->zone_pgdat->node_id]; -+ idx = zone_idx(z); -+ ns->lowfree[idx] = z->pages_high * 3; -+ ns->highfree[idx] = ns->lowfree[idx] + z->pages_high; -+ -+ if (zone_page_state(z, NR_FREE_PAGES) > ns->highfree[idx]) { -+ /* -+ * We've gotten above the high watermark of free pages -+ * so we can start prefetching till we get to the low -+ * watermark. -+ */ -+ ns->pointfree[idx] = &ns->lowfree[idx]; -+ } -+ } -+} -+ -+/* -+ * We want to be absolutely certain it's ok to start prefetching. -+ */ -+static int prefetch_suitable(void) -+{ -+ unsigned long limit; -+ struct zone *z; -+ int node, ret = 0, test_pagestate = 0; -+ -+ /* Purposefully racy */ -+ if (test_bit(0, &swapped.busy)) { -+ __clear_bit(0, &swapped.busy); -+ goto out; -+ } -+ -+ /* -+ * get_page_state and above_background_load are expensive so we only -+ * perform them every SWAP_CLUSTER_MAX prefetched_pages. -+ * We test to see if we're above_background_load as disk activity -+ * even at low priority can cause interrupt induced scheduling -+ * latencies. -+ */ -+ if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) { -+ if (above_background_load()) -+ goto out; -+ test_pagestate = 1; -+ } -+ -+ clear_current_prefetch_free(); -+ -+ /* -+ * Have some hysteresis between where page reclaiming and prefetching -+ * will occur to prevent ping-ponging between them. -+ */ -+ for_each_zone(z) { -+ struct node_stats *ns; -+ unsigned long free; -+ int idx; -+ -+ if (!populated_zone(z)) -+ continue; -+ -+ node = z->zone_pgdat->node_id; -+ ns = &sp_stat.node[node]; -+ idx = zone_idx(z); -+ -+ free = zone_page_state(z, NR_FREE_PAGES); -+ if (free < *ns->pointfree[idx]) { -+ /* -+ * Free pages have dropped below the low watermark so -+ * we won't start prefetching again till we hit the -+ * high watermark of free pages. -+ */ -+ ns->pointfree[idx] = &ns->highfree[idx]; -+ node_clear(node, sp_stat.prefetch_nodes); -+ continue; -+ } -+ ns->current_free += free; -+ } -+ -+ /* -+ * We iterate over each node testing to see if it is suitable for -+ * prefetching and clear the nodemask if it is not. -+ */ -+ for_each_node_mask(node, sp_stat.prefetch_nodes) { -+ struct node_stats *ns = &sp_stat.node[node]; -+ -+ /* -+ * We check to see that pages are not being allocated -+ * elsewhere at any significant rate implying any -+ * degree of memory pressure (eg during file reads) -+ */ -+ if (ns->last_free) { -+ if (ns->current_free + SWAP_CLUSTER_MAX < -+ ns->last_free) { -+ ns->last_free = ns->current_free; -+ node_clear(node, -+ sp_stat.prefetch_nodes); -+ continue; -+ } -+ } else -+ ns->last_free = ns->current_free; -+ -+ if (!test_pagestate) -+ continue; -+ -+ /* We shouldn't prefetch when we are doing writeback */ -+ if (node_page_state(node, NR_WRITEBACK)) { -+ node_clear(node, sp_stat.prefetch_nodes); -+ continue; -+ } -+ -+ /* -+ * >2/3 of the ram on this node is mapped, slab, swapcache or -+ * dirty, we need to leave some free for pagecache. -+ */ -+ limit = node_page_state(node, NR_FILE_PAGES); -+ limit += node_page_state(node, NR_SLAB_RECLAIMABLE); -+ limit += node_page_state(node, NR_SLAB_UNRECLAIMABLE); -+ limit += node_page_state(node, NR_FILE_DIRTY); -+ limit += node_page_state(node, NR_UNSTABLE_NFS); -+ limit += total_swapcache_pages; -+ if (limit > ns->prefetch_watermark) { -+ node_clear(node, sp_stat.prefetch_nodes); -+ continue; -+ } -+ } -+ -+ if (nodes_empty(sp_stat.prefetch_nodes)) -+ goto out; -+ -+ /* Survived all that? Hooray we can prefetch! */ -+ ret = 1; -+out: -+ return ret; -+} -+ -+/* -+ * Get previous swapped entry when iterating over all entries. swapped.lock -+ * should be held and we should already ensure that entry exists. -+ */ -+static inline struct swapped_entry *prev_swapped_entry -+ (struct swapped_entry *entry) -+{ -+ return list_entry(entry->swapped_list.prev->prev, -+ struct swapped_entry, swapped_list); -+} -+ -+/* -+ * trickle_swap is the main function that initiates the swap prefetching. It -+ * first checks to see if the busy flag is set, and does not prefetch if it -+ * is, as the flag implied we are low on memory or swapping in currently. -+ * Otherwise it runs until prefetch_suitable fails which occurs when the -+ * vm is busy, we prefetch to the watermark, or the list is empty or we have -+ * iterated over all entries -+ */ -+static enum trickle_return trickle_swap(void) -+{ -+ enum trickle_return ret = TRICKLE_DELAY; -+ struct swapped_entry *entry; -+ unsigned long flags; -+ -+ /* -+ * If laptop_mode is enabled don't prefetch to avoid hard drives -+ * doing unnecessary spin-ups -+ */ -+ if (!swap_prefetch || laptop_mode) -+ return ret; -+ -+ examine_free_limits(); -+ entry = NULL; -+ -+ for ( ; ; ) { -+ swp_entry_t swp_entry; -+ int node; -+ -+ if (!prefetch_suitable()) -+ break; -+ -+ spin_lock_irqsave(&swapped.lock, flags); -+ if (list_empty(&swapped.list)) { -+ ret = TRICKLE_FAILED; -+ spin_unlock_irqrestore(&swapped.lock, flags); -+ break; -+ } -+ -+ if (!entry) { -+ /* -+ * This sets the entry for the first iteration. It -+ * also is a safeguard against the entry disappearing -+ * while the lock is not held. -+ */ -+ entry = list_entry(swapped.list.prev, -+ struct swapped_entry, swapped_list); -+ } else if (entry->swapped_list.prev == swapped.list.next) { -+ /* -+ * If we have iterated over all entries and there are -+ * still entries that weren't swapped out there may -+ * be a reason we could not swap them back in so -+ * delay attempting further prefetching. -+ */ -+ spin_unlock_irqrestore(&swapped.lock, flags); -+ break; -+ } -+ -+ node = get_swap_entry_node(entry); -+ if (!node_isset(node, sp_stat.prefetch_nodes)) { -+ /* -+ * We found an entry that belongs to a node that is -+ * not suitable for prefetching so skip it. -+ */ -+ entry = prev_swapped_entry(entry); -+ spin_unlock_irqrestore(&swapped.lock, flags); -+ continue; -+ } -+ swp_entry = entry->swp_entry; -+ entry = prev_swapped_entry(entry); -+ spin_unlock_irqrestore(&swapped.lock, flags); -+ -+ if (trickle_swap_cache_async(swp_entry, node) == TRICKLE_DELAY) -+ break; -+ } -+ -+ if (sp_stat.prefetched_pages) { -+ lru_add_drain(); -+ sp_stat.prefetched_pages = 0; -+ } -+ return ret; -+} -+ -+static int kprefetchd(void *__unused) -+{ -+ struct sched_param param = { .sched_priority = 0 }; -+ -+ sched_setscheduler(current, SCHED_BATCH, ¶m); -+ set_user_nice(current, 19); -+ /* Set ioprio to lowest if supported by i/o scheduler */ -+ sys_ioprio_set(IOPRIO_WHO_PROCESS, 0, IOPRIO_CLASS_IDLE); -+ -+ /* kprefetchd has nothing to do until it is woken up the first time */ -+ set_current_state(TASK_INTERRUPTIBLE); -+ schedule(); -+ -+ do { -+ try_to_freeze(); -+ -+ /* -+ * TRICKLE_FAILED implies no entries left - we do not schedule -+ * a wakeup, and further delay the next one. -+ */ -+ if (trickle_swap() == TRICKLE_FAILED) { -+ set_current_state(TASK_INTERRUPTIBLE); -+ schedule(); -+ } -+ clear_last_prefetch_free(); -+ schedule_timeout_interruptible(PREFETCH_DELAY); -+ } while (!kthread_should_stop()); -+ -+ return 0; -+} -+ -+/* -+ * Create kmem cache for swapped entries -+ */ -+void __init prepare_swap_prefetch(void) -+{ -+ struct zone *zone; -+ -+ swapped.cache = kmem_cache_create("swapped_entry", -+ sizeof(struct swapped_entry), 0, SLAB_PANIC, NULL, NULL); -+ -+ /* -+ * Set max number of entries to 2/3 the size of physical ram as we -+ * only ever prefetch to consume 2/3 of the ram. -+ */ -+ swapped.maxcount = nr_free_pagecache_pages() / 3 * 2; -+ -+ for_each_zone(zone) { -+ unsigned long present; -+ struct node_stats *ns; -+ int idx; -+ -+ present = zone->present_pages; -+ if (!present) -+ continue; -+ -+ ns = &sp_stat.node[zone->zone_pgdat->node_id]; -+ ns->prefetch_watermark += present / 3 * 2; -+ idx = zone_idx(zone); -+ ns->pointfree[idx] = &ns->highfree[idx]; -+ } -+} -+ -+static int __init kprefetchd_init(void) -+{ -+ kprefetchd_task = kthread_run(kprefetchd, NULL, "kprefetchd"); -+ -+ return 0; -+} -+ -+static void __exit kprefetchd_exit(void) -+{ -+ kthread_stop(kprefetchd_task); -+} -+ -+module_init(kprefetchd_init); -+module_exit(kprefetchd_exit); -Index: linux-2.6.21-ck1/mm/swap_state.c -=================================================================== ---- linux-2.6.21-ck1.orig/mm/swap_state.c 2007-05-04 12:24:01.000000000 +1000 -+++ linux-2.6.21-ck1/mm/swap_state.c 2007-05-04 12:24:20.000000000 +1000 -@@ -10,6 +10,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -82,6 +83,7 @@ static int __add_to_swap_cache(struct pa - error = radix_tree_insert(&swapper_space.page_tree, - entry.val, page); - if (!error) { -+ remove_from_swapped_list(entry.val); - page_cache_get(page); - SetPageLocked(page); - SetPageSwapCache(page); -@@ -95,11 +97,12 @@ static int __add_to_swap_cache(struct pa - return error; - } - --static int add_to_swap_cache(struct page *page, swp_entry_t entry) -+int add_to_swap_cache(struct page *page, swp_entry_t entry) - { - int error; - - if (!swap_duplicate(entry)) { -+ remove_from_swapped_list(entry.val); - INC_CACHE_INFO(noent_race); - return -ENOENT; - } -@@ -148,6 +151,9 @@ int add_to_swap(struct page * page, gfp_ - swp_entry_t entry; - int err; - -+ /* Swap prefetching is delayed if we're swapping pages */ -+ delay_swap_prefetch(); -+ - BUG_ON(!PageLocked(page)); - - for (;;) { -@@ -320,6 +326,9 @@ struct page *read_swap_cache_async(swp_e - struct page *found_page, *new_page = NULL; - int err; - -+ /* Swap prefetching is delayed if we're already reading from swap */ -+ delay_swap_prefetch(); -+ - do { - /* - * First check the swap cache. Since this is normally -Index: linux-2.6.21-ck1/mm/vmscan.c -=================================================================== ---- linux-2.6.21-ck1.orig/mm/vmscan.c 2007-05-04 12:24:01.000000000 +1000 -+++ linux-2.6.21-ck1/mm/vmscan.c 2007-05-04 12:24:21.000000000 +1000 -@@ -16,6 +16,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -36,6 +37,7 @@ - #include - #include - #include -+#include - #include - - #include -@@ -63,7 +65,7 @@ struct scan_control { - * whole list at once. */ - int swap_cluster_max; - -- int swappiness; -+ int mapped; - - int all_unreclaimable; - }; -@@ -110,9 +112,10 @@ struct shrinker { - #endif - - /* -- * From 0 .. 100. Higher means more swappy. -+ * From 0 .. 100. Lower means more swappy. - */ --int vm_swappiness = 60; -+int vm_mapped __read_mostly = 66; -+int vm_hardmaplimit __read_mostly = 1; - long vm_total_pages; /* The total number of pages which the VM controls */ - - static LIST_HEAD(shrinker_list); -@@ -424,6 +427,7 @@ int remove_mapping(struct address_space - - if (PageSwapCache(page)) { - swp_entry_t swap = { .val = page_private(page) }; -+ add_to_swapped_list(page); - __delete_from_swap_cache(page); - write_unlock_irq(&mapping->tree_lock); - swap_free(swap); -@@ -807,10 +811,14 @@ static void shrink_active_list(unsigned - * The distress ratio is important - we don't want to start - * going oom. - * -- * A 100% value of vm_swappiness overrides this algorithm -- * altogether. -+ * This distress value is ignored if we apply a hardmaplimit except -+ * in extreme distress. -+ * -+ * A 0% value of vm_mapped overrides this algorithm altogether. - */ -- swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; -+ swap_tendency = mapped_ratio * 100 / (sc->mapped + 1); -+ if (!vm_hardmaplimit || distress == 100) -+ swap_tendency += distress; - - /* - * Now use this metric to decide whether to start moving mapped -@@ -959,6 +967,41 @@ static unsigned long shrink_zone(int pri - } - - /* -+ * Helper functions to adjust nice level of kswapd, based on the priority of -+ * the task (p) that called it. If it is already higher priority we do not -+ * demote its nice level since it is still working on behalf of a higher -+ * priority task. With kernel threads we leave it at nice 0. -+ * -+ * We don't ever run kswapd real time, so if a real time task calls kswapd we -+ * set it to highest SCHED_NORMAL priority. -+ */ -+static int effective_sc_prio(struct task_struct *p) -+{ -+ if (likely(p->mm)) { -+ if (rt_task(p)) -+ return -20; -+ if (idleprio_task(p)) -+ return 19; -+ return task_nice(p); -+ } -+ return 0; -+} -+ -+static void set_kswapd_nice(struct task_struct *kswapd, struct task_struct *p, -+ int active) -+{ -+ long nice = effective_sc_prio(p); -+ -+ if (task_nice(kswapd) > nice || !active) -+ set_user_nice(kswapd, nice); -+} -+ -+static int sc_priority(struct task_struct *p) -+{ -+ return (DEF_PRIORITY + (DEF_PRIORITY * effective_sc_prio(p) / 40)); -+} -+ -+/* - * This is the direct reclaim path, for page-allocating processes. We only - * try to reclaim pages from zones which will satisfy the caller's allocation - * request. -@@ -1015,7 +1058,8 @@ static unsigned long shrink_zones(int pr - * holds filesystem locks which prevent writeout this might not work, and the - * allocation attempt will fail. - */ --unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) -+unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask, -+ struct task_struct *p) - { - int priority; - int ret = 0; -@@ -1023,15 +1067,20 @@ unsigned long try_to_free_pages(struct z - unsigned long nr_reclaimed = 0; - struct reclaim_state *reclaim_state = current->reclaim_state; - unsigned long lru_pages = 0; -- int i; -+ int i, scan_priority = DEF_PRIORITY; - struct scan_control sc = { - .gfp_mask = gfp_mask, - .may_writepage = !laptop_mode, - .swap_cluster_max = SWAP_CLUSTER_MAX, - .may_swap = 1, -- .swappiness = vm_swappiness, -+ .mapped = vm_mapped, - }; - -+ if (p) -+ scan_priority = sc_priority(p); -+ -+ delay_swap_prefetch(); -+ - count_vm_event(ALLOCSTALL); - - for (i = 0; zones[i] != NULL; i++) { -@@ -1044,7 +1093,7 @@ unsigned long try_to_free_pages(struct z - + zone_page_state(zone, NR_INACTIVE); - } - -- for (priority = DEF_PRIORITY; priority >= 0; priority--) { -+ for (priority = scan_priority; priority >= 0; priority--) { - sc.nr_scanned = 0; - if (!priority) - disable_swap_token(); -@@ -1074,7 +1123,7 @@ unsigned long try_to_free_pages(struct z - } - - /* Take a nap, wait for some writeback to complete */ -- if (sc.nr_scanned && priority < DEF_PRIORITY - 2) -+ if (sc.nr_scanned && priority < scan_priority - 2) - congestion_wait(WRITE, HZ/10); - } - /* top priority shrink_caches still had more to do? don't OOM, then */ -@@ -1124,9 +1173,9 @@ out: - */ - static unsigned long balance_pgdat(pg_data_t *pgdat, int order) - { -- int all_zones_ok; -+ int all_zones_ok = 0; - int priority; -- int i; -+ int i, scan_priority; - unsigned long total_scanned; - unsigned long nr_reclaimed; - struct reclaim_state *reclaim_state = current->reclaim_state; -@@ -1134,7 +1183,7 @@ static unsigned long balance_pgdat(pg_da - .gfp_mask = GFP_KERNEL, - .may_swap = 1, - .swap_cluster_max = SWAP_CLUSTER_MAX, -- .swappiness = vm_swappiness, -+ .mapped = vm_mapped, - }; - /* - * temp_priority is used to remember the scanning priority at which -@@ -1142,6 +1191,8 @@ static unsigned long balance_pgdat(pg_da - */ - int temp_priority[MAX_NR_ZONES]; - -+ scan_priority = sc_priority(pgdat->kswapd); -+ - loop_again: - total_scanned = 0; - nr_reclaimed = 0; -@@ -1149,9 +1200,9 @@ loop_again: - count_vm_event(PAGEOUTRUN); - - for (i = 0; i < pgdat->nr_zones; i++) -- temp_priority[i] = DEF_PRIORITY; -+ temp_priority[i] = scan_priority; - -- for (priority = DEF_PRIORITY; priority >= 0; priority--) { -+ for (priority = scan_priority; priority >= 0; priority--) { - int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ - unsigned long lru_pages = 0; - -@@ -1167,15 +1218,22 @@ loop_again: - */ - for (i = pgdat->nr_zones - 1; i >= 0; i--) { - struct zone *zone = pgdat->node_zones + i; -+ unsigned long watermark; - - if (!populated_zone(zone)) - continue; - -- if (zone->all_unreclaimable && priority != DEF_PRIORITY) -+ if (zone->all_unreclaimable && priority != scan_priority) - continue; - -- if (!zone_watermark_ok(zone, order, zone->pages_high, -- 0, 0)) { -+ /* -+ * The watermark is relaxed depending on the -+ * level of "priority" till it drops to -+ * pages_high. -+ */ -+ watermark = zone->pages_high + (zone->pages_high * -+ priority / scan_priority); -+ if (!zone_watermark_ok(zone, order, watermark, 0, 0)) { - end_zone = i; - break; - } -@@ -1202,14 +1260,18 @@ loop_again: - for (i = 0; i <= end_zone; i++) { - struct zone *zone = pgdat->node_zones + i; - int nr_slab; -+ unsigned long watermark; - - if (!populated_zone(zone)) - continue; - -- if (zone->all_unreclaimable && priority != DEF_PRIORITY) -+ if (zone->all_unreclaimable && priority != scan_priority) - continue; - -- if (!zone_watermark_ok(zone, order, zone->pages_high, -+ watermark = zone->pages_high + (zone->pages_high * -+ priority / scan_priority); -+ -+ if (!zone_watermark_ok(zone, order, watermark, - end_zone, 0)) - all_zones_ok = 0; - temp_priority[i] = priority; -@@ -1242,7 +1304,7 @@ loop_again: - * OK, kswapd is getting into trouble. Take a nap, then take - * another pass across the zones. - */ -- if (total_scanned && priority < DEF_PRIORITY - 2) -+ if (total_scanned && priority < scan_priority - 2) - congestion_wait(WRITE, HZ/10); - - /* -@@ -1276,6 +1338,8 @@ out: - return nr_reclaimed; - } - -+#define WT_EXPIRY (HZ * 5) /* Time to wakeup watermark_timer */ -+ - /* - * The background pageout daemon, started as a kernel thread - * from the init process. -@@ -1325,6 +1389,8 @@ static int kswapd(void *p) - - try_to_freeze(); - -+ /* kswapd has been busy so delay watermark_timer */ -+ mod_timer(&pgdat->watermark_timer, jiffies + WT_EXPIRY); - prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); - new_order = pgdat->kswapd_max_order; - pgdat->kswapd_max_order = 0; -@@ -1335,6 +1401,7 @@ static int kswapd(void *p) - */ - order = new_order; - } else { -+ set_user_nice(tsk, 0); - schedule(); - order = pgdat->kswapd_max_order; - } -@@ -1348,9 +1415,10 @@ static int kswapd(void *p) - /* - * A zone is low on free memory, so wake its kswapd task to service it. - */ --void wakeup_kswapd(struct zone *zone, int order) -+void wakeup_kswapd(struct zone *zone, int order, struct task_struct *p) - { - pg_data_t *pgdat; -+ int active; - - if (!populated_zone(zone)) - return; -@@ -1362,7 +1430,9 @@ void wakeup_kswapd(struct zone *zone, in - pgdat->kswapd_max_order = order; - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - return; -- if (!waitqueue_active(&pgdat->kswapd_wait)) -+ active = waitqueue_active(&pgdat->kswapd_wait); -+ set_kswapd_nice(pgdat->kswapd, p, active); -+ if (!active) - return; - wake_up_interruptible(&pgdat->kswapd_wait); - } -@@ -1381,6 +1451,8 @@ static unsigned long shrink_all_zones(un - struct zone *zone; - unsigned long nr_to_scan, ret = 0; - -+ delay_swap_prefetch(); -+ - for_each_zone(zone) { - - if (!populated_zone(zone)) -@@ -1440,7 +1512,7 @@ unsigned long shrink_all_memory(unsigned - .may_swap = 0, - .swap_cluster_max = nr_pages, - .may_writepage = 1, -- .swappiness = vm_swappiness, -+ .mapped = vm_mapped, - }; - - current->reclaim_state = &reclaim_state; -@@ -1475,7 +1547,7 @@ unsigned long shrink_all_memory(unsigned - /* Force reclaiming mapped pages in the passes #3 and #4 */ - if (pass > 2) { - sc.may_swap = 1; -- sc.swappiness = 100; -+ sc.mapped = 0; - } - - for (prio = DEF_PRIORITY; prio >= 0; prio--) { -@@ -1539,20 +1611,57 @@ static int __devinit cpu_callback(struct - } - - /* -+ * We wake up kswapd every WT_EXPIRY till free ram is above pages_lots -+ */ -+static void watermark_wakeup(unsigned long data) -+{ -+ pg_data_t *pgdat = (pg_data_t *)data; -+ struct timer_list *wt = &pgdat->watermark_timer; -+ int i; -+ -+ if (!waitqueue_active(&pgdat->kswapd_wait) || above_background_load()) -+ goto out; -+ for (i = pgdat->nr_zones - 1; i >= 0; i--) { -+ struct zone *z = pgdat->node_zones + i; -+ -+ if (!populated_zone(z) || is_highmem(z)) { -+ /* We are better off leaving highmem full */ -+ continue; -+ } -+ if (!zone_watermark_ok(z, 0, z->pages_lots, 0, 0)) { -+ wake_up_interruptible(&pgdat->kswapd_wait); -+ goto out; -+ } -+ } -+out: -+ mod_timer(wt, jiffies + WT_EXPIRY); -+ return; -+} -+ -+/* - * This kswapd start function will be called by init and node-hot-add. - * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. - */ - int kswapd_run(int nid) - { - pg_data_t *pgdat = NODE_DATA(nid); -+ struct timer_list *wt; - int ret = 0; - - if (pgdat->kswapd) - return 0; - -+ wt = &pgdat->watermark_timer; -+ init_timer(wt); -+ wt->data = (unsigned long)pgdat; -+ wt->function = watermark_wakeup; -+ wt->expires = jiffies + WT_EXPIRY; -+ add_timer(wt); -+ - pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); - if (IS_ERR(pgdat->kswapd)) { - /* failure at boot is fatal */ -+ del_timer(wt); - BUG_ON(system_state == SYSTEM_BOOTING); - printk("Failed to start kswapd on node %d\n",nid); - ret = -1; -@@ -1623,7 +1732,7 @@ static int __zone_reclaim(struct zone *z - .swap_cluster_max = max_t(unsigned long, nr_pages, - SWAP_CLUSTER_MAX), - .gfp_mask = gfp_mask, -- .swappiness = vm_swappiness, -+ .mapped = vm_mapped, - }; - unsigned long slab_reclaimable; - -Index: linux-2.6.21-ck1/include/linux/mm_inline.h -=================================================================== ---- linux-2.6.21-ck1.orig/include/linux/mm_inline.h 2007-05-04 12:24:01.000000000 +1000 -+++ linux-2.6.21-ck1/include/linux/mm_inline.h 2007-05-04 12:24:20.000000000 +1000 -@@ -13,6 +13,13 @@ add_page_to_inactive_list(struct zone *z - } - - static inline void -+add_page_to_inactive_list_tail(struct zone *zone, struct page *page) -+{ -+ list_add_tail(&page->lru, &zone->inactive_list); -+ __inc_zone_state(zone, NR_INACTIVE); -+} -+ -+static inline void - del_page_from_active_list(struct zone *zone, struct page *page) - { - list_del(&page->lru); -Index: linux-2.6.21-ck1/include/linux/swap-prefetch.h -=================================================================== ---- /dev/null 1970-01-01 00:00:00.000000000 +0000 -+++ linux-2.6.21-ck1/include/linux/swap-prefetch.h 2007-05-04 12:24:20.000000000 +1000 -@@ -0,0 +1,55 @@ -+#ifndef SWAP_PREFETCH_H_INCLUDED -+#define SWAP_PREFETCH_H_INCLUDED -+ -+#ifdef CONFIG_SWAP_PREFETCH -+/* mm/swap_prefetch.c */ -+extern int swap_prefetch; -+struct swapped_entry { -+ swp_entry_t swp_entry; /* The actual swap entry */ -+ struct list_head swapped_list; /* Linked list of entries */ -+#if MAX_NUMNODES > 1 -+ int node; /* Node id */ -+#endif -+} __attribute__((packed)); -+ -+static inline void store_swap_entry_node(struct swapped_entry *entry, -+ struct page *page) -+{ -+#if MAX_NUMNODES > 1 -+ entry->node = page_to_nid(page); -+#endif -+} -+ -+static inline int get_swap_entry_node(struct swapped_entry *entry) -+{ -+#if MAX_NUMNODES > 1 -+ return entry->node; -+#else -+ return 0; -+#endif -+} -+ -+extern void add_to_swapped_list(struct page *page); -+extern void remove_from_swapped_list(const unsigned long index); -+extern void delay_swap_prefetch(void); -+extern void prepare_swap_prefetch(void); -+ -+#else /* CONFIG_SWAP_PREFETCH */ -+static inline void add_to_swapped_list(struct page *__unused) -+{ -+} -+ -+static inline void prepare_swap_prefetch(void) -+{ -+} -+ -+static inline void remove_from_swapped_list(const unsigned long __unused) -+{ -+} -+ -+static inline void delay_swap_prefetch(void) -+{ -+} -+#endif /* CONFIG_SWAP_PREFETCH */ -+ -+#endif /* SWAP_PREFETCH_H_INCLUDED */ -Index: linux-2.6.21-ck1/include/linux/sysctl.h -=================================================================== ---- linux-2.6.21-ck1.orig/include/linux/sysctl.h 2007-05-04 12:24:01.000000000 +1000 -+++ linux-2.6.21-ck1/include/linux/sysctl.h 2007-05-04 12:24:20.000000000 +1000 -@@ -190,7 +190,7 @@ enum - VM_OVERCOMMIT_RATIO=16, /* percent of RAM to allow overcommit in */ - VM_PAGEBUF=17, /* struct: Control pagebuf parameters */ - VM_HUGETLB_PAGES=18, /* int: Number of available Huge Pages */ -- VM_SWAPPINESS=19, /* Tendency to steal mapped memory */ -+ VM_MAPPED=19, /* percent mapped min while evicting cache */ - VM_LOWMEM_RESERVE_RATIO=20,/* reservation ratio for lower memory zones */ - VM_MIN_FREE_KBYTES=21, /* Minimum free kilobytes to maintain */ - VM_MAX_MAP_COUNT=22, /* int: Maximum number of mmaps/address-space */ -Index: linux-2.6.21-ck1/include/linux/mmzone.h -=================================================================== ---- linux-2.6.21-ck1.orig/include/linux/mmzone.h 2007-05-04 12:24:01.000000000 +1000 -+++ linux-2.6.21-ck1/include/linux/mmzone.h 2007-05-04 12:24:21.000000000 +1000 -@@ -13,6 +13,7 @@ - #include - #include - #include -+#include - #include - #include - -@@ -178,7 +179,7 @@ enum zone_type { - - struct zone { - /* Fields commonly accessed by the page allocator */ -- unsigned long pages_min, pages_low, pages_high; -+ unsigned long pages_min, pages_low, pages_high, pages_lots; - /* - * We don't know if the memory that we're going to allocate will be freeable - * or/and it will be released eventually, so to avoid totally wasting several -@@ -449,6 +450,7 @@ typedef struct pglist_data { - wait_queue_head_t kswapd_wait; - struct task_struct *kswapd; - int kswapd_max_order; -+ struct timer_list watermark_timer; - } pg_data_t; - - #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) -@@ -465,7 +467,7 @@ typedef struct pglist_data { - void get_zone_counts(unsigned long *active, unsigned long *inactive, - unsigned long *free); - void build_all_zonelists(void); --void wakeup_kswapd(struct zone *zone, int order); -+void wakeup_kswapd(struct zone *zone, int order, struct task_struct *p); - int zone_watermark_ok(struct zone *z, int order, unsigned long mark, - int classzone_idx, int alloc_flags); - enum memmap_context { -Index: linux-2.6.21-ck1/mm/page_alloc.c -=================================================================== ---- linux-2.6.21-ck1.orig/mm/page_alloc.c 2007-05-04 12:24:01.000000000 +1000 -+++ linux-2.6.21-ck1/mm/page_alloc.c 2007-05-04 12:24:20.000000000 +1000 -@@ -1277,7 +1277,7 @@ restart: - goto nopage; - - for (z = zonelist->zones; *z; z++) -- wakeup_kswapd(*z, order); -+ wakeup_kswapd(*z, order, p); - - /* - * OK, we're below the kswapd watermark and have kicked background -@@ -1341,7 +1341,7 @@ nofail_alloc: - reclaim_state.reclaimed_slab = 0; - p->reclaim_state = &reclaim_state; - -- did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask); -+ did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask, p); - - p->reclaim_state = NULL; - p->flags &= ~PF_MEMALLOC; -@@ -1597,6 +1597,7 @@ void show_free_areas(void) - " min:%lukB" - " low:%lukB" - " high:%lukB" -+ " lots:%lukB" - " active:%lukB" - " inactive:%lukB" - " present:%lukB" -@@ -1608,6 +1609,7 @@ void show_free_areas(void) - K(zone->pages_min), - K(zone->pages_low), - K(zone->pages_high), -+ K(zone->pages_lots), - K(zone_page_state(zone, NR_ACTIVE)), - K(zone_page_state(zone, NR_INACTIVE)), - K(zone->present_pages), -@@ -3146,6 +3148,7 @@ void setup_per_zone_pages_min(void) - - zone->pages_low = zone->pages_min + (tmp >> 2); - zone->pages_high = zone->pages_min + (tmp >> 1); -+ zone->pages_lots = zone->pages_min + tmp; - spin_unlock_irqrestore(&zone->lru_lock, flags); - } - -Index: linux-2.6.21-ck1/fs/buffer.c -=================================================================== ---- linux-2.6.21-ck1.orig/fs/buffer.c 2007-05-04 12:24:01.000000000 +1000 -+++ linux-2.6.21-ck1/fs/buffer.c 2007-05-04 12:24:20.000000000 +1000 -@@ -363,7 +363,7 @@ static void free_more_memory(void) - for_each_online_pgdat(pgdat) { - zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones; - if (*zones) -- try_to_free_pages(zones, GFP_NOFS); -+ try_to_free_pages(zones, GFP_NOFS, NULL); - } - } - -Index: linux-2.6.21-ck1/mm/filemap.c -=================================================================== ---- linux-2.6.21-ck1.orig/mm/filemap.c 2007-05-04 12:24:01.000000000 +1000 -+++ linux-2.6.21-ck1/mm/filemap.c 2007-05-04 12:24:21.000000000 +1000 -@@ -466,6 +466,16 @@ int add_to_page_cache_lru(struct page *p - return ret; - } - -+int add_to_page_cache_lru_tail(struct page *page, -+ struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) -+{ -+ int ret = add_to_page_cache(page, mapping, offset, gfp_mask); -+ -+ if (ret == 0) -+ lru_cache_add_tail(page); -+ return ret; -+} -+ - #ifdef CONFIG_NUMA - struct page *__page_cache_alloc(gfp_t gfp) - { -@@ -836,6 +846,34 @@ static void shrink_readahead_size_eio(st - ra->ra_pages /= 4; - } - -+/* -+ * Sysctl which determines whether we should read from large files to the -+ * tail of the inactive lru list. -+ */ -+int vm_tail_largefiles __read_mostly = 1; -+ -+static inline int nr_mapped(void) -+{ -+ return global_page_state(NR_FILE_MAPPED) + -+ global_page_state(NR_ANON_PAGES); -+} -+ -+/* -+ * This examines how large in pages a file size is and returns 1 if it is -+ * more than half the unmapped ram. Avoid doing read_page_state which is -+ * expensive unless we already know it is likely to be large enough. -+ */ -+static int large_isize(unsigned long nr_pages) -+{ -+ if (nr_pages * 6 > vm_total_pages) { -+ unsigned long unmapped_ram = vm_total_pages - nr_mapped(); -+ -+ if (nr_pages * 2 > unmapped_ram) -+ return 1; -+ } -+ return 0; -+} -+ - /** - * do_generic_mapping_read - generic file read routine - * @mapping: address_space to be read -@@ -1044,8 +1082,19 @@ no_cached_page: - goto out; - } - } -- error = add_to_page_cache_lru(cached_page, mapping, -- index, GFP_KERNEL); -+ -+ /* -+ * If we know the file is large we add the pages read to the -+ * end of the lru as we're unlikely to be able to cache the -+ * whole file in ram so make those pages the first to be -+ * dropped if not referenced soon. -+ */ -+ if (vm_tail_largefiles && large_isize(end_index)) -+ error = add_to_page_cache_lru_tail(cached_page, -+ mapping, index, GFP_KERNEL); -+ else -+ error = add_to_page_cache_lru(cached_page, mapping, -+ index, GFP_KERNEL); - if (error) { - if (error == -EEXIST) - goto find_page; -Index: linux-2.6.21-ck1/Documentation/filesystems/proc.txt -=================================================================== ---- linux-2.6.21-ck1.orig/Documentation/filesystems/proc.txt 2007-05-04 12:24:01.000000000 +1000 -+++ linux-2.6.21-ck1/Documentation/filesystems/proc.txt 2007-05-04 12:24:21.000000000 +1000 -@@ -1325,6 +1325,14 @@ To free pagecache, dentries and inodes: - As this is a non-destructive operation and dirty objects are not freeable, the - user should run `sync' first. - -+tail_largefiles -+--------------- -+ -+When enabled reads from large files to the tail end of the inactive lru list. -+This means that any cache from reading large files is dropped very quickly, -+preventing loss of mapped ram and useful pagecache when large files are read. -+This does, however, make caching less effective when working with large files. -+ - - 2.5 /proc/sys/dev - Device specific parameters - ---------------------------------------------- -Index: linux-2.6.21-ck1/arch/i386/Kconfig -=================================================================== ---- linux-2.6.21-ck1.orig/arch/i386/Kconfig 2007-05-04 12:24:00.000000000 +1000 -+++ linux-2.6.21-ck1/arch/i386/Kconfig 2007-05-04 12:24:21.000000000 +1000 -@@ -546,7 +546,7 @@ endchoice - - choice - depends on EXPERIMENTAL -- prompt "Memory split" if EMBEDDED -+ prompt "Memory split" - default VMSPLIT_3G - help - Select the desired split between kernel and user memory. -@@ -565,14 +565,14 @@ choice - option alone! - - config VMSPLIT_3G -- bool "3G/1G user/kernel split" -+ bool "Default 896MB lowmem (3G/1G user/kernel split)" - config VMSPLIT_3G_OPT - depends on !HIGHMEM -- bool "3G/1G user/kernel split (for full 1G low memory)" -+ bool "1GB lowmem (3G/1G user/kernel split)" - config VMSPLIT_2G -- bool "2G/2G user/kernel split" -+ bool "2GB lowmem (2G/2G user/kernel split)" - config VMSPLIT_1G -- bool "1G/3G user/kernel split" -+ bool "3GB lowmem (1G/3G user/kernel split)" - endchoice - - config PAGE_OFFSET -Index: linux-2.6.21-ck1/kernel/Kconfig.hz -=================================================================== ---- linux-2.6.21-ck1.orig/kernel/Kconfig.hz 2007-05-04 12:24:00.000000000 +1000 -+++ linux-2.6.21-ck1/kernel/Kconfig.hz 2007-05-04 12:24:21.000000000 +1000 -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_250 -+ default HZ_1000 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -13,8 +13,7 @@ choice - contention and cacheline bounces as a result of timer interrupts. - Note that the timer interrupt occurs on each processor in an SMP - environment leading to NR_CPUS * HZ number of timer interrupts -- per second. -- -+ per second.Laptops may also show improved battery life. - - config HZ_100 - bool "100 HZ" -@@ -23,13 +22,14 @@ choice - with lots of processors that may show reduced performance if - too many timer interrupts are occurring. - -- config HZ_250 -+ config HZ_250_NODEFAULT - bool "250 HZ" - help -- 250 Hz is a good compromise choice allowing server performance -- while also showing good interactive responsiveness even -- on SMP and NUMA systems. If you are going to be using NTSC video -- or multimedia, selected 300Hz instead. -+ 250 HZ is a lousy compromise choice allowing server interactivity -+ while also showing desktop throughput and no extra power saving on -+ laptops. Good for when you can't make up your mind. -+ -+ Recommend 100 or 1000 instead. - - config HZ_300 - bool "300 HZ" -@@ -45,12 +45,76 @@ choice - 1000 Hz is the preferred choice for desktop systems and other - systems requiring fast interactive responses to events. - -+ config HZ_1500 -+ bool "1500 HZ" -+ help -+ 1500 Hz is an insane value to use to run broken software that is Hz -+ limited. -+ -+ Being over 1000, driver breakage is likely. -+ -+ config HZ_2000 -+ bool "2000 HZ" -+ help -+ 2000 Hz is an insane value to use to run broken software that is Hz -+ limited. -+ -+ Being over 1000, driver breakage is likely. -+ -+ config HZ_3000 -+ bool "3000 HZ" -+ help -+ 3000 Hz is an insane value to use to run broken software that is Hz -+ limited. -+ -+ Being over 1000, driver breakage is likely. -+ -+ config HZ_4000 -+ bool "4000 HZ" -+ help -+ 4000 Hz is an insane value to use to run broken software that is Hz -+ limited. -+ -+ Being over 1000, driver breakage is likely. -+ -+ config HZ_5000 -+ bool "5000 HZ" -+ help -+ 5000 Hz is an obscene value to use to run broken software that is Hz -+ limited. -+ -+ Being over 1000, driver breakage is likely. -+ -+ config HZ_7500 -+ bool "7500 HZ" -+ help -+ 7500 Hz is an obscene value to use to run broken software that is Hz -+ limited. -+ -+ Being over 1000, driver breakage is likely. -+ -+ config HZ_10000 -+ bool "10000 HZ" -+ help -+ 10000 Hz is an obscene value to use to run broken software that is Hz -+ limited. -+ -+ Being over 1000, driver breakage is likely. -+ -+ - endchoice - - config HZ - int - default 100 if HZ_100 -- default 250 if HZ_250 -+ default 250 if HZ_250_NODEFAULT - default 300 if HZ_300 - default 1000 if HZ_1000 -+ default 1500 if HZ_1500 -+ default 2000 if HZ_2000 -+ default 3000 if HZ_3000 -+ default 4000 if HZ_4000 -+ default 5000 if HZ_5000 -+ default 7500 if HZ_7500 -+ default 10000 if HZ_10000 - -Index: linux-2.6.21-ck1/arch/i386/defconfig -=================================================================== ---- linux-2.6.21-ck1.orig/arch/i386/defconfig 2007-05-04 12:24:00.000000000 +1000 -+++ linux-2.6.21-ck1/arch/i386/defconfig 2007-05-04 12:24:21.000000000 +1000 -@@ -214,10 +214,10 @@ CONFIG_MTRR=y - # CONFIG_IRQBALANCE is not set - CONFIG_SECCOMP=y - # CONFIG_HZ_100 is not set --CONFIG_HZ_250=y -+# CONFIG_HZ_250 is not set - # CONFIG_HZ_300 is not set --# CONFIG_HZ_1000 is not set --CONFIG_HZ=250 -+CONFIG_HZ_1000=y -+CONFIG_HZ=1000 - # CONFIG_KEXEC is not set - # CONFIG_CRASH_DUMP is not set - CONFIG_PHYSICAL_START=0x100000 -Index: linux-2.6.21-ck1/arch/x86_64/defconfig -=================================================================== ---- linux-2.6.21-ck1.orig/arch/x86_64/defconfig 2007-05-04 12:24:00.000000000 +1000 -+++ linux-2.6.21-ck1/arch/x86_64/defconfig 2007-05-04 12:24:21.000000000 +1000 -@@ -178,10 +178,10 @@ CONFIG_PHYSICAL_START=0x200000 - CONFIG_SECCOMP=y - # CONFIG_CC_STACKPROTECTOR is not set - # CONFIG_HZ_100 is not set --CONFIG_HZ_250=y -+# CONFIG_HZ_250 is not set - # CONFIG_HZ_300 is not set --# CONFIG_HZ_1000 is not set --CONFIG_HZ=250 -+CONFIG_HZ_1000=y -+CONFIG_HZ=1000 - # CONFIG_REORDER is not set - CONFIG_K8_NB=y - CONFIG_GENERIC_HARDIRQS=y -Index: linux-2.6.21-ck1/include/linux/jiffies.h -=================================================================== ---- linux-2.6.21-ck1.orig/include/linux/jiffies.h 2007-05-04 12:24:00.000000000 +1000 -+++ linux-2.6.21-ck1/include/linux/jiffies.h 2007-05-04 12:24:21.000000000 +1000 -@@ -29,6 +29,12 @@ - # define SHIFT_HZ 9 - #elif HZ >= 768 && HZ < 1536 - # define SHIFT_HZ 10 -+#elif HZ >= 1536 && HZ < 3072 -+# define SHIFT_HZ 11 -+#elif HZ >= 3072 && HZ < 6144 -+# define SHIFT_HZ 12 -+#elif HZ >= 6144 && HZ < 12288 -+# define SHIFT_HZ 13 - #else - # error You lose. - #endif -Index: linux-2.6.21-ck1/include/net/inet_timewait_sock.h -=================================================================== ---- linux-2.6.21-ck1.orig/include/net/inet_timewait_sock.h 2007-05-04 12:24:00.000000000 +1000 -+++ linux-2.6.21-ck1/include/net/inet_timewait_sock.h 2007-05-04 12:24:21.000000000 +1000 -@@ -38,8 +38,8 @@ struct inet_hashinfo; - * If time > 4sec, it is "slow" path, no recycling is required, - * so that we select tick to get range about 4 seconds. - */ --#if HZ <= 16 || HZ > 4096 --# error Unsupported: HZ <= 16 or HZ > 4096 -+#if HZ <= 16 || HZ > 16384 -+# error Unsupported: HZ <= 16 or HZ > 16384 - #elif HZ <= 32 - # define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) - #elif HZ <= 64 -@@ -54,8 +54,12 @@ struct inet_hashinfo; - # define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) - #elif HZ <= 2048 - # define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) --#else -+#elif HZ <= 4096 - # define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) -+#elif HZ <= 8192 -+# define INET_TWDR_RECYCLE_TICK (13 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) -+#else -+# define INET_TWDR_RECYCLE_TICK (14 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) - #endif - - /* TIME_WAIT reaping mechanism. */ -Index: linux-2.6.21-ck1/include/net/pkt_sched.h -=================================================================== ---- linux-2.6.21-ck1.orig/include/net/pkt_sched.h 2007-05-04 12:24:00.000000000 +1000 -+++ linux-2.6.21-ck1/include/net/pkt_sched.h 2007-05-04 12:24:21.000000000 +1000 -@@ -78,8 +78,14 @@ typedef long psched_tdiff_t; - #define PSCHED_JSCALE 12 - #elif HZ >= 384 && HZ < 768 - #define PSCHED_JSCALE 11 --#elif HZ >= 768 -+#elif HZ >= 768 && HZ < 1536 - #define PSCHED_JSCALE 10 -+#elif HZ >= 1536 && HZ < 3072 -+#define PSCHED_JSCALE 9 -+#elif HZ >= 3072 && HZ < 6144 -+#define PSCHED_JSCALE 8 -+#else -+#define PSCHED_JSCALE 7 - #endif - - #define PSCHED_GET_TIME(stamp) ((stamp) = (get_jiffies_64()<loops_per_jiffy/(500000/HZ), -- (c->loops_per_jiffy/(5000/HZ)) % 100); -+ (c->loops_per_jiffy * 10/(50000/HZ)) % 100); - seq_printf(m, "clflush size\t: %u\n\n", c->x86_clflush_size); - - return 0; -Index: linux-2.6.21-ck1/arch/i386/kernel/smpboot.c -=================================================================== ---- linux-2.6.21-ck1.orig/arch/i386/kernel/smpboot.c 2007-05-04 12:24:00.000000000 +1000 -+++ linux-2.6.21-ck1/arch/i386/kernel/smpboot.c 2007-05-04 12:24:21.000000000 +1000 -@@ -1134,7 +1134,7 @@ static void __init smp_boot_cpus(unsigne - "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", - cpucount+1, - bogosum/(500000/HZ), -- (bogosum/(5000/HZ))%100); -+ (bogosum * 10/(50000/HZ))%100); - - Dprintk("Before bogocount - setting activated=1.\n"); - -Index: linux-2.6.21-ck1/include/linux/nfsd/stats.h -=================================================================== ---- linux-2.6.21-ck1.orig/include/linux/nfsd/stats.h 2007-05-04 12:24:00.000000000 +1000 -+++ linux-2.6.21-ck1/include/linux/nfsd/stats.h 2007-05-04 12:24:21.000000000 +1000 -@@ -35,8 +35,8 @@ struct nfsd_stats { - - }; - --/* thread usage wraps very million seconds (approx one fortnight) */ --#define NFSD_USAGE_WRAP (HZ*1000000) -+/* thread usage wraps every one hundred thousand seconds (approx one day) */ -+#define NFSD_USAGE_WRAP (HZ*100000) - - #ifdef __KERNEL__ - -Index: linux-2.6.21-ck1/arch/x86_64/kernel/setup.c -=================================================================== ---- linux-2.6.21-ck1.orig/arch/x86_64/kernel/setup.c 2007-05-04 12:24:00.000000000 +1000 -+++ linux-2.6.21-ck1/arch/x86_64/kernel/setup.c 2007-05-04 12:24:22.000000000 +1000 -@@ -1053,7 +1053,7 @@ static int show_cpuinfo(struct seq_file - - seq_printf(m, "\nbogomips\t: %lu.%02lu\n", - c->loops_per_jiffy/(500000/HZ), -- (c->loops_per_jiffy/(5000/HZ)) % 100); -+ (c->loops_per_jiffy * 10/(50000/HZ)) % 100); - - if (c->x86_tlbsize > 0) - seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); diff --git a/pkgs/os-specific/linux/kernel/patch-2.6.22-ck1 b/pkgs/os-specific/linux/kernel/patch-2.6.22-ck1 deleted file mode 100644 index 81fa14e2abe4..000000000000 --- a/pkgs/os-specific/linux/kernel/patch-2.6.22-ck1 +++ /dev/null @@ -1,5167 +0,0 @@ -Index: linux-2.6.22-ck1/include/linux/sched.h -=================================================================== ---- linux-2.6.22-ck1.orig/include/linux/sched.h 2007-07-10 14:55:00.000000000 +1000 -+++ linux-2.6.22-ck1/include/linux/sched.h 2007-07-10 14:55:21.000000000 +1000 -@@ -34,9 +34,14 @@ - #define SCHED_FIFO 1 - #define SCHED_RR 2 - #define SCHED_BATCH 3 -+#define SCHED_ISO 4 -+#define SCHED_IDLEPRIO 5 - - #ifdef __KERNEL__ - -+#define SCHED_MAX SCHED_IDLEPRIO -+#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) -+ - struct sched_param { - int sched_priority; - }; -@@ -129,7 +134,7 @@ - extern unsigned long nr_active(void); - extern unsigned long nr_iowait(void); - extern unsigned long weighted_cpuload(const int cpu); -- -+extern int above_background_load(void); - - /* - * Task state bitmask. NOTE! These bits are also -@@ -150,8 +155,7 @@ - #define EXIT_ZOMBIE 16 - #define EXIT_DEAD 32 - /* in tsk->state again */ --#define TASK_NONINTERACTIVE 64 --#define TASK_DEAD 128 -+#define TASK_DEAD 64 - - #define __set_task_state(tsk, state_value) \ - do { (tsk)->state = (state_value); } while (0) -@@ -537,14 +541,19 @@ - - #define MAX_USER_RT_PRIO 100 - #define MAX_RT_PRIO MAX_USER_RT_PRIO -+#define PRIO_RANGE (40) -+#define ISO_PRIO (MAX_RT_PRIO - 1) - --#define MAX_PRIO (MAX_RT_PRIO + 40) -+#define MAX_PRIO (MAX_RT_PRIO + PRIO_RANGE) - --#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) -+#define rt_prio(prio) unlikely((prio) < ISO_PRIO) - #define rt_task(p) rt_prio((p)->prio) - #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) --#define is_rt_policy(p) ((p) != SCHED_NORMAL && (p) != SCHED_BATCH) -+#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ -+ (policy) == SCHED_RR) - #define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) -+#define iso_task(p) unlikely((p)->policy == SCHED_ISO) -+#define idleprio_task(p) unlikely((p)->policy == SCHED_IDLEPRIO) - - /* - * Some day this will be a full-fledged user tracking system.. -@@ -809,13 +818,6 @@ - struct pipe_inode_info; - struct uts_namespace; - --enum sleep_type { -- SLEEP_NORMAL, -- SLEEP_NONINTERACTIVE, -- SLEEP_INTERACTIVE, -- SLEEP_INTERRUPTED, --}; -- - struct prio_array; - - struct task_struct { -@@ -835,20 +837,33 @@ - int load_weight; /* for niceness load balancing purposes */ - int prio, static_prio, normal_prio; - struct list_head run_list; -+ /* -+ * This bitmap shows what priorities this task has received quota -+ * from for this major priority rotation on its current runqueue. -+ */ -+ DECLARE_BITMAP(bitmap, PRIO_RANGE + 1); - struct prio_array *array; -+ /* Which major runqueue rotation did this task run */ -+ unsigned long rotation; - - unsigned short ioprio; - #ifdef CONFIG_BLK_DEV_IO_TRACE - unsigned int btrace_seq; - #endif -- unsigned long sleep_avg; - unsigned long long timestamp, last_ran; - unsigned long long sched_time; /* sched_clock time spent running */ -- enum sleep_type sleep_type; - - unsigned int policy; - cpumask_t cpus_allowed; -- unsigned int time_slice, first_time_slice; -+ /* -+ * How much this task is entitled to run at the current priority -+ * before being requeued at a lower priority. -+ */ -+ int time_slice; -+ /* Is this the very first time_slice this task has ever run. */ -+ unsigned int first_time_slice; -+ /* How much this task receives at each priority level */ -+ int quota; - - #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) - struct sched_info sched_info; -@@ -1013,6 +1028,7 @@ - struct held_lock held_locks[MAX_LOCK_DEPTH]; - unsigned int lockdep_recursion; - #endif -+ unsigned long mutexes_held; - - /* journalling filesystem info */ - void *journal_info; -@@ -1181,9 +1197,11 @@ - #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ - #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ - #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ -+#define PF_ISOREF 0x04000000 /* SCHED_ISO task has used up quota */ - #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ - #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ - #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */ -+#define PF_NONSLEEP 0x80000000 /* Waiting on in-kernel activity */ - - /* - * Only the _current_ task can read/write to tsk->flags, but other -@@ -1253,7 +1271,7 @@ - #endif - - extern void set_user_nice(struct task_struct *p, long nice); --extern int task_prio(const struct task_struct *p); -+extern int task_prio(struct task_struct *p); - extern int task_nice(const struct task_struct *p); - extern int can_nice(const struct task_struct *p, const int nice); - extern int task_curr(const struct task_struct *p); -Index: linux-2.6.22-ck1/kernel/sched.c -=================================================================== ---- linux-2.6.22-ck1.orig/kernel/sched.c 2007-07-10 14:55:00.000000000 +1000 -+++ linux-2.6.22-ck1/kernel/sched.c 2007-07-10 14:55:24.000000000 +1000 -@@ -16,6 +16,7 @@ - * by Davide Libenzi, preemptible kernel bits by Robert Love. - * 2003-09-03 Interactivity tuning by Con Kolivas. - * 2004-04-02 Scheduler domains code by Nick Piggin -+ * 2007-03-02 Staircase deadline scheduling policy by Con Kolivas - */ - - #include -@@ -53,8 +54,9 @@ - #include - #include - #include -- -+#include - #include -+ - #include - - /* -@@ -84,147 +86,85 @@ - #define USER_PRIO(p) ((p)-MAX_RT_PRIO) - #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) - #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) -+#define SCHED_PRIO(p) ((p)+MAX_RT_PRIO) - --/* -- * Some helpers for converting nanosecond timing to jiffy resolution -- */ --#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) -+/* Some helpers for converting to/from various scales.*/ - #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) -- --/* -- * These are the 'tuning knobs' of the scheduler: -- * -- * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), -- * default timeslice is 100 msecs, maximum timeslice is 800 msecs. -- * Timeslices get refilled after they expire. -- */ --#define MIN_TIMESLICE max(5 * HZ / 1000, 1) --#define DEF_TIMESLICE (100 * HZ / 1000) --#define ON_RUNQUEUE_WEIGHT 30 --#define CHILD_PENALTY 95 --#define PARENT_PENALTY 100 --#define EXIT_WEIGHT 3 --#define PRIO_BONUS_RATIO 25 --#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) --#define INTERACTIVE_DELTA 2 --#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) --#define STARVATION_LIMIT (MAX_SLEEP_AVG) --#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) -- --/* -- * If a task is 'interactive' then we reinsert it in the active -- * array after it has expired its current timeslice. (it will not -- * continue to run immediately, it will still roundrobin with -- * other interactive tasks.) -- * -- * This part scales the interactivity limit depending on niceness. -- * -- * We scale it linearly, offset by the INTERACTIVE_DELTA delta. -- * Here are a few examples of different nice levels: -- * -- * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] -- * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] -- * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] -- * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] -- * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] -- * -- * (the X axis represents the possible -5 ... 0 ... +5 dynamic -- * priority range a task can explore, a value of '1' means the -- * task is rated interactive.) -- * -- * Ie. nice +19 tasks can never get 'interactive' enough to be -- * reinserted into the active array. And only heavily CPU-hog nice -20 -- * tasks will be expired. Default nice 0 tasks are somewhere between, -- * it takes some effort for them to get interactive, but it's not -- * too hard. -- */ -- --#define CURRENT_BONUS(p) \ -- (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ -- MAX_SLEEP_AVG) -- --#define GRANULARITY (10 * HZ / 1000 ? : 1) -- --#ifdef CONFIG_SMP --#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ -- (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ -- num_online_cpus()) --#else --#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ -- (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) --#endif -- --#define SCALE(v1,v1_max,v2_max) \ -- (v1) * (v2_max) / (v1_max) -- --#define DELTA(p) \ -- (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \ -- INTERACTIVE_DELTA) -- --#define TASK_INTERACTIVE(p) \ -- ((p)->prio <= (p)->static_prio - DELTA(p)) -- --#define INTERACTIVE_SLEEP(p) \ -- (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ -- (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) -- --#define TASK_PREEMPTS_CURR(p, rq) \ -- ((p)->prio < (rq)->curr->prio) -- --#define SCALE_PRIO(x, prio) \ -- max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) -- --static unsigned int static_prio_timeslice(int static_prio) --{ -- if (static_prio < NICE_TO_PRIO(0)) -- return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); -- else -- return SCALE_PRIO(DEF_TIMESLICE, static_prio); --} -- --#ifdef CONFIG_SMP --/* -- * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) -- * Since cpu_power is a 'constant', we can use a reciprocal divide. -+#define MS_TO_NS(TIME) ((TIME) * 1000000) -+#define MS_TO_US(TIME) ((TIME) * 1000) -+#define US_TO_MS(TIME) ((TIME) / 1000) -+ -+#define TASK_PREEMPTS_CURR(p, curr) ((p)->prio < (curr)->prio) -+ -+/* -+ * This is the time all tasks within the same priority round robin. -+ * Value is in ms and set to a minimum of 10ms. Scales with number of cpus. -+ * Tunable via /proc interface. -+ */ -+int rr_interval __read_mostly = 6; -+int sched_interactive __read_mostly = 1; -+ -+/* -+ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks -+ * are allowed to run (over ISO_PERIOD seconds) as real time tasks. -+ * sched_iso_period - sysctl which determines the number of seconds over -+ * which cpu usage of SCHED_ISO tasks is averaged to determine if they are -+ * exceeding their allowable bandwidth. -+*/ -+int sched_iso_cpu __read_mostly = 80; -+int sched_iso_period __read_mostly = 5; -+ -+#define ISO_PERIOD ((sched_iso_period * HZ) + 1) -+ -+/* -+ * This contains a bitmap for each dynamic priority level with empty slots -+ * for the valid priorities each different nice level can have. It allows -+ * us to stagger the slots where differing priorities run in a way that -+ * keeps latency differences between different nice levels at a minimum. -+ * The purpose of a pre-generated matrix is for rapid lookup of next slot in -+ * O(1) time without having to recalculate every time priority gets demoted. -+ * All nice levels use priority slot 39 as this allows less niced tasks to -+ * get all priority slots better than that before expiration is forced. -+ * ie, where 0 means a slot for that priority, priority running from left to -+ * right is from prio 0 to prio 39: -+ * nice -20 0000000000000000000000000000000000000000 -+ * nice -10 1000100010001000100010001000100010010000 -+ * nice 0 1010101010101010101010101010101010101010 -+ * nice 5 1011010110110101101101011011010110110110 -+ * nice 10 1110111011101110111011101110111011101110 -+ * nice 15 1111111011111110111111101111111011111110 -+ * nice 19 1111111111111111111111111111111111111110 - */ --static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) --{ -- return reciprocal_divide(load, sg->reciprocal_cpu_power); --} -+static unsigned long prio_matrix[PRIO_RANGE][BITS_TO_LONGS(PRIO_RANGE)] -+ __read_mostly; - --/* -- * Each time a sched group cpu_power is changed, -- * we must compute its reciprocal value -- */ --static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) --{ -- sg->__cpu_power += val; -- sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); --} --#endif -+struct rq; - - /* -- * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] -- * to time slice values: [800ms ... 100ms ... 5ms] -- * -- * The higher a thread's priority, the bigger timeslices -- * it gets during one round of execution. But even the lowest -- * priority thread gets MIN_TIMESLICE worth of execution time. -+ * These are the runqueue data structures: - */ -+struct prio_array { -+ /* Tasks queued at each priority */ -+ struct list_head queue[MAX_PRIO + 1]; - --static inline unsigned int task_timeslice(struct task_struct *p) --{ -- return static_prio_timeslice(p->static_prio); --} -+ /* -+ * The bitmap of priorities queued for this array. While the expired -+ * array will never have realtime tasks on it, it is simpler to have -+ * equal sized bitmaps for a cheap array swap. Include 1 bit for -+ * delimiter. -+ */ -+ DECLARE_BITMAP(prio_bitmap, MAX_PRIO + 1); - --/* -- * These are the runqueue data structures: -- */ -+ /* -+ * The best static priority (of the dynamic priority tasks) queued -+ * this array. -+ */ -+ int best_static_prio; - --struct prio_array { -- unsigned int nr_active; -- DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ -- struct list_head queue[MAX_PRIO]; -+#ifdef CONFIG_SMP -+ /* For convenience looks back at rq */ -+ struct rq *rq; -+#endif - }; - - /* -@@ -260,14 +200,28 @@ - */ - unsigned long nr_uninterruptible; - -- unsigned long expired_timestamp; - /* Cached timestamp set by update_cpu_clock() */ - unsigned long long most_recent_timestamp; - struct task_struct *curr, *idle; - unsigned long next_balance; - struct mm_struct *prev_mm; -- struct prio_array *active, *expired, arrays[2]; -- int best_expired_prio; -+ -+ struct prio_array *active, *expired, *idleprio, arrays[2]; -+ unsigned long *dyn_bitmap, *exp_bitmap; -+ -+ /* -+ * The current dynamic priority level this runqueue is at per static -+ * priority level. -+ */ -+ int prio_level[PRIO_RANGE]; -+ -+ /* How many times we have rotated the priority queue */ -+ unsigned long prio_rotation; -+ unsigned long iso_ticks; -+ unsigned short iso_refractory; -+ -+ /* Number of idleprio tasks running */ -+ unsigned long nr_idleprio; - atomic_t nr_iowait; - - #ifdef CONFIG_SMP -@@ -606,12 +560,9 @@ - #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) - /* - * Called when a process is dequeued from the active array and given -- * the cpu. We should note that with the exception of interactive -- * tasks, the expired queue will become the active queue after the active -- * queue is empty, without explicitly dequeuing and requeuing tasks in the -- * expired queue. (Interactive tasks may be requeued directly to the -- * active queue, thus delaying tasks in the expired queue from running; -- * see scheduler_tick()). -+ * the cpu. We should note that the expired queue will become the active -+ * queue after the active queue is empty, without explicitly dequeuing and -+ * requeuing tasks in the expired queue. - * - * This function is only called from sched_info_arrive(), rather than - * dequeue_task(). Even though a task may be queued and dequeued multiple -@@ -709,71 +660,304 @@ - #define sched_info_switch(t, next) do { } while (0) - #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ - -+static int idleprio_suitable(struct task_struct *p) -+{ -+ return (!p->mutexes_held && !freezing(p) && !signal_pending(p) && -+ !(p->flags & (PF_NONSLEEP | PF_EXITING))); -+} -+ -+static int idleprio(const struct task_struct *p) -+{ -+ return (p->prio == MAX_PRIO); -+} -+ -+static inline int task_queued(struct task_struct *task) -+{ -+ return !list_empty(&task->run_list); -+} -+ -+static inline void set_dynamic_bit(struct task_struct *p, struct rq *rq) -+{ -+ __set_bit(p->prio, p->array->prio_bitmap); -+} -+ - /* -- * Adding/removing a task to/from a priority array: -+ * Removing from a runqueue. - */ --static void dequeue_task(struct task_struct *p, struct prio_array *array) -+static void dequeue_task(struct task_struct *p, struct rq *rq) - { -- array->nr_active--; -- list_del(&p->run_list); -- if (list_empty(array->queue + p->prio)) -- __clear_bit(p->prio, array->bitmap); -+ list_del_init(&p->run_list); -+ if (idleprio_task(p) && idleprio(p)) -+ rq->nr_idleprio--; -+ else if (list_empty(p->array->queue + p->prio)) -+ __clear_bit(p->prio, p->array->prio_bitmap); - } - --static void enqueue_task(struct task_struct *p, struct prio_array *array) -+static void reset_first_time_slice(struct task_struct *p) - { -- sched_info_queued(p); -- list_add_tail(&p->run_list, array->queue + p->prio); -- __set_bit(p->prio, array->bitmap); -- array->nr_active++; -+ if (unlikely(p->first_time_slice)) -+ p->first_time_slice = 0; -+} -+ -+/* -+ * The task is being queued on a fresh array so it has its entitlement -+ * bitmap cleared. -+ */ -+static void task_new_array(struct task_struct *p, struct rq *rq, -+ struct prio_array *array) -+{ -+ bitmap_zero(p->bitmap, PRIO_RANGE); -+ p->rotation = rq->prio_rotation; -+ p->time_slice = p->quota; - p->array = array; -+ reset_first_time_slice(p); -+} -+ -+/* Find the first slot from the relevant prio_matrix entry */ -+static int first_prio_slot(struct task_struct *p) -+{ -+ if (unlikely(p->policy == SCHED_BATCH)) -+ return p->static_prio; -+ return SCHED_PRIO(find_first_zero_bit( -+ prio_matrix[USER_PRIO(p->static_prio)], PRIO_RANGE)); - } - - /* -- * Put task to the end of the run list without the overhead of dequeue -- * followed by enqueue. -+ * In sched_interactive mode priority allocation occurs per process per rq -+ * array swap. In !sched_interactive mode all waking tasks must obey the -+ * current prio level of all other tasks running per array swap. - */ --static void requeue_task(struct task_struct *p, struct prio_array *array) -+static int minprio(struct rq *rq, int uprio) - { -- list_move_tail(&p->run_list, array->queue + p->prio); -+ if (sched_interactive) -+ return MAX_RT_PRIO; -+ return rq->prio_level[uprio]; - } - --static inline void --enqueue_task_head(struct task_struct *p, struct prio_array *array) -+/* -+ * Find the first unused slot by this task that is also in its prio_matrix -+ * level. SCHED_BATCH tasks do not use the priority matrix. They only take -+ * priority slots from their static_prio and above. -+ */ -+static int next_entitled_slot(struct task_struct *p, struct rq *rq) - { -- list_add(&p->run_list, array->queue + p->prio); -- __set_bit(p->prio, array->bitmap); -- array->nr_active++; -- p->array = array; -+ int search_prio = MAX_RT_PRIO, uprio = USER_PRIO(p->static_prio); -+ struct prio_array *array = rq->active; -+ DECLARE_BITMAP(tmp, PRIO_RANGE); -+ -+ /* -+ * Go straight to expiration if there are higher priority tasks -+ * already expired. -+ */ -+ if (p->static_prio > rq->expired->best_static_prio) -+ return MAX_PRIO; -+ if (!rq->prio_level[uprio]) -+ rq->prio_level[uprio] = MAX_RT_PRIO; -+ /* -+ * Only priorities equal to the prio_level and above for their -+ * static_prio are acceptable, and only if it's not better than -+ * a queued better static_prio's prio_level. -+ */ -+ if (p->static_prio < array->best_static_prio) { -+ if (likely(p->policy != SCHED_BATCH)) -+ array->best_static_prio = p->static_prio; -+ } else if (p->static_prio == array->best_static_prio) { -+ search_prio = minprio(rq, uprio); -+ } else { -+ int i; -+ -+ search_prio = minprio(rq, uprio); -+ /* A bound O(n) function, worst case n is 40 */ -+ for (i = array->best_static_prio; i <= p->static_prio ; i++) { -+ if (!rq->prio_level[USER_PRIO(i)]) -+ rq->prio_level[USER_PRIO(i)] = MAX_RT_PRIO; -+ search_prio = max(search_prio, -+ rq->prio_level[USER_PRIO(i)]); -+ } -+ } -+ if (unlikely(p->policy == SCHED_BATCH)) { -+ search_prio = max(search_prio, p->static_prio); -+ return SCHED_PRIO(find_next_zero_bit(p->bitmap, PRIO_RANGE, -+ USER_PRIO(search_prio))); -+ } -+ bitmap_or(tmp, p->bitmap, prio_matrix[uprio], PRIO_RANGE); -+ return SCHED_PRIO(find_next_zero_bit(tmp, PRIO_RANGE, -+ USER_PRIO(search_prio))); -+} -+ -+static void queue_expired(struct task_struct *p, struct rq *rq) -+{ -+ task_new_array(p, rq, rq->expired); -+ p->prio = p->normal_prio = first_prio_slot(p); -+ if (p->static_prio < rq->expired->best_static_prio) -+ rq->expired->best_static_prio = p->static_prio; -+ reset_first_time_slice(p); - } - -+#ifdef CONFIG_SMP - /* -- * __normal_prio - return the priority that is based on the static -- * priority but is modified by bonuses/penalties. -- * -- * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] -- * into the -5 ... 0 ... +5 bonus/penalty range. -- * -- * We use 25% of the full 0...39 priority range so that: -- * -- * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. -- * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. -- * -- * Both properties are important to certain workloads. -+ * If we're waking up a task that was previously on a different runqueue, -+ * update its data appropriately. Note we may be reading data from src_rq-> -+ * outside of lock, but the occasional inaccurate result should be harmless. - */ -+ static void update_if_moved(struct task_struct *p, struct rq *rq) -+{ -+ struct rq *src_rq = p->array->rq; -+ -+ if (src_rq == rq) -+ return; -+ /* -+ * Only need to set p->array when p->rotation == rq->prio_rotation as -+ * they will be set in recalc_task_prio when != rq->prio_rotation. -+ */ -+ if (p->rotation == src_rq->prio_rotation) { -+ p->rotation = rq->prio_rotation; -+ if (p->array == src_rq->expired) -+ p->array = rq->expired; -+ else -+ p->array = rq->active; -+ } else -+ p->rotation = 0; -+} -+#else -+static inline void update_if_moved(struct task_struct *p, struct rq *rq) -+{ -+} -+#endif - --static inline int __normal_prio(struct task_struct *p) -+static inline int isoprio_suitable(struct task_struct *p) - { -- int bonus, prio; -+ return !(p->flags & PF_ISOREF); -+} - -- bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; -+static int task_timeslice(struct task_struct *p); - -- prio = p->static_prio - bonus; -- if (prio < MAX_RT_PRIO) -- prio = MAX_RT_PRIO; -- if (prio > MAX_PRIO-1) -- prio = MAX_PRIO-1; -- return prio; -+/* -+ * recalc_task_prio determines what priority a non rt_task will be -+ * queued at. If the task has already been running during this runqueue's -+ * major rotation (rq->prio_rotation) then it continues at the same -+ * priority if it has tick entitlement left. If it does not have entitlement -+ * left, it finds the next priority slot according to its nice value that it -+ * has not extracted quota from. If it has not run during this major -+ * rotation, it starts at the next_entitled_slot and has its bitmap quota -+ * cleared. If it does not have any slots left it has all its slots reset and -+ * is queued on the expired at its first_prio_slot. -+ */ -+static void recalc_task_prio(struct task_struct *p, struct rq *rq) -+{ -+ struct prio_array *array = rq->active; -+ int queue_prio; -+ -+ if (iso_task(p)) { -+ if (isoprio_suitable(p)) { -+ /* -+ * If SCHED_ISO tasks have not used up their real time -+ * quota they have run just better than highest -+ * SCHED_NORMAL priority. Otherwise they run as -+ * SCHED_NORMAL. -+ */ -+ p->prio = p->normal_prio = ISO_PRIO; -+ p->array = rq->active; -+ if (p->time_slice <= 0) -+ p->time_slice = p->quota; -+ return; -+ } else if (p->prio == ISO_PRIO) { -+ /* Just about to be demoted to SCHED_NORMAL */ -+ p->time_slice = 0; -+ } -+ } else if (idleprio_task(p)) { -+ if (idleprio_suitable(p)) { -+ /* -+ * If suitable idleprio_tasks are queued at MAX_PRIO -+ * only on the idleprio array. Their time_slice is -+ * their full task_timeslice as they cooperatively -+ * multitask. -+ */ -+ p->prio = p->normal_prio = MAX_PRIO; -+ p->array = rq->idleprio; -+ if (p->time_slice <= 0) -+ p->time_slice = task_timeslice(p); -+ return; -+ } -+ /* -+ * If unsuitable idleprio_tasks are queued equivalent to -+ * nice 19 tasks on the expired array. -+ */ -+ p->flags &= ~PF_NONSLEEP; -+ p->prio = p->normal_prio = MAX_PRIO - 1; -+ p->array = rq->expired; -+ if (p->time_slice <= 0 || p->time_slice > p->quota) -+ p->time_slice = p->quota; -+ return; -+ } -+ -+ update_if_moved(p, rq); -+ if (p->rotation == rq->prio_rotation) { -+ if (p->array == array) { -+ if (p->time_slice > 0) -+ return; -+ p->time_slice = p->quota; -+ } else if (p->array == rq->expired) { -+ queue_expired(p, rq); -+ return; -+ } else -+ task_new_array(p, rq, array); -+ } else -+ task_new_array(p, rq, array); -+ -+ queue_prio = next_entitled_slot(p, rq); -+ if (queue_prio >= MAX_PRIO) { -+ queue_expired(p, rq); -+ return; -+ } -+ p->prio = p->normal_prio = queue_prio; -+ __set_bit(USER_PRIO(p->prio), p->bitmap); -+} -+ -+/* -+ * Adding to a runqueue. The dynamic priority queue that it is added to is -+ * determined by recalc_task_prio() above. -+ */ -+static inline void __enqueue_task(struct task_struct *p, struct rq *rq) -+{ -+ if (rt_task(p)) -+ p->array = rq->active; -+ else -+ recalc_task_prio(p, rq); -+ -+ if (idleprio_task(p) && idleprio(p)) -+ rq->nr_idleprio++; -+ sched_info_queued(p); -+ set_dynamic_bit(p, rq); -+} -+ -+static void enqueue_task(struct task_struct *p, struct rq *rq) -+{ -+ __enqueue_task(p, rq); -+ list_add_tail(&p->run_list, p->array->queue + p->prio); -+} -+ -+static inline void enqueue_task_head(struct task_struct *p, struct rq *rq) -+{ -+ __enqueue_task(p, rq); -+ list_add(&p->run_list, p->array->queue + p->prio); -+} -+ -+/* -+ * requeue_task is only called when p->static_prio does not change. p->prio -+ * can change with dynamic tasks. -+ */ -+static void requeue_task(struct task_struct *p, struct rq *rq, -+ struct prio_array *old_array, int old_prio) -+{ -+ if (p->array == rq->expired) -+ queue_expired(p, rq); -+ list_move_tail(&p->run_list, p->array->queue + p->prio); -+ if (!rt_task(p)) { -+ if (list_empty(old_array->queue + old_prio)) -+ __clear_bit(old_prio, old_array->prio_bitmap); -+ set_dynamic_bit(p, rq); -+ } - } - - /* -@@ -786,20 +970,29 @@ - */ - - /* -- * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE -- * If static_prio_timeslice() is ever changed to break this assumption then -- * this code will need modification -- */ --#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE --#define LOAD_WEIGHT(lp) \ -- (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) --#define PRIO_TO_LOAD_WEIGHT(prio) \ -- LOAD_WEIGHT(static_prio_timeslice(prio)) --#define RTPRIO_TO_LOAD_WEIGHT(rp) \ -- (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) -+ * task_timeslice - the total duration a task can run during one major -+ * rotation. Returns value in milliseconds as the smallest value can be 1. -+ */ -+static int task_timeslice(struct task_struct *p) -+{ -+ int slice = p->quota; /* quota is in us */ -+ -+ if (!rt_task(p)) -+ slice += (PRIO_RANGE - 1 - TASK_USER_PRIO(p)) * slice; -+ return US_TO_MS(slice); -+} -+ -+/* -+ * The load weight is basically the task_timeslice in ms. Realtime tasks are -+ * special cased to be proportionately larger than nice -20 by their -+ * rt_priority. The weight for rt tasks can only be arbitrary at best. -+ */ -+#define RTPRIO_TO_LOAD_WEIGHT(rp) (rr_interval * 20 * (40 + rp)) - - static void set_load_weight(struct task_struct *p) - { -+ int load_weight; -+ - if (has_rt_policy(p)) { - #ifdef CONFIG_SMP - if (p == task_rq(p)->migration_thread) -@@ -808,12 +1001,19 @@ - * Giving its load any weight will skew balancing - * adversely. - */ -- p->load_weight = 0; -+ load_weight = 0; - else - #endif -- p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); -+ load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); - } else -- p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); -+ load_weight = task_timeslice(p); -+ /* -+ * idleprio tasks have much lower weight than SCHED_NORMAL tasks but -+ * still need to be weighted to allow balancing to occur. -+ */ -+ if (likely(!idleprio_task(p))) -+ load_weight *= PRIO_RANGE; -+ p->load_weight = load_weight; - } - - static inline void -@@ -841,28 +1041,38 @@ - } - - /* -- * Calculate the expected normal priority: i.e. priority -- * without taking RT-inheritance into account. Might be -- * boosted by interactivity modifiers. Changes upon fork, -- * setprio syscalls, and whenever the interactivity -- * estimator recalculates. -+ * __activate_task - move a task to the runqueue. - */ --static inline int normal_prio(struct task_struct *p) -+static inline void __activate_task(struct task_struct *p, struct rq *rq) - { -- int prio; -+ enqueue_task(p, rq); -+ inc_nr_running(p, rq); -+} - -+/* -+ * __activate_idle_task - move idle task to the _front_ of runqueue. -+ */ -+static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) -+{ -+ enqueue_task_head(p, rq); -+ inc_nr_running(p, rq); -+} -+ -+static inline int normal_prio(struct task_struct *p) -+{ - if (has_rt_policy(p)) -- prio = MAX_RT_PRIO-1 - p->rt_priority; -+ return MAX_RT_PRIO-1 - p->rt_priority; -+ /* Other tasks all have normal_prio set in recalc_task_prio */ -+ if (likely(p->prio >= MAX_RT_PRIO && p->prio < MAX_PRIO)) -+ return p->prio; - else -- prio = __normal_prio(p); -- return prio; -+ return p->static_prio; - } - - /* - * Calculate the current priority, i.e. the priority - * taken into account by the scheduler. This value might -- * be boosted by RT tasks, or might be boosted by -- * interactivity modifiers. Will be RT if the task got -+ * be boosted by RT tasks as it will be RT if the task got - * RT-boosted. If not then it returns p->normal_prio. - */ - static int effective_prio(struct task_struct *p) -@@ -878,112 +1088,70 @@ - return p->prio; - } - --/* -- * __activate_task - move a task to the runqueue. -- */ --static void __activate_task(struct task_struct *p, struct rq *rq) -+static inline unsigned int nice_quota_ms(int nice) - { -- struct prio_array *target = rq->active; -+ unsigned int rr = rr_interval; - -- if (batch_task(p)) -- target = rq->expired; -- enqueue_task(p, target); -- inc_nr_running(p, rq); -+ if (nice < -6) { -+ rr *= nice * nice; -+ rr /= 40; -+ } else if (nice > 0) -+ rr = rr / 2 ? : 1; -+ return rr; - } - -+#define DEFAULT_WEIGHT (nice_quota_ms(0) * 20 * PRIO_RANGE) -+ - /* -- * __activate_idle_task - move idle task to the _front_ of runqueue. -+ * A runqueue laden with a single nice 0 task scores a weighted_cpuload of -+ * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a -+ * task of nice 0 or enough lower priority tasks to bring up the -+ * weighted_cpuload - */ --static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) -+int above_background_load(void) - { -- enqueue_task_head(p, rq->active); -- inc_nr_running(p, rq); -+ unsigned long cpu; -+ -+ for_each_online_cpu(cpu) { -+ if (weighted_cpuload(cpu) >= DEFAULT_WEIGHT) -+ return 1; -+ } -+ return 0; - } - - /* -- * Recalculate p->normal_prio and p->prio after having slept, -- * updating the sleep-average too: -+ * All tasks have quotas based on rr_interval. RT tasks all get rr_interval. -+ * From nice 1 to 19 they are smaller than it only if they are at least one -+ * tick still. Below nice 0 they get progressively larger. -+ * ie nice -6..0 = rr_interval. nice -10 = 2.5 * rr_interval -+ * nice -20 = 10 * rr_interval. nice 1-19 = rr_interval / 2. -+ * Value returned is in microseconds. - */ --static int recalc_task_prio(struct task_struct *p, unsigned long long now) -+static inline unsigned int rr_quota(struct task_struct *p) - { -- /* Caller must always ensure 'now >= p->timestamp' */ -- unsigned long sleep_time = now - p->timestamp; -+ unsigned int quota; - -- if (batch_task(p)) -- sleep_time = 0; -- -- if (likely(sleep_time > 0)) { -- /* -- * This ceiling is set to the lowest priority that would allow -- * a task to be reinserted into the active array on timeslice -- * completion. -- */ -- unsigned long ceiling = INTERACTIVE_SLEEP(p); -- -- if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) { -- /* -- * Prevents user tasks from achieving best priority -- * with one single large enough sleep. -- */ -- p->sleep_avg = ceiling; -- /* -- * Using INTERACTIVE_SLEEP() as a ceiling places a -- * nice(0) task 1ms sleep away from promotion, and -- * gives it 700ms to round-robin with no chance of -- * being demoted. This is more than generous, so -- * mark this sleep as non-interactive to prevent the -- * on-runqueue bonus logic from intervening should -- * this task not receive cpu immediately. -- */ -- p->sleep_type = SLEEP_NONINTERACTIVE; -- } else { -- /* -- * Tasks waking from uninterruptible sleep are -- * limited in their sleep_avg rise as they -- * are likely to be waiting on I/O -- */ -- if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { -- if (p->sleep_avg >= ceiling) -- sleep_time = 0; -- else if (p->sleep_avg + sleep_time >= -- ceiling) { -- p->sleep_avg = ceiling; -- sleep_time = 0; -- } -- } -- -- /* -- * This code gives a bonus to interactive tasks. -- * -- * The boost works by updating the 'average sleep time' -- * value here, based on ->timestamp. The more time a -- * task spends sleeping, the higher the average gets - -- * and the higher the priority boost gets as well. -- */ -- p->sleep_avg += sleep_time; -- -- } -- if (p->sleep_avg > NS_MAX_SLEEP_AVG) -- p->sleep_avg = NS_MAX_SLEEP_AVG; -- } -+ if (rt_task(p)) -+ quota = rr_interval; -+ else -+ quota = nice_quota_ms(TASK_NICE(p)); -+ return MS_TO_US(quota); -+} - -- return effective_prio(p); -+/* Every time we set the quota we need to set the load weight */ -+static void set_quota(struct task_struct *p) -+{ -+ p->quota = rr_quota(p); -+ set_load_weight(p); - } - - /* - * activate_task - move a task to the runqueue and do priority recalculation -- * -- * Update all the scheduling statistics stuff. (sleep average -- * calculation, priority modifiers, etc.) - */ - static void activate_task(struct task_struct *p, struct rq *rq, int local) - { -- unsigned long long now; -- -- if (rt_task(p)) -- goto out; -+ unsigned long long now = sched_clock(); - -- now = sched_clock(); - #ifdef CONFIG_SMP - if (!local) { - /* Compensate for drifting sched_clock */ -@@ -1004,32 +1172,9 @@ - (now - p->timestamp) >> 20); - } - -- p->prio = recalc_task_prio(p, now); -- -- /* -- * This checks to make sure it's not an uninterruptible task -- * that is now waking up. -- */ -- if (p->sleep_type == SLEEP_NORMAL) { -- /* -- * Tasks which were woken up by interrupts (ie. hw events) -- * are most likely of interactive nature. So we give them -- * the credit of extending their sleep time to the period -- * of time they spend on the runqueue, waiting for execution -- * on a CPU, first time around: -- */ -- if (in_interrupt()) -- p->sleep_type = SLEEP_INTERRUPTED; -- else { -- /* -- * Normal first-time wakeups get a credit too for -- * on-runqueue time, but it will be weighted down: -- */ -- p->sleep_type = SLEEP_INTERACTIVE; -- } -- } -+ set_quota(p); -+ p->prio = effective_prio(p); - p->timestamp = now; --out: - __activate_task(p, rq); - } - -@@ -1039,8 +1184,7 @@ - static void deactivate_task(struct task_struct *p, struct rq *rq) - { - dec_nr_running(p, rq); -- dequeue_task(p, p->array); -- p->array = NULL; -+ dequeue_task(p, rq); - } - - /* -@@ -1133,7 +1277,7 @@ - * If the task is not on a runqueue (and not running), then - * it is sufficient to simply update the task's cpu field. - */ -- if (!p->array && !task_running(rq, p)) { -+ if (!task_queued(p) && !task_running(rq, p)) { - set_task_cpu(p, dest_cpu); - return 0; - } -@@ -1159,7 +1303,6 @@ - { - unsigned long flags; - struct rq *rq; -- struct prio_array *array; - int running; - - repeat: -@@ -1192,7 +1335,6 @@ - */ - rq = task_rq_lock(p, &flags); - running = task_running(rq, p); -- array = p->array; - task_rq_unlock(rq, &flags); - - /* -@@ -1215,7 +1357,7 @@ - * running right now), it's preempted, and we should - * yield - it could be a while. - */ -- if (unlikely(array)) { -+ if (unlikely(task_queued(p))) { - yield(); - goto repeat; - } -@@ -1294,6 +1436,25 @@ - } - - /* -+ * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) -+ * Since cpu_power is a 'constant', we can use a reciprocal divide. -+ */ -+static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) -+{ -+ return reciprocal_divide(load, sg->reciprocal_cpu_power); -+} -+ -+/* -+ * Each time a sched group cpu_power is changed, -+ * we must compute its reciprocal value -+ */ -+static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) -+{ -+ sg->__cpu_power += val; -+ sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); -+} -+ -+/* - * find_idlest_group finds and returns the least busy CPU group within the - * domain. - */ -@@ -1490,6 +1651,31 @@ - } - #endif - -+/* -+ * We need to have a special definition for an idle runqueue when testing -+ * for preemption on CONFIG_HOTPLUG_CPU as the idle task may be scheduled as -+ * a realtime task in sched_idle_next. -+ */ -+#ifdef CONFIG_HOTPLUG_CPU -+#define rq_idle(rq) ((rq)->curr == (rq)->idle && !rt_task((rq)->curr)) -+#else -+#define rq_idle(rq) ((rq)->curr == (rq)->idle) -+#endif -+ -+static inline int task_preempts_curr(struct task_struct *p, struct rq *rq) -+{ -+ struct task_struct *curr = rq->curr; -+ -+ return ((p->array == task_rq(p)->active && -+ TASK_PREEMPTS_CURR(p, curr)) || rq_idle(rq)); -+} -+ -+static inline void try_preempt(struct task_struct *p, struct rq *rq) -+{ -+ if (task_preempts_curr(p, rq)) -+ resched_task(rq->curr); -+} -+ - /*** - * try_to_wake_up - wake up a thread - * @p: the to-be-woken-up thread -@@ -1521,7 +1707,7 @@ - if (!(old_state & state)) - goto out; - -- if (p->array) -+ if (task_queued(p)) - goto out_running; - - cpu = task_cpu(p); -@@ -1614,7 +1800,7 @@ - old_state = p->state; - if (!(old_state & state)) - goto out; -- if (p->array) -+ if (task_queued(p)) - goto out_running; - - this_cpu = smp_processor_id(); -@@ -1623,25 +1809,9 @@ - - out_activate: - #endif /* CONFIG_SMP */ -- if (old_state == TASK_UNINTERRUPTIBLE) { -+ if (old_state == TASK_UNINTERRUPTIBLE) - rq->nr_uninterruptible--; -- /* -- * Tasks on involuntary sleep don't earn -- * sleep_avg beyond just interactive state. -- */ -- p->sleep_type = SLEEP_NONINTERACTIVE; -- } else -- -- /* -- * Tasks that have marked their sleep as noninteractive get -- * woken up with their sleep average not weighted in an -- * interactive way. -- */ -- if (old_state & TASK_NONINTERACTIVE) -- p->sleep_type = SLEEP_NONINTERACTIVE; -- - -- activate_task(p, rq, cpu == this_cpu); - /* - * Sync wakeups (i.e. those types of wakeups where the waker - * has indicated that it will leave the CPU in short order) -@@ -1650,15 +1820,22 @@ - * the waker guarantees that the freshly woken up task is going - * to be considered on this CPU.) - */ -- if (!sync || cpu != this_cpu) { -- if (TASK_PREEMPTS_CURR(p, rq)) -- resched_task(rq->curr); -- } -+ activate_task(p, rq, cpu == this_cpu); -+ if (!sync || cpu != this_cpu) -+ try_preempt(p, rq); - success = 1; - - out_running: - p->state = TASK_RUNNING; - out: -+ /* -+ * Special case when freezing we need to reschedule idleprio tasks -+ * as SCHED_NORMAL or else they'll never freeze -+ */ -+ if (idleprio_task(p) && freezing(p) && idleprio(p)) { -+ dequeue_task(p, rq); -+ enqueue_task(p, rq); -+ } - task_rq_unlock(rq, &flags); - - return success; -@@ -1676,7 +1853,6 @@ - return try_to_wake_up(p, state, 0); - } - --static void task_running_tick(struct rq *rq, struct task_struct *p); - /* - * Perform scheduler related setup for a newly forked process p. - * p is forked by current. -@@ -1704,7 +1880,6 @@ - p->prio = current->normal_prio; - - INIT_LIST_HEAD(&p->run_list); -- p->array = NULL; - #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) - if (unlikely(sched_info_on())) - memset(&p->sched_info, 0, sizeof(p->sched_info)); -@@ -1716,30 +1891,31 @@ - /* Want to start with kernel preemption disabled. */ - task_thread_info(p)->preempt_count = 1; - #endif -+ if (unlikely(p->policy == SCHED_FIFO)) -+ goto out; - /* - * Share the timeslice between parent and child, thus the - * total amount of pending timeslices in the system doesn't change, - * resulting in more scheduling fairness. - */ - local_irq_disable(); -- p->time_slice = (current->time_slice + 1) >> 1; -- /* -- * The remainder of the first timeslice might be recovered by -- * the parent if the child exits early enough. -- */ -- p->first_time_slice = 1; -- current->time_slice >>= 1; -- p->timestamp = sched_clock(); -- if (unlikely(!current->time_slice)) { -+ if (current->time_slice > 0) { -+ current->time_slice /= 2; -+ if (current->time_slice) -+ p->time_slice = current->time_slice; -+ else -+ p->time_slice = 1; - /* -- * This case is rare, it happens when the parent has only -- * a single jiffy left from its timeslice. Taking the -- * runqueue lock is not a problem. -+ * The remainder of the first timeslice might be recovered by -+ * the parent if the child exits early enough. - */ -- current->time_slice = 1; -- task_running_tick(cpu_rq(cpu), current); -- } -+ p->first_time_slice = 1; -+ } else -+ p->time_slice = 0; -+ -+ p->timestamp = sched_clock(); - local_irq_enable(); -+out: - put_cpu(); - } - -@@ -1761,38 +1937,16 @@ - this_cpu = smp_processor_id(); - cpu = task_cpu(p); - -- /* -- * We decrease the sleep average of forking parents -- * and children as well, to keep max-interactive tasks -- * from forking tasks that are max-interactive. The parent -- * (current) is done further down, under its lock. -- */ -- p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * -- CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); -- -- p->prio = effective_prio(p); -- - if (likely(cpu == this_cpu)) { -+ activate_task(p, rq, 1); - if (!(clone_flags & CLONE_VM)) { - /* - * The VM isn't cloned, so we're in a good position to - * do child-runs-first in anticipation of an exec. This - * usually avoids a lot of COW overhead. - */ -- if (unlikely(!current->array)) -- __activate_task(p, rq); -- else { -- p->prio = current->prio; -- p->normal_prio = current->normal_prio; -- list_add_tail(&p->run_list, ¤t->run_list); -- p->array = current->array; -- p->array->nr_active++; -- inc_nr_running(p, rq); -- } - set_need_resched(); -- } else -- /* Run child last */ -- __activate_task(p, rq); -+ } - /* - * We skip the following code due to cpu == this_cpu - * -@@ -1809,19 +1963,16 @@ - */ - p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) - + rq->most_recent_timestamp; -- __activate_task(p, rq); -- if (TASK_PREEMPTS_CURR(p, rq)) -- resched_task(rq->curr); -+ activate_task(p, rq, 0); -+ try_preempt(p, rq); - - /* - * Parent and child are on different CPUs, now get the -- * parent runqueue to update the parent's ->sleep_avg: -+ * parent runqueue to update the parent's ->flags: - */ - task_rq_unlock(rq, &flags); - this_rq = task_rq_lock(current, &flags); - } -- current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * -- PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - task_rq_unlock(this_rq, &flags); - } - -@@ -1836,23 +1987,17 @@ - */ - void fastcall sched_exit(struct task_struct *p) - { -+ struct task_struct *parent; - unsigned long flags; - struct rq *rq; - -- /* -- * If the child was a (relative-) CPU hog then decrease -- * the sleep_avg of the parent as well. -- */ -- rq = task_rq_lock(p->parent, &flags); -- if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { -- p->parent->time_slice += p->time_slice; -- if (unlikely(p->parent->time_slice > task_timeslice(p))) -- p->parent->time_slice = task_timeslice(p); -- } -- if (p->sleep_avg < p->parent->sleep_avg) -- p->parent->sleep_avg = p->parent->sleep_avg / -- (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / -- (EXIT_WEIGHT + 1); -+ parent = p->parent; -+ rq = task_rq_lock(parent, &flags); -+ if (p->first_time_slice > 0 && task_cpu(p) == task_cpu(parent)) { -+ parent->time_slice += p->time_slice; -+ if (unlikely(parent->time_slice > parent->quota)) -+ parent->time_slice = parent->quota; -+ } - task_rq_unlock(rq, &flags); - } - -@@ -2184,23 +2329,17 @@ - * pull_task - move a task from a remote runqueue to the local runqueue. - * Both runqueues must be locked. - */ --static void pull_task(struct rq *src_rq, struct prio_array *src_array, -- struct task_struct *p, struct rq *this_rq, -- struct prio_array *this_array, int this_cpu) -+static void pull_task(struct rq *src_rq, struct task_struct *p, -+ struct rq *this_rq, int this_cpu) - { -- dequeue_task(p, src_array); -+ dequeue_task(p, src_rq); - dec_nr_running(p, src_rq); - set_task_cpu(p, this_cpu); - inc_nr_running(p, this_rq); -- enqueue_task(p, this_array); -+ enqueue_task(p, this_rq); - p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) - + this_rq->most_recent_timestamp; -- /* -- * Note that idle threads have a prio of MAX_PRIO, for this test -- * to be always true for them. -- */ -- if (TASK_PREEMPTS_CURR(p, this_rq)) -- resched_task(this_rq->curr); -+ try_preempt(p, this_rq); - } - - /* -@@ -2243,7 +2382,16 @@ - return 1; - } - --#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) -+static inline int rq_best_prio(struct rq *rq) -+{ -+ int best_prio, exp_prio; -+ -+ best_prio = sched_find_first_bit(rq->dyn_bitmap); -+ exp_prio = find_next_bit(rq->exp_bitmap, MAX_PRIO, MAX_RT_PRIO); -+ if (unlikely(best_prio > exp_prio)) -+ best_prio = exp_prio; -+ return best_prio; -+} - - /* - * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted -@@ -2259,7 +2407,7 @@ - { - int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, - best_prio_seen, skip_for_load; -- struct prio_array *array, *dst_array; -+ struct prio_array *array; - struct list_head *head, *curr; - struct task_struct *tmp; - long rem_load_move; -@@ -2286,31 +2434,29 @@ - * be cache-cold, thus switching CPUs has the least effect - * on them. - */ -- if (busiest->expired->nr_active) { -- array = busiest->expired; -- dst_array = this_rq->expired; -- } else { -- array = busiest->active; -- dst_array = this_rq->active; -- } -- -+ array = busiest->expired; - new_array: -- /* Start searching at priority 0: */ -- idx = 0; -+ /* Expired arrays don't have RT tasks so they're always MAX_RT_PRIO+ */ -+ if (array == busiest->expired) -+ idx = MAX_RT_PRIO; -+ else -+ idx = 0; - skip_bitmap: - if (!idx) -- idx = sched_find_first_bit(array->bitmap); -+ idx = sched_find_first_bit(array->prio_bitmap); - else -- idx = find_next_bit(array->bitmap, MAX_PRIO, idx); -- if (idx >= MAX_PRIO) { -- if (array == busiest->expired && busiest->active->nr_active) { -+ idx = find_next_bit(array->prio_bitmap, MAX_PRIO, idx); -+ if (idx == MAX_PRIO) { -+ if (array == busiest->idleprio && busiest->nr_idleprio) -+ goto found_idleprio; -+ if (array == busiest->expired) { - array = busiest->active; -- dst_array = this_rq->active; - goto new_array; - } - goto out; - } - -+found_idleprio: - head = array->queue + idx; - curr = head->prev; - skip_queue: -@@ -2332,11 +2478,22 @@ - best_prio_seen |= idx == best_prio; - if (curr != head) - goto skip_queue; -+ if (idx == MAX_PRIO) { -+ /* -+ * Occurs either when balancing idleprio tasks or -+ * there really are no more tasks to find. -+ */ -+ if (array == busiest->expired) { -+ array = busiest->active; -+ goto new_array; -+ } -+ goto out; -+ } - idx++; - goto skip_bitmap; - } - -- pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); -+ pull_task(busiest, tmp, this_rq, this_cpu); - pulled++; - rem_load_move -= tmp->load_weight; - -@@ -2349,6 +2506,13 @@ - this_best_prio = idx; - if (curr != head) - goto skip_queue; -+ if (idx == MAX_PRIO) { -+ if (array == busiest->expired) { -+ array = busiest->active; -+ goto new_array; -+ } -+ goto out; -+ } - idx++; - goto skip_bitmap; - } -@@ -3297,11 +3461,36 @@ - /* - * This is called on clock ticks and on context switches. - * Bank in p->sched_time the ns elapsed since the last tick or switch. -+ * CPU scheduler quota accounting is also performed here in microseconds. -+ * The value returned from sched_clock() occasionally gives bogus values so -+ * some sanity checking is required. - */ --static inline void --update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) -+static void -+update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now, -+ int tick) - { -- p->sched_time += now - p->last_ran; -+ long time_diff = now - p->last_ran; -+ -+ if (tick) { -+ /* -+ * Called from scheduler_tick() there should be less than two -+ * jiffies worth, and not negative/overflow. -+ */ -+ if (time_diff > JIFFIES_TO_NS(2) || time_diff < 0) -+ time_diff = JIFFIES_TO_NS(1); -+ } else { -+ /* -+ * Called from context_switch there should be less than one -+ * jiffy worth, and not negative/overflow. There should be -+ * some time banked here so use a nominal 1us. -+ */ -+ if (time_diff > JIFFIES_TO_NS(1) || time_diff < 1) -+ time_diff = 1000; -+ } -+ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ -+ if (p != rq->idle && p->policy != SCHED_FIFO) -+ p->time_slice -= time_diff / 1000; -+ p->sched_time += time_diff; - p->last_ran = rq->most_recent_timestamp = now; - } - -@@ -3322,27 +3511,6 @@ - } - - /* -- * We place interactive tasks back into the active array, if possible. -- * -- * To guarantee that this does not starve expired tasks we ignore the -- * interactivity of a task if the first expired task had to wait more -- * than a 'reasonable' amount of time. This deadline timeout is -- * load-dependent, as the frequency of array switched decreases with -- * increasing number of running tasks. We also ignore the interactivity -- * if a better static_prio task has expired: -- */ --static inline int expired_starving(struct rq *rq) --{ -- if (rq->curr->static_prio > rq->best_expired_prio) -- return 1; -- if (!STARVATION_LIMIT || !rq->expired_timestamp) -- return 0; -- if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running) -- return 1; -- return 0; --} -- --/* - * Account user cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @hardirq_offset: the offset to subtract from hardirq_count() -@@ -3357,7 +3525,7 @@ - - /* Add user time to cpustat. */ - tmp = cputime_to_cputime64(cputime); -- if (TASK_NICE(p) > 0) -+ if (TASK_NICE(p) > 0 || idleprio_task(p)) - cpustat->nice = cputime64_add(cpustat->nice, tmp); - else - cpustat->user = cputime64_add(cpustat->user, tmp); -@@ -3415,87 +3583,94 @@ - cpustat->steal = cputime64_add(cpustat->steal, tmp); - } - --static void task_running_tick(struct rq *rq, struct task_struct *p) -+/* -+ * The task has used up its quota of running in this prio_level so it must be -+ * dropped a priority level, all managed by recalc_task_prio(). -+ */ -+static void task_expired_entitlement(struct rq *rq, struct task_struct *p) - { -- if (p->array != rq->active) { -- /* Task has expired but was not scheduled yet */ -- set_tsk_need_resched(p); -+ int overrun; -+ -+ reset_first_time_slice(p); -+ if (rt_task(p)) { -+ p->time_slice += p->quota; -+ list_move_tail(&p->run_list, p->array->queue + p->prio); - return; - } -- spin_lock(&rq->lock); -+ overrun = p->time_slice; -+ dequeue_task(p, rq); -+ enqueue_task(p, rq); - /* -- * The task was running during this tick - update the -- * time slice counter. Note: we do not update a thread's -- * priority until it either goes to sleep or uses up its -- * timeslice. This makes it possible for interactive tasks -- * to use up their timeslices at their highest priority levels. -+ * Subtract any extra time this task ran over its time_slice; ie -+ * overrun will either be 0 or negative. - */ -- if (rt_task(p)) { -- /* -- * RR tasks need a special form of timeslice management. -- * FIFO tasks have no timeslices. -- */ -- if ((p->policy == SCHED_RR) && !--p->time_slice) { -- p->time_slice = task_timeslice(p); -- p->first_time_slice = 0; -- set_tsk_need_resched(p); -+ p->time_slice += overrun; -+} - -- /* put it at the end of the queue: */ -- requeue_task(p, rq->active); -- } -- goto out_unlock; -+/* -+ * Test if SCHED_ISO tasks have run longer than their alloted period as RT -+ * tasks and set the refractory flag if necessary. There is 10% hysteresis -+ * for unsetting the flag. -+ */ -+static unsigned int test_ret_isorefractory(struct rq *rq) -+{ -+ if (likely(!rq->iso_refractory)) { -+ if (rq->iso_ticks / ISO_PERIOD > sched_iso_cpu) -+ rq->iso_refractory = 1; -+ } else { -+ if (rq->iso_ticks / ISO_PERIOD < (sched_iso_cpu * 90 / 100)) -+ rq->iso_refractory = 0; - } -- if (!--p->time_slice) { -- dequeue_task(p, rq->active); -- set_tsk_need_resched(p); -- p->prio = effective_prio(p); -- p->time_slice = task_timeslice(p); -- p->first_time_slice = 0; -+ return rq->iso_refractory; -+} - -- if (!rq->expired_timestamp) -- rq->expired_timestamp = jiffies; -- if (!TASK_INTERACTIVE(p) || expired_starving(rq)) { -- enqueue_task(p, rq->expired); -- if (p->static_prio < rq->best_expired_prio) -- rq->best_expired_prio = p->static_prio; -- } else -- enqueue_task(p, rq->active); -- } else { -- /* -- * Prevent a too long timeslice allowing a task to monopolize -- * the CPU. We do this by splitting up the timeslice into -- * smaller pieces. -- * -- * Note: this does not mean the task's timeslices expire or -- * get lost in any way, they just might be preempted by -- * another task of equal priority. (one with higher -- * priority would have preempted this task already.) We -- * requeue this task to the end of the list on this priority -- * level, which is in essence a round-robin of tasks with -- * equal priority. -- * -- * This only applies to tasks in the interactive -- * delta range with at least TIMESLICE_GRANULARITY to requeue. -- */ -- if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - -- p->time_slice) % TIMESLICE_GRANULARITY(p)) && -- (p->time_slice >= TIMESLICE_GRANULARITY(p)) && -- (p->array == rq->active)) { -+/* No SCHED_ISO task was running so decrease rq->iso_ticks */ -+static inline void no_iso_tick(struct rq *rq) -+{ -+ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; -+} - -- requeue_task(p, rq->active); -- set_tsk_need_resched(p); -- } -+/* This manages tasks that have run out of timeslice during a scheduler_tick */ -+static void task_running_tick(struct rq *rq, struct task_struct *p) -+{ -+ /* -+ * If a SCHED_ISO task is running we increment the iso_ticks. In -+ * order to prevent SCHED_ISO tasks from causing starvation in the -+ * presence of true RT tasks we account those as iso_ticks as well. -+ */ -+ if ((rt_task(p) || (iso_task(p) && !rq->iso_refractory))) { -+ if (rq->iso_ticks <= (ISO_PERIOD * 100) - 100) -+ rq->iso_ticks += 100; -+ } else -+ no_iso_tick(rq); -+ -+ if (iso_task(p)) { -+ if (unlikely(test_ret_isorefractory(rq))) { -+ if (isoprio_suitable(p)) { -+ /* -+ * SCHED_ISO task is running as RT and limit -+ * has been hit. Set the PF_ISOREF flag and -+ * force it to reschedule as SCHED_NORMAL -+ * by zeroing its time_slice -+ */ -+ p->flags |= PF_ISOREF; -+ p->time_slice = 0; -+ } -+ } else -+ p->flags &= ~PF_ISOREF; - } --out_unlock: -- spin_unlock(&rq->lock); -+ /* SCHED_FIFO tasks never run out of timeslice. */ -+ if (p->time_slice > 0 || p->policy == SCHED_FIFO) -+ return; -+ /* p->time_slice <= 0 */ -+ set_tsk_need_resched(p); -+ if (likely(task_queued(p))) -+ task_expired_entitlement(rq, p); - } - - /* - * This function gets called by the timer code, with HZ frequency. - * We call it with interrupts disabled. -- * -- * It also gets called by the fork code, when changing the parent's -- * timeslices. - */ - void scheduler_tick(void) - { -@@ -3505,10 +3680,14 @@ - int idle_at_tick = idle_cpu(cpu); - struct rq *rq = cpu_rq(cpu); - -- update_cpu_clock(p, rq, now); -+ update_cpu_clock(p, rq, now, 1); - -+ spin_lock(&rq->lock); - if (!idle_at_tick) - task_running_tick(rq, p); -+ else -+ no_iso_tick(rq); -+ spin_unlock(&rq->lock); - #ifdef CONFIG_SMP - update_load(rq); - rq->idle_at_tick = idle_at_tick; -@@ -3554,10 +3733,80 @@ - - #endif - --static inline int interactive_sleep(enum sleep_type sleep_type) -+static void reset_prio_levels(struct rq *rq) - { -- return (sleep_type == SLEEP_INTERACTIVE || -- sleep_type == SLEEP_INTERRUPTED); -+ rq->active->best_static_prio = MAX_PRIO - 1; -+ rq->expired->best_static_prio = MAX_PRIO - 1; -+ memset(rq->prio_level, 0, sizeof(int) * PRIO_RANGE); -+} -+ -+/* -+ * Only tasks running are SCHED_IDLEPRIO. Set the active array to the -+ * idleprio array and if it isn't already active -+ */ -+static struct task_struct *next_idleprio_task(struct rq *rq) -+{ -+ struct prio_array *array = rq->active; -+ struct list_head *queue; -+ -+ if (array != rq->idleprio) { -+ rq->active = rq->idleprio; -+ rq->expired = array; -+ array = rq->active; -+ rq->exp_bitmap = rq->expired->prio_bitmap; -+ rq->dyn_bitmap = rq->active->prio_bitmap; -+ } -+ rq->prio_rotation++; -+ reset_prio_levels(rq); -+ queue = array->queue + MAX_PRIO; -+ return list_entry(queue->next, struct task_struct, run_list); -+} -+ -+/* -+ * next_dynamic_task finds the next suitable dynamic task. -+ */ -+static inline struct task_struct *next_dynamic_task(struct rq *rq, int idx) -+{ -+ struct prio_array *array = rq->active; -+ struct task_struct *next; -+ struct list_head *queue; -+ int nstatic; -+ -+retry: -+ if (unlikely(rq->nr_running == rq->nr_idleprio)) -+ return next_idleprio_task(rq); -+ if (idx >= MAX_PRIO) { -+ /* There are no more tasks in the active array. Swap arrays */ -+ array = rq->expired; -+ rq->expired = rq->active; -+ rq->active = array; -+ rq->exp_bitmap = rq->expired->prio_bitmap; -+ rq->dyn_bitmap = rq->active->prio_bitmap; -+ rq->prio_rotation++; -+ idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO); -+ reset_prio_levels(rq); -+ } -+ queue = array->queue + idx; -+ next = list_entry(queue->next, struct task_struct, run_list); -+ if (unlikely(next->time_slice <= 0 && !(iso_task(next) && -+ isoprio_suitable(next)))) { -+ /* -+ * Unlucky enough that this task ran out of time_slice -+ * before it hit a scheduler_tick so it should have its -+ * priority reassessed and choose another task (possibly -+ * the same one) -+ */ -+ task_expired_entitlement(rq, next); -+ idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO); -+ goto retry; -+ } -+ next->rotation = rq->prio_rotation; -+ nstatic = next->static_prio; -+ if (nstatic < array->best_static_prio) -+ array->best_static_prio = nstatic; -+ if (idx > rq->prio_level[USER_PRIO(nstatic)]) -+ rq->prio_level[USER_PRIO(nstatic)] = idx; -+ return next; - } - - /* -@@ -3566,13 +3815,11 @@ - asmlinkage void __sched schedule(void) - { - struct task_struct *prev, *next; -- struct prio_array *array; - struct list_head *queue; - unsigned long long now; -- unsigned long run_time; -- int cpu, idx, new_prio; - long *switch_count; - struct rq *rq; -+ int cpu, idx; - - /* - * Test if we are atomic. Since do_exit() needs to call into -@@ -3608,18 +3855,6 @@ - - schedstat_inc(rq, sched_cnt); - now = sched_clock(); -- if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { -- run_time = now - prev->timestamp; -- if (unlikely((long long)(now - prev->timestamp) < 0)) -- run_time = 0; -- } else -- run_time = NS_MAX_SLEEP_AVG; -- -- /* -- * Tasks charged proportionately less run_time at high sleep_avg to -- * delay them losing their interactive status -- */ -- run_time /= (CURRENT_BONUS(prev) ? : 1); - - spin_lock_irq(&rq->lock); - -@@ -3630,8 +3865,10 @@ - unlikely(signal_pending(prev)))) - prev->state = TASK_RUNNING; - else { -- if (prev->state == TASK_UNINTERRUPTIBLE) -+ if (prev->state == TASK_UNINTERRUPTIBLE) { -+ prev->flags |= PF_NONSLEEP; - rq->nr_uninterruptible++; -+ } - deactivate_task(prev, rq); - } - } -@@ -3641,59 +3878,29 @@ - idle_balance(cpu, rq); - if (!rq->nr_running) { - next = rq->idle; -- rq->expired_timestamp = 0; - goto switch_tasks; - } - } - -- array = rq->active; -- if (unlikely(!array->nr_active)) { -- /* -- * Switch the active and expired arrays. -- */ -- schedstat_inc(rq, sched_switch); -- rq->active = rq->expired; -- rq->expired = array; -- array = rq->active; -- rq->expired_timestamp = 0; -- rq->best_expired_prio = MAX_PRIO; -- } -- -- idx = sched_find_first_bit(array->bitmap); -- queue = array->queue + idx; -- next = list_entry(queue->next, struct task_struct, run_list); -- -- if (!rt_task(next) && interactive_sleep(next->sleep_type)) { -- unsigned long long delta = now - next->timestamp; -- if (unlikely((long long)(now - next->timestamp) < 0)) -- delta = 0; -- -- if (next->sleep_type == SLEEP_INTERACTIVE) -- delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; -- -- array = next->array; -- new_prio = recalc_task_prio(next, next->timestamp + delta); -- -- if (unlikely(next->prio != new_prio)) { -- dequeue_task(next, array); -- next->prio = new_prio; -- enqueue_task(next, array); -- } -+ idx = sched_find_first_bit(rq->dyn_bitmap); -+ if (likely(idx > ISO_PRIO)) -+ next = next_dynamic_task(rq, idx); -+ else { -+ queue = rq->active->queue + idx; -+ next = list_entry(queue->next, struct task_struct, run_list); - } -- next->sleep_type = SLEEP_NORMAL; - switch_tasks: -- if (next == rq->idle) -+ if (next == rq->idle) { -+ reset_prio_levels(rq); -+ rq->prio_rotation++; - schedstat_inc(rq, sched_goidle); -+ } - prefetch(next); - prefetch_stack(next); - clear_tsk_need_resched(prev); - rcu_qsctr_inc(task_cpu(prev)); - -- update_cpu_clock(prev, rq, now); -- -- prev->sleep_avg -= run_time; -- if ((long)prev->sleep_avg <= 0) -- prev->sleep_avg = 0; -+ update_cpu_clock(prev, rq, now, 0); - prev->timestamp = prev->last_ran = now; - - sched_info_switch(prev, next); -@@ -4129,29 +4336,22 @@ - */ - void rt_mutex_setprio(struct task_struct *p, int prio) - { -- struct prio_array *array; - unsigned long flags; -+ int queued, oldprio; - struct rq *rq; -- int oldprio; - - BUG_ON(prio < 0 || prio > MAX_PRIO); - - rq = task_rq_lock(p, &flags); - - oldprio = p->prio; -- array = p->array; -- if (array) -- dequeue_task(p, array); -+ queued = task_queued(p); -+ if (queued) -+ dequeue_task(p, rq); - p->prio = prio; - -- if (array) { -- /* -- * If changing to an RT priority then queue it -- * in the active array! -- */ -- if (rt_task(p)) -- array = rq->active; -- enqueue_task(p, array); -+ if (queued) { -+ enqueue_task(p, rq); - /* - * Reschedule if we are currently running on this runqueue and - * our priority decreased, or if we are not currently running on -@@ -4160,8 +4360,8 @@ - if (task_running(rq, p)) { - if (p->prio > oldprio) - resched_task(rq->curr); -- } else if (TASK_PREEMPTS_CURR(p, rq)) -- resched_task(rq->curr); -+ } else -+ try_preempt(p, rq); - } - task_rq_unlock(rq, &flags); - } -@@ -4170,8 +4370,7 @@ - - void set_user_nice(struct task_struct *p, long nice) - { -- struct prio_array *array; -- int old_prio, delta; -+ int queued, old_prio,delta; - unsigned long flags; - struct rq *rq; - -@@ -4192,26 +4391,27 @@ - p->static_prio = NICE_TO_PRIO(nice); - goto out_unlock; - } -- array = p->array; -- if (array) { -- dequeue_task(p, array); -+ queued = task_queued(p); -+ if (queued) { -+ dequeue_task(p, rq); - dec_raw_weighted_load(rq, p); - } - - p->static_prio = NICE_TO_PRIO(nice); -- set_load_weight(p); - old_prio = p->prio; - p->prio = effective_prio(p); -+ set_quota(p); - delta = p->prio - old_prio; - -- if (array) { -- enqueue_task(p, array); -+ if (queued) { -+ enqueue_task(p, rq); - inc_raw_weighted_load(rq, p); - /* - * If the task increased its priority or is running and - * lowered its priority, then reschedule its CPU: - */ -- if (delta < 0 || (delta > 0 && task_running(rq, p))) -+ if (delta < 0 || ((delta > 0 || idleprio_task(p)) && -+ task_running(rq, p))) - resched_task(rq->curr); - } - out_unlock: -@@ -4281,11 +4481,23 @@ - * - * This is the priority value as seen by users in /proc. - * RT tasks are offset by -200. Normal tasks are centered -- * around 0, value goes from -16 to +15. -+ * around 1, value goes from 0 to +79. Values higher than -+ * 39 indicate task is on the expired array. This is done -+ * lockless and may rarely return an active instead of -+ * expired value. - */ --int task_prio(const struct task_struct *p) -+int task_prio(struct task_struct *p) - { -- return p->prio - MAX_RT_PRIO; -+ int prio = p->prio - MAX_RT_PRIO; -+ -+ if (task_queued(p)) { -+ struct rq *rq = task_rq(p); -+ struct prio_array *array = p->array; -+ -+ if (rq && rq->expired == array) -+ prio += PRIO_RANGE; -+ } -+ return prio; - } - - /** -@@ -4328,19 +4540,14 @@ - /* Actually do priority change: must hold rq lock. */ - static void __setscheduler(struct task_struct *p, int policy, int prio) - { -- BUG_ON(p->array); -+ BUG_ON(task_queued(p)); - - p->policy = policy; - p->rt_priority = prio; - p->normal_prio = normal_prio(p); - /* we are holding p->pi_lock already */ - p->prio = rt_mutex_getprio(p); -- /* -- * SCHED_BATCH tasks are treated as perpetual CPU hogs: -- */ -- if (policy == SCHED_BATCH) -- p->sleep_avg = 0; -- set_load_weight(p); -+ set_quota(p); - } - - /** -@@ -4354,19 +4561,36 @@ - int sched_setscheduler(struct task_struct *p, int policy, - struct sched_param *param) - { -- int retval, oldprio, oldpolicy = -1; -- struct prio_array *array; -+ struct sched_param zero_param = { .sched_priority = 0 }; -+ int queued, retval, oldprio, oldpolicy = -1; -+ unsigned long rlim_rtprio = 0; - unsigned long flags; - struct rq *rq; - - /* may grab non-irq protected spin_locks */ - BUG_ON(in_interrupt()); -+ if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { -+ unsigned long lflags; -+ -+ if (!lock_task_sighand(p, &lflags)) -+ return -ESRCH; -+ rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; -+ unlock_task_sighand(p, &lflags); -+ if (rlim_rtprio) -+ goto recheck; -+ /* -+ * If the caller requested an RT policy without having the -+ * necessary rights, we downgrade the policy to SCHED_ISO. -+ * We also set the parameter to zero to pass the checks. -+ */ -+ policy = SCHED_ISO; -+ param = &zero_param; -+ } - recheck: - /* double check policy once rq lock held */ - if (policy < 0) - policy = oldpolicy = p->policy; -- else if (policy != SCHED_FIFO && policy != SCHED_RR && -- policy != SCHED_NORMAL && policy != SCHED_BATCH) -+ else if (!SCHED_RANGE(policy)) - return -EINVAL; - /* - * Valid priorities for SCHED_FIFO and SCHED_RR are -@@ -4385,14 +4609,6 @@ - */ - if (!capable(CAP_SYS_NICE)) { - if (is_rt_policy(policy)) { -- unsigned long rlim_rtprio; -- unsigned long flags; -- -- if (!lock_task_sighand(p, &flags)) -- return -ESRCH; -- rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; -- unlock_task_sighand(p, &flags); -- - /* can't set/change the rt policy */ - if (policy != p->policy && !rlim_rtprio) - return -EPERM; -@@ -4401,6 +4617,31 @@ - if (param->sched_priority > p->rt_priority && - param->sched_priority > rlim_rtprio) - return -EPERM; -+ } else { -+ switch (p->policy) { -+ /* -+ * Can only downgrade policies but not back to -+ * SCHED_NORMAL -+ */ -+ case SCHED_ISO: -+ if (policy == SCHED_ISO) -+ goto out; -+ if (policy == SCHED_NORMAL) -+ return -EPERM; -+ break; -+ case SCHED_BATCH: -+ if (policy == SCHED_BATCH) -+ goto out; -+ if (policy != SCHED_IDLEPRIO) -+ return -EPERM; -+ break; -+ case SCHED_IDLEPRIO: -+ if (policy == SCHED_IDLEPRIO) -+ goto out; -+ return -EPERM; -+ default: -+ break; -+ } - } - - /* can't change other user's priorities */ -@@ -4409,6 +4650,11 @@ - return -EPERM; - } - -+ if (!(p->mm) && policy == SCHED_IDLEPRIO) { -+ /* Don't allow kernel threads to be SCHED_IDLEPRIO. */ -+ return -EINVAL; -+ } -+ - retval = security_task_setscheduler(p, policy, param); - if (retval) - return retval; -@@ -4429,12 +4675,12 @@ - spin_unlock_irqrestore(&p->pi_lock, flags); - goto recheck; - } -- array = p->array; -- if (array) -+ queued = task_queued(p); -+ if (queued) - deactivate_task(p, rq); - oldprio = p->prio; - __setscheduler(p, policy, param->sched_priority); -- if (array) { -+ if (queued) { - __activate_task(p, rq); - /* - * Reschedule if we are currently running on this runqueue and -@@ -4444,14 +4690,15 @@ - if (task_running(rq, p)) { - if (p->prio > oldprio) - resched_task(rq->curr); -- } else if (TASK_PREEMPTS_CURR(p, rq)) -- resched_task(rq->curr); -+ } else -+ try_preempt(p, rq); - } - __task_rq_unlock(rq); - spin_unlock_irqrestore(&p->pi_lock, flags); - - rt_mutex_adjust_pi(p); - -+out: - return 0; - } - EXPORT_SYMBOL_GPL(sched_setscheduler); -@@ -4718,41 +4965,34 @@ - * sys_sched_yield - yield the current processor to other threads. - * - * This function yields the current CPU by moving the calling thread -- * to the expired array. If there are no other threads running on this -- * CPU then this function will return. -+ * to the expired array if SCHED_NORMAL or the end of its current priority -+ * queue if a realtime task. If there are no other threads running on this -+ * cpu this function will return. - */ - asmlinkage long sys_sched_yield(void) - { - struct rq *rq = this_rq_lock(); -- struct prio_array *array = current->array, *target = rq->expired; -+ struct task_struct *p = current; - - schedstat_inc(rq, yld_cnt); -- /* -- * We implement yielding by moving the task into the expired -- * queue. -- * -- * (special rule: RT tasks will just roundrobin in the active -- * array.) -- */ -- if (rt_task(current)) -- target = rq->active; -- -- if (array->nr_active == 1) { -- schedstat_inc(rq, yld_act_empty); -- if (!rq->expired->nr_active) -- schedstat_inc(rq, yld_both_empty); -- } else if (!rq->expired->nr_active) -- schedstat_inc(rq, yld_exp_empty); -- -- if (array != target) { -- dequeue_task(current, array); -- enqueue_task(current, target); -- } else -- /* -- * requeue_task is cheaper so perform that if possible. -- */ -- requeue_task(current, array); -+ if (rq->nr_running == 1) -+ schedstat_inc(rq, yld_both_empty); -+ else { -+ struct prio_array *old_array = p->array; -+ int old_prio = p->prio; -+ -+ if (idleprio_task(p)) { -+ dequeue_task(p, rq); -+ enqueue_task(p, rq); -+ goto out_release; -+ } -+ /* p->prio will be updated in requeue_task via queue_expired */ -+ if (!rt_task(p)) -+ p->array = rq->expired; -+ requeue_task(p, rq, old_array, old_prio); -+ } - -+out_release: - /* - * Since we are going to call schedule() anyway, there's - * no need to preempt or enable interrupts: -@@ -4902,6 +5142,8 @@ - break; - case SCHED_NORMAL: - case SCHED_BATCH: -+ case SCHED_ISO: -+ case SCHED_IDLEPRIO: - ret = 0; - break; - } -@@ -4926,6 +5168,8 @@ - break; - case SCHED_NORMAL: - case SCHED_BATCH: -+ case SCHED_ISO: -+ case SCHED_IDLEPRIO: - ret = 0; - } - return ret; -@@ -4959,8 +5203,8 @@ - if (retval) - goto out_unlock; - -- jiffies_to_timespec(p->policy == SCHED_FIFO ? -- 0 : task_timeslice(p), &t); -+ t = ns_to_timespec(p->policy == SCHED_FIFO ? 0 : -+ MS_TO_NS(task_timeslice(p))); - read_unlock(&tasklist_lock); - retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; - out_nounlock: -@@ -5056,10 +5300,10 @@ - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - -- idle->timestamp = sched_clock(); -- idle->sleep_avg = 0; -- idle->array = NULL; -- idle->prio = idle->normal_prio = MAX_PRIO; -+ bitmap_zero(idle->bitmap, PRIO_RANGE); -+ idle->timestamp = idle->last_ran = sched_clock(); -+ idle->array = rq->active; -+ idle->prio = idle->normal_prio = NICE_TO_PRIO(0); - idle->state = TASK_RUNNING; - idle->cpus_allowed = cpumask_of_cpu(cpu); - set_task_cpu(idle, cpu); -@@ -5178,7 +5422,7 @@ - goto out; - - set_task_cpu(p, dest_cpu); -- if (p->array) { -+ if (task_queued(p)) { - /* - * Sync timestamp with rq_dest's before activating. - * The same thing could be achieved by doing this step -@@ -5189,8 +5433,7 @@ - + rq_dest->most_recent_timestamp; - deactivate_task(p, rq_src); - __activate_task(p, rq_dest); -- if (TASK_PREEMPTS_CURR(p, rq_dest)) -- resched_task(rq_dest->curr); -+ try_preempt(p, rq_dest); - } - ret = 1; - out: -@@ -5487,7 +5730,7 @@ - /* Idle task back to normal (off runqueue, low prio) */ - rq = task_rq_lock(rq->idle, &flags); - deactivate_task(rq->idle, rq); -- rq->idle->static_prio = MAX_PRIO; -+ rq->idle->static_prio = NICE_TO_PRIO(0); - __setscheduler(rq->idle, SCHED_NORMAL, 0); - migrate_dead_tasks(cpu); - task_rq_unlock(rq, &flags); -@@ -7013,6 +7256,13 @@ - /* Move init over to a non-isolated CPU */ - if (set_cpus_allowed(current, non_isolated_cpus) < 0) - BUG(); -+ -+ /* -+ * Assume that every added cpu gives us slightly less overall latency -+ * allowing us to increase the base rr_interval, but in a non linear -+ * fashion. -+ */ -+ rr_interval *= 1 + ilog2(num_online_cpus()); - } - #else - void __init sched_init_smp(void) -@@ -7035,6 +7285,16 @@ - int i, j, k; - int highest_cpu = 0; - -+ /* Generate the priority matrix */ -+ for (i = 0; i < PRIO_RANGE; i++) { -+ bitmap_fill(prio_matrix[i], PRIO_RANGE); -+ j = PRIO_RANGE * PRIO_RANGE / (PRIO_RANGE - i); -+ for (k = 0; k <= PRIO_RANGE * (PRIO_RANGE - 1); k += j) { -+ __clear_bit(PRIO_RANGE - 1 - (k / PRIO_RANGE), -+ prio_matrix[i]); -+ } -+ } -+ - for_each_possible_cpu(i) { - struct prio_array *array; - struct rq *rq; -@@ -7042,12 +7302,20 @@ - rq = cpu_rq(i); - spin_lock_init(&rq->lock); - lockdep_set_class(&rq->lock, &rq->rq_lock_key); -+ rq->iso_ticks = 0; - rq->nr_running = 0; -+ rq->nr_idleprio = 0; -+ rq->prio_rotation = 0; - rq->active = rq->arrays; -+ rq->idleprio = rq->active; - rq->expired = rq->arrays + 1; -- rq->best_expired_prio = MAX_PRIO; -+ reset_prio_levels(rq); -+ rq->dyn_bitmap = rq->active->prio_bitmap; -+ rq->exp_bitmap = rq->expired->prio_bitmap; - - #ifdef CONFIG_SMP -+ rq->active->rq = rq; -+ rq->expired->rq = rq; - rq->sd = NULL; - for (j = 1; j < 3; j++) - rq->cpu_load[j] = 0; -@@ -7060,17 +7328,16 @@ - atomic_set(&rq->nr_iowait, 0); - - for (j = 0; j < 2; j++) { -+ - array = rq->arrays + j; -- for (k = 0; k < MAX_PRIO; k++) { -+ for (k = 0; k <= MAX_PRIO; k++) - INIT_LIST_HEAD(array->queue + k); -- __clear_bit(k, array->bitmap); -- } -- // delimiter for bitsearch -- __set_bit(MAX_PRIO, array->bitmap); -+ bitmap_zero(array->prio_bitmap, MAX_PRIO); -+ /* delimiter for bitsearch */ -+ __set_bit(MAX_PRIO, array->prio_bitmap); - } - highest_cpu = i; - } -- - set_load_weight(&init_task); - - #ifdef CONFIG_SMP -@@ -7125,25 +7392,25 @@ - #ifdef CONFIG_MAGIC_SYSRQ - void normalize_rt_tasks(void) - { -- struct prio_array *array; - struct task_struct *g, *p; - unsigned long flags; - struct rq *rq; -+ int queued; - - read_lock_irq(&tasklist_lock); - - do_each_thread(g, p) { -- if (!rt_task(p)) -+ if (!rt_task(p) && !iso_task(p)) - continue; - - spin_lock_irqsave(&p->pi_lock, flags); - rq = __task_rq_lock(p); - -- array = p->array; -- if (array) -+ queued = task_queued(p); -+ if (queued) - deactivate_task(p, task_rq(p)); - __setscheduler(p, SCHED_NORMAL, 0); -- if (array) { -+ if (queued) { - __activate_task(p, task_rq(p)); - resched_task(rq->curr); - } -Index: linux-2.6.22-ck1/kernel/sysctl.c -=================================================================== ---- linux-2.6.22-ck1.orig/kernel/sysctl.c 2007-07-10 14:55:00.000000000 +1000 -+++ linux-2.6.22-ck1/kernel/sysctl.c 2007-07-10 14:55:23.000000000 +1000 -@@ -22,6 +22,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -70,6 +71,7 @@ - extern char core_pattern[]; - extern int pid_max; - extern int min_free_kbytes; -+extern int vm_tail_largefiles; - extern int printk_ratelimit_jiffies; - extern int printk_ratelimit_burst; - extern int pid_max_min, pid_max_max; -@@ -78,6 +80,10 @@ - extern int compat_log; - extern int maps_protect; - extern int sysctl_stat_interval; -+extern int rr_interval; -+extern int sched_interactive; -+extern int sched_iso_cpu; -+extern int sched_iso_period; - - /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ - static int maxolduid = 65535; -@@ -161,6 +167,14 @@ - #endif - - -+/* Constants for minimum and maximum testing. -+ We use these as one-element integer vectors. */ -+static int __read_mostly zero; -+static int __read_mostly one = 1; -+static int __read_mostly one_hundred = 100; -+static int __read_mostly five_thousand = 5000; -+ -+ - /* The default sysctl tables: */ - - static ctl_table root_table[] = { -@@ -501,6 +515,47 @@ - .mode = 0444, - .proc_handler = &proc_dointvec, - }, -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "rr_interval", -+ .data = &rr_interval, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .strategy = &sysctl_intvec, -+ .extra1 = &one, -+ .extra2 = &five_thousand, -+ }, -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "interactive", -+ .data = &sched_interactive, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec, -+ }, -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "iso_cpu", -+ .data = &sched_iso_cpu, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .strategy = &sysctl_intvec, -+ .extra1 = &zero, -+ .extra2 = &one_hundred, -+ }, -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "iso_period", -+ .data = &sched_iso_period, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .strategy = &sysctl_intvec, -+ .extra1 = &one, -+ .extra2 = &one_hundred, -+ }, - #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) - { - .ctl_name = KERN_UNKNOWN_NMI_PANIC, -@@ -619,14 +674,16 @@ - { .ctl_name = 0 } - }; - --/* Constants for minimum and maximum testing in vm_table. -- We use these as one-element integer vectors. */ --static int zero; --static int one_hundred = 100; -- -- - static ctl_table vm_table[] = { - { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "tail_largefiles", -+ .data = &vm_tail_largefiles, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec, -+ }, -+ { - .ctl_name = VM_OVERCOMMIT_MEMORY, - .procname = "overcommit_memory", - .data = &sysctl_overcommit_memory, -@@ -705,16 +762,24 @@ - .proc_handler = &proc_dointvec, - }, - { -- .ctl_name = VM_SWAPPINESS, -- .procname = "swappiness", -- .data = &vm_swappiness, -- .maxlen = sizeof(vm_swappiness), -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "mapped", -+ .data = &vm_mapped, -+ .maxlen = sizeof(vm_mapped), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &zero, - .extra2 = &one_hundred, - }, -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "hardmaplimit", -+ .data = &vm_hardmaplimit, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec, -+ }, - #ifdef CONFIG_HUGETLB_PAGE - { - .ctl_name = VM_HUGETLB_PAGES, -@@ -882,6 +947,32 @@ - .extra1 = &zero, - }, - #endif -+#ifdef CONFIG_SWAP_PREFETCH -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "swap_prefetch", -+ .data = &swap_prefetch, -+ .maxlen = sizeof(swap_prefetch), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec, -+ }, -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "swap_prefetch_delay", -+ .data = &swap_prefetch_delay, -+ .maxlen = sizeof(swap_prefetch_delay), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec, -+ }, -+ { -+ .ctl_name = CTL_UNNUMBERED, -+ .procname = "swap_prefetch_sleep", -+ .data = &swap_prefetch_sleep, -+ .maxlen = sizeof(swap_prefetch_sleep), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec, -+ }, -+#endif - { .ctl_name = 0 } - }; - -Index: linux-2.6.22-ck1/Documentation/sched-design.txt -=================================================================== ---- linux-2.6.22-ck1.orig/Documentation/sched-design.txt 2007-07-10 14:55:00.000000000 +1000 -+++ linux-2.6.22-ck1/Documentation/sched-design.txt 2007-07-10 14:55:02.000000000 +1000 -@@ -1,11 +1,14 @@ -- Goals, Design and Implementation of the -- new ultra-scalable O(1) scheduler -+ Goals, Design and Implementation of the ultra-scalable O(1) scheduler by -+ Ingo Molnar and theStaircase Deadline cpu scheduler policy designed by -+ Con Kolivas. - - -- This is an edited version of an email Ingo Molnar sent to -- lkml on 4 Jan 2002. It describes the goals, design, and -- implementation of Ingo's new ultra-scalable O(1) scheduler. -- Last Updated: 18 April 2002. -+ This was originally an edited version of an email Ingo Molnar sent to -+ lkml on 4 Jan 2002. It describes the goals, design, and implementation -+ of Ingo's ultra-scalable O(1) scheduler. It now contains a description -+ of the Staircase Deadline priority scheduler that was built on this -+ design. -+ Last Updated: Fri, 4 May 2007 - - - Goal -@@ -163,3 +166,222 @@ - code is smaller than the old one. - - Ingo -+ -+ -+Staircase Deadline cpu scheduler policy -+================================================ -+ -+Design summary -+============== -+ -+A novel design which incorporates a foreground-background descending priority -+system (the staircase) via a bandwidth allocation matrix according to nice -+level. -+ -+ -+Features -+======== -+ -+A starvation free, strict fairness O(1) scalable design with interactivity -+as good as the above restrictions can provide. There is no interactivity -+estimator, no sleep/run measurements and only simple fixed accounting. -+The design has strict enough a design and accounting that task behaviour -+can be modelled and maximum scheduling latencies can be predicted by -+the virtual deadline mechanism that manages runqueues. The prime concern -+in this design is to maintain fairness at all costs determined by nice level, -+yet to maintain as good interactivity as can be allowed within the -+constraints of strict fairness. -+ -+ -+Design description -+================== -+ -+SD works off the principle of providing each task a quota of runtime that it is -+allowed to run at a number of priority levels determined by its static priority -+(ie. its nice level). If the task uses up its quota it has its priority -+decremented to the next level determined by a priority matrix. Once every -+runtime quota has been consumed of every priority level, a task is queued on the -+"expired" array. When no other tasks exist with quota, the expired array is -+activated and fresh quotas are handed out. This is all done in O(1). -+ -+Design details -+============== -+ -+Each task keeps a record of its own entitlement of cpu time. Most of the rest of -+these details apply to non-realtime tasks as rt task management is straight -+forward. -+ -+Each runqueue keeps a record of what major epoch it is up to in the -+rq->prio_rotation field which is incremented on each major epoch. It also -+keeps a record of the current prio_level for each static priority task. -+ -+Each task keeps a record of what major runqueue epoch it was last running -+on in p->rotation. It also keeps a record of what priority levels it has -+already been allocated quota from during this epoch in a bitmap p->bitmap. -+ -+The only tunable that determines all other details is the RR_INTERVAL. This -+is set to 8ms, and is scaled gently upwards with more cpus. This value is -+tunable via a /proc interface. -+ -+All tasks are initially given a quota based on RR_INTERVAL. This is equal to -+RR_INTERVAL between nice values of -6 and 0, half that size above nice 0, and -+progressively larger for nice values from -1 to -20. This is assigned to -+p->quota and only changes with changes in nice level. -+ -+As a task is first queued, it checks in recalc_task_prio to see if it has run at -+this runqueue's current priority rotation. If it has not, it will have its -+p->prio level set according to the first slot in a "priority matrix" and will be -+given a p->time_slice equal to the p->quota, and has its allocation bitmap bit -+set in p->bitmap for this prio level. It is then queued on the current active -+priority array. -+ -+If a task has already been running during this major epoch, and it has -+p->time_slice left and the rq->prio_quota for the task's p->prio still -+has quota, it will be placed back on the active array, but no more quota -+will be added. -+ -+If a task has been running during this major epoch, but does not have -+p->time_slice left, it will find the next lowest priority in its bitmap that it -+has not been allocated quota from. It then gets the a full quota in -+p->time_slice. It is then queued on the current active priority array at the -+newly determined lower priority. -+ -+If a task has been running during this major epoch, and does not have -+any entitlement left in p->bitmap and no time_slice left, it will have its -+bitmap cleared, and be queued at its best prio again, but on the expired -+priority array. -+ -+When a task is queued, it has its relevant bit set in the array->prio_bitmap. -+ -+p->time_slice is stored in nanosconds and is updated via update_cpu_clock on -+schedule() and scheduler_tick. If p->time_slice is below zero then the -+recalc_task_prio is readjusted and the task rescheduled. -+ -+ -+Priority Matrix -+=============== -+ -+In order to minimise the latencies between tasks of different nice levels -+running concurrently, the dynamic priority slots where different nice levels -+are queued are dithered instead of being sequential. What this means is that -+there are 40 priority slots where a task may run during one major rotation, -+and the allocation of slots is dependant on nice level. In the -+following table, a zero represents a slot where the task may run. -+ -+PRIORITY:0..................20.................39 -+nice -20 0000000000000000000000000000000000000000 -+nice -10 1000100010001000100010001000100010010000 -+nice 0 1010101010101010101010101010101010101010 -+nice 5 1011010110110101101101011011010110110110 -+nice 10 1110111011101110111011101110111011101110 -+nice 15 1111111011111110111111101111111011111110 -+nice 19 1111111111111111111111111111111111111110 -+ -+As can be seen, a nice -20 task runs in every priority slot whereas a nice 19 -+task only runs one slot per major rotation. This dithered table allows for the -+smallest possible maximum latencies between tasks of varying nice levels, thus -+allowing vastly different nice levels to be used. -+ -+SCHED_BATCH tasks are managed slightly differently, receiving only the top -+slots from its priority bitmap giving it equal cpu as SCHED_NORMAL, but -+slightly higher latencies. -+ -+ -+Modelling deadline behaviour -+============================ -+ -+As the accounting in this design is hard and not modified by sleep average -+calculations or interactivity modifiers, it is possible to accurately -+predict the maximum latency that a task may experience under different -+conditions. This is a virtual deadline mechanism enforced by mandatory -+timeslice expiration and not outside bandwidth measurement. -+ -+The maximum duration a task can run during one major epoch is determined by its -+nice value. Nice 0 tasks can run at 19 different priority levels for RR_INTERVAL -+duration during each epoch. Nice 10 tasks can run at 9 priority levels for each -+epoch, and so on. The table in the priority matrix above demonstrates how this -+is enforced. -+ -+Therefore the maximum duration a runqueue epoch can take is determined by -+the number of tasks running, and their nice level. After that, the maximum -+duration it can take before a task can wait before it get scheduled is -+determined by the position of its first slot on the matrix. -+ -+In the following examples, these are _worst case scenarios_ and would rarely -+occur, but can be modelled nonetheless to determine the maximum possible -+latency. -+ -+So for example, if two nice 0 tasks are running, and one has just expired as -+another is activated for the first time receiving a full quota for this -+runqueue rotation, the first task will wait: -+ -+nr_tasks * max_duration + nice_difference * rr_interval -+1 * 19 * RR_INTERVAL + 0 = 152ms -+ -+In the presence of a nice 10 task, a nice 0 task would wait a maximum of -+1 * 10 * RR_INTERVAL + 0 = 80ms -+ -+In the presence of a nice 0 task, a nice 10 task would wait a maximum of -+1 * 19 * RR_INTERVAL + 1 * RR_INTERVAL = 160ms -+ -+More useful than these values, though, are the average latencies which are -+a matter of determining the average distance between priority slots of -+different nice values and multiplying them by the tasks' quota. For example -+in the presence of a nice -10 task, a nice 0 task will wait either one or -+two slots. Given that nice -10 tasks have a quota 2.5 times the RR_INTERVAL, -+this means the latencies will alternate between 2.5 and 5 RR_INTERVALs or -+20 and 40ms respectively (on uniprocessor at 1000HZ). -+ -+ -+Achieving interactivity -+======================= -+ -+A requirement of this scheduler design was to achieve good interactivity -+despite being a completely fair deadline based design. The disadvantage of -+designs that try to achieve interactivity is that they usually do so at -+the expense of maintaining fairness. As cpu speeds increase, the requirement -+for some sort of metered unfairness towards interactive tasks becomes a less -+desirable phenomenon, but low latency and fairness remains mandatory to -+good interactive performance. -+ -+This design relies on the fact that interactive tasks, by their nature, -+sleep often. Most fair scheduling designs end up penalising such tasks -+indirectly giving them less than their fair possible share because of the -+sleep, and have to use a mechanism of bonusing their priority to offset -+this based on the duration they sleep. This becomes increasingly inaccurate -+as the number of running tasks rises and more tasks spend time waiting on -+runqueues rather than sleeping, and it is impossible to tell whether the -+task that's waiting on a runqueue only intends to run for a short period and -+then sleep again after than runqueue wait. Furthermore, all such designs rely -+on a period of time to pass to accumulate some form of statistic on the task -+before deciding on how much to give them preference. The shorter this period, -+the more rapidly bursts of cpu ruin the interactive tasks behaviour. The -+longer this period, the longer it takes for interactive tasks to get low -+scheduling latencies and fair cpu. -+ -+This design does not measure sleep time at all. Interactive tasks that sleep -+often will wake up having consumed very little if any of their quota for -+the current major priority rotation. The longer they have slept, the less -+likely they are to even be on the current major priority rotation. Once -+woken up, though, they get to use up a their full quota for that epoch, -+whether part of a quota remains or a full quota. Overall, however, they -+can still only run as much cpu time for that epoch as any other task of the -+same nice level. This means that two tasks behaving completely differently -+from fully cpu bound to waking/sleeping extremely frequently will still -+get the same quota of cpu, but the latter will be using its quota for that -+epoch in bursts rather than continuously. This guarantees that interactive -+tasks get the same amount of cpu as cpu bound ones. -+ -+The other requirement of interactive tasks is also to obtain low latencies -+for when they are scheduled. Unlike fully cpu bound tasks and the maximum -+latencies possible described in the modelling deadline behaviour section -+above, tasks that sleep will wake up with quota available usually at the -+current runqueue's priority_level or better. This means that the most latency -+they are likely to see is one RR_INTERVAL, and often they will preempt the -+current task if it is not of a sleeping nature. This then guarantees very -+low latency for interactive tasks, and the lowest latencies for the least -+cpu bound tasks. -+ -+ -+Fri, 4 May 2007 -+Con Kolivas -Index: linux-2.6.22-ck1/Documentation/sysctl/kernel.txt -=================================================================== ---- linux-2.6.22-ck1.orig/Documentation/sysctl/kernel.txt 2007-07-10 14:55:00.000000000 +1000 -+++ linux-2.6.22-ck1/Documentation/sysctl/kernel.txt 2007-07-10 14:55:20.000000000 +1000 -@@ -25,6 +25,9 @@ - - domainname - - hostname - - hotplug -+- interactive -+- iso_cpu -+- iso_period - - java-appletviewer [ binfmt_java, obsolete ] - - java-interpreter [ binfmt_java, obsolete ] - - kstack_depth_to_print [ X86 only ] -@@ -43,6 +46,7 @@ - - printk - - real-root-dev ==> Documentation/initrd.txt - - reboot-cmd [ SPARC only ] -+- rr_interval - - rtsig-max - - rtsig-nr - - sem -@@ -164,6 +168,40 @@ - - ============================================================== - -+interactive: -+ -+The staircase-deadline cpu scheduler can be set in either purely -+forward-looking mode for absolutely rigid fairness and cpu distribution -+according to nice level, or it can allow a small per-process history -+to smooth out cpu usage perturbations common in interactive tasks by -+enabling this sysctl. While small fairness issues can arise with this -+enabled, overall fairness is usually still strongly maintained and -+starvation is never possible. Enabling this can significantly smooth -+out 3d graphics and games. -+ -+Default value is 1 (enabled). -+ -+============================================================== -+ -+iso_cpu: -+ -+This sets the percentage cpu that the unprivileged SCHED_ISO tasks can -+run effectively at realtime priority, averaged over a rolling iso_period -+seconds. -+ -+Set to 80 (percent) by default. -+ -+============================================================== -+ -+iso_period: -+ -+This sets the number of seconds over which SCHED_ISO cpu usage is averaged -+to see if it exceeds its allocated cpu bandwidth. -+ -+Set to 5 (seconds) by default. -+ -+============================================================== -+ - l2cr: (PPC only) - - This flag controls the L2 cache of G3 processor boards. If -@@ -288,6 +326,19 @@ - - ============================================================== - -+rr_interval: -+ -+This is the smallest duration that any cpu process scheduling unit -+will run for. Increasing this value can increase throughput of cpu -+bound tasks substantially but at the expense of increased latencies -+overall. This value is in milliseconds and the default value chosen -+depends on the number of cpus available at scheduler initialisation -+with a minimum of 8. -+ -+Valid values are from 1-5000. -+ -+============================================================== -+ - rtsig-max & rtsig-nr: - - The file rtsig-max can be used to tune the maximum number -Index: linux-2.6.22-ck1/fs/pipe.c -=================================================================== ---- linux-2.6.22-ck1.orig/fs/pipe.c 2007-07-10 14:55:00.000000000 +1000 -+++ linux-2.6.22-ck1/fs/pipe.c 2007-07-10 14:55:02.000000000 +1000 -@@ -41,12 +41,7 @@ - { - DEFINE_WAIT(wait); - -- /* -- * Pipes are system-local resources, so sleeping on them -- * is considered a noninteractive wait: -- */ -- prepare_to_wait(&pipe->wait, &wait, -- TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE); -+ prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); - if (pipe->inode) - mutex_unlock(&pipe->inode->i_mutex); - schedule(); -Index: linux-2.6.22-ck1/fs/proc/array.c -=================================================================== ---- linux-2.6.22-ck1.orig/fs/proc/array.c 2007-07-10 14:55:00.000000000 +1000 -+++ linux-2.6.22-ck1/fs/proc/array.c 2007-07-10 14:55:02.000000000 +1000 -@@ -165,7 +165,6 @@ - rcu_read_lock(); - buffer += sprintf(buffer, - "State:\t%s\n" -- "SleepAVG:\t%lu%%\n" - "Tgid:\t%d\n" - "Pid:\t%d\n" - "PPid:\t%d\n" -@@ -173,7 +172,6 @@ - "Uid:\t%d\t%d\t%d\t%d\n" - "Gid:\t%d\t%d\t%d\t%d\n", - get_task_state(p), -- (p->sleep_avg/1024)*100/(1020000000/1024), - p->tgid, p->pid, - pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0, - pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0, -Index: linux-2.6.22-ck1/include/linux/init_task.h -=================================================================== ---- linux-2.6.22-ck1.orig/include/linux/init_task.h 2007-07-10 14:55:00.000000000 +1000 -+++ linux-2.6.22-ck1/include/linux/init_task.h 2007-07-10 14:55:20.000000000 +1000 -@@ -125,13 +125,15 @@ - .prio = MAX_PRIO-20, \ - .static_prio = MAX_PRIO-20, \ - .normal_prio = MAX_PRIO-20, \ -+ .rotation = 0, \ - .policy = SCHED_NORMAL, \ - .cpus_allowed = CPU_MASK_ALL, \ - .mm = NULL, \ - .active_mm = &init_mm, \ - .run_list = LIST_HEAD_INIT(tsk.run_list), \ - .ioprio = 0, \ -- .time_slice = HZ, \ -+ .time_slice = 1000000000, \ -+ .quota = 1000000000, \ - .tasks = LIST_HEAD_INIT(tsk.tasks), \ - .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ - .ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \ -@@ -158,6 +160,7 @@ - .signal = {{0}}}, \ - .blocked = {{0}}, \ - .alloc_lock = __SPIN_LOCK_UNLOCKED(tsk.alloc_lock), \ -+ .mutexes_held = 0, \ - .journal_info = NULL, \ - .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ - .fs_excl = ATOMIC_INIT(0), \ -Index: linux-2.6.22-ck1/kernel/softirq.c -=================================================================== ---- linux-2.6.22-ck1.orig/kernel/softirq.c 2007-07-10 14:55:00.000000000 +1000 -+++ linux-2.6.22-ck1/kernel/softirq.c 2007-07-10 14:55:02.000000000 +1000 -@@ -488,7 +488,7 @@ - - static int ksoftirqd(void * __bind_cpu) - { -- set_user_nice(current, 19); -+ set_user_nice(current, 15); - current->flags |= PF_NOFREEZE; - - set_current_state(TASK_INTERRUPTIBLE); -Index: linux-2.6.22-ck1/kernel/workqueue.c -=================================================================== ---- linux-2.6.22-ck1.orig/kernel/workqueue.c 2007-07-10 14:55:00.000000000 +1000 -+++ linux-2.6.22-ck1/kernel/workqueue.c 2007-07-10 14:55:02.000000000 +1000 -@@ -285,8 +285,6 @@ - if (!cwq->wq->freezeable) - current->flags |= PF_NOFREEZE; - -- set_user_nice(current, -5); -- - for (;;) { - prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); - if (!freezing(current) && -Index: linux-2.6.22-ck1/kernel/kthread.c -=================================================================== ---- linux-2.6.22-ck1.orig/kernel/kthread.c 2007-07-10 14:55:00.000000000 +1000 -+++ linux-2.6.22-ck1/kernel/kthread.c 2007-07-10 14:55:02.000000000 +1000 -@@ -223,7 +223,6 @@ - - ignore_signals(tsk); - -- set_user_nice(tsk, -5); - set_cpus_allowed(tsk, CPU_MASK_ALL); - } - -Index: linux-2.6.22-ck1/kernel/fork.c -=================================================================== ---- linux-2.6.22-ck1.orig/kernel/fork.c 2007-07-10 14:55:00.000000000 +1000 -+++ linux-2.6.22-ck1/kernel/fork.c 2007-07-10 14:55:20.000000000 +1000 -@@ -1063,6 +1063,7 @@ - p->io_context = NULL; - p->io_wait = NULL; - p->audit_context = NULL; -+ p->mutexes_held = 0; - cpuset_fork(p); - #ifdef CONFIG_NUMA - p->mempolicy = mpol_copy(p->mempolicy); -Index: linux-2.6.22-ck1/kernel/mutex.c -=================================================================== ---- linux-2.6.22-ck1.orig/kernel/mutex.c 2007-07-10 14:55:00.000000000 +1000 -+++ linux-2.6.22-ck1/kernel/mutex.c 2007-07-10 14:55:20.000000000 +1000 -@@ -60,6 +60,16 @@ - static void fastcall noinline __sched - __mutex_lock_slowpath(atomic_t *lock_count); - -+static inline void inc_mutex_count(void) -+{ -+ current->mutexes_held++; -+} -+ -+static inline void dec_mutex_count(void) -+{ -+ current->mutexes_held--; -+} -+ - /*** - * mutex_lock - acquire the mutex - * @lock: the mutex to be acquired -@@ -89,6 +99,7 @@ - * 'unlocked' into 'locked' state. - */ - __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath); -+ inc_mutex_count(); - } - - EXPORT_SYMBOL(mutex_lock); -@@ -114,6 +125,7 @@ - * into 'unlocked' state: - */ - __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath); -+ dec_mutex_count(); - } - - EXPORT_SYMBOL(mutex_unlock); -@@ -283,9 +295,14 @@ - */ - int fastcall __sched mutex_lock_interruptible(struct mutex *lock) - { -+ int ret; -+ - might_sleep(); -- return __mutex_fastpath_lock_retval -+ ret = __mutex_fastpath_lock_retval - (&lock->count, __mutex_lock_interruptible_slowpath); -+ if (likely(!ret)) -+ inc_mutex_count(); -+ return ret; - } - - EXPORT_SYMBOL(mutex_lock_interruptible); -@@ -340,8 +357,12 @@ - */ - int fastcall __sched mutex_trylock(struct mutex *lock) - { -- return __mutex_fastpath_trylock(&lock->count, -+ int ret = __mutex_fastpath_trylock(&lock->count, - __mutex_trylock_slowpath); -+ -+ if (likely(ret)) -+ inc_mutex_count(); -+ return ret; - } - - EXPORT_SYMBOL(mutex_trylock); -Index: linux-2.6.22-ck1/block/cfq-iosched.c -=================================================================== ---- linux-2.6.22-ck1.orig/block/cfq-iosched.c 2007-07-10 14:55:00.000000000 +1000 -+++ linux-2.6.22-ck1/block/cfq-iosched.c 2007-07-10 14:55:21.000000000 +1000 -@@ -1276,10 +1276,12 @@ - printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); - case IOPRIO_CLASS_NONE: - /* -- * no prio set, place us in the middle of the BE classes -+ * Select class and ioprio according to policy and nice - */ -+ cfqq->ioprio_class = task_policy_ioprio_class(tsk); - cfqq->ioprio = task_nice_ioprio(tsk); -- cfqq->ioprio_class = IOPRIO_CLASS_BE; -+ if (cfqq->ioprio_class == IOPRIO_CLASS_IDLE) -+ cfq_clear_cfqq_idle_window(cfqq); - break; - case IOPRIO_CLASS_RT: - cfqq->ioprio = task_ioprio(tsk); -Index: linux-2.6.22-ck1/include/linux/ioprio.h -=================================================================== ---- linux-2.6.22-ck1.orig/include/linux/ioprio.h 2007-07-10 14:55:00.000000000 +1000 -+++ linux-2.6.22-ck1/include/linux/ioprio.h 2007-07-10 14:55:21.000000000 +1000 -@@ -22,7 +22,7 @@ - * class, the default for any process. IDLE is the idle scheduling class, it - * is only served when no one else is using the disk. - */ --enum { -+enum ioprio_class { - IOPRIO_CLASS_NONE, - IOPRIO_CLASS_RT, - IOPRIO_CLASS_BE, -@@ -51,8 +51,25 @@ - return IOPRIO_PRIO_DATA(task->ioprio); - } - -+static inline enum ioprio_class -+ task_policy_ioprio_class(struct task_struct *task) -+{ -+ if (rt_task(task)) -+ return IOPRIO_CLASS_RT; -+ if (idleprio_task(task)) -+ return IOPRIO_CLASS_IDLE; -+ return IOPRIO_CLASS_BE; -+} -+ - static inline int task_nice_ioprio(struct task_struct *task) - { -+ if (rt_task(task)) -+ return (MAX_RT_PRIO - task->rt_priority) * IOPRIO_BE_NR / -+ (MAX_RT_PRIO + 1); -+ if (iso_task(task)) -+ return 0; -+ if (idleprio_task(task)) -+ return IOPRIO_BE_NR - 1; - return (task_nice(task) + 20) / 5; - } - -Index: linux-2.6.22-ck1/Documentation/sysctl/vm.txt -=================================================================== ---- linux-2.6.22-ck1.orig/Documentation/sysctl/vm.txt 2007-07-10 14:55:00.000000000 +1000 -+++ linux-2.6.22-ck1/Documentation/sysctl/vm.txt 2007-07-10 14:55:23.000000000 +1000 -@@ -22,6 +22,8 @@ - - dirty_background_ratio - - dirty_expire_centisecs - - dirty_writeback_centisecs -+- hardmaplimit -+- mapped - - max_map_count - - min_free_kbytes - - laptop_mode -@@ -31,12 +33,15 @@ - - min_unmapped_ratio - - min_slab_ratio - - panic_on_oom -+- swap_prefetch -+- swap_prefetch_delay -+- swap_prefetch_sleep - - ============================================================== - - dirty_ratio, dirty_background_ratio, dirty_expire_centisecs, - dirty_writeback_centisecs, vfs_cache_pressure, laptop_mode, --block_dump, swap_token_timeout, drop-caches: -+block_dump, swap_token_timeout, drop-caches, tail_largefiles: - - See Documentation/filesystems/proc.txt - -@@ -86,6 +91,27 @@ - - ============================================================== - -+hardmaplimit: -+ -+This flag makes the vm adhere to the mapped value as closely as possible -+except in the most extreme vm stress where doing so would provoke an out -+of memory condition (see mapped below). -+ -+Enabled by default. -+ -+============================================================== -+ -+mapped: -+ -+This is the percentage ram that is filled with mapped pages (applications) -+before the vm will start reclaiming mapped pages by moving them to swap. -+It is altered by the relative stress of the vm at the time so is not -+strictly adhered to to prevent provoking out of memory kills. -+ -+Set to 66 by default. -+ -+============================================================== -+ - max_map_count: - - This file contains the maximum number of memory map areas a process -@@ -216,3 +242,37 @@ - The default value is 0. - 1 and 2 are for failover of clustering. Please select either - according to your policy of failover. -+ -+============================================================== -+ -+swap_prefetch -+ -+This enables or disables the swap prefetching feature. When the virtual -+memory subsystem has been extremely idle for at least swap_prefetch_sleep -+seconds it will start copying back pages from swap into the swapcache and keep -+a copy in swap. Valid values are 0 - 3. A value of 0 disables swap -+prefetching, 1 enables it unless laptop_mode is enabled, 2 enables it in the -+presence of laptop_mode, and 3 enables it unconditionally, ignoring whether -+the system is idle or not. If set to 0, swap prefetch wil not even try to keep -+record of ram swapped out to have the most minimal impact on performance. -+ -+The default value is 1. -+ -+============================================================== -+ -+swap_prefetch_delay -+ -+This is the time in seconds that swap prefetching is delayed upon finding -+the system is not idle (ie the vm is busy or non-niced cpu load is present). -+ -+The default value is 1. -+ -+============================================================== -+ -+swap_prefetch_sleep -+ -+This is the time in seconds that the swap prefetch kernel thread is put to -+sleep for when the ram is found to be full and it is unable to prefetch -+further. -+ -+The default value is 5. -Index: linux-2.6.22-ck1/include/linux/swap.h -=================================================================== ---- linux-2.6.22-ck1.orig/include/linux/swap.h 2007-07-10 14:55:00.000000000 +1000 -+++ linux-2.6.22-ck1/include/linux/swap.h 2007-07-10 14:55:22.000000000 +1000 -@@ -180,6 +180,7 @@ - /* linux/mm/swap.c */ - extern void FASTCALL(lru_cache_add(struct page *)); - extern void FASTCALL(lru_cache_add_active(struct page *)); -+extern void FASTCALL(lru_cache_add_tail(struct page *)); - extern void FASTCALL(activate_page(struct page *)); - extern void FASTCALL(mark_page_accessed(struct page *)); - extern void lru_add_drain(void); -@@ -188,9 +189,11 @@ - extern void swap_setup(void); - - /* linux/mm/vmscan.c */ --extern unsigned long try_to_free_pages(struct zone **, gfp_t); -+extern unsigned long try_to_free_pages(struct zone **, gfp_t, -+ struct task_struct *p); - extern unsigned long shrink_all_memory(unsigned long nr_pages); --extern int vm_swappiness; -+extern int vm_mapped; -+extern int vm_hardmaplimit; - extern int remove_mapping(struct address_space *mapping, struct page *page); - extern long vm_total_pages; - -@@ -237,6 +240,7 @@ - extern struct page * lookup_swap_cache(swp_entry_t); - extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma, - unsigned long addr); -+extern int add_to_swap_cache(struct page *page, swp_entry_t entry); - /* linux/mm/swapfile.c */ - extern long total_swap_pages; - extern unsigned int nr_swapfiles; -Index: linux-2.6.22-ck1/init/Kconfig -=================================================================== ---- linux-2.6.22-ck1.orig/init/Kconfig 2007-07-10 14:55:00.000000000 +1000 -+++ linux-2.6.22-ck1/init/Kconfig 2007-07-10 14:55:22.000000000 +1000 -@@ -105,6 +105,28 @@ - used to provide more virtual memory than the actual RAM present - in your computer. If unsure say Y. - -+config SWAP_PREFETCH -+ bool "Support for prefetching swapped memory" -+ depends on SWAP -+ default y -+ ---help--- -+ This option will allow the kernel to prefetch swapped memory pages -+ when idle. The pages will be kept on both swap and in swap_cache -+ thus avoiding the need for further I/O if either ram or swap space -+ is required. -+ -+ What this will do on workstations is slowly bring back applications -+ that have swapped out after memory intensive workloads back into -+ physical ram if you have free ram at a later stage and the machine -+ is relatively idle. This means that when you come back to your -+ computer after leaving it idle for a while, applications will come -+ to life faster. Note that your swap usage will appear to increase -+ but these are cached pages, can be dropped freely by the vm, and it -+ should stabilise around 50% swap usage maximum. -+ -+ Workstations and multiuser workstation servers will most likely want -+ to say Y. -+ - config SYSVIPC - bool "System V IPC" - ---help--- -Index: linux-2.6.22-ck1/mm/Makefile -=================================================================== ---- linux-2.6.22-ck1.orig/mm/Makefile 2007-07-10 14:55:00.000000000 +1000 -+++ linux-2.6.22-ck1/mm/Makefile 2007-07-10 14:55:22.000000000 +1000 -@@ -17,6 +17,7 @@ - obj-y += bounce.o - endif - obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o -+obj-$(CONFIG_SWAP_PREFETCH) += swap_prefetch.o - obj-$(CONFIG_HUGETLBFS) += hugetlb.o - obj-$(CONFIG_NUMA) += mempolicy.o - obj-$(CONFIG_SPARSEMEM) += sparse.o -Index: linux-2.6.22-ck1/mm/swap.c -=================================================================== ---- linux-2.6.22-ck1.orig/mm/swap.c 2007-07-10 14:55:00.000000000 +1000 -+++ linux-2.6.22-ck1/mm/swap.c 2007-07-10 14:55:23.000000000 +1000 -@@ -17,6 +17,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -176,6 +177,7 @@ - */ - static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; - static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; -+static DEFINE_PER_CPU(struct pagevec, lru_add_tail_pvecs) = { 0, }; - - void fastcall lru_cache_add(struct page *page) - { -@@ -197,6 +199,31 @@ - put_cpu_var(lru_add_active_pvecs); - } - -+static void __pagevec_lru_add_tail(struct pagevec *pvec) -+{ -+ int i; -+ struct zone *zone = NULL; -+ -+ for (i = 0; i < pagevec_count(pvec); i++) { -+ struct page *page = pvec->pages[i]; -+ struct zone *pagezone = page_zone(page); -+ -+ if (pagezone != zone) { -+ if (zone) -+ spin_unlock_irq(&zone->lru_lock); -+ zone = pagezone; -+ spin_lock_irq(&zone->lru_lock); -+ } -+ BUG_ON(PageLRU(page)); -+ SetPageLRU(page); -+ add_page_to_inactive_list_tail(zone, page); -+ } -+ if (zone) -+ spin_unlock_irq(&zone->lru_lock); -+ release_pages(pvec->pages, pvec->nr, pvec->cold); -+ pagevec_reinit(pvec); -+} -+ - static void __lru_add_drain(int cpu) - { - struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu); -@@ -207,6 +234,9 @@ - pvec = &per_cpu(lru_add_active_pvecs, cpu); - if (pagevec_count(pvec)) - __pagevec_lru_add_active(pvec); -+ pvec = &per_cpu(lru_add_tail_pvecs, cpu); -+ if (pagevec_count(pvec)) -+ __pagevec_lru_add_tail(pvec); - } - - void lru_add_drain(void) -@@ -403,6 +433,20 @@ - } - - /* -+ * Function used uniquely to put pages back to the lru at the end of the -+ * inactive list to preserve the lru order. -+ */ -+void fastcall lru_cache_add_tail(struct page *page) -+{ -+ struct pagevec *pvec = &get_cpu_var(lru_add_tail_pvecs); -+ -+ page_cache_get(page); -+ if (!pagevec_add(pvec, page)) -+ __pagevec_lru_add_tail(pvec); -+ put_cpu_var(lru_add_pvecs); -+} -+ -+/* - * Try to drop buffers from the pages in a pagevec - */ - void pagevec_strip(struct pagevec *pvec) -@@ -514,6 +558,9 @@ - * Right now other parts of the system means that we - * _really_ don't want to cluster much more - */ -+ -+ prepare_swap_prefetch(); -+ - #ifdef CONFIG_HOTPLUG_CPU - hotcpu_notifier(cpu_swap_callback, 0); - #endif -Index: linux-2.6.22-ck1/mm/swap_prefetch.c -=================================================================== ---- /dev/null 1970-01-01 00:00:00.000000000 +0000 -+++ linux-2.6.22-ck1/mm/swap_prefetch.c 2007-07-10 14:55:22.000000000 +1000 -@@ -0,0 +1,542 @@ -+/* -+ * linux/mm/swap_prefetch.c -+ * -+ * Copyright (C) 2005-2007 Con Kolivas -+ * -+ * Written by Con Kolivas -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 as -+ * published by the Free Software Foundation. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ * sysctls: -+ * swap_prefetch: 0. Disable swap prefetching -+ * 1. Prefetch only when idle and not with laptop_mode -+ * 2. Prefetch when idle and with laptop_mode -+ * 3. Prefetch at all times. -+ * swap_prefetch_delay: Number of seconds to delay prefetching when system -+ * is not idle. -+ * swap_prefetch_sleep: Number of seconds to put kprefetchd to sleep when -+ * unable to prefetch. -+ */ -+int swap_prefetch __read_mostly = 1; -+int swap_prefetch_delay __read_mostly = 1; -+int swap_prefetch_sleep __read_mostly = 5; -+ -+#define PREFETCH_DELAY (HZ * swap_prefetch_delay) -+#define PREFETCH_SLEEP ((HZ * swap_prefetch_sleep) ? : 1) -+ -+struct swapped_root { -+ unsigned long busy; /* vm busy */ -+ spinlock_t lock; /* protects all data */ -+ struct list_head list; /* MRU list of swapped pages */ -+ struct radix_tree_root swap_tree; /* Lookup tree of pages */ -+ unsigned int count; /* Number of entries */ -+ unsigned int maxcount; /* Maximum entries allowed */ -+ struct kmem_cache *cache; /* Of struct swapped_entry */ -+}; -+ -+static struct swapped_root swapped = { -+ .lock = SPIN_LOCK_UNLOCKED, -+ .list = LIST_HEAD_INIT(swapped.list), -+ .swap_tree = RADIX_TREE_INIT(GFP_ATOMIC), -+}; -+ -+static struct task_struct *kprefetchd_task; -+ -+/* -+ * We check to see no part of the vm is busy. If it is this will interrupt -+ * trickle_swap and wait another PREFETCH_DELAY. Purposefully racy. -+ */ -+inline void delay_swap_prefetch(void) -+{ -+ if (!test_bit(0, &swapped.busy)) -+ __set_bit(0, &swapped.busy); -+} -+ -+/* -+ * If laptop_mode is enabled don't prefetch to avoid hard drives -+ * doing unnecessary spin-ups unless swap_prefetch is explicitly -+ * set to a higher value. -+ */ -+static inline int prefetch_enabled(void) -+{ -+ if (swap_prefetch <= laptop_mode) -+ return 0; -+ return 1; -+} -+ -+static int kprefetchd_awake; -+ -+/* -+ * Drop behind accounting which keeps a list of the most recently used swap -+ * entries. Entries are removed lazily by kprefetchd. -+ */ -+void add_to_swapped_list(struct page *page) -+{ -+ struct swapped_entry *entry; -+ unsigned long index, flags; -+ -+ if (!prefetch_enabled()) -+ goto out; -+ -+ spin_lock_irqsave(&swapped.lock, flags); -+ if (swapped.count >= swapped.maxcount) { -+ /* -+ * Once the number of entries exceeds maxcount we start -+ * removing the least recently used entries. -+ */ -+ entry = list_entry(swapped.list.next, -+ struct swapped_entry, swapped_list); -+ radix_tree_delete(&swapped.swap_tree, entry->swp_entry.val); -+ list_del(&entry->swapped_list); -+ swapped.count--; -+ } else { -+ entry = kmem_cache_alloc(swapped.cache, GFP_ATOMIC); -+ if (unlikely(!entry)) -+ /* bad, can't allocate more mem */ -+ goto out_locked; -+ } -+ -+ index = page_private(page); -+ entry->swp_entry.val = index; -+ /* -+ * On numa we need to store the node id to ensure that we prefetch to -+ * the same node it came from. -+ */ -+ store_swap_entry_node(entry, page); -+ -+ if (likely(!radix_tree_insert(&swapped.swap_tree, index, entry))) { -+ list_add(&entry->swapped_list, &swapped.list); -+ swapped.count++; -+ } else -+ kmem_cache_free(swapped.cache, entry); -+ -+out_locked: -+ spin_unlock_irqrestore(&swapped.lock, flags); -+out: -+ if (!kprefetchd_awake) -+ wake_up_process(kprefetchd_task); -+ return; -+} -+ -+/* -+ * Removes entries from the swapped_list. The radix tree allows us to quickly -+ * look up the entry from the index without having to iterate over the whole -+ * list. -+ */ -+static void remove_from_swapped_list(const unsigned long index) -+{ -+ struct swapped_entry *entry; -+ unsigned long flags; -+ -+ spin_lock_irqsave(&swapped.lock, flags); -+ entry = radix_tree_delete(&swapped.swap_tree, index); -+ if (likely(entry)) { -+ list_del(&entry->swapped_list); -+ swapped.count--; -+ kmem_cache_free(swapped.cache, entry); -+ } -+ spin_unlock_irqrestore(&swapped.lock, flags); -+} -+ -+enum trickle_return { -+ TRICKLE_SUCCESS, -+ TRICKLE_FAILED, -+ TRICKLE_DELAY, -+}; -+ -+struct node_stats { -+ /* Free ram after a cycle of prefetching */ -+ unsigned long last_free; -+ /* Free ram on this cycle of checking prefetch_suitable */ -+ unsigned long current_free; -+ /* The amount of free ram before we start prefetching */ -+ unsigned long highfree[MAX_NR_ZONES]; -+ /* The amount of free ram where we will stop prefetching */ -+ unsigned long lowfree[MAX_NR_ZONES]; -+ /* highfree or lowfree depending on whether we've hit a watermark */ -+ unsigned long *pointfree[MAX_NR_ZONES]; -+}; -+ -+/* -+ * prefetch_stats stores the free ram data of each node and this is used to -+ * determine if a node is suitable for prefetching into. -+ */ -+struct prefetch_stats { -+ /* Which nodes are currently suited to prefetching */ -+ nodemask_t prefetch_nodes; -+ /* Total pages we've prefetched on this wakeup of kprefetchd */ -+ unsigned long prefetched_pages; -+ struct node_stats node[MAX_NUMNODES]; -+}; -+ -+static struct prefetch_stats sp_stat; -+ -+/* -+ * This tries to read a swp_entry_t into swap cache for swap prefetching. -+ * If it returns TRICKLE_DELAY we should delay further prefetching. -+ */ -+static enum trickle_return trickle_swap_cache_async(const swp_entry_t entry, -+ const int node) -+{ -+ enum trickle_return ret = TRICKLE_FAILED; -+ unsigned long flags; -+ struct page *page; -+ -+ read_lock_irqsave(&swapper_space.tree_lock, flags); -+ /* Entry may already exist */ -+ page = radix_tree_lookup(&swapper_space.page_tree, entry.val); -+ read_unlock_irqrestore(&swapper_space.tree_lock, flags); -+ if (page) -+ goto out; -+ -+ /* -+ * Get a new page to read from swap. We have already checked the -+ * watermarks so __alloc_pages will not call on reclaim. -+ */ -+ page = alloc_pages_node(node, GFP_HIGHUSER & ~__GFP_WAIT, 0); -+ if (unlikely(!page)) { -+ ret = TRICKLE_DELAY; -+ goto out; -+ } -+ -+ if (add_to_swap_cache(page, entry)) { -+ /* Failed to add to swap cache */ -+ goto out_release; -+ } -+ -+ /* Add them to the tail of the inactive list to preserve LRU order */ -+ lru_cache_add_tail(page); -+ if (unlikely(swap_readpage(NULL, page))) -+ goto out_release; -+ -+ sp_stat.prefetched_pages++; -+ sp_stat.node[node].last_free--; -+ -+ ret = TRICKLE_SUCCESS; -+out_release: -+ page_cache_release(page); -+out: -+ /* -+ * All entries are removed here lazily. This avoids the cost of -+ * remove_from_swapped_list during normal swapin. Thus there are -+ * usually many stale entries. -+ */ -+ remove_from_swapped_list(entry.val); -+ return ret; -+} -+ -+static void clear_last_prefetch_free(void) -+{ -+ int node; -+ -+ /* -+ * Reset the nodes suitable for prefetching to all nodes. We could -+ * update the data to take into account memory hotplug if desired.. -+ */ -+ sp_stat.prefetch_nodes = node_online_map; -+ for_each_node_mask(node, sp_stat.prefetch_nodes) { -+ struct node_stats *ns = &sp_stat.node[node]; -+ -+ ns->last_free = 0; -+ } -+} -+ -+static void clear_current_prefetch_free(void) -+{ -+ int node; -+ -+ sp_stat.prefetch_nodes = node_online_map; -+ for_each_node_mask(node, sp_stat.prefetch_nodes) { -+ struct node_stats *ns = &sp_stat.node[node]; -+ -+ ns->current_free = 0; -+ } -+} -+ -+/* -+ * This updates the high and low watermarks of amount of free ram in each -+ * node used to start and stop prefetching. We prefetch from pages_high * 4 -+ * down to pages_high * 3. -+ */ -+static void examine_free_limits(void) -+{ -+ struct zone *z; -+ -+ for_each_zone(z) { -+ struct node_stats *ns; -+ int idx; -+ -+ if (!populated_zone(z)) -+ continue; -+ -+ ns = &sp_stat.node[zone_to_nid(z)]; -+ idx = zone_idx(z); -+ ns->lowfree[idx] = z->pages_high * 3; -+ ns->highfree[idx] = ns->lowfree[idx] + z->pages_high; -+ -+ if (zone_page_state(z, NR_FREE_PAGES) > ns->highfree[idx]) { -+ /* -+ * We've gotten above the high watermark of free pages -+ * so we can start prefetching till we get to the low -+ * watermark. -+ */ -+ ns->pointfree[idx] = &ns->lowfree[idx]; -+ } -+ } -+} -+ -+/* -+ * We want to be absolutely certain it's ok to start prefetching. -+ */ -+static enum trickle_return prefetch_suitable(void) -+{ -+ enum trickle_return ret = TRICKLE_DELAY; -+ struct zone *z; -+ int node; -+ -+ /* -+ * If swap_prefetch is set to a high value we can ignore load -+ * and prefetch whenever we can. Otherwise we test for vm and -+ * cpu activity. -+ */ -+ if (swap_prefetch < 3) { -+ /* Purposefully racy, may return false positive */ -+ if (test_bit(0, &swapped.busy)) { -+ __clear_bit(0, &swapped.busy); -+ goto out; -+ } -+ -+ /* -+ * above_background_load is expensive so we only perform it -+ * every SWAP_CLUSTER_MAX prefetched_pages. -+ * We test to see if we're above_background_load as disk -+ * activity even at low priority can cause interrupt induced -+ * scheduling latencies. -+ */ -+ if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX) && -+ above_background_load()) -+ goto out; -+ } -+ clear_current_prefetch_free(); -+ -+ /* -+ * Have some hysteresis between where page reclaiming and prefetching -+ * will occur to prevent ping-ponging between them. -+ */ -+ for_each_zone(z) { -+ struct node_stats *ns; -+ unsigned long free; -+ int idx; -+ -+ if (!populated_zone(z)) -+ continue; -+ -+ node = zone_to_nid(z); -+ ns = &sp_stat.node[node]; -+ idx = zone_idx(z); -+ -+ free = zone_page_state(z, NR_FREE_PAGES); -+ if (free < *ns->pointfree[idx]) { -+ /* -+ * Free pages have dropped below the low watermark so -+ * we won't start prefetching again till we hit the -+ * high watermark of free pages. -+ */ -+ ns->pointfree[idx] = &ns->highfree[idx]; -+ node_clear(node, sp_stat.prefetch_nodes); -+ continue; -+ } -+ ns->current_free += free; -+ } -+ -+ /* -+ * We iterate over each node testing to see if it is suitable for -+ * prefetching and clear the nodemask if it is not. -+ */ -+ for_each_node_mask(node, sp_stat.prefetch_nodes) { -+ struct node_stats *ns = &sp_stat.node[node]; -+ -+ /* -+ * We check to see that pages are not being allocated -+ * elsewhere at any significant rate implying any -+ * degree of memory pressure (eg during file reads) -+ */ -+ if (ns->last_free) { -+ if (ns->current_free + SWAP_CLUSTER_MAX < -+ ns->last_free) { -+ ns->last_free = ns->current_free; -+ node_clear(node, -+ sp_stat.prefetch_nodes); -+ continue; -+ } -+ } else -+ ns->last_free = ns->current_free; -+ -+ /* We shouldn't prefetch when we are doing writeback */ -+ if (node_page_state(node, NR_WRITEBACK)) -+ node_clear(node, sp_stat.prefetch_nodes); -+ } -+ -+ /* Nothing suitable, put kprefetchd back to sleep */ -+ if (nodes_empty(sp_stat.prefetch_nodes)) -+ return TRICKLE_FAILED; -+ -+ /* Survived all that? Hooray we can prefetch! */ -+ ret = TRICKLE_SUCCESS; -+out: -+ return ret; -+} -+ -+/* -+ * trickle_swap is the main function that initiates the swap prefetching. It -+ * first checks to see if the busy flag is set, and does not prefetch if it -+ * is, as the flag implied we are low on memory or swapping in currently. -+ * Otherwise it runs until prefetch_suitable fails which occurs when the -+ * vm is busy, we prefetch to the watermark, the list is empty or we have -+ * iterated over all entries once. -+ */ -+static enum trickle_return trickle_swap(void) -+{ -+ enum trickle_return suitable, ret = TRICKLE_DELAY; -+ struct swapped_entry *pos, *n; -+ unsigned long flags; -+ -+ if (!prefetch_enabled()) -+ return ret; -+ -+ examine_free_limits(); -+ suitable = prefetch_suitable(); -+ if (suitable != TRICKLE_SUCCESS) -+ return suitable; -+ if (list_empty(&swapped.list)) { -+ kprefetchd_awake = 0; -+ return TRICKLE_FAILED; -+ } -+ -+ spin_lock_irqsave(&swapped.lock, flags); -+ list_for_each_entry_safe_reverse(pos, n, &swapped.list, swapped_list) { -+ swp_entry_t swp_entry; -+ int node; -+ -+ spin_unlock_irqrestore(&swapped.lock, flags); -+ cond_resched(); -+ suitable = prefetch_suitable(); -+ if (suitable != TRICKLE_SUCCESS) { -+ ret = suitable; -+ goto out_unlocked; -+ } -+ -+ spin_lock_irqsave(&swapped.lock, flags); -+ if (unlikely(!pos)) -+ continue; -+ node = get_swap_entry_node(pos); -+ if (!node_isset(node, sp_stat.prefetch_nodes)) { -+ /* -+ * We found an entry that belongs to a node that is -+ * not suitable for prefetching so skip it. -+ */ -+ continue; -+ } -+ swp_entry = pos->swp_entry; -+ spin_unlock_irqrestore(&swapped.lock, flags); -+ -+ if (trickle_swap_cache_async(swp_entry, node) == TRICKLE_DELAY) -+ goto out_unlocked; -+ spin_lock_irqsave(&swapped.lock, flags); -+ } -+ spin_unlock_irqrestore(&swapped.lock, flags); -+ -+out_unlocked: -+ if (sp_stat.prefetched_pages) { -+ lru_add_drain(); -+ sp_stat.prefetched_pages = 0; -+ } -+ return ret; -+} -+ -+static int kprefetchd(void *__unused) -+{ -+ struct sched_param param = { .sched_priority = 0 }; -+ -+ sched_setscheduler(current, SCHED_BATCH, ¶m); -+ set_user_nice(current, 19); -+ /* Set ioprio to lowest if supported by i/o scheduler */ -+ sys_ioprio_set(IOPRIO_WHO_PROCESS, IOPRIO_BE_NR - 1, IOPRIO_CLASS_BE); -+ -+ while (!kthread_should_stop()) { -+ try_to_freeze(); -+ -+ if (!kprefetchd_awake) { -+ set_current_state(TASK_INTERRUPTIBLE); -+ schedule(); -+ kprefetchd_awake = 1; -+ } -+ -+ if (trickle_swap() == TRICKLE_FAILED) -+ schedule_timeout_interruptible(PREFETCH_SLEEP); -+ else -+ schedule_timeout_interruptible(PREFETCH_DELAY); -+ clear_last_prefetch_free(); -+ } -+ return 0; -+} -+ -+/* -+ * Create kmem cache for swapped entries -+ */ -+void __init prepare_swap_prefetch(void) -+{ -+ struct zone *zone; -+ -+ swapped.cache = kmem_cache_create("swapped_entry", -+ sizeof(struct swapped_entry), 0, SLAB_PANIC, NULL, NULL); -+ -+ /* -+ * We set the limit to more entries than the physical ram. -+ * We remove entries lazily so we need some headroom. -+ */ -+ swapped.maxcount = nr_free_pagecache_pages() * 2; -+ -+ for_each_zone(zone) { -+ struct node_stats *ns; -+ int idx; -+ -+ if (!populated_zone(zone)) -+ continue; -+ -+ ns = &sp_stat.node[zone_to_nid(zone)]; -+ idx = zone_idx(zone); -+ ns->pointfree[idx] = &ns->highfree[idx]; -+ } -+} -+ -+static int __init kprefetchd_init(void) -+{ -+ kprefetchd_task = kthread_run(kprefetchd, NULL, "kprefetchd"); -+ -+ return 0; -+} -+ -+static void __exit kprefetchd_exit(void) -+{ -+ kthread_stop(kprefetchd_task); -+} -+ -+module_init(kprefetchd_init); -+module_exit(kprefetchd_exit); -Index: linux-2.6.22-ck1/mm/swap_state.c -=================================================================== ---- linux-2.6.22-ck1.orig/mm/swap_state.c 2007-07-10 14:55:00.000000000 +1000 -+++ linux-2.6.22-ck1/mm/swap_state.c 2007-07-10 14:55:22.000000000 +1000 -@@ -10,6 +10,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -95,7 +96,7 @@ - return error; - } - --static int add_to_swap_cache(struct page *page, swp_entry_t entry) -+int add_to_swap_cache(struct page *page, swp_entry_t entry) - { - int error; - -@@ -148,6 +149,9 @@ - swp_entry_t entry; - int err; - -+ /* Swap prefetching is delayed if we're swapping pages */ -+ delay_swap_prefetch(); -+ - BUG_ON(!PageLocked(page)); - - for (;;) { -@@ -320,6 +324,9 @@ - struct page *found_page, *new_page = NULL; - int err; - -+ /* Swap prefetching is delayed if we're already reading from swap */ -+ delay_swap_prefetch(); -+ - do { - /* - * First check the swap cache. Since this is normally -Index: linux-2.6.22-ck1/mm/vmscan.c -=================================================================== ---- linux-2.6.22-ck1.orig/mm/vmscan.c 2007-07-10 14:55:00.000000000 +1000 -+++ linux-2.6.22-ck1/mm/vmscan.c 2007-07-10 14:55:23.000000000 +1000 -@@ -16,6 +16,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -36,6 +37,7 @@ - #include - #include - #include -+#include - #include - - #include -@@ -63,7 +65,7 @@ - * whole list at once. */ - int swap_cluster_max; - -- int swappiness; -+ int mapped; - - int all_unreclaimable; - }; -@@ -110,9 +112,10 @@ - #endif - - /* -- * From 0 .. 100. Higher means more swappy. -+ * From 0 .. 100. Lower means more swappy. - */ --int vm_swappiness = 60; -+int vm_mapped __read_mostly = 66; -+int vm_hardmaplimit __read_mostly = 1; - long vm_total_pages; /* The total number of pages which the VM controls */ - - static LIST_HEAD(shrinker_list); -@@ -803,10 +806,14 @@ - * The distress ratio is important - we don't want to start - * going oom. - * -- * A 100% value of vm_swappiness overrides this algorithm -- * altogether. -+ * This distress value is ignored if we apply a hardmaplimit except -+ * in extreme distress. -+ * -+ * A 0% value of vm_mapped overrides this algorithm altogether. - */ -- swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; -+ swap_tendency = mapped_ratio * 100 / (sc->mapped + 1); -+ if (!vm_hardmaplimit || distress == 100) -+ swap_tendency += distress; - - /* - * Now use this metric to decide whether to start moving mapped -@@ -955,6 +962,41 @@ - } - - /* -+ * Helper functions to adjust nice level of kswapd, based on the priority of -+ * the task (p) that called it. If it is already higher priority we do not -+ * demote its nice level since it is still working on behalf of a higher -+ * priority task. With kernel threads we leave it at nice 0. -+ * -+ * We don't ever run kswapd real time, so if a real time task calls kswapd we -+ * set it to highest SCHED_NORMAL priority. -+ */ -+static int effective_sc_prio(struct task_struct *p) -+{ -+ if (likely(p->mm)) { -+ if (rt_task(p)) -+ return -20; -+ if (idleprio_task(p)) -+ return 19; -+ return task_nice(p); -+ } -+ return 0; -+} -+ -+static void set_kswapd_nice(struct task_struct *kswapd, struct task_struct *p, -+ int active) -+{ -+ long nice = effective_sc_prio(p); -+ -+ if (task_nice(kswapd) > nice || !active) -+ set_user_nice(kswapd, nice); -+} -+ -+static int sc_priority(struct task_struct *p) -+{ -+ return (DEF_PRIORITY + (DEF_PRIORITY * effective_sc_prio(p) / 40)); -+} -+ -+/* - * This is the direct reclaim path, for page-allocating processes. We only - * try to reclaim pages from zones which will satisfy the caller's allocation - * request. -@@ -1011,7 +1053,8 @@ - * holds filesystem locks which prevent writeout this might not work, and the - * allocation attempt will fail. - */ --unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) -+unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask, -+ struct task_struct *p) - { - int priority; - int ret = 0; -@@ -1019,15 +1062,20 @@ - unsigned long nr_reclaimed = 0; - struct reclaim_state *reclaim_state = current->reclaim_state; - unsigned long lru_pages = 0; -- int i; -+ int i, scan_priority = DEF_PRIORITY; - struct scan_control sc = { - .gfp_mask = gfp_mask, - .may_writepage = !laptop_mode, - .swap_cluster_max = SWAP_CLUSTER_MAX, - .may_swap = 1, -- .swappiness = vm_swappiness, -+ .mapped = vm_mapped, - }; - -+ if (p) -+ scan_priority = sc_priority(p); -+ -+ delay_swap_prefetch(); -+ - count_vm_event(ALLOCSTALL); - - for (i = 0; zones[i] != NULL; i++) { -@@ -1040,7 +1088,7 @@ - + zone_page_state(zone, NR_INACTIVE); - } - -- for (priority = DEF_PRIORITY; priority >= 0; priority--) { -+ for (priority = scan_priority; priority >= 0; priority--) { - sc.nr_scanned = 0; - if (!priority) - disable_swap_token(); -@@ -1070,7 +1118,7 @@ - } - - /* Take a nap, wait for some writeback to complete */ -- if (sc.nr_scanned && priority < DEF_PRIORITY - 2) -+ if (sc.nr_scanned && priority < scan_priority - 2) - congestion_wait(WRITE, HZ/10); - } - /* top priority shrink_caches still had more to do? don't OOM, then */ -@@ -1120,9 +1168,9 @@ - */ - static unsigned long balance_pgdat(pg_data_t *pgdat, int order) - { -- int all_zones_ok; -+ int all_zones_ok = 0; - int priority; -- int i; -+ int i, scan_priority; - unsigned long total_scanned; - unsigned long nr_reclaimed; - struct reclaim_state *reclaim_state = current->reclaim_state; -@@ -1130,7 +1178,7 @@ - .gfp_mask = GFP_KERNEL, - .may_swap = 1, - .swap_cluster_max = SWAP_CLUSTER_MAX, -- .swappiness = vm_swappiness, -+ .mapped = vm_mapped, - }; - /* - * temp_priority is used to remember the scanning priority at which -@@ -1138,6 +1186,8 @@ - */ - int temp_priority[MAX_NR_ZONES]; - -+ scan_priority = sc_priority(pgdat->kswapd); -+ - loop_again: - total_scanned = 0; - nr_reclaimed = 0; -@@ -1145,9 +1195,9 @@ - count_vm_event(PAGEOUTRUN); - - for (i = 0; i < pgdat->nr_zones; i++) -- temp_priority[i] = DEF_PRIORITY; -+ temp_priority[i] = scan_priority; - -- for (priority = DEF_PRIORITY; priority >= 0; priority--) { -+ for (priority = scan_priority; priority >= 0; priority--) { - int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ - unsigned long lru_pages = 0; - -@@ -1163,15 +1213,22 @@ - */ - for (i = pgdat->nr_zones - 1; i >= 0; i--) { - struct zone *zone = pgdat->node_zones + i; -+ unsigned long watermark; - - if (!populated_zone(zone)) - continue; - -- if (zone->all_unreclaimable && priority != DEF_PRIORITY) -+ if (zone->all_unreclaimable && priority != scan_priority) - continue; - -- if (!zone_watermark_ok(zone, order, zone->pages_high, -- 0, 0)) { -+ /* -+ * The watermark is relaxed depending on the -+ * level of "priority" till it drops to -+ * pages_high. -+ */ -+ watermark = zone->pages_high + (zone->pages_high * -+ priority / scan_priority); -+ if (!zone_watermark_ok(zone, order, watermark, 0, 0)) { - end_zone = i; - break; - } -@@ -1198,14 +1255,18 @@ - for (i = 0; i <= end_zone; i++) { - struct zone *zone = pgdat->node_zones + i; - int nr_slab; -+ unsigned long watermark; - - if (!populated_zone(zone)) - continue; - -- if (zone->all_unreclaimable && priority != DEF_PRIORITY) -+ if (zone->all_unreclaimable && priority != scan_priority) - continue; - -- if (!zone_watermark_ok(zone, order, zone->pages_high, -+ watermark = zone->pages_high + (zone->pages_high * -+ priority / scan_priority); -+ -+ if (!zone_watermark_ok(zone, order, watermark, - end_zone, 0)) - all_zones_ok = 0; - temp_priority[i] = priority; -@@ -1238,7 +1299,7 @@ - * OK, kswapd is getting into trouble. Take a nap, then take - * another pass across the zones. - */ -- if (total_scanned && priority < DEF_PRIORITY - 2) -+ if (total_scanned && priority < scan_priority - 2) - congestion_wait(WRITE, HZ/10); - - /* -@@ -1272,6 +1333,8 @@ - return nr_reclaimed; - } - -+#define WT_EXPIRY (HZ * 5) /* Time to wakeup watermark_timer */ -+ - /* - * The background pageout daemon, started as a kernel thread - * from the init process. -@@ -1319,6 +1382,8 @@ - for ( ; ; ) { - unsigned long new_order; - -+ /* kswapd has been busy so delay watermark_timer */ -+ mod_timer(&pgdat->watermark_timer, jiffies + WT_EXPIRY); - prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); - new_order = pgdat->kswapd_max_order; - pgdat->kswapd_max_order = 0; -@@ -1332,6 +1397,7 @@ - if (!freezing(current)) - schedule(); - -+ set_user_nice(tsk, 0); - order = pgdat->kswapd_max_order; - } - finish_wait(&pgdat->kswapd_wait, &wait); -@@ -1349,9 +1415,10 @@ - /* - * A zone is low on free memory, so wake its kswapd task to service it. - */ --void wakeup_kswapd(struct zone *zone, int order) -+void wakeup_kswapd(struct zone *zone, int order, struct task_struct *p) - { - pg_data_t *pgdat; -+ int active; - - if (!populated_zone(zone)) - return; -@@ -1363,7 +1430,9 @@ - pgdat->kswapd_max_order = order; - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - return; -- if (!waitqueue_active(&pgdat->kswapd_wait)) -+ active = waitqueue_active(&pgdat->kswapd_wait); -+ set_kswapd_nice(pgdat->kswapd, p, active); -+ if (!active) - return; - wake_up_interruptible(&pgdat->kswapd_wait); - } -@@ -1382,6 +1451,8 @@ - struct zone *zone; - unsigned long nr_to_scan, ret = 0; - -+ delay_swap_prefetch(); -+ - for_each_zone(zone) { - - if (!populated_zone(zone)) -@@ -1441,7 +1512,7 @@ - .may_swap = 0, - .swap_cluster_max = nr_pages, - .may_writepage = 1, -- .swappiness = vm_swappiness, -+ .mapped = vm_mapped, - }; - - current->reclaim_state = &reclaim_state; -@@ -1476,7 +1547,7 @@ - /* Force reclaiming mapped pages in the passes #3 and #4 */ - if (pass > 2) { - sc.may_swap = 1; -- sc.swappiness = 100; -+ sc.mapped = 0; - } - - for (prio = DEF_PRIORITY; prio >= 0; prio--) { -@@ -1540,20 +1611,57 @@ - } - - /* -+ * We wake up kswapd every WT_EXPIRY till free ram is above pages_lots -+ */ -+static void watermark_wakeup(unsigned long data) -+{ -+ pg_data_t *pgdat = (pg_data_t *)data; -+ struct timer_list *wt = &pgdat->watermark_timer; -+ int i; -+ -+ if (!waitqueue_active(&pgdat->kswapd_wait) || above_background_load()) -+ goto out; -+ for (i = pgdat->nr_zones - 1; i >= 0; i--) { -+ struct zone *z = pgdat->node_zones + i; -+ -+ if (!populated_zone(z) || is_highmem(z)) { -+ /* We are better off leaving highmem full */ -+ continue; -+ } -+ if (!zone_watermark_ok(z, 0, z->pages_lots, 0, 0)) { -+ wake_up_interruptible(&pgdat->kswapd_wait); -+ goto out; -+ } -+ } -+out: -+ mod_timer(wt, jiffies + WT_EXPIRY); -+ return; -+} -+ -+/* - * This kswapd start function will be called by init and node-hot-add. - * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. - */ - int kswapd_run(int nid) - { - pg_data_t *pgdat = NODE_DATA(nid); -+ struct timer_list *wt; - int ret = 0; - - if (pgdat->kswapd) - return 0; - -+ wt = &pgdat->watermark_timer; -+ init_timer(wt); -+ wt->data = (unsigned long)pgdat; -+ wt->function = watermark_wakeup; -+ wt->expires = jiffies + WT_EXPIRY; -+ add_timer(wt); -+ - pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); - if (IS_ERR(pgdat->kswapd)) { - /* failure at boot is fatal */ -+ del_timer(wt); - BUG_ON(system_state == SYSTEM_BOOTING); - printk("Failed to start kswapd on node %d\n",nid); - ret = -1; -@@ -1624,7 +1732,7 @@ - .swap_cluster_max = max_t(unsigned long, nr_pages, - SWAP_CLUSTER_MAX), - .gfp_mask = gfp_mask, -- .swappiness = vm_swappiness, -+ .mapped = vm_mapped, - }; - unsigned long slab_reclaimable; - -Index: linux-2.6.22-ck1/include/linux/mm_inline.h -=================================================================== ---- linux-2.6.22-ck1.orig/include/linux/mm_inline.h 2007-07-10 14:54:59.000000000 +1000 -+++ linux-2.6.22-ck1/include/linux/mm_inline.h 2007-07-10 14:55:22.000000000 +1000 -@@ -13,6 +13,13 @@ - } - - static inline void -+add_page_to_inactive_list_tail(struct zone *zone, struct page *page) -+{ -+ list_add_tail(&page->lru, &zone->inactive_list); -+ __inc_zone_state(zone, NR_INACTIVE); -+} -+ -+static inline void - del_page_from_active_list(struct zone *zone, struct page *page) - { - list_del(&page->lru); -Index: linux-2.6.22-ck1/include/linux/swap-prefetch.h -=================================================================== ---- /dev/null 1970-01-01 00:00:00.000000000 +0000 -+++ linux-2.6.22-ck1/include/linux/swap-prefetch.h 2007-07-10 14:55:22.000000000 +1000 -@@ -0,0 +1,53 @@ -+#ifndef SWAP_PREFETCH_H_INCLUDED -+#define SWAP_PREFETCH_H_INCLUDED -+ -+#ifdef CONFIG_SWAP_PREFETCH -+/* mm/swap_prefetch.c */ -+extern int swap_prefetch; -+extern int swap_prefetch_delay; -+extern int swap_prefetch_sleep; -+ -+struct swapped_entry { -+ swp_entry_t swp_entry; /* The actual swap entry */ -+ struct list_head swapped_list; /* Linked list of entries */ -+#if MAX_NUMNODES > 1 -+ int node; /* Node id */ -+#endif -+} __attribute__((packed)); -+ -+static inline void store_swap_entry_node(struct swapped_entry *entry, -+ struct page *page) -+{ -+#if MAX_NUMNODES > 1 -+ entry->node = page_to_nid(page); -+#endif -+} -+ -+static inline int get_swap_entry_node(struct swapped_entry *entry) -+{ -+#if MAX_NUMNODES > 1 -+ return entry->node; -+#else -+ return 0; -+#endif -+} -+ -+extern void add_to_swapped_list(struct page *page); -+extern void delay_swap_prefetch(void); -+extern void prepare_swap_prefetch(void); -+ -+#else /* CONFIG_SWAP_PREFETCH */ -+static inline void add_to_swapped_list(struct page *__unused) -+{ -+} -+ -+static inline void prepare_swap_prefetch(void) -+{ -+} -+ -+static inline void delay_swap_prefetch(void) -+{ -+} -+#endif /* CONFIG_SWAP_PREFETCH */ -+ -+#endif /* SWAP_PREFETCH_H_INCLUDED */ -Index: linux-2.6.22-ck1/mm/page_io.c -=================================================================== ---- linux-2.6.22-ck1.orig/mm/page_io.c 2007-07-10 14:55:00.000000000 +1000 -+++ linux-2.6.22-ck1/mm/page_io.c 2007-07-10 14:55:22.000000000 +1000 -@@ -17,6 +17,7 @@ - #include - #include - #include -+#include - #include - - static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index, -@@ -118,6 +119,7 @@ - ret = -ENOMEM; - goto out; - } -+ add_to_swapped_list(page); - if (wbc->sync_mode == WB_SYNC_ALL) - rw |= (1 << BIO_RW_SYNC); - count_vm_event(PSWPOUT); -Index: linux-2.6.22-ck1/include/linux/sysctl.h -=================================================================== ---- linux-2.6.22-ck1.orig/include/linux/sysctl.h 2007-07-10 14:54:59.000000000 +1000 -+++ linux-2.6.22-ck1/include/linux/sysctl.h 2007-07-10 14:55:22.000000000 +1000 -@@ -190,7 +190,7 @@ - VM_OVERCOMMIT_RATIO=16, /* percent of RAM to allow overcommit in */ - VM_PAGEBUF=17, /* struct: Control pagebuf parameters */ - VM_HUGETLB_PAGES=18, /* int: Number of available Huge Pages */ -- VM_SWAPPINESS=19, /* Tendency to steal mapped memory */ -+ VM_UNUSED19=19, /* was: Tendency to steal mapped memory */ - VM_LOWMEM_RESERVE_RATIO=20,/* reservation ratio for lower memory zones */ - VM_MIN_FREE_KBYTES=21, /* Minimum free kilobytes to maintain */ - VM_MAX_MAP_COUNT=22, /* int: Maximum number of mmaps/address-space */ -Index: linux-2.6.22-ck1/include/linux/mmzone.h -=================================================================== ---- linux-2.6.22-ck1.orig/include/linux/mmzone.h 2007-07-10 14:54:59.000000000 +1000 -+++ linux-2.6.22-ck1/include/linux/mmzone.h 2007-07-10 14:55:23.000000000 +1000 -@@ -13,6 +13,7 @@ - #include - #include - #include -+#include - #include - #include - -@@ -181,7 +182,7 @@ - - struct zone { - /* Fields commonly accessed by the page allocator */ -- unsigned long pages_min, pages_low, pages_high; -+ unsigned long pages_min, pages_low, pages_high, pages_lots; - /* - * We don't know if the memory that we're going to allocate will be freeable - * or/and it will be released eventually, so to avoid totally wasting several -@@ -452,6 +453,7 @@ - wait_queue_head_t kswapd_wait; - struct task_struct *kswapd; - int kswapd_max_order; -+ struct timer_list watermark_timer; - } pg_data_t; - - #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) -@@ -468,7 +470,7 @@ - void get_zone_counts(unsigned long *active, unsigned long *inactive, - unsigned long *free); - void build_all_zonelists(void); --void wakeup_kswapd(struct zone *zone, int order); -+void wakeup_kswapd(struct zone *zone, int order, struct task_struct *p); - int zone_watermark_ok(struct zone *z, int order, unsigned long mark, - int classzone_idx, int alloc_flags); - enum memmap_context { -Index: linux-2.6.22-ck1/mm/page_alloc.c -=================================================================== ---- linux-2.6.22-ck1.orig/mm/page_alloc.c 2007-07-10 14:54:59.000000000 +1000 -+++ linux-2.6.22-ck1/mm/page_alloc.c 2007-07-10 14:55:22.000000000 +1000 -@@ -1250,7 +1250,7 @@ - goto nopage; - - for (z = zonelist->zones; *z; z++) -- wakeup_kswapd(*z, order); -+ wakeup_kswapd(*z, order, p); - - /* - * OK, we're below the kswapd watermark and have kicked background -@@ -1314,7 +1314,7 @@ - reclaim_state.reclaimed_slab = 0; - p->reclaim_state = &reclaim_state; - -- did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask); -+ did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask, p); - - p->reclaim_state = NULL; - p->flags &= ~PF_MEMALLOC; -@@ -1570,6 +1570,7 @@ - " min:%lukB" - " low:%lukB" - " high:%lukB" -+ " lots:%lukB" - " active:%lukB" - " inactive:%lukB" - " present:%lukB" -@@ -1581,6 +1582,7 @@ - K(zone->pages_min), - K(zone->pages_low), - K(zone->pages_high), -+ K(zone->pages_lots), - K(zone_page_state(zone, NR_ACTIVE)), - K(zone_page_state(zone, NR_INACTIVE)), - K(zone->present_pages), -@@ -3142,6 +3144,7 @@ - - zone->pages_low = zone->pages_min + (tmp >> 2); - zone->pages_high = zone->pages_min + (tmp >> 1); -+ zone->pages_lots = zone->pages_min + tmp; - spin_unlock_irqrestore(&zone->lru_lock, flags); - } - -Index: linux-2.6.22-ck1/fs/buffer.c -=================================================================== ---- linux-2.6.22-ck1.orig/fs/buffer.c 2007-07-10 14:54:59.000000000 +1000 -+++ linux-2.6.22-ck1/fs/buffer.c 2007-07-10 14:55:22.000000000 +1000 -@@ -356,7 +356,7 @@ - for_each_online_pgdat(pgdat) { - zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones; - if (*zones) -- try_to_free_pages(zones, GFP_NOFS); -+ try_to_free_pages(zones, GFP_NOFS, NULL); - } - } - -Index: linux-2.6.22-ck1/mm/filemap.c -=================================================================== ---- linux-2.6.22-ck1.orig/mm/filemap.c 2007-07-10 14:54:59.000000000 +1000 -+++ linux-2.6.22-ck1/mm/filemap.c 2007-07-10 14:55:23.000000000 +1000 -@@ -466,6 +466,16 @@ - return ret; - } - -+int add_to_page_cache_lru_tail(struct page *page, -+ struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) -+{ -+ int ret = add_to_page_cache(page, mapping, offset, gfp_mask); -+ -+ if (ret == 0) -+ lru_cache_add_tail(page); -+ return ret; -+} -+ - #ifdef CONFIG_NUMA - struct page *__page_cache_alloc(gfp_t gfp) - { -@@ -839,6 +849,34 @@ - ra->ra_pages /= 4; - } - -+/* -+ * Sysctl which determines whether we should read from large files to the -+ * tail of the inactive lru list. -+ */ -+int vm_tail_largefiles __read_mostly = 1; -+ -+static inline int nr_mapped(void) -+{ -+ return global_page_state(NR_FILE_MAPPED) + -+ global_page_state(NR_ANON_PAGES); -+} -+ -+/* -+ * This examines how large in pages a file size is and returns 1 if it is -+ * more than half the unmapped ram. Avoid doing read_page_state which is -+ * expensive unless we already know it is likely to be large enough. -+ */ -+static int large_isize(unsigned long nr_pages) -+{ -+ if (nr_pages * 6 > vm_total_pages) { -+ unsigned long unmapped_ram = vm_total_pages - nr_mapped(); -+ -+ if (nr_pages * 2 > unmapped_ram) -+ return 1; -+ } -+ return 0; -+} -+ - /** - * do_generic_mapping_read - generic file read routine - * @mapping: address_space to be read -@@ -1051,8 +1089,19 @@ - goto out; - } - } -- error = add_to_page_cache_lru(cached_page, mapping, -- index, GFP_KERNEL); -+ -+ /* -+ * If we know the file is large we add the pages read to the -+ * end of the lru as we're unlikely to be able to cache the -+ * whole file in ram so make those pages the first to be -+ * dropped if not referenced soon. -+ */ -+ if (vm_tail_largefiles && large_isize(end_index)) -+ error = add_to_page_cache_lru_tail(cached_page, -+ mapping, index, GFP_KERNEL); -+ else -+ error = add_to_page_cache_lru(cached_page, mapping, -+ index, GFP_KERNEL); - if (error) { - if (error == -EEXIST) - goto find_page; -Index: linux-2.6.22-ck1/Documentation/filesystems/proc.txt -=================================================================== ---- linux-2.6.22-ck1.orig/Documentation/filesystems/proc.txt 2007-07-10 14:54:59.000000000 +1000 -+++ linux-2.6.22-ck1/Documentation/filesystems/proc.txt 2007-07-10 14:55:23.000000000 +1000 -@@ -1333,6 +1333,14 @@ - As this is a non-destructive operation and dirty objects are not freeable, the - user should run `sync' first. - -+tail_largefiles -+--------------- -+ -+When enabled reads from large files to the tail end of the inactive lru list. -+This means that any cache from reading large files is dropped very quickly, -+preventing loss of mapped ram and useful pagecache when large files are read. -+This does, however, make caching less effective when working with large files. -+ - - 2.5 /proc/sys/dev - Device specific parameters - ---------------------------------------------- -Index: linux-2.6.22-ck1/arch/i386/Kconfig -=================================================================== ---- linux-2.6.22-ck1.orig/arch/i386/Kconfig 2007-07-10 14:54:59.000000000 +1000 -+++ linux-2.6.22-ck1/arch/i386/Kconfig 2007-07-10 14:55:23.000000000 +1000 -@@ -550,7 +550,7 @@ - - choice - depends on EXPERIMENTAL -- prompt "Memory split" if EMBEDDED -+ prompt "Memory split" - default VMSPLIT_3G - help - Select the desired split between kernel and user memory. -@@ -569,17 +569,17 @@ - option alone! - - config VMSPLIT_3G -- bool "3G/1G user/kernel split" -+ bool "Default 896MB lowmem (3G/1G user/kernel split)" - config VMSPLIT_3G_OPT - depends on !HIGHMEM -- bool "3G/1G user/kernel split (for full 1G low memory)" -+ bool "1GB lowmem (3G/1G user/kernel split)" - config VMSPLIT_2G -- bool "2G/2G user/kernel split" -+ bool "2GB lowmem (2G/2G user/kernel split)" - config VMSPLIT_2G_OPT - depends on !HIGHMEM -- bool "2G/2G user/kernel split (for full 2G low memory)" -+ bool "2GB lowmem (2G/2G user/kernel split)" - config VMSPLIT_1G -- bool "1G/3G user/kernel split" -+ bool "3GB lowmem (1G/3G user/kernel split)" - endchoice - - config PAGE_OFFSET -Index: linux-2.6.22-ck1/kernel/Kconfig.hz -=================================================================== ---- linux-2.6.22-ck1.orig/kernel/Kconfig.hz 2007-07-10 14:54:59.000000000 +1000 -+++ linux-2.6.22-ck1/kernel/Kconfig.hz 2007-07-10 14:55:24.000000000 +1000 -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_250 -+ default HZ_1000 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -13,8 +13,7 @@ - contention and cacheline bounces as a result of timer interrupts. - Note that the timer interrupt occurs on each processor in an SMP - environment leading to NR_CPUS * HZ number of timer interrupts -- per second. -- -+ per second.Laptops may also show improved battery life. - - config HZ_100 - bool "100 HZ" -@@ -23,13 +22,14 @@ - with lots of processors that may show reduced performance if - too many timer interrupts are occurring. - -- config HZ_250 -+ config HZ_250_NODEFAULT - bool "250 HZ" - help -- 250 Hz is a good compromise choice allowing server performance -- while also showing good interactive responsiveness even -- on SMP and NUMA systems. If you are going to be using NTSC video -- or multimedia, selected 300Hz instead. -+ 250 HZ is a lousy compromise choice allowing server interactivity -+ while also showing desktop throughput and no extra power saving on -+ laptops. Good for when you can't make up your mind. -+ -+ Recommend 100 or 1000 instead. - - config HZ_300 - bool "300 HZ" -@@ -45,12 +45,76 @@ - 1000 Hz is the preferred choice for desktop systems and other - systems requiring fast interactive responses to events. - -+ config HZ_1500 -+ bool "1500 HZ" -+ help -+ 1500 Hz is an insane value to use to run broken software that is Hz -+ limited. -+ -+ Being over 1000, driver breakage is likely. -+ -+ config HZ_2000 -+ bool "2000 HZ" -+ help -+ 2000 Hz is an insane value to use to run broken software that is Hz -+ limited. -+ -+ Being over 1000, driver breakage is likely. -+ -+ config HZ_3000 -+ bool "3000 HZ" -+ help -+ 3000 Hz is an insane value to use to run broken software that is Hz -+ limited. -+ -+ Being over 1000, driver breakage is likely. -+ -+ config HZ_4000 -+ bool "4000 HZ" -+ help -+ 4000 Hz is an insane value to use to run broken software that is Hz -+ limited. -+ -+ Being over 1000, driver breakage is likely. -+ -+ config HZ_5000 -+ bool "5000 HZ" -+ help -+ 5000 Hz is an obscene value to use to run broken software that is Hz -+ limited. -+ -+ Being over 1000, driver breakage is likely. -+ -+ config HZ_7500 -+ bool "7500 HZ" -+ help -+ 7500 Hz is an obscene value to use to run broken software that is Hz -+ limited. -+ -+ Being over 1000, driver breakage is likely. -+ -+ config HZ_10000 -+ bool "10000 HZ" -+ help -+ 10000 Hz is an obscene value to use to run broken software that is Hz -+ limited. -+ -+ Being over 1000, driver breakage is likely. -+ -+ - endchoice - - config HZ - int - default 100 if HZ_100 -- default 250 if HZ_250 -+ default 250 if HZ_250_NODEFAULT - default 300 if HZ_300 - default 1000 if HZ_1000 -+ default 1500 if HZ_1500 -+ default 2000 if HZ_2000 -+ default 3000 if HZ_3000 -+ default 4000 if HZ_4000 -+ default 5000 if HZ_5000 -+ default 7500 if HZ_7500 -+ default 10000 if HZ_10000 - -Index: linux-2.6.22-ck1/arch/i386/defconfig -=================================================================== ---- linux-2.6.22-ck1.orig/arch/i386/defconfig 2007-07-10 14:54:59.000000000 +1000 -+++ linux-2.6.22-ck1/arch/i386/defconfig 2007-07-10 14:55:23.000000000 +1000 -@@ -226,10 +226,10 @@ - # CONFIG_IRQBALANCE is not set - CONFIG_SECCOMP=y - # CONFIG_HZ_100 is not set --CONFIG_HZ_250=y -+# CONFIG_HZ_250 is not set - # CONFIG_HZ_300 is not set --# CONFIG_HZ_1000 is not set --CONFIG_HZ=250 -+CONFIG_HZ_1000=y -+CONFIG_HZ=1000 - # CONFIG_KEXEC is not set - # CONFIG_CRASH_DUMP is not set - CONFIG_PHYSICAL_START=0x100000 -Index: linux-2.6.22-ck1/arch/x86_64/defconfig -=================================================================== ---- linux-2.6.22-ck1.orig/arch/x86_64/defconfig 2007-07-10 14:54:59.000000000 +1000 -+++ linux-2.6.22-ck1/arch/x86_64/defconfig 2007-07-10 14:55:23.000000000 +1000 -@@ -185,10 +185,10 @@ - CONFIG_SECCOMP=y - # CONFIG_CC_STACKPROTECTOR is not set - # CONFIG_HZ_100 is not set --CONFIG_HZ_250=y -+# CONFIG_HZ_250 is not set - # CONFIG_HZ_300 is not set --# CONFIG_HZ_1000 is not set --CONFIG_HZ=250 -+CONFIG_HZ_1000=y -+CONFIG_HZ=1000 - CONFIG_K8_NB=y - CONFIG_GENERIC_HARDIRQS=y - CONFIG_GENERIC_IRQ_PROBE=y -Index: linux-2.6.22-ck1/include/linux/jiffies.h -=================================================================== ---- linux-2.6.22-ck1.orig/include/linux/jiffies.h 2007-07-10 14:54:59.000000000 +1000 -+++ linux-2.6.22-ck1/include/linux/jiffies.h 2007-07-10 14:55:24.000000000 +1000 -@@ -29,6 +29,12 @@ - # define SHIFT_HZ 9 - #elif HZ >= 768 && HZ < 1536 - # define SHIFT_HZ 10 -+#elif HZ >= 1536 && HZ < 3072 -+# define SHIFT_HZ 11 -+#elif HZ >= 3072 && HZ < 6144 -+# define SHIFT_HZ 12 -+#elif HZ >= 6144 && HZ < 12288 -+# define SHIFT_HZ 13 - #else - # error You lose. - #endif -Index: linux-2.6.22-ck1/include/net/inet_timewait_sock.h -=================================================================== ---- linux-2.6.22-ck1.orig/include/net/inet_timewait_sock.h 2007-07-10 14:54:59.000000000 +1000 -+++ linux-2.6.22-ck1/include/net/inet_timewait_sock.h 2007-07-10 14:55:24.000000000 +1000 -@@ -38,8 +38,8 @@ - * If time > 4sec, it is "slow" path, no recycling is required, - * so that we select tick to get range about 4 seconds. - */ --#if HZ <= 16 || HZ > 4096 --# error Unsupported: HZ <= 16 or HZ > 4096 -+#if HZ <= 16 || HZ > 16384 -+# error Unsupported: HZ <= 16 or HZ > 16384 - #elif HZ <= 32 - # define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) - #elif HZ <= 64 -@@ -54,8 +54,12 @@ - # define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) - #elif HZ <= 2048 - # define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) --#else -+#elif HZ <= 4096 - # define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) -+#elif HZ <= 8192 -+# define INET_TWDR_RECYCLE_TICK (13 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) -+#else -+# define INET_TWDR_RECYCLE_TICK (14 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) - #endif - - /* TIME_WAIT reaping mechanism. */ -Index: linux-2.6.22-ck1/init/calibrate.c -=================================================================== ---- linux-2.6.22-ck1.orig/init/calibrate.c 2007-07-10 14:54:59.000000000 +1000 -+++ linux-2.6.22-ck1/init/calibrate.c 2007-07-10 14:55:24.000000000 +1000 -@@ -122,12 +122,12 @@ - printk("Calibrating delay loop (skipped)... " - "%lu.%02lu BogoMIPS preset\n", - loops_per_jiffy/(500000/HZ), -- (loops_per_jiffy/(5000/HZ)) % 100); -+ (loops_per_jiffy * 10/(50000/HZ)) % 100); - } else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) { - printk("Calibrating delay using timer specific routine.. "); - printk("%lu.%02lu BogoMIPS (lpj=%lu)\n", - loops_per_jiffy/(500000/HZ), -- (loops_per_jiffy/(5000/HZ)) % 100, -+ (loops_per_jiffy * 10/(50000/HZ)) % 100, - loops_per_jiffy); - } else { - loops_per_jiffy = (1<<12); -@@ -166,7 +166,7 @@ - /* Round the value and print it */ - printk("%lu.%02lu BogoMIPS (lpj=%lu)\n", - loops_per_jiffy/(500000/HZ), -- (loops_per_jiffy/(5000/HZ)) % 100, -+ (loops_per_jiffy * 10/(50000/HZ)) % 100, - loops_per_jiffy); - } - -Index: linux-2.6.22-ck1/arch/i386/kernel/cpu/proc.c -=================================================================== ---- linux-2.6.22-ck1.orig/arch/i386/kernel/cpu/proc.c 2007-07-10 14:54:59.000000000 +1000 -+++ linux-2.6.22-ck1/arch/i386/kernel/cpu/proc.c 2007-07-10 14:55:24.000000000 +1000 -@@ -157,7 +157,7 @@ - - seq_printf(m, "\nbogomips\t: %lu.%02lu\n", - c->loops_per_jiffy/(500000/HZ), -- (c->loops_per_jiffy/(5000/HZ)) % 100); -+ (c->loops_per_jiffy * 10/(50000/HZ)) % 100); - seq_printf(m, "clflush size\t: %u\n\n", c->x86_clflush_size); - - return 0; -Index: linux-2.6.22-ck1/arch/i386/kernel/smpboot.c -=================================================================== ---- linux-2.6.22-ck1.orig/arch/i386/kernel/smpboot.c 2007-07-10 14:54:59.000000000 +1000 -+++ linux-2.6.22-ck1/arch/i386/kernel/smpboot.c 2007-07-10 14:55:24.000000000 +1000 -@@ -1094,7 +1094,7 @@ - "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", - cpucount+1, - bogosum/(500000/HZ), -- (bogosum/(5000/HZ))%100); -+ (bogosum * 10/(50000/HZ))%100); - - Dprintk("Before bogocount - setting activated=1.\n"); - -Index: linux-2.6.22-ck1/include/linux/nfsd/stats.h -=================================================================== ---- linux-2.6.22-ck1.orig/include/linux/nfsd/stats.h 2007-07-10 14:54:59.000000000 +1000 -+++ linux-2.6.22-ck1/include/linux/nfsd/stats.h 2007-07-10 14:55:24.000000000 +1000 -@@ -35,8 +35,8 @@ - - }; - --/* thread usage wraps very million seconds (approx one fortnight) */ --#define NFSD_USAGE_WRAP (HZ*1000000) -+/* thread usage wraps every one hundred thousand seconds (approx one day) */ -+#define NFSD_USAGE_WRAP (HZ*100000) - - #ifdef __KERNEL__ - -Index: linux-2.6.22-ck1/arch/x86_64/kernel/setup.c -=================================================================== ---- linux-2.6.22-ck1.orig/arch/x86_64/kernel/setup.c 2007-07-10 14:54:59.000000000 +1000 -+++ linux-2.6.22-ck1/arch/x86_64/kernel/setup.c 2007-07-10 14:55:24.000000000 +1000 -@@ -1047,7 +1047,7 @@ - - seq_printf(m, "\nbogomips\t: %lu.%02lu\n", - c->loops_per_jiffy/(500000/HZ), -- (c->loops_per_jiffy/(5000/HZ)) % 100); -+ (c->loops_per_jiffy * 10/(50000/HZ)) % 100); - - if (c->x86_tlbsize > 0) - seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); -Index: linux-2.6.22-ck1/Makefile -=================================================================== ---- linux-2.6.22-ck1.orig/Makefile 2007-07-10 14:54:59.000000000 +1000 -+++ linux-2.6.22-ck1/Makefile 2007-07-10 14:55:24.000000000 +1000 -@@ -1,8 +1,9 @@ - VERSION = 2 - PATCHLEVEL = 6 - SUBLEVEL = 22 --EXTRAVERSION = --NAME = Holy Dancing Manatees, Batman! -+EXTRAVERSION = -ck1 -+NAME = So long, and thanks for all the fish -+JANAME = さようなら、いままで魚をありがとう - - # *DOCUMENTATION* - # To see a list of typical targets execute "make help" diff --git a/pkgs/top-level/all-packages.nix b/pkgs/top-level/all-packages.nix index a235ccd5fb93..1fedeccedb8b 100644 --- a/pkgs/top-level/all-packages.nix +++ b/pkgs/top-level/all-packages.nix @@ -6224,64 +6224,6 @@ let [(getConfig ["kernel" "addConfig"] "")]; }; - kernel_2_6_21_ck = import ../os-specific/linux/kernel/linux-2.6.21_ck.nix { - inherit fetchurl stdenv perl mktemp module_init_tools; - kernelPatches = [ - { name = "ext3cow"; - patch = ../os-specific/linux/kernel/linux-2.6.20.3-ext3cow.patch; - extraConfig = - "CONFIG_EXT3COW_FS=m\n" + - "CONFIG_EXT3COW_FS_XATTR=y\n" + - "CONFIG_EXT3COW_FS_POSIX_ACL=y\n" + - "CONFIG_EXT3COW_FS_SECURITY=y\n"; - } - { name = "Con Kolivas Patch"; - patch = ../os-specific/linux/kernel/patch-2.6.21-ck1; - } - { name = "paravirt-nvidia"; - patch = ../os-specific/linux/kernel/2.6.20-paravirt-nvidia.patch; - } - { name = "skas-2.6.20-v9-pre9"; - patch = fetchurl { - url = http://www.user-mode-linux.org/~blaisorblade/patches/skas3-2.6/skas-2.6.20-v9-pre9/skas-2.6.20-v9-pre9.patch.bz2; - md5 = "02e619e5b3aaf0f9768f03ac42753e74"; - }; - extraConfig = - "CONFIG_PROC_MM=y\n" + - "# CONFIG_PROC_MM_DUMPABLE is not set\n"; - } - { name = "fbsplash-0.9.2-r5-2.6.21"; - patch = fetchurl { - url = http://dev.gentoo.org/~dsd/genpatches/trunk/2.6.21/4200_fbsplash-0.9.2-r5.patch; - sha256 = "00s8074fzsly2zpir885zqkvq267qyzg6vhsn7n1z2v1z78avxd8"; - }; - extraConfig = "CONFIG_FB_SPLASH=y"; - } - ]; - }; - - kernel_2_6_25 = import ../os-specific/linux/kernel/linux-2.6.25.nix { - inherit fetchurl stdenv perl mktemp module_init_tools; - kernelPatches = [ - { name = "fbcondecor-0.9.4-2.6.25-rc6"; - patch = fetchurl { - url = http://dev.gentoo.org/~spock/projects/fbcondecor/archive/fbcondecor-0.9.4-2.6.25-rc6.patch; - sha256 = "1wm94n7f0qyb8xvafip15r158z5pzw7zb7q8hrgddb092c6ibmq8"; - }; - extraConfig = "CONFIG_FB_CON_DECOR=y"; - features = { fbConDecor = true; }; - } - { name = "sec_perm-2.6.24"; - patch = ../os-specific/linux/kernel/sec_perm-2.6.24.patch; - features = { secPermPatch = true; }; - } - ]; - extraConfig = - lib.optional (getConfig ["kernel" "timer_stats"] false) "CONFIG_TIMER_STATS=y" ++ - lib.optional (getConfig ["kernel" "no_irqbalance"] false) "# CONFIG_IRQBALANCE is not set" ++ - [(getConfig ["kernel" "addConfig"] "")]; - }; - kernel_2_6_23 = import ../os-specific/linux/kernel/linux-2.6.23.nix { inherit fetchurl stdenv perl mktemp module_init_tools; kernelPatches = [ @@ -6326,6 +6268,28 @@ let [(getConfig ["kernel" "addConfig"] "")]; }; + kernel_2_6_25 = import ../os-specific/linux/kernel/linux-2.6.25.nix { + inherit fetchurl stdenv perl mktemp module_init_tools; + kernelPatches = [ + { name = "fbcondecor-0.9.4-2.6.25-rc6"; + patch = fetchurl { + url = http://dev.gentoo.org/~spock/projects/fbcondecor/archive/fbcondecor-0.9.4-2.6.25-rc6.patch; + sha256 = "1wm94n7f0qyb8xvafip15r158z5pzw7zb7q8hrgddb092c6ibmq8"; + }; + extraConfig = "CONFIG_FB_CON_DECOR=y"; + features = { fbConDecor = true; }; + } + { name = "sec_perm-2.6.24"; + patch = ../os-specific/linux/kernel/sec_perm-2.6.24.patch; + features = { secPermPatch = true; }; + } + ]; + extraConfig = + lib.optional (getConfig ["kernel" "timer_stats"] false) "CONFIG_TIMER_STATS=y" ++ + lib.optional (getConfig ["kernel" "no_irqbalance"] false) "# CONFIG_IRQBALANCE is not set" ++ + [(getConfig ["kernel" "addConfig"] "")]; + }; + kernel_2_6_26 = import ../os-specific/linux/kernel/linux-2.6.26.nix { inherit fetchurl stdenv perl mktemp module_init_tools; kernelPatches = [ @@ -6508,9 +6472,8 @@ let # The current default kernel / kernel modules. kernelPackages = kernelPackages_2_6_25; - #kernel = kernelPackages.kernel; - customKernel = composedArgsAndFun (lib.sumTwoArgs (import ../os-specific/linux/kernel/linux.nix) { + customKernel = composedArgsAndFun (lib.sumTwoArgs (import ../os-specific/linux/kernel/generic.nix) { inherit fetchurl stdenv perl mktemp module_init_tools lib; });