[ Upstream commit 7227ff4de55d931bbdc156c8ef0ce4f100c78a5b ] There is a race between adding and removing elements to the tree mod log list and rbtree that can lead to use-after-free problems. Consider the following example that explains how/why the problems happens: 1) Task A has mod log element with sequence number 200. It currently is the only element in the mod log list; 2) Task A calls btrfs_put_tree_mod_seq() because it no longer needs to access the tree mod log. When it enters the function, it initializes 'min_seq' to (u64)-1. Then it acquires the lock 'tree_mod_seq_lock' before checking if there are other elements in the mod seq list. Since the list it empty, 'min_seq' remains set to (u64)-1. Then it unlocks the lock 'tree_mod_seq_lock'; 3) Before task A acquires the lock 'tree_mod_log_lock', task B adds itself to the mod seq list through btrfs_get_tree_mod_seq() and gets a sequence number of 201; 4) Some other task, name it task C, modifies a btree and because there elements in the mod seq list, it adds a tree mod elem to the tree mod log rbtree. That node added to the mod log rbtree is assigned a sequence number of 202; 5) Task B, which is doing fiemap and resolving indirect back references, calls btrfs get_old_root(), with 'time_seq' == 201, which in turn calls tree_mod_log_search() - the search returns the mod log node from the rbtree with sequence number 202, created by task C; 6) Task A now acquires the lock 'tree_mod_log_lock', starts iterating the mod log rbtree and finds the node with sequence number 202. Since 202 is less than the previously computed 'min_seq', (u64)-1, it removes the node and frees it; 7) Task B still has a pointer to the node with sequence number 202, and it dereferences the pointer itself and through the call to __tree_mod_log_rewind(), resulting in a use-after-free problem. This issue can be triggered sporadically with the test case generic/561 from fstests, and it happens more frequently with a higher number of duperemove processes. When it happens to me, it either freezes the VM or it produces a trace like the following before crashing: [ 1245.321140] general protection fault: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI [ 1245.321200] CPU: 1 PID: 26997 Comm: pool Not tainted 5.5.0-rc6-btrfs-next-52 #1 [ 1245.321235] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-0-ga698c8995f-prebuilt.qemu.org 04/01/2014 [ 1245.321287] RIP: 0010:rb_next+0x16/0x50 [ 1245.321307] Code: .... [ 1245.321372] RSP: 0018:ffffa151c4d039b0 EFLAGS: 00010202 [ 1245.321388] RAX: 6b6b6b6b6b6b6b6b RBX: ffff8ae221363c80 RCX: 6b6b6b6b6b6b6b6b [ 1245.321409] RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffff8ae221363c80 [ 1245.321439] RBP: ffff8ae20fcc4688 R08: 0000000000000002 R09: 0000000000000000 [ 1245.321475] R10: ffff8ae20b120910 R11: 00000000243f8bb1 R12: 0000000000000038 [ 1245.321506] R13: ffff8ae221363c80 R14: 000000000000075f R15: ffff8ae223f762b8 [ 1245.321539] FS: 00007fdee1ec7700(0000) GS:ffff8ae236c80000(0000) knlGS:0000000000000000 [ 1245.321591] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1245.321614] CR2: 00007fded4030c48 CR3: 000000021da16003 CR4: 00000000003606e0 [ 1245.321642] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1245.321668] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1245.321706] Call Trace: [ 1245.321798] __tree_mod_log_rewind+0xbf/0x280 [btrfs] [ 1245.321841] btrfs_search_old_slot+0x105/0xd00 [btrfs] [ 1245.321877] resolve_indirect_refs+0x1eb/0xc60 [btrfs] [ 1245.321912] find_parent_nodes+0x3dc/0x11b0 [btrfs] [ 1245.321947] btrfs_check_shared+0x115/0x1c0 [btrfs] [ 1245.321980] ? extent_fiemap+0x59d/0x6d0 [btrfs] [ 1245.322029] extent_fiemap+0x59d/0x6d0 [btrfs] [ 1245.322066] do_vfs_ioctl+0x45a/0x750 [ 1245.322081] ksys_ioctl+0x70/0x80 [ 1245.322092] ? trace_hardirqs_off_thunk+0x1a/0x1c [ 1245.322113] __x64_sys_ioctl+0x16/0x20 [ 1245.322126] do_syscall_64+0x5c/0x280 [ 1245.322139] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 1245.322155] RIP: 0033:0x7fdee3942dd7 [ 1245.322177] Code: .... [ 1245.322258] RSP: 002b:00007fdee1ec6c88 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [ 1245.322294] RAX: ffffffffffffffda RBX: 00007fded40210d8 RCX: 00007fdee3942dd7 [ 1245.322314] RDX: 00007fded40210d8 RSI: 00000000c020660b RDI: 0000000000000004 [ 1245.322337] RBP: 0000562aa89e7510 R08: 0000000000000000 R09: 00007fdee1ec6d44 [ 1245.322369] R10: 0000000000000073 R11: 0000000000000246 R12: 00007fdee1ec6d48 [ 1245.322390] R13: 00007fdee1ec6d40 R14: 00007fded40210d0 R15: 00007fdee1ec6d50 [ 1245.322423] Modules linked in: .... [ 1245.323443] ---[ end trace 01de1e9ec5dff3cd ]--- Fix this by ensuring that btrfs_put_tree_mod_seq() computes the minimum sequence number and iterates the rbtree while holding the lock 'tree_mod_log_lock' in write mode. Also get rid of the 'tree_mod_seq_lock' lock, since it is now redundant. Fixes: bd989ba359f2ac ("Btrfs: add tree modification log functions") Fixes: 097b8a7c9e48e2 ("Btrfs: join tree mod log code with the code holding back delayed refs") CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Sasha Levin <sashal@kernel.org>
171 lines
4.4 KiB
C
171 lines
4.4 KiB
C
/*
|
|
* Copyright (C) 2013 Fusion IO. All rights reserved.
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public
|
|
* License v2 as published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public
|
|
* License along with this program; if not, write to the
|
|
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
* Boston, MA 021110-1307, USA.
|
|
*/
|
|
|
|
#include <linux/fs.h>
|
|
#include <linux/mount.h>
|
|
#include <linux/magic.h>
|
|
#include "btrfs-tests.h"
|
|
#include "../ctree.h"
|
|
#include "../volumes.h"
|
|
#include "../disk-io.h"
|
|
#include "../qgroup.h"
|
|
|
|
static struct vfsmount *test_mnt = NULL;
|
|
|
|
static const struct super_operations btrfs_test_super_ops = {
|
|
.alloc_inode = btrfs_alloc_inode,
|
|
.destroy_inode = btrfs_test_destroy_inode,
|
|
};
|
|
|
|
static struct dentry *btrfs_test_mount(struct file_system_type *fs_type,
|
|
int flags, const char *dev_name,
|
|
void *data)
|
|
{
|
|
return mount_pseudo(fs_type, "btrfs_test:", &btrfs_test_super_ops,
|
|
NULL, BTRFS_TEST_MAGIC);
|
|
}
|
|
|
|
static struct file_system_type test_type = {
|
|
.name = "btrfs_test_fs",
|
|
.mount = btrfs_test_mount,
|
|
.kill_sb = kill_anon_super,
|
|
};
|
|
|
|
struct inode *btrfs_new_test_inode(void)
|
|
{
|
|
return new_inode(test_mnt->mnt_sb);
|
|
}
|
|
|
|
int btrfs_init_test_fs(void)
|
|
{
|
|
int ret;
|
|
|
|
ret = register_filesystem(&test_type);
|
|
if (ret) {
|
|
printk(KERN_ERR "btrfs: cannot register test file system\n");
|
|
return ret;
|
|
}
|
|
|
|
test_mnt = kern_mount(&test_type);
|
|
if (IS_ERR(test_mnt)) {
|
|
printk(KERN_ERR "btrfs: cannot mount test file system\n");
|
|
unregister_filesystem(&test_type);
|
|
return ret;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
void btrfs_destroy_test_fs(void)
|
|
{
|
|
kern_unmount(test_mnt);
|
|
unregister_filesystem(&test_type);
|
|
}
|
|
|
|
struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
|
|
{
|
|
struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info),
|
|
GFP_NOFS);
|
|
|
|
if (!fs_info)
|
|
return fs_info;
|
|
fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices),
|
|
GFP_NOFS);
|
|
if (!fs_info->fs_devices) {
|
|
kfree(fs_info);
|
|
return NULL;
|
|
}
|
|
fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block),
|
|
GFP_NOFS);
|
|
if (!fs_info->super_copy) {
|
|
kfree(fs_info->fs_devices);
|
|
kfree(fs_info);
|
|
return NULL;
|
|
}
|
|
|
|
if (init_srcu_struct(&fs_info->subvol_srcu)) {
|
|
kfree(fs_info->fs_devices);
|
|
kfree(fs_info->super_copy);
|
|
kfree(fs_info);
|
|
return NULL;
|
|
}
|
|
|
|
spin_lock_init(&fs_info->buffer_lock);
|
|
spin_lock_init(&fs_info->qgroup_lock);
|
|
spin_lock_init(&fs_info->qgroup_op_lock);
|
|
spin_lock_init(&fs_info->super_lock);
|
|
spin_lock_init(&fs_info->fs_roots_radix_lock);
|
|
mutex_init(&fs_info->qgroup_ioctl_lock);
|
|
mutex_init(&fs_info->qgroup_rescan_lock);
|
|
rwlock_init(&fs_info->tree_mod_log_lock);
|
|
fs_info->running_transaction = NULL;
|
|
fs_info->qgroup_tree = RB_ROOT;
|
|
fs_info->qgroup_ulist = NULL;
|
|
atomic64_set(&fs_info->tree_mod_seq, 0);
|
|
INIT_LIST_HEAD(&fs_info->dirty_qgroups);
|
|
INIT_LIST_HEAD(&fs_info->dead_roots);
|
|
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
|
|
INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
|
|
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
|
|
return fs_info;
|
|
}
|
|
|
|
static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
|
|
{
|
|
struct radix_tree_iter iter;
|
|
void **slot;
|
|
|
|
spin_lock(&fs_info->buffer_lock);
|
|
restart:
|
|
radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
|
|
struct extent_buffer *eb;
|
|
|
|
eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock);
|
|
if (!eb)
|
|
continue;
|
|
/* Shouldn't happen but that kind of thinking creates CVE's */
|
|
if (radix_tree_exception(eb)) {
|
|
if (radix_tree_deref_retry(eb))
|
|
goto restart;
|
|
continue;
|
|
}
|
|
spin_unlock(&fs_info->buffer_lock);
|
|
free_extent_buffer_stale(eb);
|
|
spin_lock(&fs_info->buffer_lock);
|
|
}
|
|
spin_unlock(&fs_info->buffer_lock);
|
|
|
|
btrfs_free_qgroup_config(fs_info);
|
|
btrfs_free_fs_roots(fs_info);
|
|
cleanup_srcu_struct(&fs_info->subvol_srcu);
|
|
kfree(fs_info->super_copy);
|
|
kfree(fs_info->fs_devices);
|
|
kfree(fs_info);
|
|
}
|
|
|
|
void btrfs_free_dummy_root(struct btrfs_root *root)
|
|
{
|
|
if (!root)
|
|
return;
|
|
if (root->node)
|
|
free_extent_buffer(root->node);
|
|
if (root->fs_info)
|
|
btrfs_free_dummy_fs_info(root->fs_info);
|
|
kfree(root);
|
|
}
|
|
|