summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSage Weil <sage@newdream.net>2009-12-01 14:59:57 -0800
committerSage Weil <sage@newdream.net>2009-12-01 16:19:19 -0800
commitc280aace5325a5af7f05a54ad104fda0a84264ad (patch)
tree5272750b7b6f8c2f529294972d5ba0a0f496f1d2
parent8bc7d2262f6f254acbe05a9ed0a07de782bd33c6 (diff)
downloadceph-c280aace5325a5af7f05a54ad104fda0a84264ad.tar.gz
todo
the lockdep was probably due to the leak of request_mutex?
-rw-r--r--src/TODO176
1 files changed, 3 insertions, 173 deletions
diff --git a/src/TODO b/src/TODO
index 0485a6054cb..7a4ed673a02 100644
--- a/src/TODO
+++ b/src/TODO
@@ -59,6 +59,8 @@ v0.18
- auth: 'none' security framework
- mon: "safely" bail on write errors (e.g. ENOSPC)
- mds: fix replay/reconnect race (caused (fast) client reconnect to fail)
+- mds: misc journal replay, session fixes
+
- mount btrfs by UUID?
@@ -82,180 +84,7 @@ bugs
- kclient: after reconnect,
cp: writing `/c/ceph2.2/bin/gs-gpl': Bad file descriptor
- need to somehow wake up unreconnected caps? hrm!!
-- kclient: ~300 (306, 311) second delay before able to reconnect to restarted monitor???
- kclient: socket creation
-- kclient: leak of osdc->request_mutex?
-[ 80 3585.080000] events/0 D 00000000421fb690 0 5 2 0x00000000
-[ 80 3585.080000] 60356f18 7082df00 7084dcd0 7084c000 7084dcf0 60013d64 7084dcf0 60028eaa
-[ 80 3585.080000] 7084c000 7019b180 7084dd40 60266517 7084dd20 6004be47 7084c000 7084c000
-[ 80 3585.080000] 70204730 00000001 7082df00 70204750 7084ddb0 60266d82 686373c0 70204750
-[ 80 3585.080000] Call Trace:
-[ 80 3585.080000] 7084dcc8: [<60013d64>] _switch_to+0x5e/0xae
-[ 80 3585.080000] 7084dcd8: [<60028eaa>] deactivate_task+0x28/0x30
-[ 80 3585.080000] 7084dcf8: [<60266517>] schedule+0x23a/0x280
-[ 80 3585.080000] 7084dd08: [<6004be47>] debug_mutex_free_waiter+0x4d/0x51
-[ 80 3585.080000] 7084dd48: [<60266d82>] __mutex_lock_slowpath+0x129/0x21d
-[ 80 3585.080000] 7084dda8: [<60198e18>] handle_timeout+0x0/0x2b4
-[ 80 3585.080000] 7084ddb8: [<602673ee>] mutex_lock+0x25/0x3a
-[ 80 3585.080000] 7084ddc8: [<60197461>] ceph_monc_request_next_osdmap+0x64/0x96
-[ 80 3585.080000] 7084dde8: [<60198ebf>] handle_timeout+0xa7/0x2b4
-[ 80 3585.080000] 7084de48: [<60198e18>] handle_timeout+0x0/0x2b4
-[ 80 3585.080000] 7084de58: [<600408df>] worker_thread+0xff/0x18f
-[ 80 3585.080000] 7084de80: [<60043d14>] autoremove_wake_function+0x0/0x38
-[ 80 3585.080000] 7084dec0: [<600407e0>] worker_thread+0x0/0x18f
-[ 80 3585.080000] 7084ded8: [<60043a6b>] kthread+0x91/0x99
-[ 80 3585.080000] 7084df48: [<60021c09>] run_kernel_thread+0x41/0x4a
-[ 80 3585.080000] 7084df58: [<600439da>] kthread+0x0/0x99
-[ 80 3585.080000] 7084df98: [<60021bf0>] run_kernel_thread+0x28/0x4a
-[ 80 3585.080000] 7084dfc8: [<60013cdc>] new_thread_handler+0x72/0x9c
-[ 80 3585.080000]
-[ 80 3585.080000] INFO: task cp:1267 blocked for more than 120 seconds.
-[ 80 3585.080000] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
-[ 80 3585.080000] cp D 00000000421fb690 0 1267 1235 0x00000000
-[ 80 3585.080000] 60356f18 70868000 68637340 68636000 68637360 60013d64 68637360 60028eaa
-[ 80 3585.080000] 68636000 7019b180 686373b0 60266517 5a00000000024050 68636000 68636000 68636000
-[ 80 3585.080000] 70204730 00000001 70868000 70204750 68637420 60266d82 70204750 7084dd50
-[ 80 3585.080000] Call Trace:
-[ 80 3585.080000] 68637338: [<60013d64>] _switch_to+0x5e/0xae
-[ 80 3585.080000] 68637348: [<60028eaa>] deactivate_task+0x28/0x30
-[ 80 3585.080000] 68637368: [<60266517>] schedule+0x23a/0x280
-[ 80 3585.080000] 686373b8: [<60266d82>] __mutex_lock_slowpath+0x129/0x21d
-[ 80 3585.080000] 68637428: [<602673ee>] mutex_lock+0x25/0x3a
-[ 80 3585.080000] 68637438: [<602680c0>] _spin_unlock_irqrestore+0x18/0x1c
-[ 80 3585.080000] 68637458: [<60199f3e>] ceph_osdc_start_request+0x53/0x297
-[ 80 3585.080000] 68637498: [<6017f624>] ceph_writepages_start+0xabd/0x1072
-[ 80 3585.080000] 68637568: [<60028133>] arch_prctl+0xee/0x157
-[ 80 3585.080000] 68637598: [<602680a6>] _spin_unlock_irq+0xe/0x10
-[ 80 3585.080000] 686375a8: [<6002c824>] finish_task_switch+0x42/0x88
-[ 80 3585.080000] 68637638: [<6005f6e9>] do_writepages+0x1f/0x28
-[ 80 3585.080000] 68637648: [<6009ac38>] writeback_single_inode+0xe6/0x23f
-[ 80 3585.080000] 68637688: [<6009b7a4>] writeback_inodes_wb+0x359/0x3e9
-[ 80 3585.080000] 686376a8: [<6005f9e3>] get_dirty_limits+0x1e7/0x219
-[ 80 3585.080000] 68637728: [<6009b9ec>] writeback_inodes_wbc+0x19/0x1b
-[ 80 3585.080000] 68637738: [<6005fbc7>] balance_dirty_pages_ratelimited_nr+0x133/0x256
-[ 80 3585.080000] 686377f8: [<6005a4f9>] generic_file_buffered_write+0x22f/0x2b7
-[ 80 3585.080000] 686378d8: [<6005aa60>] __generic_file_aio_write+0x38d/0x3cd
-[ 80 3585.080000] 68637998: [<6005ab01>] generic_file_aio_write+0x61/0xa9
-[ 80 3585.080000] 686379d8: [<6017bc67>] ceph_aio_write+0x679/0x998
-[ 80 3585.080000] 68637a38: [<6005b08d>] generic_file_aio_read+0x544/0x5ec
-[ 80 3585.080000] 68637ae0: [<60016abf>] copy_chunk_to_user+0x0/0x22
-[ 80 3585.080000] 68637b18: [<600803cc>] do_sync_write+0xf4/0x139
-[ 80 3585.080000] 68637b88: [<60043d14>] autoremove_wake_function+0x0/0x38
-[ 80 3585.080000] 68637c58: [<60080e28>] vfs_write+0xb8/0x181
-[ 80 3585.080000] 68637c98: [<60080fb5>] sys_write+0x47/0x6f
-[ 80 3585.080000] 68637cd8: [<60016549>] handle_syscall+0x59/0x70
-[ 80 3585.080000] 68637cf8: [<60025863>] userspace+0x3c0/0x465
-[ 80 3585.080000] 68637fc8: [<60013c63>] fork_handler+0x62/0x69
-
-- kclient lockdep warning:
-[3272340.634840] =======================================================
-[3272340.636809] [ INFO: possible circular locking dependency detected ]
-[3272340.636809] 2.6.32-rc2 #1
-[3272340.636809] -------------------------------------------------------
-[3272340.636809] fstest/22237 is trying to acquire lock:
-[3272340.636809] (&osdc->request_mutex){+.+...}, at: [<ffffffffa00b4711>] ceph_osdc_start_request+0x58/0x2bd [ceph]
-[3272340.636809]
-[3272340.636809] but task is already holding lock:
-[3272340.636809] (&mm->mmap_sem){++++++}, at: [<ffffffff81029923>] do_page_fault+0x10a/0x27e
-[3272340.636809]
-[3272340.636809] which lock already depends on the new lock.
-[3272340.636809]
-[3272340.636809]
-[3272340.636809] the existing dependency chain (in reverse order) is:
-[3272340.636809]
-[3272340.636809] -> #3 (&mm->mmap_sem){++++++}:
-[3272340.636809] [<ffffffff8107017e>] __lock_acquire+0x148c/0x17e3
-[3272340.636809] [<ffffffff810705c5>] lock_acquire+0xf0/0x10d
-[3272340.636809] [<ffffffff8146ba8a>] down_read+0x48/0x7c
-[3272340.636809] [<ffffffff8102995a>] do_page_fault+0x141/0x27e
-[3272340.636809] [<ffffffff8146d43f>] page_fault+0x1f/0x30
-[3272340.636809] [<ffffffff813d1b3d>] skb_copy_datagram_iovec+0xaa/0x264
-[3272340.636809] [<ffffffff814089e2>] tcp_rcv_established+0x240/0x941
-[3272340.636809] [<ffffffff8140f9e1>] tcp_v4_do_rcv+0x31/0x1d7
-[3272340.636809] [<ffffffff813ff397>] tcp_prequeue_process+0x9c/0xb4
-[3272340.636809] [<ffffffff8140063e>] tcp_recvmsg+0x495/0x90b
-[3272340.636809] [<ffffffff813c9b90>] sock_common_recvmsg+0x32/0x47
-[3272340.636809] [<ffffffff813c8014>] sock_recvmsg+0x10e/0x133
-[3272340.636809] [<ffffffff813c92fb>] sys_recvfrom+0xa3/0xf8
-[3272340.636809] [<ffffffff8100baab>] system_call_fastpath+0x16/0x1b
-[3272340.636809]
-[3272340.636809] -> #2 (sk_lock-AF_INET){+.+.+.}:
-[3272340.636809] [<ffffffff8107017e>] __lock_acquire+0x148c/0x17e3
-[3272340.636809] [<ffffffff810705c5>] lock_acquire+0xf0/0x10d
-[3272340.636809] [<ffffffff813ca8be>] lock_sock_nested+0xea/0xfe
-[3272340.636809] [<ffffffff8141ca81>] inet_stream_connect+0x2b/0x259
-[3272340.636809] [<ffffffffa00a7c5a>] con_work+0x369/0x1607 [ceph]
-[3272340.636809] [<ffffffff8105b1ee>] worker_thread+0x283/0x398
-[3272340.636809] [<ffffffff8105f114>] kthread+0x7d/0x85
-[3272340.636809] [<ffffffff8100cb1a>] child_rip+0xa/0x20
-[3272340.636809]
-[3272340.636809] -> #1 (&con->out_mutex){+.+.+.}:
-[3272340.636809] [<ffffffff8107017e>] __lock_acquire+0x148c/0x17e3
-[3272340.636809] [<ffffffff810705c5>] lock_acquire+0xf0/0x10d
-[3272340.636809] [<ffffffff8146b4f8>] mutex_lock_nested+0x6c/0x32a
-[3272340.636809] [<ffffffffa00a901e>] ceph_con_send+0xa3/0x235 [ceph]
-[3272340.636809] [<ffffffffa00b33c0>] __send_request+0x113/0x1e9 [ceph]
-[3272340.636809] [<ffffffffa00b4863>] ceph_osdc_start_request+0x1aa/0x2bd [ceph]
-[3272340.636809] [<ffffffffa0098873>] ceph_writepages_start+0xacd/0x10cd [ceph]
-[3272340.636809] [<ffffffff810ab799>] do_writepages+0x1f/0x28
-[3272340.636809] [<ffffffff810f5c22>] writeback_single_inode+0xea/0x24c
-[3272340.636809] [<ffffffff810f68b8>] writeback_inodes_wb+0x363/0x3fb
-[3272340.636809] [<ffffffff810f6a83>] wb_writeback+0x133/0x1b3
-[3272340.636809] [<ffffffff810f6d76>] wb_do_writeback+0x1c3/0x1d9
-[3272340.636809] [<ffffffff810f6dc6>] bdi_writeback_task+0x3a/0xa9
-[3272340.636809] [<ffffffff810bacab>] bdi_start_fn+0x71/0xce
-[3272340.636809] [<ffffffff8105f114>] kthread+0x7d/0x85
-[3272340.636809] [<ffffffff8100cb1a>] child_rip+0xa/0x20
-[3272340.636809]
-[3272340.636809] -> #0 (&osdc->request_mutex){+.+...}:
-[3272340.636809] [<ffffffff8106fea3>] __lock_acquire+0x11b1/0x17e3
-[3272340.636809] [<ffffffff810705c5>] lock_acquire+0xf0/0x10d
-[3272340.636809] [<ffffffff8146b4f8>] mutex_lock_nested+0x6c/0x32a
-[3272340.636809] [<ffffffffa00b4711>] ceph_osdc_start_request+0x58/0x2bd [ceph]
-[3272340.636809] [<ffffffffa00b5598>] ceph_osdc_readpages+0x123/0x217 [ceph]
-[3272340.636809] [<ffffffffa0099006>] ceph_readpages+0x193/0x459 [ceph]
-[3272340.636809] [<ffffffff810ac66b>] __do_page_cache_readahead+0x184/0x1fa
-[3272340.636809] [<ffffffff810ac6fd>] ra_submit+0x1c/0x20
-[3272340.636809] [<ffffffff810aca62>] ondemand_readahead+0x284/0x297
-[3272340.636809] [<ffffffff810acaed>] page_cache_async_readahead+0x78/0x84
-[3272340.636809] [<ffffffff810a4821>] filemap_fault+0xbc/0x37b
-[3272340.636809] [<ffffffff810bc27a>] __do_fault+0x54/0x454
-[3272340.636809] [<ffffffff810be519>] handle_mm_fault+0x392/0x72b
-[3272340.636809] [<ffffffff81029a7d>] do_page_fault+0x264/0x27e
-[3272340.636809] [<ffffffff8146d43f>] page_fault+0x1f/0x30
-[3272340.636809]
-[3272340.636809] other info that might help us debug this:
-[3272340.636809]
-[3272340.636809] 1 lock held by fstest/22237:
-[3272340.636809] #0: (&mm->mmap_sem){++++++}, at: [<ffffffff81029923>] do_page_fault+0x10a/0x27e
-[3272340.636809]
-[3272340.636809] stack backtrace:
-[3272340.636809] Pid: 22237, comm: fstest Not tainted 2.6.32-rc2 #1
-[3272340.636809] Call Trace:
-[3272340.636809] [<ffffffff8106e796>] print_circular_bug+0xb3/0xc2
-[3272340.636809] [<ffffffff8106fea3>] __lock_acquire+0x11b1/0x17e3
-[3272340.636809] [<ffffffff8125c1e4>] ? sprintf+0x68/0x6a
-[3272340.636809] [<ffffffff810705c5>] lock_acquire+0xf0/0x10d
-[3272340.636809] [<ffffffffa00b4711>] ? ceph_osdc_start_request+0x58/0x2bd [ceph]
-[3272340.636809] [<ffffffff8146b4f8>] mutex_lock_nested+0x6c/0x32a
-[3272340.636809] [<ffffffffa00b4711>] ? ceph_osdc_start_request+0x58/0x2bd [ceph]
-[3272340.636809] [<ffffffffa00b4711>] ? ceph_osdc_start_request+0x58/0x2bd [ceph]
-[3272340.636809] [<ffffffffa00b4711>] ceph_osdc_start_request+0x58/0x2bd [ceph]
-[3272340.636809] [<ffffffffa00b5598>] ceph_osdc_readpages+0x123/0x217 [ceph]
-[3272340.636809] [<ffffffffa0099006>] ceph_readpages+0x193/0x459 [ceph]
-[3272340.636809] [<ffffffff810ac66b>] __do_page_cache_readahead+0x184/0x1fa
-[3272340.636809] [<ffffffff810ac570>] ? __do_page_cache_readahead+0x89/0x1fa
-[3272340.636809] [<ffffffff810ac6fd>] ra_submit+0x1c/0x20
-[3272340.636809] [<ffffffff810aca62>] ondemand_readahead+0x284/0x297
-[3272340.636809] [<ffffffff810acaed>] page_cache_async_readahead+0x78/0x84
-[3272340.636809] [<ffffffff810a3f39>] ? find_get_page+0x0/0x11d
-[3272340.636809] [<ffffffff810a4821>] filemap_fault+0xbc/0x37b
-[3272340.636809] [<ffffffff810bc27a>] __do_fault+0x54/0x454
-[3272340.636809] [<ffffffff8106a3b9>] ? get_lock_stats+0x19/0x4c
-[3272340.636809] [<ffffffff8106a964>] ? put_lock_stats+0xe/0x27
-[3272340.636809] [<ffffffff810be519>] handle_mm_fault+0x392/0x72b
-[3272340.636809] [<ffffffff81029a7d>] do_page_fault+0x264/0x27e
-[3272340.636809] [<ffffffff8146d43f>] page_fault+0x1f/0x30
@@ -357,6 +186,7 @@ uclient
- hadoop: clean up assert usage
mds
+- don't sync log on every clientreplay request?
- pass issued, wanted into eval(lock) when eval() already has it? (and otherwise optimize eval paths..)
- add an up:shadow mode?
- tail the mds log as it is written