diff options
author | Sage Weil <sage@newdream.net> | 2009-12-01 14:59:57 -0800 |
---|---|---|
committer | Sage Weil <sage@newdream.net> | 2009-12-01 16:19:19 -0800 |
commit | c280aace5325a5af7f05a54ad104fda0a84264ad (patch) | |
tree | 5272750b7b6f8c2f529294972d5ba0a0f496f1d2 | |
parent | 8bc7d2262f6f254acbe05a9ed0a07de782bd33c6 (diff) | |
download | ceph-c280aace5325a5af7f05a54ad104fda0a84264ad.tar.gz |
todo
the lockdep was probably due to the leak of request_mutex?
-rw-r--r-- | src/TODO | 176 |
1 files changed, 3 insertions, 173 deletions
@@ -59,6 +59,8 @@ v0.18 - auth: 'none' security framework - mon: "safely" bail on write errors (e.g. ENOSPC) - mds: fix replay/reconnect race (caused (fast) client reconnect to fail) +- mds: misc journal replay, session fixes + - mount btrfs by UUID? @@ -82,180 +84,7 @@ bugs - kclient: after reconnect, cp: writing `/c/ceph2.2/bin/gs-gpl': Bad file descriptor - need to somehow wake up unreconnected caps? hrm!! -- kclient: ~300 (306, 311) second delay before able to reconnect to restarted monitor??? - kclient: socket creation -- kclient: leak of osdc->request_mutex? -[ 80 3585.080000] events/0 D 00000000421fb690 0 5 2 0x00000000 -[ 80 3585.080000] 60356f18 7082df00 7084dcd0 7084c000 7084dcf0 60013d64 7084dcf0 60028eaa -[ 80 3585.080000] 7084c000 7019b180 7084dd40 60266517 7084dd20 6004be47 7084c000 7084c000 -[ 80 3585.080000] 70204730 00000001 7082df00 70204750 7084ddb0 60266d82 686373c0 70204750 -[ 80 3585.080000] Call Trace: -[ 80 3585.080000] 7084dcc8: [<60013d64>] _switch_to+0x5e/0xae -[ 80 3585.080000] 7084dcd8: [<60028eaa>] deactivate_task+0x28/0x30 -[ 80 3585.080000] 7084dcf8: [<60266517>] schedule+0x23a/0x280 -[ 80 3585.080000] 7084dd08: [<6004be47>] debug_mutex_free_waiter+0x4d/0x51 -[ 80 3585.080000] 7084dd48: [<60266d82>] __mutex_lock_slowpath+0x129/0x21d -[ 80 3585.080000] 7084dda8: [<60198e18>] handle_timeout+0x0/0x2b4 -[ 80 3585.080000] 7084ddb8: [<602673ee>] mutex_lock+0x25/0x3a -[ 80 3585.080000] 7084ddc8: [<60197461>] ceph_monc_request_next_osdmap+0x64/0x96 -[ 80 3585.080000] 7084dde8: [<60198ebf>] handle_timeout+0xa7/0x2b4 -[ 80 3585.080000] 7084de48: [<60198e18>] handle_timeout+0x0/0x2b4 -[ 80 3585.080000] 7084de58: [<600408df>] worker_thread+0xff/0x18f -[ 80 3585.080000] 7084de80: [<60043d14>] autoremove_wake_function+0x0/0x38 -[ 80 3585.080000] 7084dec0: [<600407e0>] worker_thread+0x0/0x18f -[ 80 3585.080000] 7084ded8: [<60043a6b>] kthread+0x91/0x99 -[ 80 3585.080000] 7084df48: [<60021c09>] run_kernel_thread+0x41/0x4a -[ 80 3585.080000] 7084df58: [<600439da>] kthread+0x0/0x99 -[ 80 3585.080000] 7084df98: [<60021bf0>] run_kernel_thread+0x28/0x4a -[ 80 3585.080000] 7084dfc8: [<60013cdc>] new_thread_handler+0x72/0x9c -[ 80 3585.080000] -[ 80 3585.080000] INFO: task cp:1267 blocked for more than 120 seconds. -[ 80 3585.080000] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. -[ 80 3585.080000] cp D 00000000421fb690 0 1267 1235 0x00000000 -[ 80 3585.080000] 60356f18 70868000 68637340 68636000 68637360 60013d64 68637360 60028eaa -[ 80 3585.080000] 68636000 7019b180 686373b0 60266517 5a00000000024050 68636000 68636000 68636000 -[ 80 3585.080000] 70204730 00000001 70868000 70204750 68637420 60266d82 70204750 7084dd50 -[ 80 3585.080000] Call Trace: -[ 80 3585.080000] 68637338: [<60013d64>] _switch_to+0x5e/0xae -[ 80 3585.080000] 68637348: [<60028eaa>] deactivate_task+0x28/0x30 -[ 80 3585.080000] 68637368: [<60266517>] schedule+0x23a/0x280 -[ 80 3585.080000] 686373b8: [<60266d82>] __mutex_lock_slowpath+0x129/0x21d -[ 80 3585.080000] 68637428: [<602673ee>] mutex_lock+0x25/0x3a -[ 80 3585.080000] 68637438: [<602680c0>] _spin_unlock_irqrestore+0x18/0x1c -[ 80 3585.080000] 68637458: [<60199f3e>] ceph_osdc_start_request+0x53/0x297 -[ 80 3585.080000] 68637498: [<6017f624>] ceph_writepages_start+0xabd/0x1072 -[ 80 3585.080000] 68637568: [<60028133>] arch_prctl+0xee/0x157 -[ 80 3585.080000] 68637598: [<602680a6>] _spin_unlock_irq+0xe/0x10 -[ 80 3585.080000] 686375a8: [<6002c824>] finish_task_switch+0x42/0x88 -[ 80 3585.080000] 68637638: [<6005f6e9>] do_writepages+0x1f/0x28 -[ 80 3585.080000] 68637648: [<6009ac38>] writeback_single_inode+0xe6/0x23f -[ 80 3585.080000] 68637688: [<6009b7a4>] writeback_inodes_wb+0x359/0x3e9 -[ 80 3585.080000] 686376a8: [<6005f9e3>] get_dirty_limits+0x1e7/0x219 -[ 80 3585.080000] 68637728: [<6009b9ec>] writeback_inodes_wbc+0x19/0x1b -[ 80 3585.080000] 68637738: [<6005fbc7>] balance_dirty_pages_ratelimited_nr+0x133/0x256 -[ 80 3585.080000] 686377f8: [<6005a4f9>] generic_file_buffered_write+0x22f/0x2b7 -[ 80 3585.080000] 686378d8: [<6005aa60>] __generic_file_aio_write+0x38d/0x3cd -[ 80 3585.080000] 68637998: [<6005ab01>] generic_file_aio_write+0x61/0xa9 -[ 80 3585.080000] 686379d8: [<6017bc67>] ceph_aio_write+0x679/0x998 -[ 80 3585.080000] 68637a38: [<6005b08d>] generic_file_aio_read+0x544/0x5ec -[ 80 3585.080000] 68637ae0: [<60016abf>] copy_chunk_to_user+0x0/0x22 -[ 80 3585.080000] 68637b18: [<600803cc>] do_sync_write+0xf4/0x139 -[ 80 3585.080000] 68637b88: [<60043d14>] autoremove_wake_function+0x0/0x38 -[ 80 3585.080000] 68637c58: [<60080e28>] vfs_write+0xb8/0x181 -[ 80 3585.080000] 68637c98: [<60080fb5>] sys_write+0x47/0x6f -[ 80 3585.080000] 68637cd8: [<60016549>] handle_syscall+0x59/0x70 -[ 80 3585.080000] 68637cf8: [<60025863>] userspace+0x3c0/0x465 -[ 80 3585.080000] 68637fc8: [<60013c63>] fork_handler+0x62/0x69 - -- kclient lockdep warning: -[3272340.634840] ======================================================= -[3272340.636809] [ INFO: possible circular locking dependency detected ] -[3272340.636809] 2.6.32-rc2 #1 -[3272340.636809] ------------------------------------------------------- -[3272340.636809] fstest/22237 is trying to acquire lock: -[3272340.636809] (&osdc->request_mutex){+.+...}, at: [<ffffffffa00b4711>] ceph_osdc_start_request+0x58/0x2bd [ceph] -[3272340.636809] -[3272340.636809] but task is already holding lock: -[3272340.636809] (&mm->mmap_sem){++++++}, at: [<ffffffff81029923>] do_page_fault+0x10a/0x27e -[3272340.636809] -[3272340.636809] which lock already depends on the new lock. -[3272340.636809] -[3272340.636809] -[3272340.636809] the existing dependency chain (in reverse order) is: -[3272340.636809] -[3272340.636809] -> #3 (&mm->mmap_sem){++++++}: -[3272340.636809] [<ffffffff8107017e>] __lock_acquire+0x148c/0x17e3 -[3272340.636809] [<ffffffff810705c5>] lock_acquire+0xf0/0x10d -[3272340.636809] [<ffffffff8146ba8a>] down_read+0x48/0x7c -[3272340.636809] [<ffffffff8102995a>] do_page_fault+0x141/0x27e -[3272340.636809] [<ffffffff8146d43f>] page_fault+0x1f/0x30 -[3272340.636809] [<ffffffff813d1b3d>] skb_copy_datagram_iovec+0xaa/0x264 -[3272340.636809] [<ffffffff814089e2>] tcp_rcv_established+0x240/0x941 -[3272340.636809] [<ffffffff8140f9e1>] tcp_v4_do_rcv+0x31/0x1d7 -[3272340.636809] [<ffffffff813ff397>] tcp_prequeue_process+0x9c/0xb4 -[3272340.636809] [<ffffffff8140063e>] tcp_recvmsg+0x495/0x90b -[3272340.636809] [<ffffffff813c9b90>] sock_common_recvmsg+0x32/0x47 -[3272340.636809] [<ffffffff813c8014>] sock_recvmsg+0x10e/0x133 -[3272340.636809] [<ffffffff813c92fb>] sys_recvfrom+0xa3/0xf8 -[3272340.636809] [<ffffffff8100baab>] system_call_fastpath+0x16/0x1b -[3272340.636809] -[3272340.636809] -> #2 (sk_lock-AF_INET){+.+.+.}: -[3272340.636809] [<ffffffff8107017e>] __lock_acquire+0x148c/0x17e3 -[3272340.636809] [<ffffffff810705c5>] lock_acquire+0xf0/0x10d -[3272340.636809] [<ffffffff813ca8be>] lock_sock_nested+0xea/0xfe -[3272340.636809] [<ffffffff8141ca81>] inet_stream_connect+0x2b/0x259 -[3272340.636809] [<ffffffffa00a7c5a>] con_work+0x369/0x1607 [ceph] -[3272340.636809] [<ffffffff8105b1ee>] worker_thread+0x283/0x398 -[3272340.636809] [<ffffffff8105f114>] kthread+0x7d/0x85 -[3272340.636809] [<ffffffff8100cb1a>] child_rip+0xa/0x20 -[3272340.636809] -[3272340.636809] -> #1 (&con->out_mutex){+.+.+.}: -[3272340.636809] [<ffffffff8107017e>] __lock_acquire+0x148c/0x17e3 -[3272340.636809] [<ffffffff810705c5>] lock_acquire+0xf0/0x10d -[3272340.636809] [<ffffffff8146b4f8>] mutex_lock_nested+0x6c/0x32a -[3272340.636809] [<ffffffffa00a901e>] ceph_con_send+0xa3/0x235 [ceph] -[3272340.636809] [<ffffffffa00b33c0>] __send_request+0x113/0x1e9 [ceph] -[3272340.636809] [<ffffffffa00b4863>] ceph_osdc_start_request+0x1aa/0x2bd [ceph] -[3272340.636809] [<ffffffffa0098873>] ceph_writepages_start+0xacd/0x10cd [ceph] -[3272340.636809] [<ffffffff810ab799>] do_writepages+0x1f/0x28 -[3272340.636809] [<ffffffff810f5c22>] writeback_single_inode+0xea/0x24c -[3272340.636809] [<ffffffff810f68b8>] writeback_inodes_wb+0x363/0x3fb -[3272340.636809] [<ffffffff810f6a83>] wb_writeback+0x133/0x1b3 -[3272340.636809] [<ffffffff810f6d76>] wb_do_writeback+0x1c3/0x1d9 -[3272340.636809] [<ffffffff810f6dc6>] bdi_writeback_task+0x3a/0xa9 -[3272340.636809] [<ffffffff810bacab>] bdi_start_fn+0x71/0xce -[3272340.636809] [<ffffffff8105f114>] kthread+0x7d/0x85 -[3272340.636809] [<ffffffff8100cb1a>] child_rip+0xa/0x20 -[3272340.636809] -[3272340.636809] -> #0 (&osdc->request_mutex){+.+...}: -[3272340.636809] [<ffffffff8106fea3>] __lock_acquire+0x11b1/0x17e3 -[3272340.636809] [<ffffffff810705c5>] lock_acquire+0xf0/0x10d -[3272340.636809] [<ffffffff8146b4f8>] mutex_lock_nested+0x6c/0x32a -[3272340.636809] [<ffffffffa00b4711>] ceph_osdc_start_request+0x58/0x2bd [ceph] -[3272340.636809] [<ffffffffa00b5598>] ceph_osdc_readpages+0x123/0x217 [ceph] -[3272340.636809] [<ffffffffa0099006>] ceph_readpages+0x193/0x459 [ceph] -[3272340.636809] [<ffffffff810ac66b>] __do_page_cache_readahead+0x184/0x1fa -[3272340.636809] [<ffffffff810ac6fd>] ra_submit+0x1c/0x20 -[3272340.636809] [<ffffffff810aca62>] ondemand_readahead+0x284/0x297 -[3272340.636809] [<ffffffff810acaed>] page_cache_async_readahead+0x78/0x84 -[3272340.636809] [<ffffffff810a4821>] filemap_fault+0xbc/0x37b -[3272340.636809] [<ffffffff810bc27a>] __do_fault+0x54/0x454 -[3272340.636809] [<ffffffff810be519>] handle_mm_fault+0x392/0x72b -[3272340.636809] [<ffffffff81029a7d>] do_page_fault+0x264/0x27e -[3272340.636809] [<ffffffff8146d43f>] page_fault+0x1f/0x30 -[3272340.636809] -[3272340.636809] other info that might help us debug this: -[3272340.636809] -[3272340.636809] 1 lock held by fstest/22237: -[3272340.636809] #0: (&mm->mmap_sem){++++++}, at: [<ffffffff81029923>] do_page_fault+0x10a/0x27e -[3272340.636809] -[3272340.636809] stack backtrace: -[3272340.636809] Pid: 22237, comm: fstest Not tainted 2.6.32-rc2 #1 -[3272340.636809] Call Trace: -[3272340.636809] [<ffffffff8106e796>] print_circular_bug+0xb3/0xc2 -[3272340.636809] [<ffffffff8106fea3>] __lock_acquire+0x11b1/0x17e3 -[3272340.636809] [<ffffffff8125c1e4>] ? sprintf+0x68/0x6a -[3272340.636809] [<ffffffff810705c5>] lock_acquire+0xf0/0x10d -[3272340.636809] [<ffffffffa00b4711>] ? ceph_osdc_start_request+0x58/0x2bd [ceph] -[3272340.636809] [<ffffffff8146b4f8>] mutex_lock_nested+0x6c/0x32a -[3272340.636809] [<ffffffffa00b4711>] ? ceph_osdc_start_request+0x58/0x2bd [ceph] -[3272340.636809] [<ffffffffa00b4711>] ? ceph_osdc_start_request+0x58/0x2bd [ceph] -[3272340.636809] [<ffffffffa00b4711>] ceph_osdc_start_request+0x58/0x2bd [ceph] -[3272340.636809] [<ffffffffa00b5598>] ceph_osdc_readpages+0x123/0x217 [ceph] -[3272340.636809] [<ffffffffa0099006>] ceph_readpages+0x193/0x459 [ceph] -[3272340.636809] [<ffffffff810ac66b>] __do_page_cache_readahead+0x184/0x1fa -[3272340.636809] [<ffffffff810ac570>] ? __do_page_cache_readahead+0x89/0x1fa -[3272340.636809] [<ffffffff810ac6fd>] ra_submit+0x1c/0x20 -[3272340.636809] [<ffffffff810aca62>] ondemand_readahead+0x284/0x297 -[3272340.636809] [<ffffffff810acaed>] page_cache_async_readahead+0x78/0x84 -[3272340.636809] [<ffffffff810a3f39>] ? find_get_page+0x0/0x11d -[3272340.636809] [<ffffffff810a4821>] filemap_fault+0xbc/0x37b -[3272340.636809] [<ffffffff810bc27a>] __do_fault+0x54/0x454 -[3272340.636809] [<ffffffff8106a3b9>] ? get_lock_stats+0x19/0x4c -[3272340.636809] [<ffffffff8106a964>] ? put_lock_stats+0xe/0x27 -[3272340.636809] [<ffffffff810be519>] handle_mm_fault+0x392/0x72b -[3272340.636809] [<ffffffff81029a7d>] do_page_fault+0x264/0x27e -[3272340.636809] [<ffffffff8146d43f>] page_fault+0x1f/0x30 @@ -357,6 +186,7 @@ uclient - hadoop: clean up assert usage mds +- don't sync log on every clientreplay request? - pass issued, wanted into eval(lock) when eval() already has it? (and otherwise optimize eval paths..) - add an up:shadow mode? - tail the mds log as it is written |