Merge remote-tracking branch 'origin/master' into newcg

* origin/master: (756 commits) don't crash if argv[0] == NULL (#7037) -package P was loading all versions of P in GHCi (#7030) Add a Note, copying text from #2437 improve the --help docs a bit (#7008) Copy Data.HashTable's hashString into our Util module Build fix Build fixes Parse error: suggest brackets and indentation. Don't build the ghc DLL on Windows; works around trac #5987 On Windows, detect if DLLs have too many symbols; trac #5987 Add some more Integer rules; fixes #6111 Fix PA dfun construction with silent superclass args Add silent superclass parameters to the vectoriser Add silent superclass parameters (again) Mention Generic1 in the user's guide Make the GHC API a little more powerful. tweak llvm version warning message New version of the patch for #5461. Fix Word64ToInteger conversion rule. Implemented feature request on reconfigurable pretty-printing in GHCi (#5461) ... Conflicts: compiler/basicTypes/UniqSupply.lhs compiler/cmm/CmmBuildInfoTables.hs compiler/cmm/CmmLint.hs compiler/cmm/CmmOpt.hs compiler/cmm/CmmPipeline.hs compiler/cmm/CmmStackLayout.hs compiler/cmm/MkGraph.hs compiler/cmm/OldPprCmm.hs compiler/codeGen/CodeGen.lhs compiler/codeGen/StgCmm.hs compiler/codeGen/StgCmmBind.hs compiler/codeGen/StgCmmLayout.hs compiler/codeGen/StgCmmUtils.hs compiler/main/CodeOutput.lhs compiler/main/HscMain.hs compiler/nativeGen/AsmCodeGen.lhs compiler/simplStg/SimplStg.lhs
author: Simon Marlow <marlowsd@gmail.com> 2012-07-04 10:34:48 +0100
committer: Simon Marlow <marlowsd@gmail.com> 2012-07-04 10:34:48 +0100
commit: 99fd2469fba1a38b2a65b4694f337d92e559df01 (patch)
tree: 20491590ccb07223afd9d1f6a6546213b0f43577 /rts/Stats.c
parent: d260d919eef22654b1af61334feed0545f64cea5 (diff)
parent: 0d19922acd724991b7b97871b1404f3db5058b49 (diff)
download: haskell-99fd2469fba1a38b2a65b4694f337d92e559df01.tar.gz
1 files changed, 115 insertions, 48 deletions
diff --git a/rts/Stats.c b/rts/Stats.c
index 83c43f0bdd..b12cb769f7 100644
--- a/rts/Stats.c
+++ b/rts/Stats.c
@@ -40,7 +40,7 @@ static StgWord64 GC_tot_alloc      = 0;
 static StgWord64 GC_tot_copied     = 0;
 
 static StgWord64 GC_par_max_copied = 0;
-static StgWord64 GC_par_avg_copied = 0;
+static StgWord64 GC_par_tot_copied = 0;
 
 #ifdef PROFILING
 static Time RP_start_time  = 0, RP_tot_time  = 0;  // retainer prof user time
@@ -140,7 +140,7 @@ initStats0(void)
     GC_tot_alloc     = 0;
     GC_tot_copied    = 0;
     GC_par_max_copied = 0;
-    GC_par_avg_copied = 0;
+    GC_par_tot_copied = 0;
     GC_tot_cpu  = 0;
 
 #ifdef PROFILING
@@ -256,7 +256,7 @@ stat_endExit(void)
 static nat rub_bell = 0;
 
 void
-stat_startGC (gc_thread *gct)
+stat_startGC (Capability *cap, gc_thread *gct)
 {
     nat bell = RtsFlags.GcFlags.ringBell;
 
@@ -278,7 +278,16 @@ stat_startGC (gc_thread *gct)
 #endif
 
     getProcessTimes(&gct->gc_start_cpu, &gct->gc_start_elapsed);
-    gct->gc_start_thread_cpu  = getThreadCPUTime();
+
+    // Post EVENT_GC_START with the same timestamp as used for stats
+    // (though converted from Time=StgInt64 to EventTimestamp=StgWord64).
+    // Here, as opposed to other places, the event is emitted on the cap
+    // that initiates the GC and external tools expect it to have the same
+    // timestamp as used in +RTS -s calculcations.
+    traceEventGcStartAtT(cap,
+                         TimeToNS(gct->gc_start_elapsed - start_init_elapsed));
+
+    gct->gc_start_thread_cpu = getThreadCPUTime();
 
     if (RtsFlags.GcFlags.giveStats != NO_GC_STATS)
     {
@@ -287,18 +296,29 @@ stat_startGC (gc_thread *gct)
 }
 
 void
-stat_gcWorkerThreadStart (gc_thread *gct)
+stat_gcWorkerThreadStart (gc_thread *gct STG_UNUSED)
 {
+#if 0
+    /*
+     * We dont' collect per-thread GC stats any more, but this code
+     * could be used to do that if we want to in the future:
+     */
     if (RtsFlags.GcFlags.giveStats != NO_GC_STATS)
     {
         getProcessTimes(&gct->gc_start_cpu, &gct->gc_start_elapsed);
         gct->gc_start_thread_cpu  = getThreadCPUTime();
     }
+#endif
 }
 
 void
-stat_gcWorkerThreadDone (gc_thread *gct)
+stat_gcWorkerThreadDone (gc_thread *gct STG_UNUSED)
 {
+#if 0
+    /*
+     * We dont' collect per-thread GC stats any more, but this code
+     * could be used to do that if we want to in the future:
+     */
     Time thread_cpu, elapsed, gc_cpu, gc_elapsed;
 
     if (RtsFlags.GcFlags.giveStats != NO_GC_STATS)
@@ -311,6 +331,7 @@ stat_gcWorkerThreadDone (gc_thread *gct)
     
         taskDoneGC(gct->cap->running_task, gc_cpu, gc_elapsed);
     }
+#endif
 }
 
 /* -----------------------------------------------------------------------------
@@ -318,32 +339,54 @@ stat_gcWorkerThreadDone (gc_thread *gct)
    -------------------------------------------------------------------------- */
 
 void
-stat_endGC (gc_thread *gct,
-            lnat alloc, lnat live, lnat copied, nat gen,
-            lnat max_copied, lnat avg_copied, lnat slop)
+stat_endGC (Capability *cap, gc_thread *gct,
+            lnat alloc, lnat live, lnat copied, lnat slop, nat gen,
+            nat par_n_threads, lnat par_max_copied, lnat par_tot_copied)
 {
     if (RtsFlags.GcFlags.giveStats != NO_GC_STATS ||
         RtsFlags.ProfFlags.doHeapProfile)
         // heap profiling needs GC_tot_time
     {
-        Time cpu, elapsed, thread_gc_cpu, gc_cpu, gc_elapsed;
+        Time cpu, elapsed, gc_cpu, gc_elapsed;
+
+        // Has to be emitted while all caps stopped for GC, but before GC_END.
+        // See trac.haskell.org/ThreadScope/wiki/RTSsummaryEvents
+        // for a detailed design rationale of the current setup
+        // of GC eventlog events.
+        traceEventGcGlobalSync(cap);
 	
+        // Emitted before GC_END on all caps, which simplifies tools code.
+        traceEventGcStats(cap,
+                          CAPSET_HEAP_DEFAULT,
+                          gen,
+                          copied * sizeof(W_),
+                          slop   * sizeof(W_),
+                          /* current loss due to fragmentation */
+                          (mblocks_allocated * BLOCKS_PER_MBLOCK - n_alloc_blocks)
+                                 * BLOCK_SIZE_W * sizeof(W_),
+                          par_n_threads,
+                          par_max_copied * sizeof(W_),
+                          par_tot_copied * sizeof(W_));
+
         getProcessTimes(&cpu, &elapsed);
-        gc_elapsed    = elapsed - gct->gc_start_elapsed;
 
-        thread_gc_cpu = getThreadCPUTime() - gct->gc_start_thread_cpu;
+        // Post EVENT_GC_END with the same timestamp as used for stats
+        // (though converted from Time=StgInt64 to EventTimestamp=StgWord64).
+        // Here, as opposed to other places, the event is emitted on the cap
+        // that initiates the GC and external tools expect it to have the same
+        // timestamp as used in +RTS -s calculcations.
+        traceEventGcEndAtT(cap, TimeToNS(elapsed - start_init_elapsed));
 
+        gc_elapsed = elapsed - gct->gc_start_elapsed;
         gc_cpu = cpu - gct->gc_start_cpu;
 
-        taskDoneGC(gct->cap->running_task, thread_gc_cpu, gc_elapsed);
-
         if (RtsFlags.GcFlags.giveStats == VERBOSE_GC_STATS) {
 	    nat faults = getPageFaults();
 	    
-	    statsPrintf("%9ld %9ld %9ld",
+	    statsPrintf("%9" FMT_SizeT " %9" FMT_SizeT " %9" FMT_SizeT,
 		    alloc*sizeof(W_), copied*sizeof(W_), 
 			live*sizeof(W_));
-            statsPrintf(" %5.2f %5.2f %7.2f %7.2f %4ld %4ld  (Gen: %2d)\n",
+            statsPrintf(" %5.2f %5.2f %7.2f %7.2f %4" FMT_SizeT " %4" FMT_SizeT "  (Gen: %2d)\n",
                     TimeToSecondsDbl(gc_cpu),
 		    TimeToSecondsDbl(gc_elapsed),
 		    TimeToSecondsDbl(cpu),
@@ -364,9 +407,31 @@ stat_endGC (gc_thread *gct,
 
 	GC_tot_copied += (StgWord64) copied;
 	GC_tot_alloc  += (StgWord64) alloc;
-        GC_par_max_copied += (StgWord64) max_copied;
-        GC_par_avg_copied += (StgWord64) avg_copied;
+        GC_par_max_copied += (StgWord64) par_max_copied;
+        GC_par_tot_copied += (StgWord64) par_tot_copied;
 	GC_tot_cpu   += gc_cpu;
+        
+        /* For the moment we calculate both per-HEC and total allocation.
+	 * There is thus redundancy here, but for the moment we will calculate
+	 * it both the old and new way and assert they're the same.
+	 * When we're sure it's working OK then we can simplify things.
+	 * TODO: simplify calcAllocated and clearNurseries so they don't have
+	 *       to calculate the total
+	 */
+        {
+            lnat tot_alloc = 0;
+            lnat n;
+            for (n = 0; n < n_capabilities; n++) {
+                tot_alloc += capabilities[n].total_allocated;
+                traceEventHeapAllocated(&capabilities[n],
+                                        CAPSET_HEAP_DEFAULT,
+                                        capabilities[n].total_allocated * sizeof(W_));
+            }
+            ASSERT(GC_tot_alloc == tot_alloc);
+        }
+        traceEventHeapSize(cap,
+	                   CAPSET_HEAP_DEFAULT,
+			   mblocks_allocated * MBLOCK_SIZE_W * sizeof(W_));
 
 	if (gen == RtsFlags.GcFlags.generations-1) { /* major GC? */
 	    if (live > max_residency) {
@@ -375,6 +440,9 @@ stat_endGC (gc_thread *gct,
             current_residency = live;
 	    residency_samples++;
 	    cumulative_residency += live;
+	    traceEventHeapLive(cap, 
+    	    	    	       CAPSET_HEAP_DEFAULT,
+	                       live * sizeof(W_));
 	}
 
         if (slop > max_slop) max_slop = slop;
@@ -544,6 +612,12 @@ stat_exit(int alloc)
 
 	GC_tot_alloc += alloc;
 
+        for (i = 0; i < n_capabilities; i++) {
+            traceEventHeapAllocated(&capabilities[i],
+                                    CAPSET_HEAP_DEFAULT,
+                                    capabilities[i].total_allocated * sizeof(W_));
+        }
+
 	/* Count total garbage collections */
 	for (g = 0; g < RtsFlags.GcFlags.generations; g++)
 	    total_collections += generations[g].collections;
@@ -553,7 +627,7 @@ stat_exit(int alloc)
 	if (tot_elapsed == 0.0) tot_elapsed = 1;
 	
 	if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) {
-	    statsPrintf("%9ld %9.9s %9.9s", (lnat)alloc*sizeof(W_), "", "");
+	    statsPrintf("%9" FMT_SizeT " %9.9s %9.9s", (lnat)alloc*sizeof(W_), "", "");
 	    statsPrintf(" %5.2f %5.2f\n\n", 0.0, 0.0);
 	}
 
@@ -592,16 +666,16 @@ stat_exit(int alloc)
             if ( residency_samples > 0 ) {
 		showStgWord64(max_residency*sizeof(W_), 
 				     temp, rtsTrue/*commas*/);
-		statsPrintf("%16s bytes maximum residency (%ld sample(s))\n",
+		statsPrintf("%16s bytes maximum residency (%" FMT_SizeT " sample(s))\n",
 			temp, residency_samples);
 	    }
 
 	    showStgWord64(max_slop*sizeof(W_), temp, rtsTrue/*commas*/);
 	    statsPrintf("%16s bytes maximum slop\n", temp);
 
-	    statsPrintf("%16ld MB total memory in use (%ld MB lost due to fragmentation)\n\n", 
+	    statsPrintf("%16" FMT_SizeT " MB total memory in use (%" FMT_SizeT " MB lost due to fragmentation)\n\n", 
                         peak_mblocks_allocated * MBLOCK_SIZE_W / (1024 * 1024 / sizeof(W_)),
-                        (peak_mblocks_allocated * BLOCKS_PER_MBLOCK * BLOCK_SIZE_W - hw_alloc_blocks * BLOCK_SIZE_W) / (1024 * 1024 / sizeof(W_)));
+                        (lnat)(peak_mblocks_allocated * BLOCKS_PER_MBLOCK * BLOCK_SIZE_W - hw_alloc_blocks * BLOCK_SIZE_W) / (1024 * 1024 / sizeof(W_)));
 
 	    /* Print garbage collections in each gen */
             statsPrintf("                                    Tot time (elapsed)  Avg pause  Max pause\n");
@@ -618,33 +692,20 @@ stat_exit(int alloc)
             }
 
 #if defined(THREADED_RTS)
-            if (RtsFlags.ParFlags.parGcEnabled) {
-                statsPrintf("\n  Parallel GC work balance: %.2f (%ld / %ld, ideal %d)\n", 
-                            (double)GC_par_avg_copied / (double)GC_par_max_copied,
-                            (lnat)GC_par_avg_copied, (lnat)GC_par_max_copied,
-                            n_capabilities
+            if (RtsFlags.ParFlags.parGcEnabled && n_capabilities > 1) {
+                statsPrintf("\n  Parallel GC work balance: %.2f%% (serial 0%%, perfect 100%%)\n", 
+                            100 * (((double)GC_par_tot_copied / (double)GC_par_max_copied) - 1)
+                                / (n_capabilities - 1)
                     );
             }
 #endif
             statsPrintf("\n");
 
 #if defined(THREADED_RTS)
-	    {
-		nat i;
-		Task *task;
-                statsPrintf("                        MUT time (elapsed)       GC time  (elapsed)\n");
-		for (i = 0, task = all_tasks; 
-		     task != NULL; 
-		     i++, task = task->all_link) {
-		    statsPrintf("  Task %2d %-8s :  %6.2fs    (%6.2fs)     %6.2fs    (%6.2fs)\n",
-				i,
-				(task->worker) ? "(worker)" : "(bound)",
-				TimeToSecondsDbl(task->mut_time),
-				TimeToSecondsDbl(task->mut_etime),
-				TimeToSecondsDbl(task->gc_time),
-				TimeToSecondsDbl(task->gc_etime));
-		}
-	    }
+            statsPrintf("  TASKS: %d (%d bound, %d peak workers (%d total), using -N%d)\n",
+                        taskCount, taskCount - workerCount,
+                        peakWorkerCount, workerCount,
+                        n_capabilities);
 
 	    statsPrintf("\n");
 
@@ -660,7 +721,7 @@ stat_exit(int alloc)
                     sparks.fizzled   += capabilities[i].spark_stats.fizzled;
                 }
 
-                statsPrintf("  SPARKS: %ld (%ld converted, %ld overflowed, %ld dud, %ld GC'd, %ld fizzled)\n\n",
+                statsPrintf("  SPARKS: %" FMT_Word " (%" FMT_Word " converted, %" FMT_Word " overflowed, %" FMT_Word " dud, %" FMT_Word " GC'd, %" FMT_Word " fizzled)\n\n",
                             sparks.created + sparks.dud + sparks.overflowed,
                             sparks.converted, sparks.overflowed, sparks.dud,
                             sparks.gcd, sparks.fizzled);
@@ -835,17 +896,18 @@ statDescribeGens(void)
           gen_blocks += gcThreadLiveBlocks(i,g);
       }
 
-      debugBelch("%5d %7ld %9d", g, (lnat)gen->max_blocks, mut);
+      debugBelch("%5d %7" FMT_SizeT " %9d", g, (lnat)gen->max_blocks, mut);
 
       gen_slop = gen_blocks * BLOCK_SIZE_W - gen_live;
 
-      debugBelch("%8ld %8d %8ld %8ld\n", gen_blocks, lge,
+      debugBelch("%8" FMT_SizeT " %8d %8" FMT_SizeT " %8" FMT_SizeT "\n", gen_blocks, lge,
                  gen_live*sizeof(W_), gen_slop*sizeof(W_));
       tot_live += gen_live;
       tot_slop += gen_slop;
   }
   debugBelch("----------------------------------------------------------\n");
-  debugBelch("%41s%8ld %8ld\n","",tot_live*sizeof(W_),tot_slop*sizeof(W_));
+  debugBelch("%41s%8" FMT_SizeT " %8" FMT_SizeT "\n",
+             "",tot_live*sizeof(W_),tot_slop*sizeof(W_));
   debugBelch("----------------------------------------------------------\n");
   debugBelch("\n");
 }
@@ -860,6 +922,11 @@ extern HsInt64 getAllocations( void )
 
 /* EZY: I'm not convinced I got all the casting right. */
 
+extern rtsBool getGCStatsEnabled( void )
+{
+    return RtsFlags.GcFlags.giveStats != NO_GC_STATS;
+}
+
 extern void getGCStats( GCStats *s )
 {
     nat total_collections = 0;
@@ -901,7 +968,7 @@ extern void getGCStats( GCStats *s )
     /* EZY: Being consistent with incremental output, but maybe should also discount init */
     s->cpu_seconds = TimeToSecondsDbl(current_cpu);
     s->wall_seconds = TimeToSecondsDbl(current_elapsed - end_init_elapsed);
-    s->par_avg_bytes_copied = GC_par_avg_copied*(StgWord64)sizeof(W_);
+    s->par_tot_bytes_copied = GC_par_tot_copied*(StgWord64)sizeof(W_);
     s->par_max_bytes_copied = GC_par_max_copied*(StgWord64)sizeof(W_);
 }
 // extern void getTaskStats( TaskStats **s ) {}
author	Simon Marlow <marlowsd@gmail.com>	2012-07-04 10:34:48 +0100
committer	Simon Marlow <marlowsd@gmail.com>	2012-07-04 10:34:48 +0100
commit	99fd2469fba1a38b2a65b4694f337d92e559df01 (patch)
tree	20491590ccb07223afd9d1f6a6546213b0f43577 /rts/Stats.c
parent	d260d919eef22654b1af61334feed0545f64cea5 (diff)
parent	0d19922acd724991b7b97871b1404f3db5058b49 (diff)
download	haskell-99fd2469fba1a38b2a65b4694f337d92e559df01.tar.gz