#!/usr/bin/perl -w # get_sloc # Take a list of dirs, and get the SLOC or filecount data from them. # NOTE: The intended input data ignores zero-length files & ignores dups, # so if that's true for the input data, it'll be true for the output data! # This code works but is NOT cleaned up-- it basically grew like # topsy. Many of the variable names are misleading, as my needs for # output changed. # This is part of SLOCCount, a toolsuite that counts # source lines of code (SLOC). # Copyright (C) 2001-2004 David A. Wheeler. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # To contact David A. Wheeler, see his website at: # http://www.dwheeler.com. # Default values for the effort estimation model; the model is # effort = ($effort_factor * KiloSLOC) ** $effort_exponent. # The following numbers are for basic COCOMO: $effort_factor = 2.40; $effort_exponent = 1.05; $effort_estimation_message = "Basic COCOMO model,"; $schedule_factor = 2.5; $schedule_exponent = 0.38; $schedule_estimation_message = "Basic COCOMO model,"; # Average Salary / year. # Source: ComputerWorld, Sep. 4, 2000 Salary Survey, # average (U.S.) programmer/analyst salary. $person_cost = 56286.; # Overhead; the person cost is multiplied by this value to determine # true annual costs. $overhead = 2.4; @license_list = ( "GPL", "LGPL", "MIT", "BSD", "distributable", "public domain", "MPL"); %license_of = (); # input is name of program, output is license. $no_license_total = 0; %non_language_list = ( "dup" => 1, "not" => 1, "unknown" => 1, "auto" => 1, "zero" => 1, ); %ignore_language_list = ( "makefile" => 1, "sql" => 1, "html" => 1, ); # Default input values $dirs_in_stdin = 0; # 0: dirs to analyze as arguments, 1: in stdin # Default Output Values: $computing_sloc = 1; # 0= showing filecounts, 1= showing SLOC. $narrow = 1; $sort_by = "total"; # If empty, sort by name; else "total" or lang name. $show_effort = 0; # Show effort for each component? $break_line = 1; # Break up long lines into multiple lines? $show_non_lang = 0; # Show non-language counts? $one_program = 0; # Are all files part of a single program? $show_header = 1; # Show header? $show_footer = 1; # Show footer? # Global variables: @dirs = (); # Directories to examine %examined_directories = (); # Keys = Names of directories examined this run. # Subroutines. sub commify { # TODO: Needs to be internationalized. my $text = reverse $_[0]; $text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g; return scalar reverse $text; } sub numformat { # Format number nicely with commas. my $num = shift; my $digits = shift; return commify(sprintf("%0.${digits}f", $num)); } sub effort_person_months { # Given the SLOC, reply an estimate of the number of person-months # needed to develop it traditionally. my $total_sloc = shift; return ( ($effort_factor*(($total_sloc/1000.0)**$effort_exponent))); } sub estimate_schedule { # Given the person-months, reply an estimate of the number of months # needed to develop it traditionally. my $person_months = shift; return ($schedule_factor*($person_months**$schedule_exponent)); } sub get_lang_total { my $lang = shift; if (defined($lang_total{$lang})) {return $lang_total{$lang}} else {return 0;} } # MAIN PROGRAM # Process options (if any): if ($#ARGV < 0) { print STDERR "Error! You must list at least one directory to process, or --stdin.\n"; exit(1); } while ((scalar (@ARGV) > 0) && ($ARGV[0] =~ m/^-/)) { $arg = shift; if ($arg eq "--") {last;} elsif ($arg eq "--filecount") {$computing_sloc = 0;} elsif ($arg eq "--filecounts") {$computing_sloc = 0;} elsif ($arg eq "--sloc") {$computing_sloc = 1;} elsif ($arg eq "--narrow") {$narrow = 1;} elsif ($arg eq "--wide") {$narrow = 0;} elsif ($arg eq "--break") {$break_line = 1;} elsif ($arg eq "--nobreak") {$break_line = 0;} elsif ($arg eq "--sort") {$sort_by = shift;} # Must be "total" or a lang. elsif ($arg eq "--nosort") {$sort_by = "";} elsif ($arg eq "--showother") {$show_non_lang = 1;} elsif ($arg eq "--noshowother") {$show_non_lang = 0;} elsif ($arg eq "--oneprogram") {$one_program = 1;} elsif ($arg eq "--noheader") {$show_header = 0;} elsif ($arg eq "--nofooter") {$show_footer = 0;} elsif ($arg eq "--addlang") { $lang = shift; if (!defined($ignore_language_list{$lang})) { die "Sorry, but $lang isn't ignored"; }; delete $ignore_language_list{$lang}; } elsif ($arg eq "--addlangall") { %ignore_language_list = (); } elsif ($arg eq "--effort") {$effort_factor = (shift)*1.0; $effort_exponent = (shift)*1.0; $effort_estimation_message = "effort model"} elsif ($arg eq "--schedule") {$schedule_factor = (shift)*1.0; $schedule_exponent = (shift)*1.0; $schedule_estimation_message = "schedule model"} elsif ($arg eq "--personcost") {$person_cost = (shift)*1.0;} elsif ($arg eq "--overhead") {$overhead = (shift)*1.0;} elsif ($arg eq "--stdin") {$dirs_in_stdin = 1;} else {die "Unknown option: $arg\n";} } # Determine the languages to show: if ($computing_sloc) { $show_non_lang = 0; } if (!$show_non_lang) { # Add the non_language_list to the ignored languages. foreach $langname (keys(%non_language_list)) {$ignore_language_list{$langname} = 1;} } %lang_total = (); %license_total = (); @data_lines = (); $sloc = 0; $total_sloc = 0; $total_lang_sloc = 0; $grand_total_sloc = 0; $grand_total_lang_sloc = 0; $effort = 0.0; $grand_total_effort = 0.0; $grand_schedule = 0.0; if (!$narrow) { # Ouch! To accurately determine the column positions and names, # without "pre-knowing" them, we need to look through the data. # So, we'll do it twice. This isn't efficient - if needed, # speed it up by rewriting this to do it in-memory. while (defined($_ = )) { ($lang, $sloc) = split; next if ( (!defined($lang)) || (!defined($sloc)) ); next if ($ignore_language_list{$lang}); $lang_total{$lang} = 0; } } # Print the header. if ($show_header) { if ($narrow) { if ($computing_sloc) { print "SLOC\t"; } else { print "#Files\t"; } if ($show_effort) {print "P.Y.\t";} print "Directory\t"; if ($computing_sloc) { print "SLOC-by-Language (Sorted)"; } else { print "#Files-by-Language (Sorted)"; } print "\n"; } else { if ($computing_sloc) { print "SLOC\t"; } else { print "#Files\t"; } if ($show_effort) {print "P.M.\t";} printf "%-22s\t", "Dir"; foreach $lang (keys(%lang_total)) { print "$lang\t"; $lang_total{$lang} = 0; }; print "\n"; } } if ($dirs_in_stdin == 1) { while (defined($dir = )) { chomp ($dir); push (@dirs, $dir); } } while ($dir = shift) { push (@dirs, $dir); } foreach $dir (@dirs) { if (! -d "$dir") { # print "Skipping non-directory $dir\n"; next; } # Skip previously-examined directories. if ($examined_directories{$dir}) { # print "Skipping already-examined directory $dir\n"; next; } $examined_directories{$dir} = 1; if (! -r "${dir}/filelist") { # print "Skipping directory $dir; it doesn't contain a file 'filelist'\n"; next; } $simplename = $dir; $simplename =~ s!^.*\/!!; $total_sloc = 0; $total_lang_sloc = 0; $preceding_entry = 0; $line = ""; %lang_data = (); if ($computing_sloc) { $filename = "${dir}/all-physical.sloc"; } else { $filename = "${dir}/all.filecount"; } if (open(DATAFILE, "<$filename")) { while (defined($_ = )) { ($lang, $sloc) = split; next if ( (!defined($lang)) || (!defined($sloc)) ); next if ($ignore_language_list{$lang}); if ($narrow) { if ($sloc) {$lang_data{$lang} = $sloc;}} else { $line .= "${sloc}\t"; } if ($lang eq $sort_by) {$interesting_lang_sloc = $sloc;} $total_sloc += $sloc; $total_lang_sloc += $sloc unless ($non_language_list{$lang}); $lang_total{$lang} += $sloc; } close(DATAFILE); } else { print STDERR "Error openinig $filename\n"; } if ($narrow) { # For narrow view, sort the language entries. foreach $entry (sort {$lang_data{$b} <=> $lang_data{$a}} keys %lang_data){ if ($preceding_entry) {$line .= ",";} $preceding_entry = 1; $line .= "${entry}=${lang_data{$entry}}"; } if (!$preceding_entry) {$line .= "(none)";} } $grand_total_sloc += $total_sloc; $grand_total_lang_sloc += $total_lang_sloc; $effort = effort_person_months($total_sloc); $grand_total_effort += $effort; $schedule = estimate_schedule($effort); if ($schedule > $grand_schedule) { $grand_schedule = $schedule; # The longest leg wins. } $displayed_effort = ""; if ($show_effort) { $displayed_effort = sprintf "%.2f\t", $effort; } if ($narrow) { $displayed_name = "$simplename"; } else { $displayed_name = sprintf "%-22s\t", $simplename; } # Add to the corresponding license, if the license is known. $license = ""; if (open(LICENSE_FILE, "<${dir}/PROGRAM_LICENSE")) { $license = ; chomp($license); close(LICENSE_FILE); if ($license) { $license_of{$simplename} = $license; # Hash currently unused. if (! defined($license_total{$license})) { $license_total{$license} = 0; } $license_total{$license} = $license_total{$license} + $total_sloc; } } else { $no_license_total += $total_sloc; } if ($narrow) { $line = sprintf "%-7d %s%-15s %-s\n", $total_sloc, $displayed_effort, $simplename, $line; if ($break_line && (length($line) > 77)) { # Break up long line. $line =~ s/(.{71})([^,]*),(.*)/$1$2,\n $3/; } if ($license) { $line .= " [$license]\n"; } } else { $line = "${total_sloc}\t${displayed_effort}${displayed_name}${line}\n"; } if ($sort_by) { if ($sort_by eq "total") {$line = "$total_sloc\t$line";} else {$line = "$interesting_lang_sloc\t$line";} $data_lines[$#data_lines+1] = $line; # Add to data lines. } else { print $line; # No sort - print immediately for speed. } } if ($sort_by) { # Print sorted version. This is a little inefficient, but for # only a few hundred or thousand values it doesn't matter. @sorted_data_lines = sort { ($b =~ /^(\d+)/)[0] <=> ($a =~ /^(\d+)/)[0] } @data_lines; foreach $line (@sorted_data_lines) { $short_line = $line; $short_line =~ s/^[^\t]*\t//; # Remove sort field. print $short_line; } } if (! $show_footer) {exit(0);} if ($grand_total_sloc == 0) { print "SLOC total is zero, no further analysis performed.\n"; exit(1); } # Print the footer. if ($narrow) { print "\n"; print "\n"; print "Totals grouped by language (dominant language first):\n"; # If you don't want the list sorted by size of language, just do: # foreach $lang (@language_list) { foreach $lang (sort {&get_lang_total($b) <=> &get_lang_total($a) } keys(%lang_total) ) { $percent = get_lang_total($lang) * 100.0 / $grand_total_sloc; if ($percent > 0.0) { printf "%-9s %9d (%.2f%%)\n", $lang . ":", $lang_total{$lang}, $percent; } }; if ($show_non_lang) { # The previous list showed "non-languages", so now we'll show only the # data for data associated with a normal language: print "\n"; print "\n"; foreach $lang (sort {&get_lang_total($b) <=> &get_lang_total($a) } keys(%lang_total)) { next if (defined($non_language_list{$lang})); $percent = $lang_total{$lang} * 100.0 / $grand_total_lang_sloc; if ($percent > 0.0) { printf "%-9s %9d (%.2f%%)\n", $lang . ":", $lang_total{$lang}, $percent; } }; } } else { # Not narrow. print "$grand_total_sloc\t"; if ($show_effort) {printf "%.2f\t", $grand_total_effort;} printf "%-22s", "Totals"; foreach $lang (keys(%lang_total)) { print "\t$lang_total{$lang}"; }; print "\t"; if ($show_effort) {printf "\t";} printf "%-22s\t", "Percentages"; foreach $lang (keys(%lang_total)) { $percent = $lang_total{$lang} * 100.0 / $grand_total_sloc; printf "\t%0.2f", $percent; }; print "\n"; print "\t"; if ($show_effort) {printf "\t";} printf "%-22s\t", "Code Percentages"; foreach $lang (keys(%lang_total)) { next if (defined($non_language_list{$lang})); $percent = $lang_total{$lang} * 100.0 / $grand_total_lang_sloc; printf "\t%0.2f", $percent; }; print "\n"; } print "\n"; print "\n"; if (%license_total) { # We have license info on something, so if there's anything # unallocated, add that to the list. if ($no_license_total) { $license_total{"Not listed"} = $no_license_total; } print "Licenses:\n"; foreach $license (sort {$license_total{$b} <=> $license_total{$a} } keys(%license_total)) { $percent = $license_total{$license} * 100.0 / $grand_total_sloc; if ($percent > 0.0) { printf "%9d (%.2f%%) %s\n", $license_total{$license}, $percent, $license; } }; print "\n"; print "\n"; print "Percentage of Licenses containing selected key phrases:\n"; %license_phrase = (); foreach $license (keys(%license_total)) { foreach $phrase (@license_list) { if ($license =~ m/\b$phrase\b/i) { if (!defined($license_phrase{$phrase})) {$license_phrase{$phrase} = 0;} $license_phrase{$phrase} = $license_phrase{$phrase} + $license_total{$license}; } } } foreach $phrase (sort {$license_phrase{$b} <=> $license_phrase{$a} } keys(%license_phrase)) { $percent = $license_phrase{$phrase} * 100.0 / $grand_total_sloc; if ($percent > 0.0) { printf "%9d (%.2f%%) %s\n", $license_phrase{$phrase}, $percent, $phrase; } }; } print "\n"; print "\n"; if ($computing_sloc) { if ($one_program) { # If it's one program, override the grand total of effort # and the schedule calculations by using the total SLOC. $grand_total_effort = effort_person_months($grand_total_sloc); $grand_schedule = estimate_schedule($grand_total_effort); } printf "Total Physical Source Lines of Code (SLOC) = %s\n", commify($grand_total_sloc); printf "Development Effort Estimate, Person-Years (Person-Months) = %s (%s)\n", numformat($grand_total_effort/12.0, 2), numformat($grand_total_effort, 2); print " ($effort_estimation_message " . "Person-Months = $effort_factor * (KSLOC**$effort_exponent))\n"; printf "Schedule Estimate, Years (Months) = %s (%s)\n", numformat($grand_schedule/12.0, 2), numformat($grand_schedule, 2); print " ($schedule_estimation_message " . "Months = $schedule_factor * (person-months**$schedule_exponent))\n"; # Don't show this if there are multiple programs, because the computation # is essentially meaningless: after the "smaller" projects have completed, # the longest one would keep going: if ($one_program && ($grand_schedule > 0.0)) { printf "Estimated Average Number of Developers (Effort/Schedule) = %s\n", numformat($grand_total_effort / $grand_schedule, 2); } $value = ($grand_total_effort / 12.0) * $person_cost * $overhead; printf "Total Estimated Cost to Develop = \$ %s\n", numformat($value, 0); printf " (average salary = \$%s/year, overhead = %0.2f).\n", commify($person_cost), $overhead; } else { print "Total Number of Files = $grand_total_sloc\n"; print "Total Number of Source Code Files = $grand_total_lang_sloc\n"; } print "SLOCCount, Copyright (C) 2001-2004 David A. Wheeler\n"; print "SLOCCount is Open Source Software/Free Software, licensed under the GNU GPL.\n"; print "SLOCCount comes with ABSOLUTELY NO WARRANTY, and you are welcome to\n"; print "redistribute it under certain conditions as specified by the GNU GPL license;\n"; print "see the documentation for details.\n"; print "Please credit this data as \"generated using David A. Wheeler's 'SLOCCount'.\"\n";