#!/usr/bin/perl -w # break_filelist # Take a list of dirs which contain a "filelist"; # creates files in each directory identifying which are C, C++, Perl, etc. # For example, "ansic.dat" lists all ANSI C files contained in filelist. # Note: ".h" files are ambiguous (they could be C or C++); the program # uses heuristics to determine this. # The list of .h files is also contained in h_list.dat. # This is part of SLOCCount, a toolsuite that counts # source lines of code (SLOC). # Copyright (C) 2001-2004 David A. Wheeler. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # To contact David A. Wheeler, see his website at: # http://www.dwheeler.com. # If adding a new language: add the logic to open the file, # close the file, and detect & write to the file listing that language. # Debatable decisions: # Doesn't count .dsl files (stylesheets, which are partially LISP). # Doesn't count .sql files (SQL queries & commands) # Note - I don't try to distinguish between TCL and [incr TCL] (itcl), # an OO extended version of TCL. For our purposes, it's all TCL. use FileHandle; # Set default configuration: $duplicates_okay = 0; # Set to 1 if you want to count file duplicates. $crossdups_okay = 0; # Set to 1 if duplicates okay in different filelists. $autogen_okay = 0; # Set to 1 if you want to count autogen'ed files. $noisy = 0; # Set to 1 if you want noisy reports. %lang_list_files = (); # The following extensions are NOT code: %not_code_extensions = ( "html" => 1, "in" => 1, # Debatable. "xpm" => 1, "po" => 1, "am" => 1, # Debatable. "1" => 1, # Man pages (documentation): "2" => 1, "3" => 1, "4" => 1, "5" => 1, "6" => 1, "7" => 1, "8" => 1, "9" => 1, "n" => 1, "gif" => 1, "tfm" => 1, "png" => 1, "m4" => 1, # Debatable. "bdf" => 1, "sgml" => 1, "mf" => 1, "txt" => 1, "text" => 1, "man" => 1, "xbm" => 1, "Tag" => 1, "sgm" => 1, "vf" => 1, "tex" => 1, "elc" => 1, "gz" => 1, "dic" => 1, "pfb" => 1, "fig" => 1, "afm" => 1, # font metrics "jpg" => 1, "bmp" => 1, "htm" => 1, "kdelnk" => 1, "desktop" => 1, "pbm" => 1, "pdf" => 1, "ps" => 1, # Postscript is _USUALLY_ generated automatically. "eps" => 1, "doc" => 1, "man" => 1, "o" => 1, # Object code is generated from source code. "a" => 1, # Static object code. "so" => 1, # Dynamically-loaded object code. "Y" => 1, # file compressed with "Yabba" "Z" => 1, # file compressed with "compress" "ad" => 1, # X application default resource file. "arc" => 1, # arc(1) archive "arj" => 1, # arj(1) archive "au" => 1, # Audio sound filearj(1) archive "wav" => 1, "bak" => 1, # Backup files - we only want to count the "real" files. "bz2" => 1, # bzip2(1) compressed file "mp3" => 1, # zip archive "tgz" => 1, # tarball "zip" => 1, # zip archive ); # The following filenames are NOT code: %not_code_filenames = ( "README" => 1, "Readme" => 1, "readme" => 1, "README.tk" => 1, # used in kdemultimedia, it's confusing. "Changelog" => 1, "ChangeLog" => 1, "Repository" => 1, "CHANGES" => 1, "Changes" => 1, ".cvsignore" => 1, "Root" => 1, # CVS. "BUGS" => 1, "TODO" => 1, "COPYING" => 1, "MAINTAINERS" => 1, "Entries" => 1, # Skip "iconfig.h" files; they're used in Imakefiles # (used in xlockmore): "iconfig.h" => 1, ); # A filename ending in the following extensions usually maps to the # given language: # TODO: See suffixes(7) # .al Perl autoload file # .am automake input %file_extensions = ( "c" => "ansic", "ec" => "ansic", # Informix C. "ecp" => "ansic", # Informix C. "pgc" => "ansic", # Postgres embedded C/C++ (guess C) "C" => "cpp", "cpp" => "cpp", "cxx" => "cpp", "cc" => "cpp", "pcc" => "cpp", # Input to Oracle C++ preproc. "m" => "objc", # C# (C-sharp) is named 'cs', not 'c#', because # the '#' is a comment character and I'm trying to # avoid bug-prone conventions. # C# doesn't support header files. "cs" => "cs", # Header files are allocated to the "h" language, and then # copied to the correct location later so that C/C++/Objective-C # can be separated. "h" => "h", "H" => "h", "hpp" => "h", "hh" => "h", "ada" => "ada", "adb" => "ada", "ads" => "ada", "pad" => "ada", # Oracle Ada preprocessor. "f" => "fortran", "F" => "fortran", # This catches "wokka.F" as Fortran. # Warning: "Freeze" format also uses .f. Haven't heard of problems, # freeze is extremely rare and even more rare in source code directories. "f77" => "fortran", "F77" => "fortran", "f90" => "f90", "F90" => "f90", "cob" => "cobol", "cbl" => "cobol", "COB" => "cobol", "CBL" => "cobol", # Yes, people do create wokka.CBL files "p" => "pascal", "pas" => "pascal", "pp" => "pascal", "dpr" => "pascal", "py" => "python", "s" => "asm", "S" => "asm", "asm" => "asm", "sh" => "sh", "bash" => "sh", "csh" => "csh", "tcsh" => "csh", "java" => "java", "lisp" => "lisp", "el" => "lisp", "scm" => "lisp", "sc" => "lisp", "lsp" => "lisp", "cl" => "lisp", "jl" => "lisp", "tcl" => "tcl", "tk" => "tcl", "itk" => "tcl", "exp" => "exp", "pl" => "perl", "pm" => "perl", "perl" => "perl", "ph" => "perl", "awk" => "awk", "sed" => "sed", "y" => "yacc", "l" => "lex", "makefile" => "makefile", "sql" => "sql", "php" => "php", "php3" => "php", "php4" => "php", "php5" => "php", "php6" => "php", "inc" => "inc", # inc MAY be PHP - we'll handle it specially. "m3" => "modula3", "i3" => "modula3", "mg" => "modula3", "ig" => "modula3", "ml" => "ml", "mli" => "ml", "mly" => "ml", # ocamlyacc. In fact this is half-yacc half-ML, especially # comments in yacc part are C-like, not ML like. "mll" => "ml", # ocamllex, no such problems as in ocamlyacc "rb" => "ruby", "hs" => "haskell", "lhs" => "haskell", # ???: .pco is Oracle Cobol "jsp" => "jsp", # Java server pages ); # GLOBAL VARIABLES $dup_count = 0; $warning_from_first_line = ""; %examined_directories = (); # Keys = Names of directories examined this run. $duplistfile = ""; ########### # Handle re-opening individual CODE_FILEs. # CODE_FILE is public # Private value: $opened_file_name = ""; sub reopen { # Open file if it isn't already, else rewind. # If filename is "", close any open file. my $filename = shift; chomp($filename); # print("DEBUG: reopen($filename)\n"); if ($filename eq "") { if ($opened_file_name) {close(CODE_FILE);} $opened_file_name = ""; return; } if ($filename eq $opened_file_name) { seek CODE_FILE, 0, 0; # Rewind. } else { # We're opening a new file. if ($opened_file_name) {close(CODE_FILE)} open(CODE_FILE, "<$filename\0") || die "Can't open $filename"; $opened_file_name = $filename; } } ########### sub looks_like_cpp { # returns a confidence level - does the file looks like it's C++? my $filename = shift; my $confidence = 0; chomp($filename); open( SUSPECT, "<$filename"); while (defined($_ = )) { if (m/^\s*class\b.*\{/) { # "}" close(SUSPECT); return 2; } if (m/^\s*class\b/) { $confidence = 1; } } close(SUSPECT); return $confidence; } # Cache which files are objective-C or not. # Key is the full file pathname; value is 1 if objective-C (else 0). %objective_c_files = (); sub really_is_objc { # Given filename, returns TRUE if its contents really are objective-C. my $filename = shift; chomp($filename); my $is_objc = 0; # Value to determine. my $brace_lines = 0; # Lines that begin/end with curly braces. my $plus_minus = 0; # Lines that begin with + or -. my $word_main = 0; # Did we find "main("? my $special = 0; # Did we find a special Objective-C pattern? # Return cached result, if available: if ($objective_c_files{$filename}) { return $objective_c_files{$filename};} open(OBJC_FILE, "<$filename") || die "Can't open $filename to determine if it's objective C.\n"; while() { if (m/^\s*[{}]/ || m/[{}];?\s*$/) { $brace_lines++;} if (m/^\s*[+-]/) {$plus_minus++;} if (m/\bmain\s*\(/) {$word_main++;} # "main" followed by "("? # Handle /usr/src/redhat/BUILD/egcs-1.1.2/gcc/objc/linking.m: if (m/^\s*\[object name\];\s*$/i) {$special=1;} } close(OBJC_FILE); if (($brace_lines > 1) && (($plus_minus > 1) || $word_main || $special)) {$is_objc = 1;} $objective_c_files{$filename} = $is_objc; # Store result in cache. return $is_objc; } # Cache which files are lex or not. # Key is the full file pathname; value is 1 if lex (else 0). %lex_files = (); sub really_is_lex { # Given filename, returns TRUE if its contents really is lex. # lex file must have "%%", "%{", and "%}". # In theory, a lex file doesn't need "%{" and "%}", but in practice # they all have them, and requiring them avoid mislabeling a # non-lexfile as a lex file. my $filename = shift; chomp($filename); my $is_lex = 0; # Value to determine. my $percent_percent = 0; my $percent_opencurly = 0; my $percent_closecurly = 0; # Return cached result, if available: if ($lex_files{$filename}) { return $lex_files{$filename};} open(LEX_FILE, "<$filename") || die "Can't open $filename to determine if it's lex.\n"; while() { $percent_percent++ if (m/^\s*\%\%/); $percent_opencurly++ if (m/^\s*\%\{/); $percent_closecurly++ if (m/^\s*\%\}/); } close(LEX_FILE); if ($percent_percent && $percent_opencurly && $percent_closecurly) {$is_lex = 1;} $lex_files{$filename} = $is_lex; # Store result in cache. return $is_lex; } # Cache which files are expect or not. # Key is the full file pathname; value is 1 if it is (else 0). %expect_files = (); sub really_is_expect { # Given filename, returns TRUE if its contents really are Expect. # Many "exp" files (such as in Apache and Mesa) are just "export" data, # summarizing something else # (e.g., its interface). # Sometimes (like in RPM) it's just misc. data. # Thus, we need to look at the file to determine # if it's really an "expect" file. my $filename = shift; chomp($filename); # The heuristic is as follows: it's Expect _IF_ it: # 1. has "load_lib" command and either "#" comments or {}. # 2. {, }, and one of: proc, if, [...], expect my $is_expect = 0; # Value to determine. my $begin_brace = 0; # Lines that begin with curly braces. my $end_brace = 0; # Lines that begin with curly braces. my $load_lib = 0; # Lines with the Load_lib command. my $found_proc = 0; my $found_if = 0; my $found_brackets = 0; my $found_expect = 0; my $found_pound = 0; # Return cached result, if available: if ($expect_files{$filename}) { return expect_files{$filename};} open(EXPECT_FILE, "<$filename") || die "Can't open $filename to determine if it's expect.\n"; while() { if (m/#/) {$found_pound++; s/#.*//;} if (m/^\s*\{/) { $begin_brace++;} if (m/\{\s*$/) { $begin_brace++;} if (m/^\s*\}/) { $end_brace++;} if (m/\};?\s*$/) { $end_brace++;} if (m/^\s*load_lib\s+\S/) { $load_lib++;} if (m/^\s*proc\s/) { $found_proc++;} if (m/^\s*if\s/) { $found_if++;} if (m/\[.*\]/) { $found_brackets++;} if (m/^\s*expect\s/) { $found_expect++;} } close(EXPECT_FILE); if ($load_lib && ($found_pound || ($begin_brace && $end_brace))) {$is_expect = 1;} if ( $begin_brace && $end_brace && ($found_proc || $found_if || $found_brackets || $found_expect)) {$is_expect = 1;} $expect_files{$filename} = $is_expect; # Store result in cache. return $is_expect; } # Cached values. %pascal_files = (); sub really_is_pascal { # Given filename, returns TRUE if its contents really are Pascal. # This isn't as obvious as it seems. # Many ".p" files are Perl files # (such as /usr/src/redhat/BUILD/ispell-3.1/dicts/czech/glob.p), # others are C extractions # (such as /usr/src/redhat/BUILD/linux/include/linux/umsdos_fs.p # and some files in linuxconf). # However, test files in "p2c" really are Pascal, for example. # Note that /usr/src/redhat/BUILD/ucd-snmp-4.1.1/ov/bitmaps/UCD.20.p # is actually C code. The heuristics determine that they're not Pascal, # but because it ends in ".p" it's not counted as C code either. # I believe this is actually correct behavior, because frankly it # looks like it's automatically generated (it's a bitmap expressed as code). # Rather than guess otherwise, we don't include it in a list of # source files. Let's face it, someone who creates C files ending in ".p" # and expects them to be counted by default as C files in SLOCCount needs # their head examined. I suggest examining their head # with a sucker rod (see syslogd(8) for more on sucker rods). # This heuristic counts as Pascal such files such as: # /usr/src/redhat/BUILD/teTeX-1.0/texk/web2c/tangleboot.p # Which is hand-generated. We don't count woven documents now anyway, # so this is justifiable. my $filename = shift; chomp($filename); # The heuristic is as follows: it's Pascal _IF_ it has all of the following # (ignoring {...} and (*...*) comments): # 1. "^..program NAME" or "^..unit NAME", # 2. "procedure", "function", "^..interface", or "^..implementation", # 3. a "begin", and # 4. it ends with "end.", # # Or it has all of the following: # 1. "^..module NAME" and # 2. it ends with "end.". # # Or it has all of the following: # 1. "^..program NAME", # 2. a "begin", and # 3. it ends with "end.". # # The "end." requirements in particular filter out non-Pascal. # # Note (jgb): this does not detect Pascal main files in fpc, like # fpc-1.0.4/api/test/testterminfo.pas, which does not have "program" in # it my $is_pascal = 0; # Value to determine. my $has_program = 0; my $has_unit = 0; my $has_module = 0; my $has_procedure_or_function = 0; my $found_begin = 0; my $found_terminating_end = 0; # Return cached result, if available: if ($pascal_files{$filename}) { return pascal_files{$filename};} open(PASCAL_FILE, "<$filename") || die "Can't open $filename to determine if it's pascal.\n"; while() { s/\{.*?\}//g; # Ignore {...} comments on this line; imperfect, but effective. s/\(\*.*?\*\)//g; # Ignore (*...*) comments on this line; imperfect, but effective. if (m/\bprogram\s+[A-Za-z]/i) {$has_program=1;} if (m/\bunit\s+[A-Za-z]/i) {$has_unit=1;} if (m/\bmodule\s+[A-Za-z]/i) {$has_module=1;} if (m/\bprocedure\b/i) { $has_procedure_or_function = 1; } if (m/\bfunction\b/i) { $has_procedure_or_function = 1; } if (m/^\s*interface\s+/i) { $has_procedure_or_function = 1; } if (m/^\s*implementation\s+/i) { $has_procedure_or_function = 1; } if (m/\bbegin\b/i) { $has_begin = 1; } # Originally I said: # "This heuristic fails if there are multi-line comments after # "end."; I haven't seen that in real Pascal programs:" # But jgb found there are a good quantity of them in Debian, specially in # fpc (at the end of a lot of files there is a multiline comment # with the changelog for the file). # Therefore, assume Pascal if "end." appears anywhere in the file. if (m/end\.\s*$/i) {$found_terminating_end = 1;} # elsif (m/\S/) {$found_terminating_end = 0;} } close(PASCAL_FILE); # Okay, we've examined the entire file looking for clues; # let's use those clues to determine if it's really Pascal: if ( ( ($has_unit || $has_program) && $has_procedure_or_function && $has_begin && $found_terminating_end ) || ( $has_module && $found_terminating_end ) || ( $has_program && $has_begin && $found_terminating_end ) ) {$is_pascal = 1;} $pascal_files{$filename} = $is_pascal; # Store result in cache. return $is_pascal; } sub really_is_incpascal { # Given filename, returns TRUE if its contents really are Pascal. # For .inc files (mainly seen in fpc) my $filename = shift; chomp($filename); # The heuristic is as follows: it is Pacal if any of the following: # 1. really_is_pascal returns true # 2. Any usual reserverd word is found (program, unit, const, begin...) # If the general routine for Pascal files works, we have it if (&really_is_pascal ($filename)) { $pascal_files{$filename} = 1; return 1; } my $is_pascal = 0; # Value to determine. my $found_begin = 0; open(PASCAL_FILE, "<$filename") || die "Can't open $filename to determine if it's pascal.\n"; while() { s/\{.*?\}//g; # Ignore {...} comments on this line; imperfect, but effective. s/\(\*.*?\*\)//g; # Ignore (*...*) comments on this line; imperfect, but effective. if (m/\bprogram\s+[A-Za-z]/i) {$is_pascal=1;} if (m/\bunit\s+[A-Za-z]/i) {$is_pascal=1;} if (m/\bmodule\s+[A-Za-z]/i) {$is_pascal=1;} if (m/\bprocedure\b/i) {$is_pascal = 1; } if (m/\bfunction\b/i) {$is_pascal = 1; } if (m/^\s*interface\s+/i) {$is_pascal = 1; } if (m/^\s*implementation\s+/i) {$is_pascal = 1; } if (m/\bconstant\s+/i) {$is_pascal=1;} if (m/\bbegin\b/i) { $found_begin = 1; } if ((m/end\.\s*$/i) && ($found_begin = 1)) {$is_pascal = 1;} if ($is_pascal) { last; } } close(PASCAL_FILE); $pascal_files{$filename} = $is_pascal; # Store result in cache. return $is_pascal; } # Cache which files are php or not. # Key is the full file pathname; value is 1 if it is (else 0). %php_files = (); sub really_is_php { # Given filename, returns TRUE if its contents really is php. my $filename = shift; chomp($filename); my $is_php = 0; # Value to determine. # Need to find a matching pair of surrounds, with ending after beginning: my $normal_surround = 0; # my $script_surround = 0; # ; bit 0 =