diff options
author | David A. Wheeler <dwheeler@dwheeler.com> | 2013-09-02 20:43:51 -0400 |
---|---|---|
committer | David A. Wheeler <dwheeler@dwheeler.com> | 2013-09-02 20:43:51 -0400 |
commit | a8a34a8b51a40f1db189b5c664139bc59b56ade9 (patch) | |
tree | 9838be49096655d01d277dee2093f11d3366aac8 | |
parent | 798de4ba1bd05f681c7947a1aaa67ac6018e9ae3 (diff) | |
download | sloccount-git-a8a34a8b51a40f1db189b5c664139bc59b56ade9.tar.gz |
Add support for Apache Pig [from Clay B]
- https://sourceforge.net/p/sloccount/patches/16/
-rwxr-xr-x | break_filelist | 48 | ||||
-rw-r--r-- | makefile | 4 | ||||
-rw-r--r-- | makefile.orig | 1 | ||||
-rw-r--r-- | pig_count.c | 225 | ||||
-rw-r--r-- | sloccount.html | 2 | ||||
-rw-r--r-- | testcode/test.pig | 7 |
6 files changed, 287 insertions, 0 deletions
diff --git a/break_filelist b/break_filelist index 04fda72..4c8ca73 100755 --- a/break_filelist +++ b/break_filelist @@ -176,6 +176,7 @@ $noisy = 0; # Set to 1 if you want noisy reports. "cob" => "cobol", "cbl" => "cobol", "COB" => "cobol", "CBL" => "cobol", # Yes, people do create wokka.CBL files "p" => "pascal", "pas" => "pascal", "pp" => "pascal", "dpr" => "pascal", + "pig" => "pig", "piglet" => "pig", "py" => "python", "s" => "asm", "S" => "asm", "asm" => "asm", "sh" => "sh", "bash" => "sh", @@ -611,6 +612,50 @@ sub really_is_php { } +# Cache which files are pig or not. +# Key is the full file pathname; value is 1 if it is (else 0). +%pig_files = (); + +sub really_is_pig { +# Given filename, returns TRUE if its contents really is pig. + + my $filename = shift; + chomp($filename); + + my $is_pig = 0; # Value to determine. + # Need to find a FOREACH, LOAD or DUMP, and a semicolon + + # Return cached result, if available: + if ($pig_files{$filename}) { return $pig_files{$filename}; } + + open(PIG_FILE, "<$filename") || + die "Can't open $filename to determine if it's pig.\n"; + while(<PIG_FILE>) { + # most Pig opterations need a terminating semicolon and equals + # signs to define a relation + if (m/;/i) { $script_semicolon |= 1; } + if (m/=/i) { $script_equals |= 1; } + # all FOREACH's need a GENERATE + if (m/FOREACH/i) { $script_foreach |= 1; } + if (m/GENERATE/i) { $script_foreach |= 2; } + # all LOAD's & DUMP's need a USING + if (m/(LOAD|DUMP)/i) { $script_input_output |= 1; } + if (m/USING/i) { $script_input_output |= 2; } + # all JOIN's, GROUP's & FILTER's need a BY + if (m/(JOIN|GROUP|FILTER)/i) { $script_dataset |= 1; } + if (m/BY/i) { $script_dataset |= 2; } + } + close(PIG_FILE); + + if ( ($script_semicolon == 1 && $script_equals == 1 && ($script_foreach == 3 || + &script_dataset == 3 || $script_input_output == 3) ) { + $is_pig = 1; + } + + $pig_files{$filename} = $is_pig; # Store result in cache. + + return $is_pig; +} sub examine_dir { # Given a file, determine if there are only C++, OBJC, C, or a mixture @@ -914,6 +959,9 @@ sub file_type_from_contents() { if ($command =~ m/^ruby[0-9\.]*(\.exe)?$/i) { return "ruby"; } + if ($command =~ m/^pig[0-9-\.]*/) { + return "pig"; + } if ($command =~ m/^(tcl|tclsh|bltwish|wish|wishx|WISH)[0-9\.]*(\.exe)?$/i) { return "tcl"; } @@ -81,6 +81,7 @@ COMPILED_EXECUTABLES= \ lexcount1$(EXE_SUFFIX) \ pascal_count$(EXE_SUFFIX) \ php_count$(EXE_SUFFIX) \ + pig_count$(EXE_SUFFIX) \ jsp_count$(EXE_SUFFIX) \ ml_count$(EXE_SUFFIX) @@ -143,6 +144,9 @@ c_count$(EXE_SUFFIX): c_count.c php_count$(EXE_SUFFIX): php_count.c $(CC) php_count.c -o php_count$(EXE_SUFFIX) +pig_count$(EXE_SUFFIX): php_count.c + $(CC) pig_count.c -o pig_count$(EXE_SUFFIX) + pascal_count.c: pascal_count.l driver.c driver.h flex -Cfe -t pascal_count.l > pascal_count.c diff --git a/makefile.orig b/makefile.orig index 7646787..9673db7 100644 --- a/makefile.orig +++ b/makefile.orig @@ -105,6 +105,7 @@ EXECUTABLES= \ get_sloc \ get_sloc_details \ haskell_count \ + innosetup_count \ javascript_count \ lex_count \ lisp_count \ diff --git a/pig_count.c b/pig_count.c new file mode 100644 index 0000000..4b26535 --- /dev/null +++ b/pig_count.c @@ -0,0 +1,225 @@ +/* pig_count: given a list of Apache Pig files on the command line, + count the SLOC in each one. SLOC = physical, non-comment lines. + This program knows about C and Pig comments (and how they interact), + and correctly ignores comment markers inside strings. + +This is part of SLOCCount, a toolsuite that counts source lines of code (SLOC). +Copyright (C) 2001-2004 David A. Wheeler. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +To contact David A. Wheeler, see his website at: + http://www.dwheeler.com. + + Usage: Use in one of the following ways: + pig_count # As filter + pig_count [-f file] [list_of_files] + file: file with a list of files to count (if "-", read list from stdin) + list_of_files: list of files to count +*/ + +#include <stdio.h> +#include <string.h> +#include <ctype.h> +#include <stdlib.h> + +/* Modes */ +#define NORMAL 0 +#define INSTRING 1 +#define INCOMMENT 2 + +/* Types of comments: */ +#define ANSIC_STYLE 0 +#define PIG_STYLE 1 + +/* Not all C compilers support a boolean type, so for portability's sake, + we'll fake it. */ +#define BOOLEAN int +#define TRUE 1 +#define FALSE 0 + + +/* Globals */ +long total_sloc; + +static BOOLEAN warn_embedded_newlines = FALSE; + +int peek(FILE *stream) { + int c = getc(stream); + ungetc(c, stream); + return c; +} + +int ispeek(int c, FILE *stream) { + if (c == peek(stream)) {return 1;} + return 0; +} + +long line_number; + +int getachar(FILE *stream) { +/* Like getchar(), but keep track of line number. */ + static BOOLEAN last_char_was_newline = 0; + int c; + + c = getc(stream); + if (last_char_was_newline) line_number++; + if (c == '\n') last_char_was_newline=1; + else last_char_was_newline=0; + return c; +} + + +long sloc_count(char *filename, FILE *stream) { + /* Count the sloc in the program in stdin. */ + + long sloc = 0; + + int sawchar = 0; /* Did you see a character on this line? */ + int c; + int mode = NORMAL; /* NORMAL, INSTRING, or INCOMMENT */ + int comment_type = ANSIC_STYLE; /* ANSIC_STYLE or PIG_STYLE */ + + + /* The following implements a state machine with transitions; the + main state is "mode" and "comment_type", the transitions are + triggered by characters input. */ + + while ( (c = getachar(stream)) != EOF) { + if (mode == NORMAL) { + if (c == '"') {sawchar=1; mode = INSTRING;} + else if (c == '\'') { /* Consume single-character 'xxxx' values */ + sawchar=1; + c = getachar(stream); + if (c == '\\') c = getachar(stream); + do { + c = getachar(stream); + } while ((c != '\'') && (c != '\n') & (c != EOF)); + } else if ((c == '/') && ispeek('*', stream)) { + c = getachar(stream); + mode = INCOMMENT; + comment_type = ANSIC_STYLE; + } else if ((c == '-') && ispeek('-', stream)) { + c = getachar(stream); + mode = INCOMMENT; + comment_type = PIG_STYLE; + } else if (!isspace(c)) {sawchar = 1;} + } else if (mode == INSTRING) { + /* We only count string lines with non-whitespace -- this is to + gracefully handle syntactically invalid programs. + You could argue that multiline strings with whitespace are + still executable and should be counted. */ + if (!isspace(c)) sawchar = 1; + if (c == '"') {mode = NORMAL;} + else if ((c == '\\') && (ispeek('\"', stream) || ispeek('\\', stream))) {c = getachar(stream);} + else if ((c == '\\') && ispeek('\n', stream)) {c = getachar(stream);} + else if ((c == '\n') && warn_embedded_newlines) { + /* We found a bare newline in a string without preceding backslash. */ + fprintf(stderr, "pig_count WARNING - newline in string, line %ld, file %s\n", line_number, filename); + /* We COULD warn & reset mode to "Normal", but lots of code does this, + so we'll just depend on the warning for ending the program + in a string to catch syntactically erroneous programs. */ + } + } else { /* INCOMMENT mode */ + if ((c == '\n') && (comment_type == PIG_STYLE)) { mode = NORMAL;} + if ((comment_type == ANSIC_STYLE) && (c == '*') && + ispeek('/', stream)) { c= getachar(stream); mode = NORMAL;} + } + if (c == '\n') { + if (sawchar) sloc++; + sawchar = 0; + } + } + /* We're done with the file. Handle EOF-without-EOL. */ + if (sawchar) sloc++; + sawchar = 0; + if ((mode == INCOMMENT) && (comment_type == PIG_STYLE)) { mode = NORMAL;} + + if (mode == INCOMMENT) { + fprintf(stderr, "pig_count ERROR - terminated in comment in %s\n", filename); + } else if (mode == INSTRING) { + fprintf(stderr, "pig_count ERROR - terminated in string in %s\n", filename); + } + + return sloc; +} + + +void count_file(char *filename) { + long sloc; + FILE *stream; + + stream = fopen(filename, "r"); + line_number = 1; + sloc = sloc_count(filename, stream); + total_sloc += sloc; + printf("%ld %s\n", sloc, filename); + fclose(stream); +} + +char *read_a_line(FILE *file) { + /* Read a line in, and return a malloc'ed buffer with the line contents. + Any newline at the end is stripped. + If there's nothing left to read, returns NULL. */ + + /* We'll create a monstrously long buffer to make life easy for us: */ + char buffer[10000]; + char *returnval; + char *newlinepos; + + returnval = fgets(buffer, sizeof(buffer), file); + if (returnval) { + newlinepos = buffer + strlen(buffer) - 1; + if (*newlinepos == '\n') {*newlinepos = '\0';}; + return strdup(buffer); + } else { + return NULL; + } +} + + +int main(int argc, char *argv[]) { + long sloc; + int i; + FILE *file_list; + char *s; + + total_sloc = 0; + line_number = 1; + + if (argc <= 1) { + sloc = sloc_count("-", stdin); + printf("%ld %s\n", sloc, "-"); + total_sloc += sloc; + } else if ((argc == 3) && (!strcmp(argv[1], "-f"))) { + if (!strcmp (argv[2], "-")) { + file_list = stdin; + } else { + file_list = fopen(argv[2], "r"); + } + if (file_list) { + while ((s = read_a_line(file_list))) { + count_file(s); + free(s); + } + } + } else { + for (i=1; i < argc; i++) { count_file(argv[i]); } + } + printf("Total:\n"); + printf("%ld\n", total_sloc); + return 0; /* Report success */ +} + diff --git a/sloccount.html b/sloccount.html index 233ae9a..7b74b18 100644 --- a/sloccount.html +++ b/sloccount.html @@ -240,6 +240,7 @@ listed in brackets: <li>Pascal (.p, .pas) [pascal] <li>Perl (.pl, .pm, .perl) [perl] <li>PHP (.php, .php[3456], .inc) [php] +<li>Pig (.pig, .piglett) [pig] <li>Python (.py) [python] <li>Ruby (.rb) [ruby] <li>sed (.sed) [sed] @@ -1468,6 +1469,7 @@ modula3_count, objc_count, pascal_count, perl_count, +pig_count, python_count, sed_count, sh_count, diff --git a/testcode/test.pig b/testcode/test.pig new file mode 100644 index 0000000..38de841 --- /dev/null +++ b/testcode/test.pig @@ -0,0 +1,7 @@ +--This is a short example Pig script + +/* This should have only two lines of actual code */ + +foo = LOAD '/etc/passwd' USING PigStorage(':'); + +DUMP foo |