summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid A. Wheeler <dwheeler@dwheeler.com>2013-09-02 20:43:51 -0400
committerDavid A. Wheeler <dwheeler@dwheeler.com>2013-09-02 20:43:51 -0400
commita8a34a8b51a40f1db189b5c664139bc59b56ade9 (patch)
tree9838be49096655d01d277dee2093f11d3366aac8
parent798de4ba1bd05f681c7947a1aaa67ac6018e9ae3 (diff)
downloadsloccount-git-a8a34a8b51a40f1db189b5c664139bc59b56ade9.tar.gz
Add support for Apache Pig [from Clay B]
- https://sourceforge.net/p/sloccount/patches/16/
-rwxr-xr-xbreak_filelist48
-rw-r--r--makefile4
-rw-r--r--makefile.orig1
-rw-r--r--pig_count.c225
-rw-r--r--sloccount.html2
-rw-r--r--testcode/test.pig7
6 files changed, 287 insertions, 0 deletions
diff --git a/break_filelist b/break_filelist
index 04fda72..4c8ca73 100755
--- a/break_filelist
+++ b/break_filelist
@@ -176,6 +176,7 @@ $noisy = 0; # Set to 1 if you want noisy reports.
"cob" => "cobol", "cbl" => "cobol",
"COB" => "cobol", "CBL" => "cobol", # Yes, people do create wokka.CBL files
"p" => "pascal", "pas" => "pascal", "pp" => "pascal", "dpr" => "pascal",
+ "pig" => "pig", "piglet" => "pig",
"py" => "python",
"s" => "asm", "S" => "asm", "asm" => "asm",
"sh" => "sh", "bash" => "sh",
@@ -611,6 +612,50 @@ sub really_is_php {
}
+# Cache which files are pig or not.
+# Key is the full file pathname; value is 1 if it is (else 0).
+%pig_files = ();
+
+sub really_is_pig {
+# Given filename, returns TRUE if its contents really is pig.
+
+ my $filename = shift;
+ chomp($filename);
+
+ my $is_pig = 0; # Value to determine.
+ # Need to find a FOREACH, LOAD or DUMP, and a semicolon
+
+ # Return cached result, if available:
+ if ($pig_files{$filename}) { return $pig_files{$filename}; }
+
+ open(PIG_FILE, "<$filename") ||
+ die "Can't open $filename to determine if it's pig.\n";
+ while(<PIG_FILE>) {
+ # most Pig opterations need a terminating semicolon and equals
+ # signs to define a relation
+ if (m/;/i) { $script_semicolon |= 1; }
+ if (m/=/i) { $script_equals |= 1; }
+ # all FOREACH's need a GENERATE
+ if (m/FOREACH/i) { $script_foreach |= 1; }
+ if (m/GENERATE/i) { $script_foreach |= 2; }
+ # all LOAD's & DUMP's need a USING
+ if (m/(LOAD|DUMP)/i) { $script_input_output |= 1; }
+ if (m/USING/i) { $script_input_output |= 2; }
+ # all JOIN's, GROUP's & FILTER's need a BY
+ if (m/(JOIN|GROUP|FILTER)/i) { $script_dataset |= 1; }
+ if (m/BY/i) { $script_dataset |= 2; }
+ }
+ close(PIG_FILE);
+
+ if ( ($script_semicolon == 1 && $script_equals == 1 && ($script_foreach == 3 ||
+ &script_dataset == 3 || $script_input_output == 3) ) {
+ $is_pig = 1;
+ }
+
+ $pig_files{$filename} = $is_pig; # Store result in cache.
+
+ return $is_pig;
+}
sub examine_dir {
# Given a file, determine if there are only C++, OBJC, C, or a mixture
@@ -914,6 +959,9 @@ sub file_type_from_contents() {
if ($command =~ m/^ruby[0-9\.]*(\.exe)?$/i) {
return "ruby";
}
+ if ($command =~ m/^pig[0-9-\.]*/) {
+ return "pig";
+ }
if ($command =~ m/^(tcl|tclsh|bltwish|wish|wishx|WISH)[0-9\.]*(\.exe)?$/i) {
return "tcl";
}
diff --git a/makefile b/makefile
index 9673db7..117341b 100644
--- a/makefile
+++ b/makefile
@@ -81,6 +81,7 @@ COMPILED_EXECUTABLES= \
lexcount1$(EXE_SUFFIX) \
pascal_count$(EXE_SUFFIX) \
php_count$(EXE_SUFFIX) \
+ pig_count$(EXE_SUFFIX) \
jsp_count$(EXE_SUFFIX) \
ml_count$(EXE_SUFFIX)
@@ -143,6 +144,9 @@ c_count$(EXE_SUFFIX): c_count.c
php_count$(EXE_SUFFIX): php_count.c
$(CC) php_count.c -o php_count$(EXE_SUFFIX)
+pig_count$(EXE_SUFFIX): php_count.c
+ $(CC) pig_count.c -o pig_count$(EXE_SUFFIX)
+
pascal_count.c: pascal_count.l driver.c driver.h
flex -Cfe -t pascal_count.l > pascal_count.c
diff --git a/makefile.orig b/makefile.orig
index 7646787..9673db7 100644
--- a/makefile.orig
+++ b/makefile.orig
@@ -105,6 +105,7 @@ EXECUTABLES= \
get_sloc \
get_sloc_details \
haskell_count \
+ innosetup_count \
javascript_count \
lex_count \
lisp_count \
diff --git a/pig_count.c b/pig_count.c
new file mode 100644
index 0000000..4b26535
--- /dev/null
+++ b/pig_count.c
@@ -0,0 +1,225 @@
+/* pig_count: given a list of Apache Pig files on the command line,
+ count the SLOC in each one. SLOC = physical, non-comment lines.
+ This program knows about C and Pig comments (and how they interact),
+ and correctly ignores comment markers inside strings.
+
+This is part of SLOCCount, a toolsuite that counts source lines of code (SLOC).
+Copyright (C) 2001-2004 David A. Wheeler.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+To contact David A. Wheeler, see his website at:
+ http://www.dwheeler.com.
+
+ Usage: Use in one of the following ways:
+ pig_count # As filter
+ pig_count [-f file] [list_of_files]
+ file: file with a list of files to count (if "-", read list from stdin)
+ list_of_files: list of files to count
+*/
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdlib.h>
+
+/* Modes */
+#define NORMAL 0
+#define INSTRING 1
+#define INCOMMENT 2
+
+/* Types of comments: */
+#define ANSIC_STYLE 0
+#define PIG_STYLE 1
+
+/* Not all C compilers support a boolean type, so for portability's sake,
+ we'll fake it. */
+#define BOOLEAN int
+#define TRUE 1
+#define FALSE 0
+
+
+/* Globals */
+long total_sloc;
+
+static BOOLEAN warn_embedded_newlines = FALSE;
+
+int peek(FILE *stream) {
+ int c = getc(stream);
+ ungetc(c, stream);
+ return c;
+}
+
+int ispeek(int c, FILE *stream) {
+ if (c == peek(stream)) {return 1;}
+ return 0;
+}
+
+long line_number;
+
+int getachar(FILE *stream) {
+/* Like getchar(), but keep track of line number. */
+ static BOOLEAN last_char_was_newline = 0;
+ int c;
+
+ c = getc(stream);
+ if (last_char_was_newline) line_number++;
+ if (c == '\n') last_char_was_newline=1;
+ else last_char_was_newline=0;
+ return c;
+}
+
+
+long sloc_count(char *filename, FILE *stream) {
+ /* Count the sloc in the program in stdin. */
+
+ long sloc = 0;
+
+ int sawchar = 0; /* Did you see a character on this line? */
+ int c;
+ int mode = NORMAL; /* NORMAL, INSTRING, or INCOMMENT */
+ int comment_type = ANSIC_STYLE; /* ANSIC_STYLE or PIG_STYLE */
+
+
+ /* The following implements a state machine with transitions; the
+ main state is "mode" and "comment_type", the transitions are
+ triggered by characters input. */
+
+ while ( (c = getachar(stream)) != EOF) {
+ if (mode == NORMAL) {
+ if (c == '"') {sawchar=1; mode = INSTRING;}
+ else if (c == '\'') { /* Consume single-character 'xxxx' values */
+ sawchar=1;
+ c = getachar(stream);
+ if (c == '\\') c = getachar(stream);
+ do {
+ c = getachar(stream);
+ } while ((c != '\'') && (c != '\n') & (c != EOF));
+ } else if ((c == '/') && ispeek('*', stream)) {
+ c = getachar(stream);
+ mode = INCOMMENT;
+ comment_type = ANSIC_STYLE;
+ } else if ((c == '-') && ispeek('-', stream)) {
+ c = getachar(stream);
+ mode = INCOMMENT;
+ comment_type = PIG_STYLE;
+ } else if (!isspace(c)) {sawchar = 1;}
+ } else if (mode == INSTRING) {
+ /* We only count string lines with non-whitespace -- this is to
+ gracefully handle syntactically invalid programs.
+ You could argue that multiline strings with whitespace are
+ still executable and should be counted. */
+ if (!isspace(c)) sawchar = 1;
+ if (c == '"') {mode = NORMAL;}
+ else if ((c == '\\') && (ispeek('\"', stream) || ispeek('\\', stream))) {c = getachar(stream);}
+ else if ((c == '\\') && ispeek('\n', stream)) {c = getachar(stream);}
+ else if ((c == '\n') && warn_embedded_newlines) {
+ /* We found a bare newline in a string without preceding backslash. */
+ fprintf(stderr, "pig_count WARNING - newline in string, line %ld, file %s\n", line_number, filename);
+ /* We COULD warn & reset mode to "Normal", but lots of code does this,
+ so we'll just depend on the warning for ending the program
+ in a string to catch syntactically erroneous programs. */
+ }
+ } else { /* INCOMMENT mode */
+ if ((c == '\n') && (comment_type == PIG_STYLE)) { mode = NORMAL;}
+ if ((comment_type == ANSIC_STYLE) && (c == '*') &&
+ ispeek('/', stream)) { c= getachar(stream); mode = NORMAL;}
+ }
+ if (c == '\n') {
+ if (sawchar) sloc++;
+ sawchar = 0;
+ }
+ }
+ /* We're done with the file. Handle EOF-without-EOL. */
+ if (sawchar) sloc++;
+ sawchar = 0;
+ if ((mode == INCOMMENT) && (comment_type == PIG_STYLE)) { mode = NORMAL;}
+
+ if (mode == INCOMMENT) {
+ fprintf(stderr, "pig_count ERROR - terminated in comment in %s\n", filename);
+ } else if (mode == INSTRING) {
+ fprintf(stderr, "pig_count ERROR - terminated in string in %s\n", filename);
+ }
+
+ return sloc;
+}
+
+
+void count_file(char *filename) {
+ long sloc;
+ FILE *stream;
+
+ stream = fopen(filename, "r");
+ line_number = 1;
+ sloc = sloc_count(filename, stream);
+ total_sloc += sloc;
+ printf("%ld %s\n", sloc, filename);
+ fclose(stream);
+}
+
+char *read_a_line(FILE *file) {
+ /* Read a line in, and return a malloc'ed buffer with the line contents.
+ Any newline at the end is stripped.
+ If there's nothing left to read, returns NULL. */
+
+ /* We'll create a monstrously long buffer to make life easy for us: */
+ char buffer[10000];
+ char *returnval;
+ char *newlinepos;
+
+ returnval = fgets(buffer, sizeof(buffer), file);
+ if (returnval) {
+ newlinepos = buffer + strlen(buffer) - 1;
+ if (*newlinepos == '\n') {*newlinepos = '\0';};
+ return strdup(buffer);
+ } else {
+ return NULL;
+ }
+}
+
+
+int main(int argc, char *argv[]) {
+ long sloc;
+ int i;
+ FILE *file_list;
+ char *s;
+
+ total_sloc = 0;
+ line_number = 1;
+
+ if (argc <= 1) {
+ sloc = sloc_count("-", stdin);
+ printf("%ld %s\n", sloc, "-");
+ total_sloc += sloc;
+ } else if ((argc == 3) && (!strcmp(argv[1], "-f"))) {
+ if (!strcmp (argv[2], "-")) {
+ file_list = stdin;
+ } else {
+ file_list = fopen(argv[2], "r");
+ }
+ if (file_list) {
+ while ((s = read_a_line(file_list))) {
+ count_file(s);
+ free(s);
+ }
+ }
+ } else {
+ for (i=1; i < argc; i++) { count_file(argv[i]); }
+ }
+ printf("Total:\n");
+ printf("%ld\n", total_sloc);
+ return 0; /* Report success */
+}
+
diff --git a/sloccount.html b/sloccount.html
index 233ae9a..7b74b18 100644
--- a/sloccount.html
+++ b/sloccount.html
@@ -240,6 +240,7 @@ listed in brackets:
<li>Pascal (.p, .pas) [pascal]
<li>Perl (.pl, .pm, .perl) [perl]
<li>PHP (.php, .php[3456], .inc) [php]
+<li>Pig (.pig, .piglett) [pig]
<li>Python (.py) [python]
<li>Ruby (.rb) [ruby]
<li>sed (.sed) [sed]
@@ -1468,6 +1469,7 @@ modula3_count,
objc_count,
pascal_count,
perl_count,
+pig_count,
python_count,
sed_count,
sh_count,
diff --git a/testcode/test.pig b/testcode/test.pig
new file mode 100644
index 0000000..38de841
--- /dev/null
+++ b/testcode/test.pig
@@ -0,0 +1,7 @@
+--This is a short example Pig script
+
+/* This should have only two lines of actual code */
+
+foo = LOAD '/etc/passwd' USING PigStorage(':');
+
+DUMP foo