Added CR | LF | CR LF handling.

Handle space before/after trailing . Various bits of tidying
author: Dave Beckett <dave@dajobe.org> 2001-07-22 20:05:52 +0000
committer: Dave Beckett <dave@dajobe.org> 2001-07-22 20:05:52 +0000
commit: a61fb179c2a8f185bb33cb37c6a83f6695efcda8 (patch)
tree: 130b3ebb33ce6bc5b98a3df4a507788050894b01
parent: f39d9113e7a212baf38b5fd3b0e4bac79693c044 (diff)
download: raptor-a61fb179c2a8f185bb33cb37c6a83f6695efcda8.tar.gz
1 files changed, 129 insertions, 105 deletions
diff --git a/src/ntriples_parse.c b/src/ntriples_parse.c
index 6a31d22c..b6250e54 100644
--- a/src/ntriples_parse.c
+++ b/src/ntriples_parse.c
@@ -295,23 +295,36 @@ raptor_ntriples_generate_statement(raptor_ntriples_parser *parser,
 
 
 static int
-ntriples_parse_line (raptor_ntriples_parser* parser, char *buffer, int len) 
+raptor_ntriples_parse_line (raptor_ntriples_parser* parser, char *buffer, 
+                            int len) 
 {
   int i;
   char *p;
-  char *q=NULL; /* keeps gcc -Wall happy */
+  char *start=NULL; /* keeps gcc -Wall happy */
   char *dest;
-  char c;
+  char c = '\0';
   char *terms[3];
   int term_lengths[3];
   raptor_ntriples_term_type term_types[3];
   int backslash=0;
+
+  /* ASSERTION:
+   * p always points to first char we are considering
+   * p[len-1] always points to last char
+   */
   
   /* Handle empty  lines */
   if(!len)
     return 0;
+
+#if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
+  LIBRDF_DEBUG3(raptor_ntriples_parse_line,
+                "handling line '%s' (%d bytes)\n", 
+                buffer, len);
+#endif
   
   p=buffer;
+
   while(len>0 && isspace(*p)) {
     p++;
     parser->locator.column++;
@@ -327,13 +340,28 @@ ntriples_parse_line (raptor_ntriples_parser* parser, char *buffer, int len)
   if(*p == '#')
     return 0;
   
+  /* Remove trailing spaces */
+  while(len>0 && isspace(p[len-1])) {
+    p[len-1]='\0';
+    len--;
+  }
+
+  /* can't be empty now - that would have been caught above */
+  
+  /* Check for terminating '.' */
+  if(p[len-1] != '.') {
+    raptor_ntriples_parser_fatal_error(parser, "Missing . at end of line");
+    return 1;
+  }
+
+  p[len-1]='\0';
+  len--;
+
 
   /* Must be triple */
 
   for(i=0; i<3; i++) {
-    int tlen;
-    
-    if(!*p) {
+    if(!len) {
       raptor_ntriples_parser_fatal_error(parser, "Unexpected end of line");
       return 1;
     }
@@ -341,12 +369,12 @@ ntriples_parse_line (raptor_ntriples_parser* parser, char *buffer, int len)
     /* Expect either <anonURI> or _:name */
     if(i == 2) {
       if(*p != '<' && *p != '_' && *p != '"') {
-        raptor_ntriples_parser_fatal_error(parser, "Expected <URIref>, _:anonNode or \"literal\"");
+        raptor_ntriples_parser_fatal_error(parser, "Saw '%c', expected <URIref>, _:anonNode or \"literal\"", *p);
         return 1;
       }
     } else {
       if(*p != '<' && *p != '_') {
-        raptor_ntriples_parser_fatal_error(parser, "Expected <URIref> or _:anonNode");
+        raptor_ntriples_parser_fatal_error(parser, "Saw '%c', expected <URIref> or _:anonNode", *p);
         return 1;
       }
     }
@@ -356,27 +384,45 @@ ntriples_parse_line (raptor_ntriples_parser* parser, char *buffer, int len)
         term_types[i]= RAPTOR_NTRIPLES_TERM_TYPE_URI_REF;
         
         p++;
+        len--;
         parser->locator.column++;
         parser->locator.byte++;
-        len--;
 
-        q=p;
-        while(*q && *q != '>')
-          q++;
+        start=p;
+        while(len > 0 && *p != '>') {
+          p++;
+          len--;
+          parser->locator.column++;
+          parser->locator.byte++;
+        }
+
+        if(!len) {
+          raptor_ntriples_parser_fatal_error(parser, "Missing end > for URI");
+          return 1;
+        }
+
         break;
 
       case '"':
         term_types[i]= RAPTOR_NTRIPLES_TERM_TYPE_LITERAL;
         
+        start=p;
+        dest=p;
+
         p++;
+        len--;
         parser->locator.column++;
         parser->locator.byte++;
-        len--;
 
         /* find end of string, fixing backslashed characters on the way */
-        q=p;
-        dest=p;
-        while((c = *q++)) {
+        while(len > 0) {
+          c = *p;
+
+          p++;
+          len--;
+          parser->locator.column++;
+          parser->locator.byte++;
+
           if(c == '\\') {
             if(backslash) {
               *dest++='\\';
@@ -400,7 +446,7 @@ ntriples_parse_line (raptor_ntriples_parser* parser, char *buffer, int len)
                 *dest++='\t';
                 break;
               default:
-                raptor_ntriples_parser_fatal_error(parser, "Illegal string escape \\%c in \"%s\"", c, p);
+                raptor_ntriples_parser_fatal_error(parser, "Illegal string escape \\%c in \"%s\"", c, start);
                 break;
             }
             backslash=0;
@@ -417,27 +463,44 @@ ntriples_parse_line (raptor_ntriples_parser* parser, char *buffer, int len)
           /* otherwise store and move on */
           *dest++=c;
         } /* end while */
+
+        if(c != '"') {
+          raptor_ntriples_parser_fatal_error(parser, "Missing end \" for literal");
+          return 1;
+        }
         
         break;
 
+
       case '_':
         term_types[i]= RAPTOR_NTRIPLES_TERM_TYPE_ANON_NODE;
 
-        q=p;
-        q++;
-        if(!*q || (*q && *q != ':')) {
-          if(*q) {
-            parser->locator.column++;
-            parser->locator.byte++;
-          }
+        /* NOTE: here: start includes _ */
+        start=p;
+
+        p++;
+        len--;
+        parser->locator.column++;
+        parser->locator.byte++;
+
+        if(!len || (len > 0 && *p != ':')) {
           raptor_ntriples_parser_fatal_error(parser, "Illegal anonNode _ not followed by :");
           return 1;
         }
         /* Found ':' - move on */
-        q++;
 
-        while(*q && isalnum(*q))
-          q++;
+        p++;
+        len--;
+        parser->locator.column++;
+        parser->locator.byte++;
+
+        while(len>0 && isalnum(*p)) {
+          p++;
+          len--;
+          parser->locator.column++;
+          parser->locator.byte++;
+        }
+
         break;
 
       default:
@@ -446,61 +509,30 @@ ntriples_parse_line (raptor_ntriples_parser* parser, char *buffer, int len)
     }
 
 
-    /* Move locator to last char */
-    tlen=q-p-1;
-    parser->locator.column +=tlen;
-    parser->locator.byte   +=tlen;
-
-    if(!*q) {
-      switch(term_types[i]) {
-        case RAPTOR_NTRIPLES_TERM_TYPE_URI_REF:
-          raptor_ntriples_parser_fatal_error(parser, "Missing end > for URI");
-          break;
-        case RAPTOR_NTRIPLES_TERM_TYPE_ANON_NODE:
-          raptor_ntriples_parser_fatal_error(parser, "Missing space after anonNode");
-          break;
-        case RAPTOR_NTRIPLES_TERM_TYPE_LITERAL:
-          raptor_ntriples_parser_fatal_error(parser, "Missing end \" for literal");
-          break;
-        default:
-          raptor_ntriples_parser_fatal_error(parser, "Illegal term type %d", 
-                                             term_types[i]);
-      }
-      return 1;
-    }
-
-
     /* Replace
      *   end '>' for <URIref>
      *   whitespace after _:anonNode
      * with '\0' to terminate string
+     * and move to char after delimiter
      */
-    if(term_types[i] != RAPTOR_NTRIPLES_TERM_TYPE_LITERAL)
-      *q='\0';
-
-    /* Store term */
-    terms[i]=p; term_lengths[i]=tlen+1;
-
-    /* move to last char seen
-     *   for literal and anonNode, is char after end
-     *   for <URIref> is '>'
-     */
-    p=q;
-
-    /* move to char after delimter for non-literal */
-    if(term_types[i] != RAPTOR_NTRIPLES_TERM_TYPE_LITERAL) {
+    if(len>0 && term_types[i] != RAPTOR_NTRIPLES_TERM_TYPE_LITERAL) {
+      *p='\0';
       p++;
       len--;
       parser->locator.column++;
       parser->locator.byte++;
     }
     
+    /* Store term */
+    terms[i]=start; term_lengths[i]=(p-start);
+
+    
     /* Skip whitespace between parts */
     while(len>0 && isspace(*p)) {
       p++;
+      len--;
       parser->locator.column++;
       parser->locator.byte++;
-      len--;
     }
 
 #if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
@@ -510,21 +542,8 @@ ntriples_parse_line (raptor_ntriples_parser* parser, char *buffer, int len)
 #endif    
   }
 
-  /* Check for terminating '.' */
-  if(!*p || (*p && *p != '.')) {
-    raptor_ntriples_parser_fatal_error(parser, "Missing . at end of line");
-    return 1;
-  }
-
-  do {
-    p++;
-    parser->locator.column++;
-    parser->locator.byte++;
-    len--;
-  } while(len>0 && isspace(*p));
-
-  if(*p) {
-    raptor_ntriples_parser_fatal_error(parser, "Extra junk after .");
+  if(len) {
+    raptor_ntriples_parser_fatal_error(parser, "Extra junk before .: '%s' (%d bytes)", p, len);
     return 1;
   }
   
@@ -534,12 +553,6 @@ ntriples_parse_line (raptor_ntriples_parser* parser, char *buffer, int len)
                                      terms[1], term_types[1],
                                      terms[2], term_types[2]);
 
-#if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
-  fprintf(stderr, "%d: '", parser->locator.line);
-  fwrite(p, 1, len, stderr);
-  fprintf(stderr, "' (%d bytes)\n", len);
-#endif
-
   parser->locator.byte += len;
 
   return 0;
@@ -553,9 +566,10 @@ raptor_ntriples_parse(raptor_ntriples_parser* parser, char *s, int len,
   char *buffer;
   char *ptr;
   char *start;
-
+  char last_nl;
+  
 #if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
-  LIBRDF_DEBUG2(ntriples_parse, "adding %d data bytes\n", len);
+  LIBRDF_DEBUG2(raptor_ntriples_parse, "adding %d bytes to line buffer\n", len);
 #endif
 
   buffer=(char*)LIBRDF_MALLOC(cstring, parser->line_length + len + 1);
@@ -583,42 +597,52 @@ raptor_ntriples_parse(raptor_ntriples_parser* parser, char *s, int len,
   *ptr = '\0';
 
 #if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
-  LIBRDF_DEBUG2(raptor_xml_cdata_handler, 
-                "line buffer now %d bytes\n", 
-                parser->line_length);
+  LIBRDF_DEBUG3(raptor_ntriples_parse,
+                "line buffer now '%s' (%d bytes)\n", 
+                parser->line, parser->line_length);
 #endif
 
+  last_nl='\n';  /* last newline character - \r triggers check */
+
   ptr=buffer+parser->offset;
   start=ptr;
   while(*ptr) {
-    if(*ptr != '\n') {
+    /* skip \n when just seen \r - i.e. \r\n or CR LF */
+    if(last_nl == '\r' && *ptr == '\n') {
       ptr++;
-      continue;
+      parser->locator.byte++;
     }
+    
+    while(*ptr && *ptr != '\n' && *ptr != '\r')
+      ptr++;
+
+    /* keep going - no newline yet */
+    if(!*ptr && !is_end)
+      break;
+
+    last_nl=*ptr;
 
     len=ptr-start;
-    if(len>0 && ptr[-1] == '\r')
-      len--;
     parser->locator.column=0;
 
     *ptr='\0';
-    if(ntriples_parse_line(parser,start,len))
+    if(raptor_ntriples_parse_line(parser,start,len))
       return 1;
     
-    /* go past '\r' in bytes only */
-    if(ptr[-1] == '\r')
-      parser->locator.byte++;
-
     parser->locator.line++;
 
-    /* go past '\n' */
+    /* go past newline */
     ptr++;
     parser->locator.byte++;
 
     start=ptr;
   }
-  parser->offset=start-buffer;
 
+  /* exit now, no more input */
+  if(is_end)
+    return 0;
+    
+  parser->offset=start-buffer;
 
   len=parser->line_length - parser->offset;
     
@@ -626,7 +650,7 @@ raptor_ntriples_parse(raptor_ntriples_parser* parser, char *s, int len,
     /* collapse buffer */
 
 #if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
-    LIBRDF_DEBUG3(raptor_xml_cdata_handler, 
+    LIBRDF_DEBUG3(raptor_ntriples_parse,
                   "collapsing line buffer from %d to %d bytes\n", 
                   parser->line_length, len);
 #endif
@@ -646,7 +670,7 @@ raptor_ntriples_parse(raptor_ntriples_parser* parser, char *s, int len,
     parser->offset=0;
 
 #if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
-    LIBRDF_DEBUG3(raptor_xml_cdata_handler, 
+    LIBRDF_DEBUG3(raptor_ntriples_parse,
                   "line buffer now '%s' (%d bytes)\n", 
                   parser->line, parser->line_length);
 #endif
author	Dave Beckett <dave@dajobe.org>	2001-07-22 20:05:52 +0000
committer	Dave Beckett <dave@dajobe.org>	2001-07-22 20:05:52 +0000
commit	a61fb179c2a8f185bb33cb37c6a83f6695efcda8 (patch)
tree	130b3ebb33ce6bc5b98a3df4a507788050894b01
parent	f39d9113e7a212baf38b5fd3b0e4bac79693c044 (diff)
download	raptor-a61fb179c2a8f185bb33cb37c6a83f6695efcda8.tar.gz