patch 8.0.0190: finding duplicate tags uses a slow linear searchv8.0.0190

Problem: Detecting duplicate tags uses a slow linear search. Solution: Use a much faster hash table solution. (James McCoy, closes #1046) But don't add hi_keylen, it makes hash tables 50% bigger.
author: Bram Moolenaar <Bram@vim.org> 2017-01-15 16:52:51 +0100
committer: Bram Moolenaar <Bram@vim.org> 2017-01-15 16:52:51 +0100
commit: 810f9c361c83afb36b9f1cdadca2b93f1201d039 (patch)
tree: df2c3926f3e1d6fcbeebf9c9b3cfd79539d9fb20
parent: 296b1f28ca9cedeb55872f306808b2214b519ce7 (diff)
download: vim-git-8.0.0190.tar.gz
2 files changed, 160 insertions, 162 deletions
diff --git a/src/tag.c b/src/tag.c
index 646cbcd62..94b5e2b68 100644
--- a/src/tag.c
+++ b/src/tag.c
@@ -35,9 +35,9 @@ typedef struct tag_pointers
 } tagptrs_T;
 
 /*
- * The matching tags are first stored in ga_match[].  In which one depends on
- * the priority of the match.
- * At the end, the matches from ga_match[] are concatenated, to make a list
+ * The matching tags are first stored in one of the ht_match[] hash tables.  In
+ * which one depends on the priority of the match.
+ * At the end, all the matches from ht_match[] are concatenated, to make a list
  * sorted on priority.
  */
 #define MT_ST_CUR	0		/* static match in current file */
@@ -1341,12 +1341,9 @@ find_tags(
     int		is_etag;		/* current file is emaces style */
 #endif
 
-    struct match_found
-    {
-	int	len;		/* nr of chars of match[] to be compared */
-	char_u	match[1];	/* actually longer */
-    } *mfp, *mfp2;
-    garray_T	ga_match[MT_COUNT];
+    char_u	*mfp;
+    hashtab_T	ht_match[MT_COUNT];
+    hash_T	hash = 0;
     int		match_count = 0;		/* number of matches found */
     char_u	**matches;
     int		mtt;
@@ -1402,16 +1399,16 @@ find_tags(
     vimconv.vc_type = CONV_NONE;
 #endif
 
-/*
- * Allocate memory for the buffers that are used
- */
+    /*
+     * Allocate memory for the buffers that are used
+     */
     lbuf = alloc(lbuf_size);
     tag_fname = alloc(MAXPATHL + 1);
 #ifdef FEAT_EMACS_TAGS
     ebuf = alloc(LSIZE);
 #endif
     for (mtt = 0; mtt < MT_COUNT; ++mtt)
-	ga_init2(&ga_match[mtt], (int)sizeof(struct match_found *), 100);
+	hash_init(&ht_match[mtt]);
 
     /* check for out of memory situation */
     if (lbuf == NULL || tag_fname == NULL
@@ -2206,10 +2203,12 @@ parse_line:
 	    }
 
 	    /*
-	     * If a match is found, add it to ga_match[].
+	     * If a match is found, add it to ht_match[].
 	     */
 	    if (match)
 	    {
+		int len = 0;
+
 #ifdef FEAT_CSCOPE
 		if (use_cscope)
 		{
@@ -2262,179 +2261,169 @@ parse_line:
 		}
 
 		/*
-		 * Add the found match in ga_match[mtt], avoiding duplicates.
+		 * Add the found match in ht_match[mtt].
 		 * Store the info we need later, which depends on the kind of
 		 * tags we are dealing with.
 		 */
-		if (ga_grow(&ga_match[mtt], 1) == OK)
+		if (help_only)
 		{
-		    int len;
-		    int heuristic;
-
-		    if (help_only)
-		    {
 #ifdef FEAT_MULTI_LANG
 # define ML_EXTRA 3
 #else
 # define ML_EXTRA 0
 #endif
-			/*
-			 * Append the help-heuristic number after the
-			 * tagname, for sorting it later.
-			 */
-			*tagp.tagname_end = NUL;
-			len = (int)(tagp.tagname_end - tagp.tagname);
-			mfp = (struct match_found *)
-				 alloc((int)sizeof(struct match_found) + len
-							     + 10 + ML_EXTRA);
-			if (mfp != NULL)
-			{
-			    /* "len" includes the language and the NUL, but
-			     * not the priority. */
-			    mfp->len = len + ML_EXTRA + 1;
-#define ML_HELP_LEN 6
-			    p = mfp->match;
-			    STRCPY(p, tagp.tagname);
+		    /*
+		     * Append the help-heuristic number after the tagname, for
+		     * sorting it later.  The heuristic is ignored for
+		     * detecting duplicates.
+		     * The format is {tagname}@{lang}NUL{heuristic}NUL
+		     */
+		    *tagp.tagname_end = NUL;
+		    len = (int)(tagp.tagname_end - tagp.tagname);
+		    mfp = (char_u *)alloc((int)sizeof(char_u) + len + 10 + ML_EXTRA + 1);
+		    if (mfp != NULL)
+		    {
+			int heuristic;
+
+			p = mfp;
+			STRCPY(p, tagp.tagname);
 #ifdef FEAT_MULTI_LANG
-			    p[len] = '@';
-			    STRCPY(p + len + 1, help_lang);
+			p[len] = '@';
+			STRCPY(p + len + 1, help_lang);
 #endif
 
-			    heuristic = help_heuristic(tagp.tagname,
-					match_re ? matchoff : 0, !match_no_ic);
+			heuristic = help_heuristic(tagp.tagname,
+				    match_re ? matchoff : 0, !match_no_ic);
 #ifdef FEAT_MULTI_LANG
-			    heuristic += help_pri;
+			heuristic += help_pri;
 #endif
-			    sprintf((char *)p + len + 1 + ML_EXTRA, "%06d",
-								   heuristic);
-			}
-			*tagp.tagname_end = TAB;
+			sprintf((char *)p + len + 1 + ML_EXTRA, "%06d",
+							       heuristic);
 		    }
-		    else if (name_only)
+		    *tagp.tagname_end = TAB;
+		}
+		else if (name_only)
+		{
+		    if (get_it_again)
 		    {
-			if (get_it_again)
-			{
-			    char_u *temp_end = tagp.command;
+			char_u *temp_end = tagp.command;
 
-			    if (*temp_end == '/')
-				while (*temp_end && *temp_end != '\r'
-					&& *temp_end != '\n'
-					&& *temp_end != '$')
-				    temp_end++;
+			if (*temp_end == '/')
+			    while (*temp_end && *temp_end != '\r'
+				    && *temp_end != '\n'
+				    && *temp_end != '$')
+				temp_end++;
 
-			    if (tagp.command + 2 < temp_end)
-			    {
-				len = (int)(temp_end - tagp.command - 2);
-				mfp = (struct match_found *)alloc(
-					(int)sizeof(struct match_found) + len);
-				if (mfp != NULL)
-				{
-				    mfp->len = len + 1; /* include the NUL */
-				    p = mfp->match;
-				    vim_strncpy(p, tagp.command + 2, len);
-				}
-			    }
-			    else
-				mfp = NULL;
-			    get_it_again = FALSE;
-			}
-			else
+			if (tagp.command + 2 < temp_end)
 			{
-			    len = (int)(tagp.tagname_end - tagp.tagname);
-			    mfp = (struct match_found *)alloc(
-				       (int)sizeof(struct match_found) + len);
+			    len = (int)(temp_end - tagp.command - 2);
+			    mfp = (char_u *)alloc((int)sizeof(char_u) + len + 1);
 			    if (mfp != NULL)
-			    {
-				mfp->len = len + 1; /* include the NUL */
-				p = mfp->match;
-				vim_strncpy(p, tagp.tagname, len);
-			    }
-
-			    /* if wanted, re-read line to get long form too */
-			    if (State & INSERT)
-				get_it_again = p_sft;
+				vim_strncpy(mfp, tagp.command + 2, len);
 			}
+			else
+			    mfp = NULL;
+			get_it_again = FALSE;
 		    }
 		    else
 		    {
-			/* Save the tag in a buffer.
-			 * Emacs tag: <mtt><tag_fname><NUL><ebuf><NUL><lbuf>
-			 * other tag: <mtt><tag_fname><NUL><NUL><lbuf>
-			 * without Emacs tags: <mtt><tag_fname><NUL><lbuf>
-			 */
-			len = (int)STRLEN(tag_fname)
-						 + (int)STRLEN(lbuf) + 3;
+			len = (int)(tagp.tagname_end - tagp.tagname);
+			mfp = (char_u *)alloc((int)sizeof(char_u) + len + 1);
+			if (mfp != NULL)
+			    vim_strncpy(mfp, tagp.tagname, len);
+
+			/* if wanted, re-read line to get long form too */
+			if (State & INSERT)
+			    get_it_again = p_sft;
+		    }
+		}
+		else
+		{
+#define TAG_SEP 0x01
+		    size_t tag_fname_len = STRLEN(tag_fname);
 #ifdef FEAT_EMACS_TAGS
-			if (is_etag)
-			    len += (int)STRLEN(ebuf) + 1;
-			else
-			    ++len;
+		    size_t ebuf_len = 0;
 #endif
-			mfp = (struct match_found *)alloc(
-				       (int)sizeof(struct match_found) + len);
-			if (mfp != NULL)
-			{
-			    mfp->len = len;
-			    p = mfp->match;
-			    p[0] = mtt;
-			    STRCPY(p + 1, tag_fname);
+
+		    /* Save the tag in a buffer.
+		     * Use 0x01 to separate fields (Can't use NUL, because the
+		     * hash key is terminated by NUL).
+		     * Emacs tag: <mtt><tag_fname><0x01><ebuf><0x01><lbuf><NUL>
+		     * other tag: <mtt><tag_fname><0x01><0x01><lbuf><NUL>
+		     * without Emacs tags: <mtt><tag_fname><0x01><lbuf><NUL>
+		     */
+		    len = (int)tag_fname_len + (int)STRLEN(lbuf) + 3;
+#ifdef FEAT_EMACS_TAGS
+		    if (is_etag)
+		    {
+			ebuf_len = STRLEN(ebuf);
+			len += (int)ebuf_len + 1;
+		    }
+		    else
+			++len;
+#endif
+		    mfp = (char_u *)alloc((int)sizeof(char_u) + len + 1);
+		    if (mfp != NULL)
+		    {
+			p = mfp;
+			p[0] = mtt;
+			STRCPY(p + 1, tag_fname);
 #ifdef BACKSLASH_IN_FILENAME
-			    /* Ignore differences in slashes, avoid adding
-			     * both path/file and path\file. */
-			    slash_adjust(p + 1);
+			/* Ignore differences in slashes, avoid adding
+			 * both path/file and path\file. */
+			slash_adjust(p + 1);
 #endif
-			    s = p + 1 + STRLEN(tag_fname) + 1;
+			p[tag_fname_len + 1] = TAG_SEP;
+			s = p + 1 + tag_fname_len + 1;
 #ifdef FEAT_EMACS_TAGS
-			    if (is_etag)
-			    {
-				STRCPY(s, ebuf);
-				s += STRLEN(ebuf) + 1;
-			    }
-			    else
-				*s++ = NUL;
-#endif
-			    STRCPY(s, lbuf);
+			if (is_etag)
+			{
+			    STRCPY(s, ebuf);
+			    s[ebuf_len] = TAG_SEP;
+			    s += ebuf_len + 1;
 			}
+			else
+			    *s++ = TAG_SEP;
+#endif
+			STRCPY(s, lbuf);
 		    }
+		}
 
-		    if (mfp != NULL)
-		    {
-			/*
-			 * Don't add identical matches.
-			 * This can take a lot of time when finding many
-			 * matches, check for CTRL-C now and then.
-			 * Add all cscope tags, because they are all listed.
-			 */
+		if (mfp != NULL)
+		{
+		    hashitem_T	*hi;
+
+		    /*
+		     * Don't add identical matches.
+		     * Add all cscope tags, because they are all listed.
+		     * "mfp" is used as a hash key, there is a NUL byte to end
+		     * the part matters for comparing, more bytes may follow
+		     * after it.  E.g. help tags store the priority after the
+		     * NUL.
+		     */
 #ifdef FEAT_CSCOPE
-			if (use_cscope)
-			    i = -1;
-			else
+		    if (use_cscope)
+			hash++;
+		    else
 #endif
-			  for (i = ga_match[mtt].ga_len; --i >= 0 && !got_int; )
-			  {
-			      mfp2 = ((struct match_found **)
-						  (ga_match[mtt].ga_data))[i];
-			      if (mfp2->len == mfp->len
-				      && memcmp(mfp2->match, mfp->match,
-						       (size_t)mfp->len) == 0)
-				  break;
-			      fast_breakcheck();
-			  }
-			if (i < 0)
+			hash = hash_hash(mfp);
+		    hi = hash_lookup(&ht_match[mtt], mfp, hash);
+		    if (HASHITEM_EMPTY(hi))
+		    {
+			if (hash_add_item(&ht_match[mtt], hi, mfp, hash)
+								       == FAIL)
 			{
-			    ((struct match_found **)(ga_match[mtt].ga_data))
-					       [ga_match[mtt].ga_len++] = mfp;
-			    ++match_count;
+			    /* Out of memory! Just forget about the rest. */
+			    retval = OK;
+			    stop_searching = TRUE;
+			    break;
 			}
 			else
-			    vim_free(mfp);
+			    ++match_count;
 		    }
-		}
-		else    /* Out of memory! Just forget about the rest. */
-		{
-		    retval = OK;
-		    stop_searching = TRUE;
-		    break;
+		    else
+			/* duplicate tag, drop it */
+			vim_free(mfp);
 		}
 	    }
 #ifdef FEAT_CSCOPE
@@ -2532,7 +2521,7 @@ findtag_end:
 #endif
 
     /*
-     * Move the matches from the ga_match[] arrays into one list of
+     * Move the matches from the ht_match[] arrays into one list of
      * matches.  When retval == FAIL, free the matches.
      */
     if (retval == FAIL)
@@ -2546,22 +2535,29 @@ findtag_end:
     match_count = 0;
     for (mtt = 0; mtt < MT_COUNT; ++mtt)
     {
-	for (i = 0; i < ga_match[mtt].ga_len; ++i)
+	hashitem_T	*hi;
+	long_u		todo;
+
+	todo = (long)ht_match[mtt].ht_used;
+	for (hi = ht_match[mtt].ht_array; todo > 0; ++hi)
 	{
-	    mfp = ((struct match_found **)(ga_match[mtt].ga_data))[i];
-	    if (matches == NULL)
-		vim_free(mfp);
-	    else
+	    if (!HASHITEM_EMPTY(hi))
 	    {
-		/* To avoid allocating memory again we turn the struct
-		 * match_found into a string.  For help the priority was not
-		 * included in the length. */
-		mch_memmove(mfp, mfp->match,
-			 (size_t)(mfp->len + (help_only ? ML_HELP_LEN : 0)));
-		matches[match_count++] = (char_u *)mfp;
+		mfp = hi->hi_key;
+		if (matches == NULL)
+		    vim_free(mfp);
+		else
+		{
+		    /* now change the TAG_SEP back to NUL */
+		    for (p = mfp; *p != NUL; ++p)
+			if (*p == TAG_SEP)
+			    *p = NUL;
+		    matches[match_count++] = (char_u *)mfp;
+		}
+		todo--;
 	    }
 	}
-	ga_clear(&ga_match[mtt]);
+	hash_clear(&ht_match[mtt]);
     }
 
     *matchesp = matches;
diff --git a/src/version.c b/src/version.c
index 368a984e3..2094db704 100644
--- a/src/version.c
+++ b/src/version.c
@@ -765,6 +765,8 @@ static char *(features[]) =
 static int included_patches[] =
 {   /* Add new patch number below this line */
 /**/
+    190,
+/**/
     189,
 /**/
     188,
author	Bram Moolenaar <Bram@vim.org>	2017-01-15 16:52:51 +0100
committer	Bram Moolenaar <Bram@vim.org>	2017-01-15 16:52:51 +0100
commit	810f9c361c83afb36b9f1cdadca2b93f1201d039 (patch)
tree	df2c3926f3e1d6fcbeebf9c9b3cfd79539d9fb20
parent	296b1f28ca9cedeb55872f306808b2214b519ce7 (diff)
download	vim-git-8.0.0190.tar.gz