From 123e83037b5de90fb964e96267bd60e90c70db19 Mon Sep 17 00:00:00 2001 From: Paul McGuire Date: Mon, 5 Aug 2019 23:58:25 -0500 Subject: Fixed bug in CloseMatch where end location was incorrectly computed; and updated partial_gene_match.py example. --- CHANGES | 3 + examples/partial_gene_match.py | 139 +++++++++++++++-------------------------- pyparsing.py | 4 +- 3 files changed, 56 insertions(+), 90 deletions(-) diff --git a/CHANGES b/CHANGES index 612de6c..68503c7 100644 --- a/CHANGES +++ b/CHANGES @@ -38,6 +38,9 @@ Version 2.5.0a1 suppression. As part of resolution to a question posted by John Greene on StackOverflow. +- Fixed bug in CloseMatch where end location was incorrectly + computed; and updated partial_gene_match.py example. + - BigQueryViewParser.py added to examples directory, PR submitted by Michael Smedberg, nice work! diff --git a/examples/partial_gene_match.py b/examples/partial_gene_match.py index e19cf8b..8ca9c44 100644 --- a/examples/partial_gene_match.py +++ b/examples/partial_gene_match.py @@ -1,88 +1,51 @@ -# parital_gene_match.py -# -# Example showing how to create a customized pyparsing Token, in this case, -# one that is very much like Literal, but which tolerates up to 'n' character -# mismatches -from pyparsing import * - -import urllib.request, urllib.parse, urllib.error - -# read in a bunch of genomic data -datafile = urllib.request.urlopen("http://toxodb.org/common/downloads/release-6.0/Tgondii/TgondiiApicoplastORFsNAs_ToxoDB-6.0.fasta") -fastasrc = datafile.read() -datafile.close() - -""" -Sample header: ->NC_001799-6-2978-2778 | organism=Toxoplasma_gondii_RH | location=NC_001799:2778-2978(-) | length=201 -""" -integer = Word(nums).setParseAction(lambda t:int(t[0])) -genebit = Group(">" + Word(alphanums.upper()+"-_") + "|" + - Word(printables)("id") + SkipTo("length=", include=True) + - integer("genelen") + LineEnd() + - Combine(OneOrMore(Word("ACGTN")),adjacent=False)("gene")) - -# read gene data from .fasta file - takes just a few seconds -genedata = OneOrMore(genebit).parseString(fastasrc) - - -class CloseMatch(Token): - """A special subclass of Token that does *close* matches. For each - close match of the given string, a tuple is returned giving the - found close match, and a list of mismatch positions.""" - def __init__(self, seq, maxMismatches=1): - super(CloseMatch,self).__init__() - self.name = seq - self.sequence = seq - self.maxMismatches = maxMismatches - self.errmsg = "Expected " + self.sequence - self.mayIndexError = False - self.mayReturnEmpty = False - - def parseImpl( self, instring, loc, doActions=True ): - start = loc - instrlen = len(instring) - maxloc = start + len(self.sequence) - - if maxloc <= instrlen: - seq = self.sequence - seqloc = 0 - mismatches = [] - throwException = False - done = False - while loc < maxloc and not done: - if instring[loc] != seq[seqloc]: - mismatches.append(seqloc) - if len(mismatches) > self.maxMismatches: - throwException = True - done = True - loc += 1 - seqloc += 1 - else: - throwException = True - - if throwException: - exc = self.myException - exc.loc = loc - exc.pstr = instring - raise exc - - return loc, (instring[start:loc],mismatches) - -# using the genedata extracted above, look for close matches of a gene sequence -searchseq = CloseMatch("TTAAATCTAGAAGAT", 3) -for g in genedata: - print("%s (%d)" % (g.id, g.genelen)) - print("-"*24) - for t,startLoc,endLoc in searchseq.scanString(g.gene, overlap=True): - matched, mismatches = t[0] - print("MATCH:", searchseq.sequence) - print("FOUND:", matched) - if mismatches: - print(" ", ''.join(' ' if i not in mismatches else '*' - for i,c in enumerate(searchseq.sequence))) - else: - print("") - print("at location", startLoc) - print() - print() +# parital_gene_match.py +# +# Example showing how to use the CloseMatch class, to find strings in a gene with up to 'n' mismatches +# +import pyparsing as pp + +import urllib.request, urllib.parse, urllib.error +from contextlib import closing + +# read in a bunch of genomic data +data_url = "http://toxodb.org/common/downloads/release-6.0/Tgondii/TgondiiApicoplastORFsNAs_ToxoDB-6.0.fasta" +with closing(urllib.request.urlopen(data_url)) as datafile: + fastasrc = datafile.read().decode() + +""" +Sample header: +>NC_001799-6-2978-2778 | organism=Toxoplasma_gondii_RH | location=NC_001799:2778-2978(-) | length=201 +""" +integer = pp.Word(pp.nums).setParseAction(lambda t:int(t[0])) +genebit = pp.Group(">" + pp.Word(pp.alphanums.upper() + "-_")("gene_id") + + "|" + pp.Word(pp.printables)("organism") + + "|" + pp.Word(pp.printables)("location") + + "|" + "length=" + integer("gene_len") + + pp.LineEnd() + + pp.Word("ACGTN")[1, ...].addParseAction(''.join)("gene")) + +# read gene data from .fasta file - takes just a few seconds +genedata = genebit[1, ...].parseString(fastasrc) + +# using the genedata extracted above, look for close matches of a gene sequence +searchseq = pp.CloseMatch("TTAAATCTAGAAGAT", 3) + +for g in genedata: + show_header = True + for t, startLoc, endLoc in searchseq.scanString(g.gene, overlap=True): + if show_header: + print("%s/%s/%s (%d)" % (g.gene_id, g.organism, g.location, g.gene_len)) + print("-" * 24) + show_header = False + + matched = t[0] + mismatches = t['mismatches'] + print("MATCH:", searchseq.match_string) + print("FOUND:", matched) + if mismatches: + print(" ", ''.join('*' if i in mismatches else ' ' + for i, c in enumerate(searchseq.match_string))) + else: + print("") + print("at location", startLoc) + print() diff --git a/pyparsing.py b/pyparsing.py index 92e2929..23b36dc 100644 --- a/pyparsing.py +++ b/pyparsing.py @@ -96,7 +96,7 @@ classes inherit from. Use the docstrings for examples of how to: """ __version__ = "2.5.0a1" -__versionTime__ = "06 Aug 2019 01:12 UTC" +__versionTime__ = "06 Aug 2019 04:55 UTC" __author__ = "Paul McGuire " import string @@ -2843,7 +2843,7 @@ class CloseMatch(Token): if len(mismatches) > maxMismatches: break else: - loc = match_stringloc + 1 + loc = start + match_stringloc + 1 results = ParseResults([instring[start:loc]]) results['original'] = match_string results['mismatches'] = mismatches -- cgit v1.2.1