1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
|
# parital_gene_match.py
#
# Example showing how to create a customized pyparsing Token, in this case,
# one that is very much like Literal, but which tolerates up to 'n' character
# mismatches
from pyparsing import *
import urllib.request, urllib.parse, urllib.error
# read in a bunch of genomic data
datafile = urllib.request.urlopen("http://toxodb.org/common/downloads/release-6.0/Tgondii/TgondiiApicoplastORFsNAs_ToxoDB-6.0.fasta")
fastasrc = datafile.read()
datafile.close()
"""
Sample header:
>NC_001799-6-2978-2778 | organism=Toxoplasma_gondii_RH | location=NC_001799:2778-2978(-) | length=201
"""
integer = Word(nums).setParseAction(lambda t:int(t[0]))
genebit = Group(">" + Word(alphanums.upper()+"-_") + "|" +
Word(printables)("id") + SkipTo("length=", include=True) +
integer("genelen") + LineEnd() +
Combine(OneOrMore(Word("ACGTN")),adjacent=False)("gene"))
# read gene data from .fasta file - takes just a few seconds
genedata = OneOrMore(genebit).parseString(fastasrc)
class CloseMatch(Token):
"""A special subclass of Token that does *close* matches. For each
close match of the given string, a tuple is returned giving the
found close match, and a list of mismatch positions."""
def __init__(self, seq, maxMismatches=1):
super(CloseMatch,self).__init__()
self.name = seq
self.sequence = seq
self.maxMismatches = maxMismatches
self.errmsg = "Expected " + self.sequence
self.mayIndexError = False
self.mayReturnEmpty = False
def parseImpl( self, instring, loc, doActions=True ):
start = loc
instrlen = len(instring)
maxloc = start + len(self.sequence)
if maxloc <= instrlen:
seq = self.sequence
seqloc = 0
mismatches = []
throwException = False
done = False
while loc < maxloc and not done:
if instring[loc] != seq[seqloc]:
mismatches.append(seqloc)
if len(mismatches) > self.maxMismatches:
throwException = True
done = True
loc += 1
seqloc += 1
else:
throwException = True
if throwException:
exc = self.myException
exc.loc = loc
exc.pstr = instring
raise exc
return loc, (instring[start:loc],mismatches)
# using the genedata extracted above, look for close matches of a gene sequence
searchseq = CloseMatch("TTAAATCTAGAAGAT", 3)
for g in genedata:
print("%s (%d)" % (g.id, g.genelen))
print("-"*24)
for t,startLoc,endLoc in searchseq.scanString(g.gene, overlap=True):
matched, mismatches = t[0]
print("MATCH:", searchseq.sequence)
print("FOUND:", matched)
if mismatches:
print(" ", ''.join(' ' if i not in mismatches else '*'
for i,c in enumerate(searchseq.sequence)))
else:
print("<exact match>")
print("at location", startLoc)
print()
print()
|