Convert ensemble gtf to gff3 for JBrowse
#!/usr/bin/python
"""
Convert ensemble gtf to gff3 for JBrowse
Copyright (C)
__Author__: liubing
"""
import sys
import re
chr = ''
keep_features = ["CDS", "UTR", "exon", "gene", "transcript"]
if len(sys.argv) < 2:
print("Run:\n\tpython " + sys.argv[0] + " input.gtf output.gff3")
exit()
gtf_file = sys.argv[1]
if len(sys.argv) == 2:
gff_file = gtf_file.rstrip('gtf') + 'gff3'
else:
gff_file = sys.argv[2]
### Parse gtf file
gtf = open(gtf_file, 'r')
gff = open(gff_file, 'w')
gtf_lines = gtf.readlines()
for line in gtf_lines:
if re.match(r'^#', line):
continue
featureline = line.strip().split('\t')
[seq, source, feature, start, end, score, strand, frame, attributes] = featureline
if feature not in keep_features:
continue
seq = chr + seq
if feature == 'transcript':
feature = 'mRNA'
attributes_array = attributes.replace('"','').strip(';').split('; ')
if feature == 'gene':
ID = str(attributes_array[0].split()[1])
count = 0
Name = attributes_array[1].split()[1]
gff.write(
'\t'.join([seq, source, feature, start, end, score, strand, frame]) + '\t' +
'ID=' + ID + ';Name='+Name +'\n')
elif feature == 'mRNA':
count += 1
Parent = ID
ID = str(attributes_array[0].split()[1])
ID = ID + '.' + str(count)
Name = Name + '.' + str(count)
gff.write(
'\t'.join([seq, source, feature, start, end, score, strand, frame]) + '\t' +
'ID=' + ID + ';Parent='+ Parent +';Name='+Name +'\n')
else:
gff.write(
'\t'.join([seq, source, feature, start, end, score, strand, frame]) + '\t' +
'Parent=' + ID + '\n')
gff.close()
gtf.close()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65