import sys
from pylab import *
genelist_unord = sys.argv[1].split(',')
genelist = unique(genelist_unord)
begs = [100000000 for g in genelist]
ends = [0 for g in genelist]
chrs = [0 for g in genelist]
chrs2 = [0 for g in genelist]
input_file = open('ref_GRCh37.p13_top_level.gff3','r')
line = input_file.readline()
genelines = []
linechrs = []
linechrs2 = []
chrnow = 1
chr2now = 1
while line[0] == '#':
line = input_file.readline()
while True:
line = input_file.readline()
if line.find('chromosome=') > -1:
if line[line.find('chromosome=')+11] in ['0','1','2','3','4','5','6','7','8','9','X','Y']:
chrnowstr = line[line.find('chromosome=')+11:line.find('chromosome=')+line[line.find('chromosome='):].find(';')]
chrnow = int(chrnowstr) if chrnowstr != 'X' and chrnowstr != 'Y' else (23 if chrnowstr == 'X' else 24)
#print('chrnow = '+str(chrnow)+', chrnowstr = '+chrnowstr)
if line.find('chromosome ') > -1:
if line[line.find('chromosome ')+11] in ['0','1','2','3','4','5','6','7','8','9','X','Y']:
chr2nowstr = line[line.find('chromosome ')+11:line.find('chromosome ')+11+min(2,line[line.find('chromosome ')+11:].find(' '))]
if chr2nowstr != '':
chr2now = int(chr2nowstr) if chr2nowstr != 'X' and chr2nowstr != 'Y' else (23 if chr2nowstr == 'X' else 24)
#print('chr2now = '+str(chr2now)+', chr2nowstr = '+chr2nowstr)
if len(line) == 0:
break
splitted = line.split('\t')
if len(splitted) < 9:
continue
entryType = splitted[2]
if entryType == 'gene':
genelines.append(line)
linechrs.append(chrnow)
linechrs2.append(chr2now)
dataField = splitted[8]
splitted2 = dataField.split(';')
geneEntry = ''
for isplitted in range(0,len(splitted2)):
if splitted2[isplitted][0:5] == 'gene=':
geneEntry = splitted2[isplitted][5:]
igeneFound = -1
if geneEntry in genelist:
for igene in range(0,len(genelist)):
if genelist[igene] == geneEntry:
igeneFound = igene
begs[igene] = int(splitted[3])
ends[igene] = int(splitted[4])
chrs[igene] = int(chrnow)
chrs2[igene] = int(chr2now)
break
input_file.close()
#print('chrnow = '+str(chrnow)+', chr2now = '+str(chr2now))
begs_syn = begs[:] #synonymous genes
ends_syn = ends[:] #synonymous genes
realnames = genelist[:]
chrs_syn = chrs[:]
chrs2_syn = chrs2[:]
for igene in range(0,len(genelist)):
if ends[igene] == 0:
for iline in range(0,len(genelines)):
line = genelines[iline]
splitted = line.split('\t')
dataField = splitted[8]
splitted2 = dataField.split(';')
geneEntry = ''
synonyms = ''
for isplitted in range(0,len(splitted2)):
if splitted2[isplitted][0:5] == 'gene=':
geneEntry = splitted2[isplitted][5:]
elif splitted2[isplitted][0:13] == 'gene_synonym=':
synonyms = splitted2[isplitted][13:]
splitted3 = synonyms.split(',')
if genelist[igene] in splitted3:
if ends_syn[igene] > 0:
print('#Overwriting '+realnames[igene]+' for '+genelist[igene])
realnames[igene] = geneEntry
begs_syn[igene] = int(splitted[3])
ends_syn[igene] = int(splitted[4])
chrs_syn[igene] = linechrs[iline]
chrs2_syn[igene] = linechrs2[iline]
#print('Only main gene names:')
#for i in range(0,len(genelist)):
# print("'"+genelist[i]+"',"+str(chrs[i])+","+str(chrs2[i])+","+str(begs[i])+","+str(ends[i]))
#
#print('Using synonyms:')
#for i in range(0,len(genelist)):
# print("'"+genelist[i]+"' ('"+realnames[i]+"'),"+str(chrs_syn[i])+","+str(chrs2_syn[i])+","+str(begs_syn[i])+","+str(ends_syn[i]))
print("DATA = [")
for i in range(0,len(genelist)):
print("['"+realnames[i]+"',"+str(chrs_syn[i])+","+str(begs_syn[i])+","+str(ends_syn[i])+"],")
print("]")
#print(str(begs))
#print(str(ends))