Commit 42dbf51a by Mustafa Tekpinar

Added escottfile and ranksorted args to prescott UI.

parent f98b9cb1
......@@ -328,7 +328,16 @@ def main():
main_parser.add_argument('--usefrequencies', dest='usefrequencies', type=str, \
help='Do not touch this if you don\'t know what you are doing! Default is true',
required=False, default='true')
main_parser.add_argument('--ranksorted', dest='ranksorted', type=str, \
help='If your data is already ranksorted, change this argument to true. Default is false',
required=False, default='false')
main_parser.add_argument('--escottformat', dest='escottformat', type=str, \
help='Main format of escott file. There are two possibilities: gemme or singleline. \n'+\
'gemme: a horizontal format of 20 rows and N columns.\n'+\
'singleline: each line contains a mutation and its value separated by a space.\n'+\
'M1A 0.378\n', required=False, default='gemme')
# main_parser.add_argument('--colormap', dest='colormap', type=str, \
# help='A colormap as defined in matplotlib',
# required=False, default='coolwarm_r')
......@@ -354,7 +363,7 @@ def main():
print("@> Use population max. freq : {}".format(str(args.usepopmax).lower()))
print("@> Which equation to use (Default=2): {}".format(str(args.equation)))
print("@> Scaling coefficient (Default=1.0): {}".format(args.coefficient))
print("@> Frequency cutoff (Default=-4.0) : {}".format(args.frequencycutoff))
print("@> Frequency cutoff (Default=-4.0) : {}".format(args.frequencycutoff))
print("@> Name of the output file : {}".format(args.outputfile))
# End of argument parsing!
......@@ -366,32 +375,45 @@ def main():
usePopMaxOrNot = args.usepopmax.lower()
version = args.equation
if (os.path.exists(escottDataPath)):
#Parse the file containing raw ESCOTT scores.
scanningMatrix = parseGEMMEoutput(args.escottfile, verbose=False)
#Convert the matrix format to singleline format
localResidueList = None
if(args.sequencefile != None):
referenceSeq = SeqIO.read(args.sequencefile, 'fasta')
localResidueList = list(referenceSeq.seq)
aaOrderList = list('ACDEFGHIKLMNPQRSTVWY')
writeSinglelineFormat(scanningMatrix, protein+'_singleline.txt', residueList = localResidueList,\
beg=0, end=None, aaOrder = aaOrderList, \
offSet=0)
#Mostyl, I am using normPred_Combi_singleline as input file and it doesn't have a header.
df = pd.read_table(protein+'_singleline.txt', sep="\s+", header=None)
#data = np.genfromtxt(args.input,dtype=None)
data = df.to_numpy()
rawData = data.T[1]
processedData = 1.0 - rankSortData(rawData)
with open(protein+'_singleline_1-ranksort.txt', 'w') as f:
#f.write("#Resid Value\n")
for i in range (len(processedData)):
f.write("{:} {:6.2f}\n".format(data.T[0][i], processedData[i]))
dfESCOTT = pd.read_table(protein+'_singleline_1-ranksort.txt', sep='\s+', header=None)
if(args.escottformat=='gemme'):
#Parse the file containing raw ESCOTT scores.
scanningMatrix = parseGEMMEoutput(args.escottfile, verbose=False)
#Convert the matrix format to singleline format
localResidueList = None
if(args.sequencefile != None):
referenceSeq = SeqIO.read(args.sequencefile, 'fasta')
localResidueList = list(referenceSeq.seq)
aaOrderList = list('ACDEFGHIKLMNPQRSTVWY')
writeSinglelineFormat(scanningMatrix, protein+'_singleline.txt', residueList = localResidueList,\
beg=0, end=None, aaOrder = aaOrderList, \
offSet=0)
#Mostyl, I am using normPred_Combi_singleline as input file and it doesn't have a header.
df = pd.read_table(protein+'_singleline.txt', sep="\s+", header=None)
elif(args.escottformat=='singleline'):
df = pd.read_table(args.escottfile, sep="\s+", header=None)
else:
print('@> ERROR: Unknown escott format. It should be gemme or singleline!')
sys.exit(-1)
if(args.ranksorted == 'false'):
#data = np.genfromtxt(args.input,dtype=None)
data = df.to_numpy()
rawData = data.T[1]
processedData = 1.0 - rankSortData(rawData)
with open(protein+'_singleline_1-ranksort.txt', 'w') as f:
#f.write("#Resid Value\n")
for i in range (len(processedData)):
f.write("{:} {:6.2f}\n".format(data.T[0][i], processedData[i]))
dfESCOTT = pd.read_table(protein+'_singleline_1-ranksort.txt', sep='\s+', header=None)
else:
dfESCOTT = df
dfESCOTT.columns = ['mutant', 'ESCOTT']
dfESCOTT['protein']=protein
......@@ -416,10 +438,8 @@ def main():
(row['ClinVar Clinical Significance']=='Pathogenic') or \
(row['ClinVar Clinical Significance']=='Likely pathogenic')):
gnomadDF.at[index,'labels'] = 1
print(gnomadDF.loc[(gnomadDF['labels']==0) | (gnomadDF['labels']==1)])
# print(gnomadDF['ClinVar Clinical Significance'])
# Add frequency column and a dummy frequency to each row in myBigMergedDF
myBigMergedDF['frequency'] = 999.0
myBigMergedDF['labels'] = np.nan
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment