Commit 6f4cbe77 by Mustafa Tekpinar

Implemented ranksorting inside prescott.py.

parent c87411cf
>MLH1
msfvaGVIRRLDETVVNRIAAGEVIQRPANAIKEMIENCLDAKSTSIQVIVKEGGLKLIQIQDNGTGIRKEDLDIVCERF
TTSKLQSFEDLASISTYGFRGEALASISHVAHVTITTKTADGKCAYRASYSDGKLKAPPKPCAGNQGTQITVEDLFYNIA
TRRKALKNPSEEYGKILEVVGRYSVHNAGISFSVKKQGETVADVRTLPNASTVDNIRSIFGNAVSRELIEIGCEDKTLAF
KMNGYISNANYSVKKCIFLLFINHRLVESTSLRKAIETVYAAYLPKNTHPFLYLSLEISPQNVDVNVHPTKHEVHFLHEE
SILERVQQHIESKLLGSNSSRMYFTQTLLPGLAGPSGEMVKSttsLTSSSTSGSSDKVYAHQMVRTDSREQKLDAFLQPL
SKPLSSQPQAIVTEDKTDISSGRARQQDEEMLELPAPAEVAAKNQslegDTTKGTSEMSEKRGPTSSNPRKRHREDSDVE
MVEDDSRKEMTAACTPRRRIINLTSVLSLQEEINEQGHEVLREMLHNHSFVGCVNPQWALAQHQTKLYLLNTTKLSEELF
YQILIYDFANFGVLRLSEPAPLFDLAMLALDSPESGWTEEDGPKEGLAEYIVEFLKKKAEMLADYFSLEIDEEGNLIGLP
LLIDNYVPPLEGLPIFILRLATEVNWDEEKECFESLSKECAMFYSIRKQYISEESTlsgqqsEVPGSIPNSWKWTVEHIV
YKALRSHILPPKHFTEDGNILQLANLPDLYKVFERC
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
#!/bin/bash #!/bin/bash
prescott -e ../data/MLH1_normPred_evolCombi_singleline_1-ranksort.txt -g ../data/gnomAD_v2.1.1_MLH1_HUMAN_ENSG00000076242.csv prescott -e ../data/MLH1_normPred_evolCombi.txt -g ../data/gnomAD_v2.1.1_MLH1_HUMAN_ENSG00000076242.csv -s ../data/MLH1.fasta
import os import os
import sys import sys
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import seaborn as sns # import seaborn as sns
import pandas as pd import pandas as pd
from sklearn import mixture from sklearn import mixture
from sklearn import metrics from sklearn import metrics
import numpy as np import numpy as np
import argparse import argparse
from Bio import SeqIO
one_letter ={'VAL':'V', 'ILE':'I', 'LEU':'L', 'GLU':'E', 'GLN':'Q', \ one_letter ={'VAL':'V', 'ILE':'I', 'LEU':'L', 'GLU':'E', 'GLN':'Q', \
'ASP':'D', 'ASN':'N', 'HIS':'H', 'TRP':'W', 'PHE':'F', 'TYR':'Y', \ 'ASP':'D', 'ASN':'N', 'HIS':'H', 'TRP':'W', 'PHE':'F', 'TYR':'Y', \
...@@ -24,6 +25,8 @@ def getGnomADOverallFrequency(infile, usePopMax="true"): ...@@ -24,6 +25,8 @@ def getGnomADOverallFrequency(infile, usePopMax="true"):
df = pd.read_csv(infile) df = pd.read_csv(infile)
# print(df.columns) # print(df.columns)
# #Select only columns containing missense variants!
# df = df.loc[df[]=='']
#print(df1.columns) #print(df1.columns)
dfMissense = df.loc[df['VEP Annotation']=='missense_variant', dfMissense = df.loc[df['VEP Annotation']=='missense_variant',
['HGVS Consequence', 'Protein Consequence', 'Transcript Consequence', ['HGVS Consequence', 'Protein Consequence', 'Transcript Consequence',
...@@ -140,6 +143,146 @@ def getGnomADOverallFrequency(infile, usePopMax="true"): ...@@ -140,6 +143,146 @@ def getGnomADOverallFrequency(infile, usePopMax="true"):
dfMissense['mutant'] = mutantList dfMissense['mutant'] = mutantList
dfMissense['protein'] = proteinNameList dfMissense['protein'] = proteinNameList
return (dfMissense) return (dfMissense)
alphabeticalAminoAcidsList = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
def writeSinglelineFormat(scanningMatrix, outFile, residueList,
beg, end, \
aaOrder = alphabeticalAminoAcidsList, \
offSet=0):
"""
A function to convert GEMME matrix format data to experimental
deep mutational scanning format.
The format has only two columns. In the first column, one-letter wild
type amino acid, residue number and mutated amino acid are combined without any
gap or any other character, such as A19C. The second column is typically a float
value of the predicted effect.
Residue numbering starts with 1.
"A1A 0.211
"A1C 0.124
.
.
.
"T262C 0.434
Parameters
----------
scanningMatrix: numpy array of arrays
Data matrix to plot
outFile: string
Name of the output png image
residueList: Python list
A list of the residues in the protein such as ['X1', 'X2', ..., 'XN'].
X is one-letter amino acid code and N is the total number of residues.
Default is None.
If not provided, the program will write them starting from 1 till the end.
beg: int
The first residue in the fasta file may not necessarily be the first residue
in the protein. This beg parameter gives us a chance to adjust the first residue
other than one. It only effects the DMS format text output. Default value is 0.
end: int
The last residue to use. It is used to select a subrange
of amino acids.
aaOrder: Python list
A list of 20 amino acid characters such as ['A', 'C',...., 'Y'].
Default is alphabetical list.
offSet: int
It is used to match the residue IDs in your PDB
file with 0 based indices read from scanningMatrix matrix
Returns
-------
Nothing
"""
debug = 0
#We subtract 1 from beg bc matrix indices starts from 0
if(end == None):
end = len(scanningMatrix[0])
if(debug):
print("Beginning: "+str(beg))
print("End : "+str(end))
print(len(scanningMatrix[0]))
# I don't want to write the submatrices for now!
# Let's keep it simple for now!
# subMatrix = scanningMatrix[:, (beg-1):end]
realResidueList = []
if(residueList == None):
for i in range(0,len(scanningMatrix[0])):
tempString = 'X'+str(i+1+beg)
realResidueList.append(tempString)
else:
for i in range(0,len(scanningMatrix[0])):
tempString = residueList[i].upper()+str(i+1+beg)
realResidueList.append(tempString)
with open(outFile, 'w') as file:
for res in range (len(scanningMatrix[0])):
for i in range(20):
file.write(realResidueList[res]+aaOrder[i]+" ")
file.write(str(scanningMatrix[alphabeticalAminoAcidsList.index(aaOrder[i])][res])+"\n")
def parseGEMMEoutput(inputFile, verbose):
"""
Parse normalized (I don't know how?!) GEMME output files:
Independent, Epistatic and Combined
Return: Data File as a Numpy array, where each col contains
deleteriousness score for an amino acid.
verbose is a boolean value
"""
gemmeDataFile =open(inputFile, 'r')
allLines = gemmeDataFile.readlines()
gemmeDataFile.close()
headerLine = allLines[0]
if(verbose):
print(headerLine)
matrixData = []
aaColumn = []
for line in allLines[1:]:
tempList = []
data = line.split()
aaColumn.append(data[0])
for item in data[1:]:
if (item == 'NA'):
tempList.append(0.0000000)
else:
tempList.append(float(item))
matrixData.append(tempList)
mutationsData = np.array(matrixData)
# mutatedTransposed = mutationsData.T
# return mutatedTransposed
return mutationsData
from scipy.stats import rankdata
def rankSortData(dataArray):
"""
This function ranksorts protein data and converts it
to values between [0,1.0].
Parameters:
----------
dataArray: numpy array of arrays
data read by numpy.loadtxt
Returns:
-------
normalizedRankedDataArray: numpy array
"""
normalizedRankedDataArray = rankdata(dataArray)/float(len(dataArray))
return (normalizedRankedDataArray)
def main(): def main():
# Adding the main parser # Adding the main parser
...@@ -150,6 +293,10 @@ def main(): ...@@ -150,6 +293,10 @@ def main():
help='Main input for prescott is an ESCOTT single point mutation file.', \ help='Main input for prescott is an ESCOTT single point mutation file.', \
required=True, default=None) required=True, default=None)
main_parser.add_argument('-s', '--sequencefile', dest='sequencefile', type=str, \
help='Sequence file of only the query protein sequence in fasta format.', \
required=False, default=None)
main_parser.add_argument('-g', '--gnomadfile', dest='gnomadfile', type=str, \ main_parser.add_argument('-g', '--gnomadfile', dest='gnomadfile', type=str, \
help='The second input for prescott is a gnomad frequency file for the protein\n.'+ help='The second input for prescott is a gnomad frequency file for the protein\n.'+
'You can obtain it from https://gnomad.broadinstitute.org/ by searching your favorite protein (such as P53).\n'+ 'You can obtain it from https://gnomad.broadinstitute.org/ by searching your favorite protein (such as P53).\n'+
...@@ -168,7 +315,7 @@ def main(): ...@@ -168,7 +315,7 @@ def main():
help='An integer value. Default is 2, which means it only reduces scores if it is too frequent!', help='An integer value. Default is 2, which means it only reduces scores if it is too frequent!',
required=False, default=2) required=False, default=2)
main_parser.add_argument('-s', '--scalingcoefficient', dest='scalingcoefficient', type=float, \ main_parser.add_argument('-c', '--coefficient', dest='coefficient', type=float, \
help='Scaling coefficient to adjust contribution of frequency information to the scores. \n It is denoted with c in the model equation. Default is 1.0', \ help='Scaling coefficient to adjust contribution of frequency information to the scores. \n It is denoted with c in the model equation. Default is 1.0', \
required=False, default=1.0) required=False, default=1.0)
main_parser.add_argument('-f', '--frequencycutoff', dest='frequencycutoff', type=float, \ main_parser.add_argument('-f', '--frequencycutoff', dest='frequencycutoff', type=float, \
...@@ -203,7 +350,7 @@ def main(): ...@@ -203,7 +350,7 @@ def main():
print("@> GNOMAD frequency file : {}".format(args.gnomadfile)) print("@> GNOMAD frequency file : {}".format(args.gnomadfile))
print("@> Use population max. freq : {}".format(str(args.usepopmax).lower())) print("@> Use population max. freq : {}".format(str(args.usepopmax).lower()))
print("@> Which equation to use (Default=2): {}".format(str(args.equation))) print("@> Which equation to use (Default=2): {}".format(str(args.equation)))
print("@> Scaling coefficient (Default=1.0): {}".format(args.scalingcoefficient)) print("@> Scaling coefficient (Default=1.0): {}".format(args.coefficient))
print("@> Frequency cutoff (Default=-4.0) : {}".format(args.frequencycutoff)) print("@> Frequency cutoff (Default=-4.0) : {}".format(args.frequencycutoff))
print("@> Name of the output file : {}".format(args.outputfile)) print("@> Name of the output file : {}".format(args.outputfile))
# End of argument parsing! # End of argument parsing!
...@@ -233,7 +380,32 @@ def main(): ...@@ -233,7 +380,32 @@ def main():
usePopMaxOrNot = args.usepopmax.lower() usePopMaxOrNot = args.usepopmax.lower()
version = args.equation version = args.equation
if (os.path.exists(escottDataPath)): if (os.path.exists(escottDataPath)):
dfESCOTT = pd.read_table(escottDataPath, sep='\s+', header=None) #Parse the file containing raw ESCOTT scores.
scanningMatrix = parseGEMMEoutput(args.escottfile, verbose=False)
#Convert the matrix format to singleline format
localResidueList = None
if(args.sequencefile != None):
referenceSeq = SeqIO.read(args.sequencefile, 'fasta')
localResidueList = list(referenceSeq.seq)
aaOrderList = list('ACDEFGHIKLMNPQRSTVWY')
writeSinglelineFormat(scanningMatrix, protein+'_singleline.txt', residueList = localResidueList,\
beg=0, end=None, aaOrder = aaOrderList, \
offSet=0)
#Mostyl, I am using normPred_Combi_singleline as input file and it doesn't have a header.
df = pd.read_table(protein+'_singleline.txt', sep="\s+", header=None)
#data = np.genfromtxt(args.input,dtype=None)
data = df.to_numpy()
rawData = data.T[1]
processedData = 1.0 - rankSortData(rawData)
with open(protein+'_singleline_1-ranksort.txt', 'w') as f:
#f.write("#Resid Value\n")
for i in range (len(processedData)):
f.write("{:} {:6.2f}\n".format(data.T[0][i], processedData[i]))
dfESCOTT = pd.read_table(protein+'_singleline_1-ranksort.txt', sep='\s+', header=None)
dfESCOTT.columns = ['mutant', 'ESCOTT'] dfESCOTT.columns = ['mutant', 'ESCOTT']
dfESCOTT['protein']=protein dfESCOTT['protein']=protein
...@@ -293,7 +465,7 @@ def main(): ...@@ -293,7 +465,7 @@ def main():
myBigMergedDF.at[index,'labels'] = gnomadDF.loc[gnomadDF['mutant'] == row['mutant'], 'labels'].values[0] myBigMergedDF.at[index,'labels'] = gnomadDF.loc[gnomadDF['mutant'] == row['mutant'], 'labels'].values[0]
# print(myBigMergedDF) # print(myBigMergedDF)
scalingCoeff = args.scalingcoefficient scalingCoeff = args.coefficient
freqCutoff = args.frequencycutoff freqCutoff = args.frequencycutoff
for index, row in myBigMergedDF.iterrows(): for index, row in myBigMergedDF.iterrows():
if(row['frequency'] != 999.0): if(row['frequency'] != 999.0):
......
...@@ -5,5 +5,5 @@ scipy ...@@ -5,5 +5,5 @@ scipy
pandas pandas
biopython<=1.79 biopython<=1.79
biotite biotite
sklearn
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment