Commit 6f4cbe77 by Mustafa Tekpinar

Implemented ranksorting inside prescott.py.

parent c87411cf
>MLH1
msfvaGVIRRLDETVVNRIAAGEVIQRPANAIKEMIENCLDAKSTSIQVIVKEGGLKLIQIQDNGTGIRKEDLDIVCERF
TTSKLQSFEDLASISTYGFRGEALASISHVAHVTITTKTADGKCAYRASYSDGKLKAPPKPCAGNQGTQITVEDLFYNIA
TRRKALKNPSEEYGKILEVVGRYSVHNAGISFSVKKQGETVADVRTLPNASTVDNIRSIFGNAVSRELIEIGCEDKTLAF
KMNGYISNANYSVKKCIFLLFINHRLVESTSLRKAIETVYAAYLPKNTHPFLYLSLEISPQNVDVNVHPTKHEVHFLHEE
SILERVQQHIESKLLGSNSSRMYFTQTLLPGLAGPSGEMVKSttsLTSSSTSGSSDKVYAHQMVRTDSREQKLDAFLQPL
SKPLSSQPQAIVTEDKTDISSGRARQQDEEMLELPAPAEVAAKNQslegDTTKGTSEMSEKRGPTSSNPRKRHREDSDVE
MVEDDSRKEMTAACTPRRRIINLTSVLSLQEEINEQGHEVLREMLHNHSFVGCVNPQWALAQHQTKLYLLNTTKLSEELF
YQILIYDFANFGVLRLSEPAPLFDLAMLALDSPESGWTEEDGPKEGLAEYIVEFLKKKAEMLADYFSLEIDEEGNLIGLP
LLIDNYVPPLEGLPIFILRLATEVNWDEEKECFESLSKECAMFYSIRKQYISEESTlsgqqsEVPGSIPNSWKWTVEHIV
YKALRSHILPPKHFTEDGNILQLANLPDLYKVFERC
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
#!/bin/bash
prescott -e ../data/MLH1_normPred_evolCombi_singleline_1-ranksort.txt -g ../data/gnomAD_v2.1.1_MLH1_HUMAN_ENSG00000076242.csv
prescott -e ../data/MLH1_normPred_evolCombi.txt -g ../data/gnomAD_v2.1.1_MLH1_HUMAN_ENSG00000076242.csv -s ../data/MLH1.fasta
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns
# import seaborn as sns
import pandas as pd
from sklearn import mixture
from sklearn import metrics
import numpy as np
import argparse
from Bio import SeqIO
one_letter ={'VAL':'V', 'ILE':'I', 'LEU':'L', 'GLU':'E', 'GLN':'Q', \
'ASP':'D', 'ASN':'N', 'HIS':'H', 'TRP':'W', 'PHE':'F', 'TYR':'Y', \
......@@ -24,6 +25,8 @@ def getGnomADOverallFrequency(infile, usePopMax="true"):
df = pd.read_csv(infile)
# print(df.columns)
# #Select only columns containing missense variants!
# df = df.loc[df[]=='']
#print(df1.columns)
dfMissense = df.loc[df['VEP Annotation']=='missense_variant',
['HGVS Consequence', 'Protein Consequence', 'Transcript Consequence',
......@@ -140,6 +143,146 @@ def getGnomADOverallFrequency(infile, usePopMax="true"):
dfMissense['mutant'] = mutantList
dfMissense['protein'] = proteinNameList
return (dfMissense)
alphabeticalAminoAcidsList = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
def writeSinglelineFormat(scanningMatrix, outFile, residueList,
beg, end, \
aaOrder = alphabeticalAminoAcidsList, \
offSet=0):
"""
A function to convert GEMME matrix format data to experimental
deep mutational scanning format.
The format has only two columns. In the first column, one-letter wild
type amino acid, residue number and mutated amino acid are combined without any
gap or any other character, such as A19C. The second column is typically a float
value of the predicted effect.
Residue numbering starts with 1.
"A1A 0.211
"A1C 0.124
.
.
.
"T262C 0.434
Parameters
----------
scanningMatrix: numpy array of arrays
Data matrix to plot
outFile: string
Name of the output png image
residueList: Python list
A list of the residues in the protein such as ['X1', 'X2', ..., 'XN'].
X is one-letter amino acid code and N is the total number of residues.
Default is None.
If not provided, the program will write them starting from 1 till the end.
beg: int
The first residue in the fasta file may not necessarily be the first residue
in the protein. This beg parameter gives us a chance to adjust the first residue
other than one. It only effects the DMS format text output. Default value is 0.
end: int
The last residue to use. It is used to select a subrange
of amino acids.
aaOrder: Python list
A list of 20 amino acid characters such as ['A', 'C',...., 'Y'].
Default is alphabetical list.
offSet: int
It is used to match the residue IDs in your PDB
file with 0 based indices read from scanningMatrix matrix
Returns
-------
Nothing
"""
debug = 0
#We subtract 1 from beg bc matrix indices starts from 0
if(end == None):
end = len(scanningMatrix[0])
if(debug):
print("Beginning: "+str(beg))
print("End : "+str(end))
print(len(scanningMatrix[0]))
# I don't want to write the submatrices for now!
# Let's keep it simple for now!
# subMatrix = scanningMatrix[:, (beg-1):end]
realResidueList = []
if(residueList == None):
for i in range(0,len(scanningMatrix[0])):
tempString = 'X'+str(i+1+beg)
realResidueList.append(tempString)
else:
for i in range(0,len(scanningMatrix[0])):
tempString = residueList[i].upper()+str(i+1+beg)
realResidueList.append(tempString)
with open(outFile, 'w') as file:
for res in range (len(scanningMatrix[0])):
for i in range(20):
file.write(realResidueList[res]+aaOrder[i]+" ")
file.write(str(scanningMatrix[alphabeticalAminoAcidsList.index(aaOrder[i])][res])+"\n")
def parseGEMMEoutput(inputFile, verbose):
"""
Parse normalized (I don't know how?!) GEMME output files:
Independent, Epistatic and Combined
Return: Data File as a Numpy array, where each col contains
deleteriousness score for an amino acid.
verbose is a boolean value
"""
gemmeDataFile =open(inputFile, 'r')
allLines = gemmeDataFile.readlines()
gemmeDataFile.close()
headerLine = allLines[0]
if(verbose):
print(headerLine)
matrixData = []
aaColumn = []
for line in allLines[1:]:
tempList = []
data = line.split()
aaColumn.append(data[0])
for item in data[1:]:
if (item == 'NA'):
tempList.append(0.0000000)
else:
tempList.append(float(item))
matrixData.append(tempList)
mutationsData = np.array(matrixData)
# mutatedTransposed = mutationsData.T
# return mutatedTransposed
return mutationsData
from scipy.stats import rankdata
def rankSortData(dataArray):
"""
This function ranksorts protein data and converts it
to values between [0,1.0].
Parameters:
----------
dataArray: numpy array of arrays
data read by numpy.loadtxt
Returns:
-------
normalizedRankedDataArray: numpy array
"""
normalizedRankedDataArray = rankdata(dataArray)/float(len(dataArray))
return (normalizedRankedDataArray)
def main():
# Adding the main parser
......@@ -149,6 +292,10 @@ def main():
main_parser.add_argument('-e', '--escottfile', dest='escottfile', type=str, \
help='Main input for prescott is an ESCOTT single point mutation file.', \
required=True, default=None)
main_parser.add_argument('-s', '--sequencefile', dest='sequencefile', type=str, \
help='Sequence file of only the query protein sequence in fasta format.', \
required=False, default=None)
main_parser.add_argument('-g', '--gnomadfile', dest='gnomadfile', type=str, \
help='The second input for prescott is a gnomad frequency file for the protein\n.'+
......@@ -168,7 +315,7 @@ def main():
help='An integer value. Default is 2, which means it only reduces scores if it is too frequent!',
required=False, default=2)
main_parser.add_argument('-s', '--scalingcoefficient', dest='scalingcoefficient', type=float, \
main_parser.add_argument('-c', '--coefficient', dest='coefficient', type=float, \
help='Scaling coefficient to adjust contribution of frequency information to the scores. \n It is denoted with c in the model equation. Default is 1.0', \
required=False, default=1.0)
main_parser.add_argument('-f', '--frequencycutoff', dest='frequencycutoff', type=float, \
......@@ -203,7 +350,7 @@ def main():
print("@> GNOMAD frequency file : {}".format(args.gnomadfile))
print("@> Use population max. freq : {}".format(str(args.usepopmax).lower()))
print("@> Which equation to use (Default=2): {}".format(str(args.equation)))
print("@> Scaling coefficient (Default=1.0): {}".format(args.scalingcoefficient))
print("@> Scaling coefficient (Default=1.0): {}".format(args.coefficient))
print("@> Frequency cutoff (Default=-4.0) : {}".format(args.frequencycutoff))
print("@> Name of the output file : {}".format(args.outputfile))
# End of argument parsing!
......@@ -232,8 +379,33 @@ def main():
# Check if file exists
usePopMaxOrNot = args.usepopmax.lower()
version = args.equation
if (os.path.exists(escottDataPath)):
dfESCOTT = pd.read_table(escottDataPath, sep='\s+', header=None)
if (os.path.exists(escottDataPath)):
#Parse the file containing raw ESCOTT scores.
scanningMatrix = parseGEMMEoutput(args.escottfile, verbose=False)
#Convert the matrix format to singleline format
localResidueList = None
if(args.sequencefile != None):
referenceSeq = SeqIO.read(args.sequencefile, 'fasta')
localResidueList = list(referenceSeq.seq)
aaOrderList = list('ACDEFGHIKLMNPQRSTVWY')
writeSinglelineFormat(scanningMatrix, protein+'_singleline.txt', residueList = localResidueList,\
beg=0, end=None, aaOrder = aaOrderList, \
offSet=0)
#Mostyl, I am using normPred_Combi_singleline as input file and it doesn't have a header.
df = pd.read_table(protein+'_singleline.txt', sep="\s+", header=None)
#data = np.genfromtxt(args.input,dtype=None)
data = df.to_numpy()
rawData = data.T[1]
processedData = 1.0 - rankSortData(rawData)
with open(protein+'_singleline_1-ranksort.txt', 'w') as f:
#f.write("#Resid Value\n")
for i in range (len(processedData)):
f.write("{:} {:6.2f}\n".format(data.T[0][i], processedData[i]))
dfESCOTT = pd.read_table(protein+'_singleline_1-ranksort.txt', sep='\s+', header=None)
dfESCOTT.columns = ['mutant', 'ESCOTT']
dfESCOTT['protein']=protein
......@@ -293,7 +465,7 @@ def main():
myBigMergedDF.at[index,'labels'] = gnomadDF.loc[gnomadDF['mutant'] == row['mutant'], 'labels'].values[0]
# print(myBigMergedDF)
scalingCoeff = args.scalingcoefficient
scalingCoeff = args.coefficient
freqCutoff = args.frequencycutoff
for index, row in myBigMergedDF.iterrows():
if(row['frequency'] != 999.0):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment