Implemented ranksorting inside prescott.py.

6f4cbe77 · Mustafa Tekpinar · c87411cf · 6f4cbe77 · 6f4cbe77 · c87411cf
Commit 6f4cbe77 authored Sep 29, 2023 by Mustafa Tekpinar
6 changed files
--- a/data/MLH1.fasta
+++ b/data/MLH1.fasta
+>MLH1
+msfvaGVIRRLDETVVNRIAAGEVIQRPANAIKEMIENCLDAKSTSIQVIVKEGGLKLIQIQDNGTGIRKEDLDIVCERF
+TTSKLQSFEDLASISTYGFRGEALASISHVAHVTITTKTADGKCAYRASYSDGKLKAPPKPCAGNQGTQITVEDLFYNIA
+TRRKALKNPSEEYGKILEVVGRYSVHNAGISFSVKKQGETVADVRTLPNASTVDNIRSIFGNAVSRELIEIGCEDKTLAF
+KMNGYISNANYSVKKCIFLLFINHRLVESTSLRKAIETVYAAYLPKNTHPFLYLSLEISPQNVDVNVHPTKHEVHFLHEE
+SILERVQQHIESKLLGSNSSRMYFTQTLLPGLAGPSGEMVKSttsLTSSSTSGSSDKVYAHQMVRTDSREQKLDAFLQPL
+SKPLSSQPQAIVTEDKTDISSGRARQQDEEMLELPAPAEVAAKNQslegDTTKGTSEMSEKRGPTSSNPRKRHREDSDVE
+MVEDDSRKEMTAACTPRRRIINLTSVLSLQEEINEQGHEVLREMLHNHSFVGCVNPQWALAQHQTKLYLLNTTKLSEELF
+YQILIYDFANFGVLRLSEPAPLFDLAMLALDSPESGWTEEDGPKEGLAEYIVEFLKKKAEMLADYFSLEIDEEGNLIGLP
+LLIDNYVPPLEGLPIFILRLATEVNWDEEKECFESLSKECAMFYSIRKQYISEESTlsgqqsEVPGSIPNSWKWTVEHIV
+YKALRSHILPPKHFTEDGNILQLANLPDLYKVFERC
--- a/data/MLH1_normPred_evolCombi.txt
+++ b/data/MLH1_normPred_evolCombi.txt
--- a/data/MLH1_normPred_evolCombi_singleline_1-ranksort.txt
+++ b/data/MLH1_normPred_evolCombi_singleline_1-ranksort.txt
--- a/examples/example-prescott-script.sh
+++ b/examples/example-prescott-script.sh
 #!/bin/bash
-prescott -e ../data/MLH1_normPred_evolCombi_singleline_1-ranksort.txt -g ../data/gnomAD_v2.1.1_MLH1_HUMAN_ENSG00000076242.csv
+prescott -e ../data/MLH1_normPred_evolCombi.txt -g ../data/gnomAD_v2.1.1_MLH1_HUMAN_ENSG00000076242.csv -s ../data/MLH1.fasta
--- a/prescott/prescott.py
+++ b/prescott/prescott.py
 import os
 import sys
 import matplotlib.pyplot as plt
-import seaborn as sns
+# import seaborn as sns
 import pandas as pd
 from sklearn import mixture
 from sklearn import metrics
 import numpy as np
 import argparse
+from Bio import SeqIO
 one_letter ={'VAL':'V', 'ILE':'I', 'LEU':'L', 'GLU':'E', 'GLN':'Q', \
            'ASP':'D', 'ASN':'N', 'HIS':'H', 'TRP':'W', 'PHE':'F', 'TYR':'Y',    \
@@ -24,6 +25,8 @@ def getGnomADOverallFrequency(infile, usePopMax="true"):
    df = pd.read_csv(infile)
    # print(df.columns)
+    # #Select only columns containing missense variants!
+    # df = df.loc[df[]=='']
    #print(df1.columns)
    dfMissense = df.loc[df['VEP Annotation']=='missense_variant',
                    ['HGVS Consequence', 'Protein Consequence', 'Transcript Consequence',
@@ -140,6 +143,146 @@ def getGnomADOverallFrequency(infile, usePopMax="true"):
    dfMissense['mutant'] = mutantList
    dfMissense['protein'] = proteinNameList
    return (dfMissense)
+alphabeticalAminoAcidsList = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
+                              'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
+def writeSinglelineFormat(scanningMatrix, outFile, residueList, 
+                    beg, end, \
+                    aaOrder = alphabeticalAminoAcidsList, \
+                    offSet=0):
+    """
+        A function to convert GEMME matrix format data to experimental 
+        deep mutational scanning format. 
+    The format has only two columns. In the first column, one-letter wild
+    type amino acid, residue number and mutated amino acid are combined without any
+    gap or any other character, such as A19C. The second column is typically a float
+    value of the predicted effect.
+    Residue numbering starts with 1.
+    "A1A 0.211
+    "A1C 0.124
+    .
+    .
+    .
+    "T262C 0.434
+    Parameters
+    ----------
+    scanningMatrix: numpy array of arrays
+        Data matrix to plot
+    outFile: string
+        Name of the output png image
+    residueList: Python list
+        A list of the residues in the protein such as ['X1', 'X2', ..., 'XN']. 
+        X is one-letter amino acid code and N is the total number of residues. 
+        Default is None.
+        If not provided, the program will write them starting from 1 till the end. 
+    beg: int
+        The first residue in the fasta file may not necessarily be the first residue
+        in the protein. This beg parameter gives us a chance to adjust the first residue
+        other than one. It only effects the DMS format text output. Default value is 0.
+    end: int
+        The last residue to use. It is used to select a subrange 
+        of amino acids.
+    aaOrder: Python list
+        A list of 20 amino acid characters such as ['A', 'C',...., 'Y'].  
+        Default is alphabetical list.
+    offSet: int
+        It is used to match the residue IDs in your PDB
+        file with 0 based indices read from scanningMatrix matrix
+    Returns
+    -------
+    Nothing
+    """
+    debug = 0
+    #We subtract 1 from beg bc matrix indices starts from 0
+    if(end == None):
+        end = len(scanningMatrix[0])
+    if(debug):
+        print("Beginning: "+str(beg))
+        print("End      : "+str(end))
+        print(len(scanningMatrix[0]))
+    # I don't want to write the submatrices for now!
+    # Let's keep it simple for now! 
+    # subMatrix = scanningMatrix[:, (beg-1):end]
+    realResidueList = []
+    if(residueList == None):
+        for i in range(0,len(scanningMatrix[0])):
+            tempString = 'X'+str(i+1+beg)
+            realResidueList.append(tempString)
+    else:
+        for i in range(0,len(scanningMatrix[0])):
+            tempString = residueList[i].upper()+str(i+1+beg)
+            realResidueList.append(tempString)
+    with open(outFile, 'w') as file:
+        for res in range (len(scanningMatrix[0])):
+            for i in range(20):
+                file.write(realResidueList[res]+aaOrder[i]+" ")
+                file.write(str(scanningMatrix[alphabeticalAminoAcidsList.index(aaOrder[i])][res])+"\n")
+def parseGEMMEoutput(inputFile, verbose):
+    """
+        Parse normalized (I don't know how?!) GEMME output files: 
+        Independent, Epistatic and Combined
+        Return: Data File as a Numpy array, where each col contains 
+        deleteriousness score for an amino acid.
+        verbose is a boolean value
+    """
+    gemmeDataFile =open(inputFile, 'r')
+    allLines = gemmeDataFile.readlines()
+    gemmeDataFile.close()
+    headerLine = allLines[0]
+    if(verbose):
+        print(headerLine)
+    matrixData = []
+    aaColumn = []
+    for line in allLines[1:]:
+        tempList = []
+        data = line.split()
+        aaColumn.append(data[0])
+        for item in data[1:]:
+            if (item == 'NA'):
+                tempList.append(0.0000000)
+            else:
+                tempList.append(float(item))
+        matrixData.append(tempList)
+    mutationsData = np.array(matrixData)
+    # mutatedTransposed = mutationsData.T
+    # return mutatedTransposed
+    return mutationsData
+from scipy.stats import rankdata
+def rankSortData(dataArray):
+    """
+        This function ranksorts protein data and converts it
+        to values between [0,1.0].
+    Parameters:
+    ----------
+    dataArray: numpy array of arrays
+               data read by numpy.loadtxt
+    Returns:
+    -------
+    normalizedRankedDataArray: numpy array
+    """
+    normalizedRankedDataArray = rankdata(dataArray)/float(len(dataArray))
+    return (normalizedRankedDataArray)
 def main():
    # Adding the main parser
@@ -150,6 +293,10 @@ def main():
        help='Main input for prescott is an ESCOTT single point mutation file.', \
        required=True, default=None)
+    main_parser.add_argument('-s', '--sequencefile', dest='sequencefile', type=str, \
+        help='Sequence file of only the query protein sequence in fasta format.', \
+        required=False, default=None)
    main_parser.add_argument('-g', '--gnomadfile', dest='gnomadfile', type=str, \
        help='The second input for prescott is a gnomad frequency file for the protein\n.'+
             'You can obtain it from https://gnomad.broadinstitute.org/ by searching your favorite protein (such as P53).\n'+
@@ -168,7 +315,7 @@ def main():
        help='An integer value. Default is 2, which means it only reduces scores if it is too frequent!',
        required=False, default=2)
-    main_parser.add_argument('-s', '--scalingcoefficient', dest='scalingcoefficient', type=float, \
+    main_parser.add_argument('-c', '--coefficient', dest='coefficient', type=float, \
        help='Scaling coefficient to adjust contribution of frequency information to the scores. \n It is denoted with c in the model equation. Default is 1.0', \
        required=False, default=1.0)
    main_parser.add_argument('-f', '--frequencycutoff', dest='frequencycutoff', type=float, \
@@ -203,7 +350,7 @@ def main():
    print("@> GNOMAD frequency file            : {}".format(args.gnomadfile))
    print("@> Use population max. freq         : {}".format(str(args.usepopmax).lower()))
    print("@> Which equation to use (Default=2): {}".format(str(args.equation)))
-    print("@> Scaling coefficient (Default=1.0): {}".format(args.scalingcoefficient))
+    print("@> Scaling coefficient (Default=1.0): {}".format(args.coefficient))
    print("@> Frequency cutoff (Default=-4.0) : {}".format(args.frequencycutoff))
    print("@> Name of the output file          : {}".format(args.outputfile))
    # End of argument parsing!
@@ -233,7 +380,32 @@ def main():
    usePopMaxOrNot = args.usepopmax.lower()
    version = args.equation
    if (os.path.exists(escottDataPath)):
-        dfESCOTT = pd.read_table(escottDataPath, sep='\s+', header=None)
+        #Parse the file containing raw ESCOTT scores. 
+        scanningMatrix = parseGEMMEoutput(args.escottfile, verbose=False)
+        #Convert the matrix format to singleline format
+        localResidueList = None
+        if(args.sequencefile != None):
+            referenceSeq = SeqIO.read(args.sequencefile, 'fasta')
+            localResidueList = list(referenceSeq.seq)
+        aaOrderList = list('ACDEFGHIKLMNPQRSTVWY')
+        writeSinglelineFormat(scanningMatrix, protein+'_singleline.txt', residueList = localResidueList,\
+                        beg=0, end=None, aaOrder = aaOrderList, \
+                        offSet=0)
+        #Mostyl, I am using normPred_Combi_singleline as input file and it doesn't have a header.
+        df = pd.read_table(protein+'_singleline.txt', sep="\s+", header=None)
+        #data = np.genfromtxt(args.input,dtype=None)
+        data = df.to_numpy()
+        rawData = data.T[1]
+        processedData = 1.0 - rankSortData(rawData)
+        with open(protein+'_singleline_1-ranksort.txt', 'w') as f:
+            #f.write("#Resid Value\n")
+            for i in range (len(processedData)):
+                f.write("{:} {:6.2f}\n".format(data.T[0][i], processedData[i]))
+        dfESCOTT = pd.read_table(protein+'_singleline_1-ranksort.txt', sep='\s+', header=None)
        dfESCOTT.columns = ['mutant', 'ESCOTT']
        dfESCOTT['protein']=protein
@@ -293,7 +465,7 @@ def main():
                myBigMergedDF.at[index,'labels'] = gnomadDF.loc[gnomadDF['mutant'] == row['mutant'], 'labels'].values[0]
        # print(myBigMergedDF)
-        scalingCoeff = args.scalingcoefficient
+        scalingCoeff = args.coefficient
        freqCutoff = args.frequencycutoff
        for index, row in myBigMergedDF.iterrows():
            if(row['frequency'] != 999.0):

--- a/requirements.txt
+++ b/requirements.txt
@@ -5,5 +5,5 @@ scipy
 pandas
 biopython<=1.79
 biotite
+sklearn