Removed dynamics related functions: calcPercentDFI, calcBfactors and getBfactors.

b20bee53 · Mustafa Tekpinar · 5ffaa0e3 · b20bee53 · b20bee53
Commit b20bee53 authored Mar 15, 2023 by Mustafa Tekpinar
Hide whitespace changes
Inline Side-by-side

Showing with 128 additions and 125 deletions

esgemme.py esgemme.py +128 -125

example-esgemme-script.sh example/example-esgemme-script.sh +0 -0

No files found.
--- a/esgemme.py
+++ b/esgemme.py
@@ -21,6 +21,10 @@ from prody import *
 from scipy.stats import rankdata
 from Bio import AlignIO

+import biotite.structure.io as strucio
+import biotite.application.dssp as dssp
+import biotite.structure.io.pdb as pdb
+
 #from gemmeAnal import *

 import pandas as pd
@@ -264,9 +268,7 @@ def launchJET(prot, retMet, bFile, fFile, pdbfile, chains, n, N, nl):
 # Run Rscript to compute predictions
 def launchPred(prot,inAli,mutFile, normWeightMode, alphabet):
    if (mutFile!=''):
-	    rcmd="Rscript --save $ESGEMME_PATH/computePred.R "+prot+" "+inAli+" FALSE "+mutFile+" "+normWeightMode+" "+alphabet
-    
-    #TODO: remember that there is a problem here!!!! 
+        rcmd="Rscript --save $ESGEMME_PATH/computePred.R "+prot+" "+inAli+" FALSE "+mutFile+" "+normWeightMode+" "+alphabet
    else:
        rcmd="Rscript --save $ESGEMME_PATH/computePred.R "+prot+" "+inAli+" TRUE none "+normWeightMode+" "+alphabet

@@ -331,114 +333,114 @@ def rankSortProteinData(dataArray, inverted=True):
    else:
        return (normalizedRankedDataArray)

-def calcPercentDFI(pdbfile, outfile, nmodes=None, attenuate="true", ranksorted="true"):
-    """
-        Calculate rank normalized dynamic flexibility index. 
-    """
+# def calcPercentDFI(pdbfile, outfile, nmodes=None, attenuate="true", ranksorted="true"):
+#     """
+#         Calculate rank normalized dynamic flexibility index. 
+#     """
    
-    #confProDy(auto_show=False)
+#     #confProDy(auto_show=False)

-    structure = parsePDB(pdbfile)
-    calphas = structure.select('name CA')
+#     structure = parsePDB(pdbfile)
+#     calphas = structure.select('name CA')

-    anm = ANM('ANM')
-    anm.buildHessian(calphas)
-    if(nmodes==None):
-        anm.calcModes(n_modes='all')
-    else:
-        anm.calcModes(n_modes=int(nmodes))
+#     anm = ANM('ANM')
+#     anm.buildHessian(calphas)
+#     if(nmodes==None):
+#         anm.calcModes(n_modes='all')
+#     else:
+#         anm.calcModes(n_modes=int(nmodes))

-    dfi = calcDynamicFlexibilityIndex(anm, calphas, select='ca', norm=True)
-    percentDfi = rankSortProteinData(dfi)
+#     dfi = calcDynamicFlexibilityIndex(anm, calphas, select='ca', norm=True)
+#     percentDfi = rankSortProteinData(dfi)

-    if(attenuate.lower()=='true'):
-        percentDfi = attenuateEndPoints(percentDfi)
+#     if(attenuate.lower()=='true'):
+#         percentDfi = attenuateEndPoints(percentDfi)

-    # if(outfile != None):
-    #     np.savetxt(outfile, percentDfi)
-    # else:
-    #     np.savetxt(outfile, dfi)
+#     # if(outfile != None):
+#     #     np.savetxt(outfile, percentDfi)
+#     # else:
+#     #     np.savetxt(outfile, dfi)

-    if(ranksorted.lower()=='true'):
-        return percentDfi
-    else:
-        return dfi
+#     if(ranksorted.lower()=='true'):
+#         return percentDfi
+#     else:
+#         return dfi

-def calcBfactors(pdbfile, outfile, nmodes=None, attenuate="true", 
-                ranksorted="true", inverted=False):
-    """
-        Calculate rank normalized mean squared fluctuations,
-        which correlate well with Bfactors. 
-    """
+# def calcBfactors(pdbfile, outfile, nmodes=None, attenuate="true", 
+#                 ranksorted="true", inverted=False):
+#     """
+#         Calculate rank normalized mean squared fluctuations,
+#         which correlate well with Bfactors. 
+#     """
    
-    #confProDy(auto_show=False)
+#     #confProDy(auto_show=False)

-    structure = parsePDB(pdbfile)
-    calphas = structure.select('name CA')
+#     structure = parsePDB(pdbfile)
+#     calphas = structure.select('name CA')

-    anm = ANM('ANM')
-    anm.buildHessian(calphas)
-    if(nmodes==None):
-        anm.calcModes(n_modes='all')
-    else:
-        anm.calcModes(n_modes=int(nmodes))
+#     anm = ANM('ANM')
+#     anm.buildHessian(calphas)
+#     if(nmodes==None):
+#         anm.calcModes(n_modes='all')
+#     else:
+#         anm.calcModes(n_modes=int(nmodes))

-    bfactors = calcSqFlucts(anm)
+#     bfactors = calcSqFlucts(anm)

-    percentBfactors = rankSortProteinData(bfactors, inverted=False)
+#     percentBfactors = rankSortProteinData(bfactors, inverted=False)

-    if(attenuate.lower()=='true'):
-        percentBfactors = attenuateEndPoints(percentBfactors)
+#     if(attenuate.lower()=='true'):
+#         percentBfactors = attenuateEndPoints(percentBfactors)

-    # if(outfile != None):
-    #     np.savetxt(outfile, percentBfactors)
-    # else:
-    #     np.savetxt(outfile, dfi)
+#     # if(outfile != None):
+#     #     np.savetxt(outfile, percentBfactors)
+#     # else:
+#     #     np.savetxt(outfile, dfi)

-    if(ranksorted.lower()=='true'):
-        if(inverted==True):
-            return (1.0 - percentBfactors)
-        else:
-            return percentBfactors
-    else:
-        if(inverted==True):
-            return (np.max(bfactors) - bfactors)
-        else:
-            return bfactors
+#     if(ranksorted.lower()=='true'):
+#         if(inverted==True):
+#             return (1.0 - percentBfactors)
+#         else:
+#             return percentBfactors
+#     else:
+#         if(inverted==True):
+#             return (np.max(bfactors) - bfactors)
+#         else:
+#             return bfactors

-def getBfactors(pdbfile, outfile, attenuate="true", \
-                ranksorted="true",  inverted=True):
-    """
-        Calculate rank normalized dynamic flexibility index. 
-    """
+# def getBfactors(pdbfile, outfile, attenuate="true", \
+#                 ranksorted="true",  inverted=True):
+#     """
+#         Calculate rank normalized dynamic flexibility index. 
+#     """
    
-    #confProDy(auto_show=False)
+#     #confProDy(auto_show=False)

-    structure = parsePDB(pdbfile)
-    calphas = structure.select('name CA')
+#     structure = parsePDB(pdbfile)
+#     calphas = structure.select('name CA')

-    bfactors = calphas.getBetas()
+#     bfactors = calphas.getBetas()

-    percentBfactors = rankSortProteinData(bfactors, inverted=False)
+#     percentBfactors = rankSortProteinData(bfactors, inverted=False)

-    if(attenuate.lower()=='true'):
-        percentBfactors = attenuateEndPoints(percentBfactors)
+#     if(attenuate.lower()=='true'):
+#         percentBfactors = attenuateEndPoints(percentBfactors)

-    # if(outfile != None):
-    #     np.savetxt(outfile, percentBfactors)
-    # else:
-    #     np.savetxt(outfile, dfi)
+#     # if(outfile != None):
+#     #     np.savetxt(outfile, percentBfactors)
+#     # else:
+#     #     np.savetxt(outfile, dfi)

-    if(ranksorted.lower()=='true'):
-        if(inverted==True):
-            return (1.0 - percentBfactors)
-        else:
-            return percentBfactors
-    else:
-        if(inverted==True):
-            return (np.max(bfactors) - bfactors)
-        else:
-            return bfactors
+#     if(ranksorted.lower()=='true'):
+#         if(inverted==True):
+#             return (1.0 - percentBfactors)
+#         else:
+#             return percentBfactors
+#     else:
+#         if(inverted==True):
+#             return (np.max(bfactors) - bfactors)
+#         else:
+#             return bfactors
    
 def attenuateEndPoints(dfi):
    """
@@ -646,17 +648,17 @@ def check_argument_groups(parser, arg_dict, group, argument):
    """
    c = 0
    for myArg in argument:
-    	arg_name = myArg.replace("-", "")
-    	if arg_dict[arg_name]!='':
-    		c = c+1
+        arg_name = myArg.replace("-", "")
+        if arg_dict[arg_name]!='':
+            c = c+1
    group_name = group.replace("-", "")
    if arg_dict[group_name]=="input":
    	if c!=1:
-    		parser.error("gemme requires " + str(argument) +
+    		parser.error("esgemme requires " + str(argument) + \
                         " if " + group + " is set to input.")
    else:
        if c>0:
-            parser.error("gemme requires " + group +
+            parser.error("esgemme requires " + group +
                         " to be set to input if " + str(argument) + " is used.")
    return None
 def calculateSecondaryStructure(pdbfile):
@@ -674,8 +676,7 @@ def calculateSecondaryStructure(pdbfile):

        A structure file in PDB format is the main input.
    """
-    import biotite.structure.io as strucio
-    import biotite.application.dssp as dssp
+

    #pdbfile: Name of the input pdb file
    #outfile=Name of outputfile that contain secondary structure information= pdbfile+".dssp"
@@ -724,23 +725,23 @@ def countCoilSegments(inputfile):

 def parse_command_line():
    """
-    Parse command line.
-    It uses argparse to parse phylosofs' command line arguments and returns the
-    argparse parser.
+        Parse command line.
+        It uses argparse to parse esgemme' command line arguments and returns the
+        argparse parser.
    """
    parser = argparse.ArgumentParser(
        prog="gemme",
        description="""
-        GEMME (Global Epistasis Model for predicting Mutational Effects) 
-        is a tool to predict mutational outcomes based on sequence analysis
-        """,
+            ESGEMME (Evolutionary and Structural Global Epistasis Model for predicting Mutational Effects) 
+            is a tool to predict mutational outcomes based on sequence and structure analysis
+            """,
        epilog="""
-        It has been developed at LCQB (Laboratory of Computational and
-        Quantitative Biology), UMR 7238 CNRS, Sorbonne Université.
-        If you use it, please cite:
-        Laine E, Karami Y, Carbone A. Predicting Protein Mutational Landscapes
-        using Evolutionary Conservation and Global Epistasis
-        """,
+            It has been developed at LCQB (Laboratory of Computational and
+            Quantitative Biology), UMR 7238 CNRS, Sorbonne Université.
+            If you use it, please cite:
+            Laine E, Karami Y, Carbone A. Predicting Protein Mutational Landscapes
+            using Evolutionary Conservation and Global Epistasis
+            """,
    )

    parser.add_argument(
@@ -814,7 +815,7 @@ def parse_command_line():
 
    # Check flag arguments
    if args.input is None:
-    	parser.error("GEMME requires an input alignment.")
+        parser.error("ESGEMME requires an input alignment.")

    return args

@@ -855,11 +856,13 @@ def doit(inAli,mutFile,retMet,bFile,fFile,n,N, jetfile, pdbfile, normWeightMode,

    structure = parsePDB(prot+".pdb")
    calphas = structure.select('name CA')
-
+    
    chains = list(set(calphas.getChids()))
    
    #print(chains)

+    # pdb_file = pdb.PDBFile.read(prot+".pdb")
+    # chains = list(set(pdb_file.get_chains()))
    if((jetfile) == None):
        #I intend to run JET2 completely externally!!
        #It is too much buggy and it has too many dependencies. 
@@ -878,22 +881,22 @@ def doit(inAli,mutFile,retMet,bFile,fFile,n,N, jetfile, pdbfile, normWeightMode,
            shutil.copy2(jetfile, prot+"_jet.res")
        print("done")

-    #If a real pdb file is given, calculate dfi for the residues. 
-    if(((normWeightMode=='dfi') or (normWeightMode=='maxtracedfi') or (normWeightMode=='maxdfitrace'))):
-        if (pdbfile == None):
-            print("ERROR: There is not any pdb file to calculate DFI.")
-            sys.exit(-1)
-        else:
-            print("Calculating %DFI data per residue!")
-            dfi = calcPercentDFI(pdbfile, outfile=None, nmodes=None, attenuate="true", ranksorted="true")
-            dfi = dfi
-            df = pd.read_table(prot+"_jet.res")
-            print(df)
-
-            df['dfi'] = dfi.round(4)
-            df.to_csv(prot+"_jet.res", header=True, index=None, sep='\t', mode='w')
-
-            print(df)
+    # #If a real pdb file is given, calculate dfi for the residues. 
+    # if(((normWeightMode=='dfi') or (normWeightMode=='maxtracedfi') or (normWeightMode=='maxdfitrace'))):
+    #     if (pdbfile == None):
+    #         print("ERROR: There is not any pdb file to calculate DFI.")
+    #         sys.exit(-1)
+    #     else:
+    #         print("Calculating %DFI data per residue!")
+    #         dfi = calcPercentDFI(pdbfile, outfile=None, nmodes=None, attenuate="true", ranksorted="true")
+    #         dfi = dfi
+    #         df = pd.read_table(prot+"_jet.res")
+    #         print(df)
+
+    #         df['dfi'] = dfi.round(4)
+    #         df.to_csv(prot+"_jet.res", header=True, index=None, sep='\t', mode='w')
+
+    #         print(df)
    if((normWeightMode=='ssjetormax')):
        if (pdbfile == None):
            print("ERROR: There is not any pdb file to calculate DFI.")
@@ -1034,6 +1037,6 @@ if (__name__ == '__main__'):

    toc = time.perf_counter()
    
-    print(f"S-GEMME computation finished in {toc - tic:0.4f} seconds!")
+    print(f"ESGEMME computation finished in {toc - tic:0.4f} seconds!")
    

--- a/example/example-sgemme-script.sh
+++ b/example/example-sgemme-script.sh