Added sstjetormaxtwocomponent as default normweightmode.

9175fdba · Mustafa Tekpinar · 7447bfab · 9175fdba · 9175fdba · 9175fdba
Commit 9175fdba authored Jul 07, 2023 by Mustafa Tekpinar
Hide whitespace changes
Inline Side-by-side

Showing with 235 additions and 11 deletions

installation.rst docs/installation.rst +51 -7

computePred.R esgemme/computePred.R +31 -2

esgemme.py esgemme/esgemme.py +153 -2

No files found.
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -15,12 +15,30 @@ ESGEMME has the following external dependencies:

  * java
  * naccess: http://www.bioinf.manchester.ac.uk/naccess/
-  * muscle: https://www.drive5.com/muscle/

-* seqinr R package: https://cran.r-project.org/web/packages/seqinr/index.html
-* dssp for secondary structure prediction.
+    
+After you installed JET2 define a parameter called JET2_PATH inside your .profile file.
+You can open .profile as follows:

-These tools should be installed to be able to use ESGEMME.
+.. code:: bash
+
+	  gedit ~/.profile
+
+You should add a command like below to the end of that file, save and exit.
+
+.. code:: bash
+	  
+	  export JET2_PATH=/home/tekpinar/JET2/
+
+Please, do not forget to replace /home/tekpinar/JET2 with your own file path.
+
+Then, source the saved .profile so that the environment variable will be taken into account:
+
+.. code:: bash
+
+	  source ~/.profile
+
+JET2 is essential and it should be installed to be able to use ESGEMME.

 Preparation of the environment and installation of ESGEMME
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -88,16 +106,41 @@ Please note that default dssp in Ubuntu 22.04 is not working properly.
 Check the location of hhsuite folder and add it to your path
 In my case it was in /home/tekpinar/research/lcqb folder. Therefore, I added the following line
 to my .profile file. 
-PATH="/home/tekpinar/research/lcqb/hhsuite/bin:/home/tekpinar/research/lcqb/hhsuite/scripts:$PATH"
+Open .profile file with gedit:
+
+.. code:: bash

-Then
-source ~/.profile
+	  gedit ~/.profile
+
+Now, add the following line to the end of the file.
+
+.. code:: bash
+	  
+	  PATH="/home/tekpinar/research/lcqb/hhsuite/bin:/home/tekpinar/research/lcqb/hhsuite/scripts:$PATH"
+Of course, your path will not be /home/tekpinar/research/lcqb/ and you have to modify the path according to
+your system. Save the file and exit. Then, 
+
+.. code:: bash
+	  
+	  source ~/.profile

 #

   cd ESGEMME

 #Download ESGEMME from http://gitlab.lcqb.upmc.fr/tekpinar/ESGEMME repository and go inside the ESGEMME folder.!
+You can download the master version using command line as follows:
+
+.. code:: bash
+	  git clone http://gitlab.lcqb.upmc.fr/tekpinar/ESGEMME.git
+
+If you would like the development version:
+
+.. code:: bash
+	  
+	  git clone -b development http://gitlab.lcqb.upmc.fr/tekpinar/ESGEMME.git
+	  
+
 .. code:: bash

   cd ESGEMME
@@ -115,6 +158,7 @@ file according to your system.
   cd ../

 #Installing the required R packages
+
 .. code:: bash

   sudo Rscript -e 'install.packages("seqinr", repos="http://cran.us.r-project.org", dependencies=TRUE)'

--- a/esgemme/computePred.R
+++ b/esgemme/computePred.R
@@ -84,9 +84,38 @@ if ((normWeightMode=="max")){
      quit(status=-1)
    } 
  }
-
+}else if (normWeightMode=="sstjetormaxtwocomponent"){ 
+  print("Using only sstjetormaxtwocomponent")
+  for (row in 1:nrow(jet)) {
+    if(sum(colnames(jet)=="sstjetormaxtwocomponent")==1){
+      trace<-append(trace, jet[row, "sstjetormaxtwocomponent"])
+    }else{
+      print("No field called sstjetormaxtwocomponent in the JET output!")
+      quit(status=-1)
+    } 
+  }
+}else if (normWeightMode=="sstjetormaxthirdchanged"){ 
+  print("Using only sstjetormaxthirdchanged")
+  for (row in 1:nrow(jet)) {
+    if(sum(colnames(jet)=="sstjetormaxthirdchanged")==1){
+      trace<-append(trace, jet[row, "sstjetormaxthirdchanged"])
+    }else{
+      print("No field called sstjetormaxthirdchanged in the JET output!")
+      quit(status=-1)
+    } 
+  }
+}else if (normWeightMode=="sstjetormaxthirdpcstar"){ 
+  print("Using only sstjetormaxthirdpcstar")
+  for (row in 1:nrow(jet)) {
+    if(sum(colnames(jet)=="sstjetormaxthirdpcstar")==1){
+      trace<-append(trace, jet[row, "sstjetormaxthirdpcstar"])
+    }else{
+      print("No field called sstjetormaxthirdpcstar in the JET output!")
+      quit(status=-1)
+    } 
+  }
 }else if (normWeightMode=="tjetormax"){ 
-  print("Using tjetormax with inverse CV")
+  print("Using tjetormax with inverse")
  for (row in 1:nrow(jet)) {
    if(sum(colnames(jet)=="traceMax")==1){
 	    trace<-append(trace, max((jet[row, "traceMax"]+jet[row, "pc"])/2.0, max((jet[row, "traceMax"]+1.0-jet[row, "cv"])/2.0, (jet[row, "pc"]+1.0-jet[row, "cv"])/2.0 )))

--- a/esgemme/esgemme.py
+++ b/esgemme/esgemme.py
@@ -936,8 +936,8 @@ def parse_command_line():
    
    parser.add_argument('--normweightmode', dest='normweightmode', type=str, \
        help="It can be one of these: 'tjet', 'cv', 'pc',"+\
-            "max, tjetormax or sstjetormax. Default is 'tjet'.",
-        required=False, default="tjet")
+            "max, tjetormax or sstjetormaxtwocomponent. Default is 'tjet'.",
+        required=False, default="sstjetormaxtwocomponent")

    parser.add_argument('--verbose', dest='verbose', type=bool, \
        help="This argument controls amount of the output. Default is 'False'."+\
@@ -1004,6 +1004,9 @@ def doit(inAli,mutFile,retMet,bFile,fFile,n,N, jetfile, pdbfile, normWeightMode,
        (normWeightMode != 'pc') and \
        (normWeightMode != 'max') and \
        (normWeightMode != 'tjetormax') and \
+        (normWeightMode != 'sstjetormaxtwocomponent') and \
+        (normWeightMode != 'sstjetormaxthirdchanged') and \
+        (normWeightMode != 'sstjetormaxthirdpcstar') and \
        (normWeightMode != 'sstjetormax')):
        print("ERROR: normWeightMode can only be 'tjet', 'cv', 'pc', "+\
              "'max', 'tjetormax' or 'sstjetormax'!")
@@ -1073,6 +1076,154 @@ def doit(inAli,mutFile,retMet,bFile,fFile,n,N, jetfile, pdbfile, normWeightMode,
            df.to_csv(prot+"_jet.res", header=True, index=None, sep='\t', mode='w')
            #sys.exit(-1)

+    if((normWeightMode=='sstjetormaxtwocomponent')):
+        if (pdbfile == None):
+            print("ERROR: There is not any pdb file.")
+            sys.exit(-1)
+        else:
+            calculateSecondaryStructure(pdbfile)
+            countCoilSegments(pdbfile+".dssp")
+
+            df = pd.read_table(prot+"_jet.res", sep="\s+")
+
+            df2 = pd.read_table(pdbfile+".dssp.new", header=None, sep=",")
+            df2.columns = ['pos', 'ss', 'length']
+
+            mergedRes = pd.merge(df, df2, on ='pos', right_index=False)
+
+            if(debug):
+                print(df['pos'])
+                print(pdbfile+".dssp")
+                print(os.getcwd())
+                print(df2)            
+                print(mergedRes)
+
+            sstjetormaxList = []
+            # maxCoilLength = 5
+            print("WARNING: Max. coil length = {}".format(maxCoilLength))
+            for index, row in mergedRes.iterrows():
+                if(row['ss']=='C') and (row['length']>maxCoilLength):
+                    sstjetormaxList.append(row['trace'])
+                else:
+                    maxVal = max([((row['trace']+row['pc'])/2.0), ((row['trace']+row['cv'])/2.0)])
+                    sstjetormaxList.append(maxVal)
+            #sstjetormaxList=rankSortProteinData(sstjetormaxList, inverted=False)
+            df['sstjetormaxtwocomponent'] = sstjetormaxList
+            df['sstjetormaxtwocomponent'] = df['sstjetormaxtwocomponent'].round(decimals = 4)
+            df.to_csv(prot+"_jet.res", header=True, index=None, sep='\t', mode='w')
+            #sys.exit(-1)
+
+    if((normWeightMode=='sstjetormaxthirdchanged')):
+        if (pdbfile == None):
+            print("ERROR: There is not any pdb file.")
+            sys.exit(-1)
+        else:
+            calculateSecondaryStructure(pdbfile)
+            countCoilSegments(pdbfile+".dssp")
+
+            df = pd.read_table(prot+"_jet.res", sep="\s+")
+
+            df2 = pd.read_table(pdbfile+".dssp.new", header=None, sep=",")
+            df2.columns = ['pos', 'ss', 'length']
+
+            mergedRes = pd.merge(df, df2, on ='pos', right_index=False)
+
+            if(debug):
+                print(df['pos'])
+                print(pdbfile+".dssp")
+                print(os.getcwd())
+                print(df2)            
+                print(mergedRes)
+
+            sstjetormaxList = []
+            # maxCoilLength = 5
+            print("WARNING: Max. coil length = {}".format(maxCoilLength))
+            for index, row in mergedRes.iterrows():
+                if(row['ss']=='C') and (row['length']>maxCoilLength):
+                    sstjetormaxList.append(row['trace'])
+                else:
+                    maxVal = max([((row['trace']+row['pc'])/2.0), ((row['trace']+row['cv'])/2.0), ((row['trace']))])
+                    sstjetormaxList.append(maxVal)
+            #sstjetormaxList=rankSortProteinData(sstjetormaxList, inverted=False)
+            df['sstjetormaxthirdchanged'] = sstjetormaxList
+            df['sstjetormaxthirdchanged'] = df['sstjetormaxthirdchanged'].round(decimals = 4)
+            df.to_csv(prot+"_jet.res", header=True, index=None, sep='\t', mode='w')
+    if((normWeightMode=='sstjetormaxthirdpcstar')):
+        if (pdbfile == None):
+            print("ERROR: There is not any pdb file.")
+            sys.exit(-1)
+        else:
+            from collections import OrderedDict
+            pcStar = OrderedDict()
+            pcStar={'A':0.38,
+                    'V':0.54,
+                    'L':0.45,
+                    'I':0.60,
+                    'P':0.18,
+                    'F':0.50,
+                    'W':0.27,
+                    'M':0.40,
+                    'G':0.36,
+                    'S':0.23,
+                    'T':0.24,
+                    'C':0.45,
+                    'Y':0.15,
+                    'N':0.13,
+                    'Q':0.07,
+                    'D':0.15,
+                    'E':0.18,
+                    'K':0.04,
+                    'R':0.02,
+                    'H':0.17}
+            oneLetter2ThreeLetters = {'C': 'CYS', 'D': 'ASP', 'S': 'SER',  'Q': 'GLN', 'K': 'LYS',
+                                    'I': 'ILE', 'P': 'PRO', 'T': 'THR', 'F': 'PHE', 'N': 'ASN', 
+                                    'G': 'GLY',  'H': 'HIS', 'L': 'LEU', 'R': 'ARG', 'W': 'TRP', 
+                                    'A': 'ALA', 'V': 'VAL', 'E': 'GLU', 'Y': 'TYR', 'M': 'MET'}
+
+            threeLetters2oneLetter = {v: k for k, v in oneLetter2ThreeLetters.items()}
+
+            calculateSecondaryStructure(pdbfile)
+            countCoilSegments(pdbfile+".dssp")
+
+            df = pd.read_table(prot+"_jet.res", sep="\s+")
+            pcstarList = []
+            for index, row in df.iterrows():
+                print(row['pos'], row['AA'])
+                singleLetterCode = threeLetters2oneLetter[row['AA']]
+                pcstarList.append(pcStar[singleLetterCode])
+            import numpy as np
+            df['pcstar'] = list(np.array(pcstarList)/(np.array(pcstarList).max()))
+
+            print(df)
+
+            df2 = pd.read_table(pdbfile+".dssp.new", header=None, sep=",")
+            df2.columns = ['pos', 'ss', 'length']
+
+            mergedRes = pd.merge(df, df2, on ='pos', right_index=False)
+
+            if(debug):
+                print(df['pos'])
+                print(pdbfile+".dssp")
+                print(os.getcwd())
+                print(df2)            
+                print(mergedRes)
+
+            sstjetormaxList = []
+            # maxCoilLength = 5
+            print("WARNING: Max. coil length = {}".format(maxCoilLength))
+            for index, row in mergedRes.iterrows():
+                if(row['ss']=='C') and (row['length']>maxCoilLength):
+                    sstjetormaxList.append(row['trace'])
+                else:
+                    maxVal = max([((row['trace']+row['pc'])/2.0), ((row['trace']+row['cv'])/2.0), (   (row['cv']+row['pcstar'])/2.0   )])
+                    sstjetormaxList.append(maxVal)
+            #sstjetormaxList=rankSortProteinData(sstjetormaxList, inverted=False)
+            df['sstjetormaxthirdpcstar'] = sstjetormaxList
+            df['sstjetormaxthirdpcstar'] = df['sstjetormaxthirdpcstar'].round(decimals = 4)
+
+            df.to_csv(prot+"_jet.res", header=True, index=None, sep='\t', mode='w')
+            #sys.exit(-1)
+
    # #If a real pdb file is given, calculate dfi for the residues. 
    # if(((normWeightMode=='tracemovingaverage'))):
    #     print("Calculating trace moving average per residue!")