Commit ce822ebf by DLA

Updates

parent d1948723
......@@ -13,10 +13,8 @@ import gc
from os import path, mkdir, getenv, listdir, remove, system, stat
import pandas as pd
import numpy as np
#from prody import *
import glob
import shutil
#import matplotlib.pyplot as plt
import seaborn as sns
from math import exp
import subprocess
......@@ -35,10 +33,8 @@ from sklearn.preprocessing import OneHotEncoder
import tensorflow.keras
from tensorflow.keras import backend as K
from tensorflow.keras import regularizers
from tensorflow.keras.datasets import mnist # subroutines for fetching the MNIST dataset
from tensorflow.keras.models import Model, Sequential,load_model # basic class for specifying and training a neural network
from tensorflow.keras.models import Model, Sequential,load_model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Dense, Dropout, Activation, Flatten, AveragePooling3D
#from tensorflow.keras.utils import np_utils # utilities for one-hot encoding of ground truth values
from tensorflow.keras.layers import Dot
from tensorflow.keras.backend import ones, ones_like
......@@ -99,14 +95,12 @@ three_letter ={'V':'VAL', 'I':'ILE', 'L':'LEU', 'E':'GLU', 'Q':'GLN', \
v_dim = 24
#map_dir = 'validations'
#map_dir = 'map_dir_skempi_wt'
map_dir = 'map_dir_skempi_wt_nomask_4channels'
map_dir = '../Examples/map_dir'
output_file = 'output_xray_skempi_wt_nomask_4channels.csv'
intermediate_file = 'intermediate_xray_skempi_wt_nomask_200_4channels.csv'
output_file = 'output_xray_wt_mask.csv'
intermediate_file = 'intermediate_xray_wt_mask_200.csv'
model = load_model(path.join('models5_nonorm_classweight_Porlineweight08_Others_4channels', '0_733_model'))
model = load_model(path.join('Models', 'ssDLA_model'))
def load_map(sample_test):
check_call(
......@@ -118,7 +112,7 @@ def load_map(sample_test):
X, y, reg_type, res_pos, res_name, inter_info = load_obj(sample_test.replace('.pkl.lz4',''))
#Filter features (SCR and RL)
X = X[:,:,:,:,:4]
X = X[:,:,:,:,:167]
remove(sample_test.replace('.lz4',''))
return X, y, reg_type, res_pos, res_name, inter_info
......
......@@ -99,14 +99,12 @@ three_letter ={'V':'VAL', 'I':'ILE', 'L':'LEU', 'E':'GLU', 'Q':'GLN', \
v_dim = 24
#map_dir = 'validations'
#map_dir = 'map_dir_skempi_wt'
map_dir = 'map_dir_skempi_wt_mask_sphere5_randomcenter'
map_dir = '../Examples/map_dir'
output_file = 'output_xray_skempi_wt_mask_sphere5_randomcenter.csv'
intermediate_file = 'intermediate_xray_skempi_wt_mask_sphere5_randomcenter_200.csv'
output_file = 'output_xray_wt_mask_4channels.csv'
intermediate_file = 'intermediate_xray_wt_mask_200_4channels.csv'
model = load_model(path.join('models5_nonorm_classweight_Porlineweight08', '0_30_model'))
model = load_model(path.join('Models', 'ssDLA_model_4channels'))
def load_map(sample_test):
check_call(
......@@ -118,7 +116,7 @@ def load_map(sample_test):
X, y, reg_type, res_pos, res_name, inter_info = load_obj(sample_test.replace('.pkl.lz4',''))
#Filter features (SCR and RL)
X = X[:,:,:,:,:167]
X = X[:,:,:,:,:4]
remove(sample_test.replace('.lz4',''))
return X, y, reg_type, res_pos, res_name, inter_info
......
......@@ -82,8 +82,6 @@ def get_model(input_shape, input_shape_aux):
input_data = 'intermediate_backrub_nomask_200.csv'
#input_data = 'intermediate_xray_skempi_wt_mask_sphere5_randomcenter_200.csv'
output_confusionmatrix_svg = 'tf_confusionmatrix_aareducedalphabet_classificaiton_' + input_data.replace('.csv','.svg')
......@@ -339,4 +337,4 @@ def performance(y_test, test_preds, name):
performance(y_test_wt, test_preds_wt, 'wt')
performance(y_test_mut, test_preds_mut, 'mut')
\ No newline at end of file
performance(y_test_mut, test_preds_mut, 'mut')
......@@ -81,10 +81,7 @@ def get_model(input_shape, input_shape_aux):
return _model
input_data = 'intermediate_xray_skempi_wt_nomask_200.csv'
#input_data = 'intermediate_xray_skempi_wt_mask_sphere5_randomcenter_200.csv'
input_data = 'intermediate_xray_wt_nomask_200.csv'
output_confusionmatrix_svg = 'tf_confusionmatrix_aareducedalphabet_classificaiton_' + input_data.replace('.csv','.svg')
output_precisionrecall_svg = 'tf_precisionrecall_aareducedalphabet_classificaiton_' + input_data.replace('.csv','.svg')
......@@ -276,4 +273,4 @@ fig, ax = plt.subplots(figsize=(20,20))
sns.barplot(x='Classes', y='Performance', data=performance_df, hue='Metric', ax = ax)
plt.ylim(0,1)
plt.tight_layout()
plt.savefig(output_f1_svg)
\ No newline at end of file
plt.savefig(output_f1_svg)
......@@ -92,15 +92,10 @@ def get_model(input_shape):
"""
X_in = Input(shape=input_shape)
#aux_input = Input(shape=input_shape_aux)
H = Dense(20, use_bias=True, activation='elu', kernel_initializer='he_uniform', kernel_regularizer=l2(1e-3), kernel_constraint=max_norm(4), bias_constraint=max_norm(4))(X_in)
#H = Dense(30, use_bias=True, activation='elu', kernel_initializer='he_uniform', kernel_regularizer=l2(1e-3), kernel_constraint=max_norm(4), bias_constraint=max_norm(4))(H)
#H = Dense(10, use_bias=True, activation='elu', kernel_initializer='he_uniform', kernel_regularizer=l2(1e-3), kernel_constraint=max_norm(4), bias_constraint=max_norm(4))(H)
#H = Concatenate()([H, aux_input])
Y = Dense(len(label_classes), activation='softmax')(H)
#_model = Model(inputs=[X_in, aux_input], outputs=Y)
_model = Model(inputs=[X_in], outputs=Y)
_model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.00001))
_model.summary()
......@@ -108,8 +103,7 @@ def get_model(input_shape):
return _model
input_data = 'intermediate_xray_skempi_wt_nomask_200.csv'
#input_data = 'intermediate_xray_skempi_wt_mask_sphere5_randomcenter_200.csv'
input_data = 'intermediate_xray_wt_nomask_200.csv'
......@@ -294,4 +288,4 @@ fig, ax = plt.subplots(figsize=(20,20))
sns.barplot(x='Classes', y='Performance', data=performance_df, hue='Metric', ax = ax)
plt.ylim(0,1)
plt.tight_layout()
plt.savefig(output_f1_svg)
\ No newline at end of file
plt.savefig(output_f1_svg)
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Comp;ch1;ch2
1IAR;A;B
1BRS;A;D
......@@ -89,52 +89,11 @@ def save_pdb_interface(pdbfile, map_name, interface):
def mapcomplex(wt_pdb, mut_pdb, protein_complex, mutate_complex, ddg, model_ind, mut_name, interaction_region, complex_type, experimental_method):
try:
#name = protein_complex + '--' + mutate_complex + '--' + str(model_ind)
name = mut_name + '--' + str(model_ind)
#Attention: here the R/L feature isn't really meaningful! So we don't use it!
"""
p1 = parsePDB(wt_pdb).select('protein').select("chain " + ' '.join(protein_complex.split('_')[1]))
p2 = parsePDB(wt_pdb).select('protein').select("chain " + ' '.join(protein_complex.split('_')[2]))
writePDB(name+'_complex_p1.pdb', p1.toAtomGroup())
writePDB(name+'_complex_p2.pdb', p2.toAtomGroup())
writePDB(name+'_complex.pdb', p1.toAtomGroup() + p2.toAtomGroup())
scr.get_scr(name+'_complex_p1.pdb', name+'_complex_p2.pdb', name+'_complex.pdb', name)
rimcoresup = pd.read_csv(name+'_rimcoresup.csv', header=None, sep=' ')
interface = []
with open('scrinfo','w') as fh_csr:
for index, row in rimcoresup.iterrows():
fh_csr.write(str(row[2])+';'+row[1]+';'+row[5]+'\n')
#Chain ID, Resnum, SCR
interface.append((row[1], row[2], row[5]))
"""
##########################################################################
"""
map_name = path.join(inter_dir_mut, mut_name, 'wt_' + name)
save_pdb_interface(wt_pdb, map_name, interface)
map_name = path.join(inter_dir_mut, mut_name, 'mut_' + name)
save_pdb_interface(mut_pdb, map_name, interface)
if model_ind == 1:
tl.save_obj((interface, ddg), map_name + '.interface')
remove(name+'_complex_p1.pdb')
remove(name+'_complex_p2.pdb')
remove(name+'_complex.pdb')
remove(name+'_rimcoresup.csv')
return
"""
##########################################################################
#mapcommand = [bin_path, "--mode", "map", "-i", wt_pdb, "--native", "-m", str(v_dim), "-t", "167", "-v", "0.8", "-o", wt_pdb.replace('pdb','bin')]
mapcommand = [bin_path, "--mode", "map", "-i", wt_pdb, "--native", "-m", str(v_dim), "-t", "167", "-v", "0.8", "-o", 'wt.bin']
subprocess.call(mapcommand)
#mapcommand = [bin_path, "--mode", "map", "-i", mut_pdb, "--native", "-m", str(v_dim), "-t", "167", "-v", "0.8", "-o", mut_pdb.replace('pdb','bin')]
mapcommand = [bin_path, "--mode", "map", "-i", mut_pdb, "--native", "-m", str(v_dim), "-t", "167", "-v", "0.8", "-o", 'mut.bin']
subprocess.call(mapcommand)
......@@ -165,9 +124,6 @@ def mapcomplex(wt_pdb, mut_pdb, protein_complex, mutate_complex, ddg, model_ind,
#remove(name+'_rimcoresup.csv')
##########################################################################
y_hotencode = encoder.transform([[three_letter[mutate_complex[0]]]])
map_name = path.join(map_dir_mut_sep, mut_name, '1', 'wt_' + name)
......@@ -252,8 +208,7 @@ def manage_mut_files(use_multiprocessing):
mkdir(map_dir_mut_sep)
if not path.exists(inter_dir_mut):
mkdir(inter_dir_mut)
#mut_directory = path.join('/home/yasser/myThesis/ProtPartDisc/Mutation_3DCNN_Analysis/skempi', 'mutations')
mut_directory = '/media/yasser/YasserMB2/Backup_myThesis_ALL/Backup_Backrub_myThesis/Temp/output_bu'
mut_directory = 'backrub_models_directory'
muts = listdir(mut_directory)
mut_cases = []
for mut in muts:
......
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Jan 2 20:21:06 2022
@author: yasser
"""
import logging
import os
import sys
from os import path, mkdir, getenv, listdir, remove, system, stat
import pandas as pd
import numpy as np
from prody import *
import glob
import shutil
#import matplotlib.pyplot as plt
import seaborn as sns
from math import exp
from subprocess import CalledProcessError, check_call, call
import traceback
from random import shuffle, random, seed, sample
from numpy import newaxis
import matplotlib.pyplot as plt
import time
from prody import *
import collections
import scr
from numpy import asarray
from sklearn.preprocessing import OneHotEncoder
import subprocess
import load_data as load
from sklearn.preprocessing import MinMaxScaler
logging.basicConfig(filename='manager.log', filemode='w', format='%(levelname)s: %(message)s', level=logging.DEBUG)
mainlog = logging.getLogger('main')
logging.Logger
sys.path.insert(1, '../lib/')
import tools as tl
comp_dict = pd.read_csv('../Examples/complex_list.txt', sep=';')
comp_dir = '../Examples/complex_directory'
target_comp = listdir(comp_dir)
map_dir = '../Examples/map_dir'
if not path.exists(map_dir):
mkdir(map_dir)
bin_path = "./maps_generator_mask_spherer5A"
v_dim = 24
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
onehot = encoder.fit(asarray([['CYS'], ['ASP'], ['SER'], ['GLN'], ['LYS'], ['ILE'], ['PRO'],
['THR'], ['PHE'], ['ASN'], ['GLY'], ['HIS'], ['LEU'], ['ARG'],
['TRP'], ['ALA'], ['VAL'], ['GLU'], ['TYR'], ['MET']]))
def mapcomplex(file, ch1, ch2, targetcomplex):
try:
name = targetcomplex
rec = parsePDB(file).select('protein').select('chain ' + ch1[0])
rec.setChids('R')
lig = parsePDB(file).select('protein').select('chain ' + ch2[0])
lig.setChids('L')
writePDB(name+'_r.pdb', rec.toAtomGroup())
writePDB(name+'_l.pdb', lig.toAtomGroup())
writePDB(name+'_complex.pdb', rec.toAtomGroup() + lig.toAtomGroup())
scr.get_scr(name+'_r.pdb', name+'_l.pdb', name+'_complex.pdb', name)
rimcoresup = pd.read_csv(name+'_rimcoresup.csv', header=None, sep=' ')
rec_regions = rimcoresup.loc[rimcoresup[4] == 'receptor']
rec_regions = pd.Series(rec_regions[5].values, index=rec_regions[2]).to_dict()
lig_regions = rimcoresup.loc[rimcoresup[4] == 'ligand']
lig_regions = pd.Series(lig_regions[5].values, index=lig_regions[2]).to_dict()
res_num2name_map_rec = dict(zip(rec.getResnums(),rec.getResnames()))
res_num2name_map_lig = dict(zip(lig.getResnums(),lig.getResnames()))
res_num2coord_map_rec = dict(zip(rec.select('ca').getResnums(),rec.select('ca').getCoords()))
res_num2coord_map_lig = dict(zip(lig.select('ca').getResnums(),lig.select('ca').getCoords()))
L1 = list(set(rec.getResnums()))
res_ind_map_rec = dict([(x,inx) for inx, x in enumerate(sorted(L1))])
L1 = list(set(lig.getResnums()))
res_ind_map_lig = dict([(x,inx+len(res_ind_map_rec)) for inx, x in enumerate(sorted(L1))])
res_inter_rec = [(res_ind_map_rec[x], rec_regions[x], x, 'R', res_num2name_map_rec[x], res_num2coord_map_rec[x])
for x in sorted(list(rec_regions.keys())) if x in res_ind_map_rec]
res_inter_lig = [(res_ind_map_lig[x], lig_regions[x], x, 'L', res_num2name_map_lig[x], res_num2coord_map_lig[x])
for x in sorted(list(lig_regions.keys())) if x in res_ind_map_lig]
reg_type = list(map(lambda x: x[1],res_inter_rec)) + list(map(lambda x: x[1],res_inter_lig))
res_name = list(map(lambda x: [x[4]],res_inter_rec)) + list(map(lambda x: [x[4]],res_inter_lig))
res_pos = list(map(lambda x: x[5],res_inter_rec)) + list(map(lambda x: x[5],res_inter_lig))
#Merge these two files!
with open('resinfo','w') as fh_res:
for x in res_inter_rec:
fh_res.write(str(x[2])+';'+x[3]+'\n')
for x in res_inter_lig:
fh_res.write(str(x[2])+';'+x[3]+'\n')
with open('scrinfo','w') as fh_csr:
for x in res_inter_rec:
fh_csr.write(str(x[2])+';'+x[3]+';'+x[1]+'\n')
for x in res_inter_lig:
fh_csr.write(str(x[2])+';'+x[3]+';'+x[1]+'\n')
if not res_inter_rec or not res_inter_lig:
return [],[],[]
#tl.coarse_grain_pdb('train.pdb')
mapcommand = [bin_path, "--mode", "map", "-i", name+'_complex.pdb', "--native", "-m", str(v_dim), "-t", "167", "-v", "0.8", "-o", name+'_complex.bin']
call(mapcommand)
dataset_train = load.read_data_set(name+'_complex.bin')
print(dataset_train.maps.shape)
#scaler = MinMaxScaler()
#scaler.fit(dataset_train.maps)
#data_norm = scaler.transform(dataset_train.maps)
data_norm = dataset_train.maps
X = np.reshape(data_norm, (-1,v_dim,v_dim,v_dim,173))
y = encoder.transform(res_name)
map_name = path.join(map_dir, targetcomplex, name)
tl.save_obj((X,y,reg_type,res_pos,res_name,res_inter_rec+res_inter_lig), map_name)
check_call(
[
'lz4', '-f', #, '--rm' because if inconsistency in lz4 versions!
map_name + '.pkl'
],
stdout=sys.stdout)
remove(map_name + '.pkl')
remove(name+'_complex.bin')
remove(name+'_r.pdb')
remove(name+'_l.pdb')
remove(name+'_complex.pdb')
remove(name+'_rimcoresup.csv')
print(type(X))
print(X.shape)
except Exception as e:
logging.info("Bad interface!" + '\nError message: ' + str(e) +
"\nMore information:\n" + traceback.format_exc())
return [],[],[]
return X, y, reg_type
already_exist = listdir(map_dir)
def process_targetcomplex(targetcomplex, comp_dir, report_dict):
try:
if targetcomplex in already_exist:
return
logging.info('Processing ' + targetcomplex + ' ...')
pos_path = path.join(map_dir, targetcomplex, '1')
neg_path = path.join(map_dir, targetcomplex, '0')
if not path.exists(path.join(map_dir, targetcomplex)):
mkdir(path.join(map_dir, targetcomplex))
if not path.exists(pos_path):
mkdir(pos_path)
if not path.exists(neg_path):
mkdir(neg_path)
ch1, ch2 = comp_dict.loc[comp_dict.Comp == targetcomplex][['ch1','ch2']].iloc[0]
file = path.join(comp_dir, targetcomplex + '.pdb')
mapcomplex(file, ch1, ch2, targetcomplex)
except Exception as e:
logging.info("Bad target complex!" + '\nError message: ' + str(e) +
"\nMore information:\n" + traceback.format_exc())
def manage_pair_files(use_multiprocessing):
tc_cases = []
for tc in target_comp:
tc_cases.append((tc, comp_dir))
report_dict = tl.do_processing(tc_cases, process_targetcomplex, use_multiprocessing)
return report_dict
report_dict = manage_pair_files(False)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment