Commit 42fc3107 by DLA-Ranker

Updates

parent 24e2a357
python3 train_model_v2.py -c ~/save/conv_params/conv0.pkl --steps 20000 -t _95_2 --restore ~/save/tests_cluster/Model_95_1/
python3 evaluation.py -s gdt -d ~/data/CASP12_stage1/ -m ~/Temp/Model_95_2 -n _12stage_1
python3 evaluation.py -s cad -d ~/data/CASP12_stage1/ -m ~/Temp/Model_95_2 -n _12stage_1_cad
python3 evaluation.py -s gdt -d ~/data/CASP12_stage2/ -m ~/Temp/Model_95_2 -n _12stage_2
python3 evaluation.py -s cad -d ~/data/CASP12_stage2/ -m ~/Temp/Model_95_2 -n _12stage_2_cad
import numpy as np
import matplotlib.pyplot as plt
import os
import argparse
# Path to the directory where the losses files are located
directory_path = '/home/benoitch/save/tests_cluster/'
files = sorted(os.listdir(directory_path))
losses = []
indix = []
FLAGS = None
line_size = 0.9
def get_list():
"""
format the FLAGS.tests
:return: list of int, list of the test ids to plot
"""
ids = []
for l in FLAGS.tests:
id = ''
for i in l:
id += i
ids.append(int(id))
return ids
def recover_loss(s):
"""
Exctract the loss, from a smoothed loss list
:param s: 1D List containing smooth loss : list created during training of the model
:return: List containing the raw loss
"""
alpha = 0.999
l = np.zeros(s.size)
l[0] = s[0]/1000
for n in range(l.size):
if n == 0:
continue
l[n] = (1/(1-alpha))*((s[n]*(1-alpha**n)/1000) - (alpha*s[n-1]*(1-alpha**(n-1))/1000))
return l
def smooth(l):
"""
Smooth the loss
:param l: 1D list of the raw loss values
:return: 1D list of the smooth loss
"""
s = np.zeros(l.size)
slid = 0
decay = 0.999
dln = 1
for i in range(l.size):
dln = dln*decay
slid = decay*slid + (1-decay)*l[i]
s[i] = 1000* slid / (1 - dln)
return s
def main(ids):
for f in files:
f = directory_path + f
# This two conditions are used to plot reference loss (simply prediciting the mean of what it has seen so far)
# (The files containing the reference loss are located in benoitch/save/tests_cluster/ and are called gt_losses.npy and gt_x_losses.npy)
if f.split('/')[-1][:4] == 'gt_l':
f__ = open(f, 'rb')
loss_gt = np.load(f__)
elif f.split('/')[-1][:4] == 'gt_x':
f__ = open(f, 'rb')
x_gt = np.load(f__)
# then collecting the ids of the tests in the directory
elif f[-3:] == 'npy':
f__ = open(f, 'rb')
loss = np.load(f__)
losses.append(loss)
# get the id of the file's origin network id is an int
f = f[:-4]
indix.append(int(f.split('/')[-1].split('_')[1])) # the id needs to be the 7th char of the file's name
number = -1
for i,l in enumerate(losses):
# need to deal with losses from a same network training splited in two files
if not indix[i] in ids:
continue
if number == -1:
number = indix[i]
y = recover_loss(l)
x = range(l.size)
elif indix[i] == number:
y = np.append(y,recover_loss(l))
x = np.append(x, [x[-1] + n for n in range(l.size)])
else:
s = smooth(y)
plt.plot(x,s, alpha = 0.8, linewidth=line_size)
number = indix[i]
y = recover_loss(l)
x = range(l.size)
s = smooth(y)
plt.plot(x,s, alpha = 0.8, linewidth=line_size)
plt.plot(x_gt,loss_gt, alpha = 0.5, color='grey', linewidth=line_size)
plt.show()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'-t',
'--tests',
type=list,
required=True,
nargs='+',
help='List of tests ids to plot'
)
FLAGS = parser.parse_args()
ids = get_list()
main(ids)
import os
import shutil
class load_config:
def __init__(self):
# Must be in the same folder as three_dl
# loads the configuration from config file
f = open(os.path.join(os.path.dirname(os.path.abspath(__file__)),"config"), 'r')
config = f.read()
f.close()
config = config.split('\n')
for line in config:
if line != '' and line[0] != '#':
[name,var] = line.split('=')
name, var = name.replace(' ', ''), var.replace(' ', '')
self.__dict__[name] = var
# flushing the tensorboard repository
folder = self.TENSORBOARD_PATH
for the_file in os.listdir(folder):
file_path = os.path.join(folder, the_file)
try:
if os.path.isfile(file_path):
os.unlink(file_path)
except Exception as e:
print(e)
Ornate is a method for protein quality assessment.
To install Ornate:
-Install Python 3.5 or later
-Install Tensorflow (preferably with GPU support) from https://www.tensorflow.org/install
-Uncomment the line corresponding to your operating system in the "config" file
To run Ornate:
You can score one structure :
python score.py -s path/to/my_structure.pdb
or all structures in a directory :
python score.py -d path/to/my_directory
To interpret output:
The output in generally composed of multiple lines, each line correspond to one residue.
A typical line output is :
RES 107 L 0.4041
which means that the residue 107 from the input is a leucine and its score is 0.4041.
A high score should correspond to a well folded residue.
Feel free to modify the script score.py to match your needs
\ No newline at end of file
import config
import subprocess
import os
import numpy as np
import scipy.stats
from scipy.stats.stats import pearsonr
import random
import math
import tensorflow as tf
import load_data
import model
import argparse
from glob import glob
def getAverageScoreSco(filename) :
scoreList=[]
#print(filename)
with open(filename) as fp:
lines = fp.readlines()
for l in lines :
#print(l)
for k in l.split(" "):
#print(k)
try:
scoreList.append(float(k))
except ValueError:
continue
if scoreList == [] :
return -1
return np.average(scoreList)
def getAverageScoreOut(filename) :
scoreList=[]
#print(filename)
with open(filename) as fp:
lines = fp.readlines()
for l in lines :
#print(l)#print(k)
try:
scoreList.append(float(l.split(" ")[4]))
except ValueError:
continue
if scoreList == [] :
return -1
return np.average(scoreList)
def getAverageScore(filename) :
scoreList=[]
#print(filename)
with open(filename) as fp:
lines = fp.readlines()
for l in lines :
#print(l)
#print(k)
try:
scoreList.append(float(l[10:]))
except ValueError:
continue
if scoreList == [] :
return -1
return np.average(scoreList)
def getDictScore(filename) :
scoreDict={}
#print(filename)
with open(filename) as fp:
lines = fp.readlines()
for l in lines :
#print(l)
#print(k)
try:
resId = int(l[3:8])
score= float(l[10:])
scoreDict[resId] = score
except ValueError:
continue
return scoreDict
def getDictScoreSco(filename) :
scoreDict={}
#print(filename)
with open(filename) as fp:
lines = fp.readlines()
for l in lines :
#print(l)
#print(k)
try:
resId = int(l.split("r<")[1].split(">")[0])
score= float(l.split(" ")[-1])
scoreDict[resId] = score
except ValueError:
continue
return scoreDict
def main():
gtfiles = glob(FLAGS.directory + '/**/*'+FLAGS.suffix1, recursive=True)
scoresGT = []
scoresRes = []
diffScores = []
for f in gtfiles :
resFile = f[:-len(FLAGS.suffix1)]+FLAGS.suffix2
if os.path.exists(resFile) :
#scoreGT = getAverageScoreOut(f)
#scoreRes = getAverageScore(resFile)
scoreSco = getDictScoreSco(f)
scoreRes = getDictScore(resFile)
for key, value in scoreSco.items():
if key in scoreRes :
scoresGT.append(scoreSco[key])
scoresRes.append(scoreRes[key])
diffScores.append((scoreSco[key] - scoreRes[key])*(scoreSco[key] - scoreRes[key]))
#print(f)
#print(scoreGT)
#print(scoreRes)
#if scoresGT == [] :
# return
print(len(scoresGT))
print(pearsonr(scoresGT, scoresRes))
print(np.mean(diffScores)*1000)
print(1000*(np.std(diffScores))/math.sqrt(len(diffScores)))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'-d',
'--directory',
type=str,
help='Path to the validation data'
)
parser.add_argument(
'-s',
'--suffix1',
default=".cad",
type=str,
help='suffix to add to gt file'
)
parser.add_argument(
'-f',
'--suffix2',
default=".orn",
type=str,
help='suffix to add to result file'
)
FLAGS = parser.parse_args()
main()
# Config file for Ornate scoring function
#Please uncomment the line corresponding to your system
MAP_GENERATOR_PATH = bin/Linux/maps_generator
#MAP_GENERATOR_PATH = bin/Windows/maps_generator.exe
#MAP_GENERATOR_PATH = bin/MacOs/maps_generator
# Place to store temporary files
TEMP_PATH = /tmp/
# Place where the pre-trained model is located
MODEL_PATH = model/model.ckpt
import os
import shutil
class load_config:
def __init__(self):
# Must be in the same folder as three_dl
# loads the configuration from config file
f = open(os.path.join(os.path.dirname(os.path.abspath(__file__)),"config"), 'r')
config = f.read()
f.close()
config = config.split('\n')
for line in config:
if line != '' and line[0] != '#':
[name,var] = line.split('=')
name, var = name.replace(' ', ''), var.replace(' ', '')
self.__dict__[name] = var
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import gzip
import numpy
from six.moves import xrange
from tensorflow.contrib.learn.python.learn.datasets import base
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import random_seed
import tensorflow as tf
def _read32(bytestream):
dt = numpy.dtype(numpy.uint32)
return numpy.frombuffer(bytestream.read(4), dtype=dt)[0]
def check_dims(f, gridSize, nbDim):
print('Check dimensions ', f.name, flush = True)
with f as bytestream:
headerSize = _read32(bytestream)
magic = _read32(bytestream)
if magic != 7919:
raise ValueError('Invalid magic number %d in maps file: %s' %
(magic, f.name))
rows = _read32(bytestream)
cols = _read32(bytestream)
lays = _read32(bytestream)
assert(rows == gridSize)
assert(cols == gridSize)
assert(lays == gridSize)
chan = _read32(bytestream)
assert(chan == nbDim)
def extract_maps(f):
#print('Extracting', f.name, flush = True)
with f as bytestream:
headerSize = _read32(bytestream)
magic = _read32(bytestream)
if magic != 7919:
raise ValueError('Invalid magic number %d in maps file: %s' %
(magic, f.name))
rows = _read32(bytestream)
#print("rows "+str(rows))
cols = _read32(bytestream)
#print("cols "+str(cols))
lays = _read32(bytestream)
#print("lays "+str(lays))
chan = _read32(bytestream)
#print("chan "+str(chan))
metaSize = _read32(bytestream)
#print("metaSize "+str(metaSize))
num_maps = _read32(bytestream)
#print("num_maps "+str(num_maps))
header_end = bytestream.read(headerSize - 4*8)
if num_maps<=0 :
return None,None
size = int(rows) * int(cols) * int(lays) * int(chan) * int(num_maps)
size += int(metaSize) * int(num_maps)
try :
buf = bytestream.read(size)
except OverflowError :
return None, None
data = numpy.frombuffer(buf, dtype=numpy.uint8)
data = data.reshape(num_maps, -1)
meta = numpy.ascontiguousarray(data[:, -int(metaSize):]).view(dtype=numpy.int32)
data = data[:,:-int(metaSize)]
return data , meta
class DataSet(object):
def __init__(self,
maps,
meta,
dtype=dtypes.float32,
seed=None,
prop = 1,
shuffle = False):
# prop means the percentage of maps from the data that are put in the dataset, useful to make the dataset lighter
# when doing that shuffle is useful to take different residue each time
seed1, seed2 = random_seed.get_seed(seed)
numpy.random.seed(seed1 if seed is None else seed2)
dtype = dtypes.as_dtype(dtype).base_dtype
if dtype not in (dtypes.uint8, dtypes.float32, dtypes.float16):
raise TypeError('Invalid map dtype %r, expected uint8 or float32 or float16' %
dtype)
if dtype == dtypes.float32:
maps = maps.astype(numpy.float32)
numpy.multiply(maps, 1.0 / 255.0, out = maps)
if dtype == dtypes.float16:
maps = maps.astype(numpy.float16)
numpy.multiply(maps, 1.0 / 255.0, out = maps)
if shuffle:
perm0 = numpy.arange(maps.shape[0])[:int(maps.shape[0]*prop)]
self._maps = maps[perm0]
self._meta = meta[perm0]
else:
self._maps = maps
self._meta = meta
self._epochs_completed = 0
self._index_in_epoch = 0
self._num_res = self._maps.shape[0]
@property
def maps(self):
return self._maps
@property
def meta(self):
return self._meta
@property
def num_res(self):
return self._num_res
@property
def epochs_completed(self):
return self._epochs_completed
def next_batch(self, batch_size, shuffle=True, select_residue = -1):
"""Return the next `batch_size` examples from this data set."""
# Select residue is not used anymore, just kept for compatibility purposes
start = self._index_in_epoch
# Shuffle for the first epoch
if self._epochs_completed == 0 and start == 0 and shuffle:
perm0 = numpy.arange(self._num_res)
numpy.random.shuffle(perm0)
self._maps = self.maps[perm0]
self._meta = self._meta[perm0] # Go to the next epoch
if start + batch_size > self._num_res:
# Finished epoch
self._epochs_completed += 1
# Get the rest examples in this epoch
rest_num_examples = self._num_res - start
maps_rest_part = self._maps[start:self._num_res]
meta_rest_part = self._meta[start:self._num_res]
# Shuffle the data
if shuffle:
perm = numpy.arange(self._num_res)
numpy.random.shuffle(perm)
self._maps = self.maps[perm]
self._meta = self.meta[perm]
# Start next epoch
start = 0
self._index_in_epoch = batch_size - rest_num_examples
end = self._index_in_epoch
maps_new_part = self._maps[start:end]
meta_new_part = self._meta[start:end]
return numpy.concatenate((maps_rest_part, maps_new_part), axis=0) , numpy.concatenate((meta_rest_part, meta_new_part), axis=0)
else:
self._index_in_epoch += batch_size
end = self._index_in_epoch
return self._maps[start:end], self._meta[start:end]
def append(self, dataSet_):
self._maps = numpy.concatenate((self._maps, dataSet_._maps))
self._meta = numpy.concatenate((self._meta, dataSet_._meta))
self._num_res += dataSet_._num_res
def is_res(self, index, res_code):
if index < self._num_res :
if self._meta[index, 1] == res_code:
return True
else:
print('index = num_res')
return False
def find_next_res(self, index, res_code):
i = index + 1
while (not self.is_res(i, res_code)) and i < self._num_res - 1:
i += 1
if self.is_res(i, res_code):
return i
return -1
def read_data_set(filename,
dtype=dtypes.float32,
seed=None,
shuffle = False,
prop = 1):
local_file = filename
try :
with open(local_file, 'rb') as f:
train_maps,train_meta = extract_maps(f)
if train_maps is None :
return None
train = DataSet(
train_maps, train_meta, dtype=dtype, seed=seed, shuffle = shuffle, prop = prop)
return train
except ValueError :
return None
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import numpy as np
import tensorflow as tf
NUM_RETYPE = 15
GRID_SIZE = 24
GRID_VOXELS = GRID_SIZE * GRID_SIZE * GRID_SIZE
NB_TYPE = 169
def _weight_variable(name, shape):
return tf.get_variable(name, shape, tf.float32, tf.truncated_normal_initializer(stddev=0.01))
def _bias_variable(name, shape):
return tf.get_variable(name, shape, tf.float32, tf.constant_initializer(0.1, dtype=tf.float32))
def scoringModel(num_retype, maps, isTraining, batch_norm = True, validation = 'softplus', final_activation = 'sigmoid'):
print("Create model start")
prev_layer = tf.reshape(maps,[-1,NB_TYPE, GRID_SIZE,GRID_SIZE,GRID_SIZE])
retyper = _weight_variable("retype"+"_"+str(num_retype), [NB_TYPE, num_retype])
with tf.name_scope('Retype'):
tf.summary.histogram("Weights_R", retyper)
prev_layer = tf.transpose(prev_layer, perm=[0, 2, 3, 4, 1])
map_shape = tf.gather(tf.shape(prev_layer), [0,1,2,3]) # Extract the first three dimensions
map_shape = tf.concat([map_shape, [num_retype]], axis=0)
prev_layer = tf.reshape(prev_layer,[-1,NB_TYPE])
prev_layer = tf.matmul(prev_layer,retyper);
retyped = tf.reshape(prev_layer, map_shape)
CONV1_OUT = 20
kernelConv1 = _weight_variable("weights_C1"+"_"+str(num_retype), [3,3,3, num_retype, CONV1_OUT])
prev_layer = tf.nn.conv3d(retyped, kernelConv1, [1, 1, 1, 1, 1], padding='VALID')
biasConv1 = _bias_variable("biases_C1"+"_"+str(num_retype), [CONV1_OUT])
with tf.name_scope('Conv1'):
tf.summary.histogram("weights_C1", kernelConv1)
tf.summary.histogram("bias_C1", biasConv1)
prev_layer = prev_layer + biasConv1;
if batch_norm :
prev_layer = tf.layers.batch_normalization(prev_layer, training = isTraining, name = "batchn1")
prev_layer = tf.nn.dropout(prev_layer, 1 - tf.cast(isTraining, dtype=tf.float32) * 0.5, name="dropout1")
if validation == 'softplus':
conv1 = tf.nn.softplus(prev_layer, name="softplus1")
else:
conv1 = tf.nn.elu(prev_layer, name="elu1")
CONV2_OUT = 30
kernelConv2 = _weight_variable("weights_C2"+"_"+str(num_retype), [4,4,4, CONV1_OUT, CONV2_OUT])
prev_layer = tf.nn.conv3d(conv1, kernelConv2, [1, 1, 1, 1, 1], padding='VALID')
biasConv2 = _bias_variable("biases_C2"+"_"+str(num_retype), [CONV2_OUT])
with tf.name_scope('Conv2'):
tf.summary.histogram("weights_C2", kernelConv2)
tf.summary.histogram("bias_C2", biasConv2)
prev_layer = prev_layer + biasConv2;
if batch_norm :
prev_layer = tf.layers.batch_normalization(prev_layer, training = isTraining, name = "batchn2")
if validation == 'softplus':
prev_layer = tf.nn.softplus(prev_layer, name="softplus2")
else:
prev_layer = tf.nn.elu(prev_layer, name="elu2")
CONV3_OUT = 20
kernelConv3 = _weight_variable("weights_C3"+"_"+str(num_retype), [4,4,4, CONV2_OUT, CONV3_OUT])
prev_layer = tf.nn.conv3d(prev_layer, kernelConv3, [1, 1, 1, 1, 1], padding='VALID')
biasConv3 = _bias_variable("biases_C3"+"_"+str(num_retype), [CONV3_OUT])
with tf.name_scope('Conv3'):
tf.summary.histogram("weights_C3", kernelConv3)
tf.summary.histogram("bias_C3", biasConv3)
prev_layer = prev_layer + biasConv3;
if batch_norm :
prev_layer = tf.layers.batch_normalization(prev_layer, training = isTraining, name = "batchn3")
if validation == 'softplus':
prev_layer = tf.nn.softplus(prev_layer, name="softplus3")
else:
prev_layer = tf.nn.elu(prev_layer, name="elu3")
POOL_SIZE = 4
prev_layer = tf.nn.avg_pool3d(
prev_layer,
[1,POOL_SIZE,POOL_SIZE,POOL_SIZE,1],
[1,POOL_SIZE,POOL_SIZE,POOL_SIZE,1],
padding='VALID')
NB_DIMOUT = 4*4*4*CONV3_OUT
flat0 = tf.reshape(prev_layer,[-1,NB_DIMOUT])
LINEAR1_OUT = 64
weightsLinear = _weight_variable("weights_L1"+"_"+str(num_retype), [NB_DIMOUT, LINEAR1_OUT])
prev_layer = tf.matmul(flat0, weightsLinear)
biasLinear1 = _bias_variable("biases_L1"+"_"+str(num_retype), [LINEAR1_OUT])
with tf.name_scope('Linear1'):
tf.summary.histogram("weights_L1", weightsLinear)
tf.summary.histogram("biases_L1", biasLinear1)
prev_layer = prev_layer + biasLinear1
if batch_norm:
prev_layer = tf.layers.batch_normalization(prev_layer, training = isTraining, name = "batchn4")
#prev_layer = tf.nn.l2_normalize(flat0,dim=1)
if validation == 'softplus':
flat1 = tf.nn.softplus(prev_layer, name="softplus3")
else:
flat1 = tf.nn.elu(prev_layer, name="elu1")
weightsLinear2 = _weight_variable("weights_L2"+"_"+str(num_retype), [LINEAR1_OUT,1])
with tf.name_scope('Linear2'):
tf.summary.histogram("weights_L2", weightsLinear2)
last = tf.matmul(flat1, weightsLinear2)
print("Create model end")
prev_layer = tf.squeeze(last)
if final_activation == 'tanh':
return tf.add(tf.tanh(prev_layer)*0.5, 0.5, name = "main_output"), flat1, last, weightsLinear2
else:
return tf.sigmoid(prev_layer, name = "main_output"), flat1, last, weightsLinear2
def loss(scores, cad_score):
#return tf.losses.mean_squared_error(scores,cad_score)
return tf.square(scores - cad_score)
def training(loss, learning_rate):
optimizer = tf.train.AdamOptimizer(learning_rate, beta1=0.9)
#optimizer = tf.train.RMSPropOptimizer(learning_rate, decay = 0.999)
global_step = tf.Variable(0, name='global_step', trainable=False)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_op = optimizer.minimize(loss, global_step=global_step)
return train_op
import config
import subprocess
import os
import numpy as np
import scipy.stats
import random
import tensorflow as tf
import load_data
import model
import argparse
CONF = config.load_config()
FLAGS = None
def predict(sess, maps_placeholder, is_training, logits, filenames, meta_pl = None,outSuffix = ".orn", mapOptions=[]):
print(mapOptions)
# mapping protein
mapFilename = CONF.TEMP_PATH + 'map_'+str(os.getpid())+ '_pred.bin'
subProcessList=[]
k = 0
nbTypes = int(maps_placeholder.shape[1].value/(24*24*24))
for filename in filenames :
print('# Scoring '+filename)
#subprocess.call([CONF.MAP_GENERATOR_PATH, "--mode", "map", "-i", filename, "--native", "-m", "24", "-v", "0.8", "-o", mapFilename])
subProcessList.append(subprocess.Popen([CONF.MAP_GENERATOR_PATH, "--mode", "map", "-i", filename, "--native", "-m", "24", "-v", "0.8", "-o", mapFilename+str(k)]+mapOptions))
k+=1
k = 0
for filename in filenames :
subProcessList[k].wait()
if not os.path.exists(mapFilename+str(k)):
print('# Mapping failed, ignoring protein')
k+=1
continue
predDataset = load_data.read_data_set(mapFilename+str(k))
if predDataset is None :
k+=1
continue
os.remove(mapFilename+str(k))
result_file = open(filename+outSuffix,"w")
preds = []
# compute prediction res by res
for i in range(predDataset.num_res):
f_map = np.reshape(predDataset.maps[i], (1, model.GRID_VOXELS * nbTypes))
if meta_pl is not None :
f_meta = np.reshape(predDataset.meta[i], (1, 16))
feed_dict = {maps_placeholder: f_map, meta_pl:f_meta, is_training: False}
else :
feed_dict = {maps_placeholder: f_map, is_training: False}
pred = sess.run(logits,feed_dict=feed_dict)
preds.append(pred)
outline='RES {:4d} {:c} {:5.4f}'.format(predDataset.meta[i][0], predDataset.meta[i][1], pred)
print(outline, file = result_file)
#print(predDataset.meta[i][0]+)
#print(pred)
result_file.close()
k+=1
def main():
print(FLAGS.options)
#exit()
sess = tf.Session()
print('Restore existing model: %s' % FLAGS.model)
saver = tf.train.import_meta_graph(FLAGS.model + '.meta')
saver.restore(sess, FLAGS.model)
graph = tf.get_default_graph()
# getting placeholder for input data and output
maps_placeholder = graph.get_tensor_by_name('main_input:0')
meta_placeholder = graph.get_tensor_by_name('meta_pl:0')
is_training = graph.get_tensor_by_name('is_training:0')
logits = graph.get_tensor_by_name("main_output:0")
if FLAGS.structure != None :
predict(sess, maps_placeholder, is_training, logits, FLAGS.structure, meta_pl = meta_placeholder, outSuffix = FLAGS.suffix, mapOptions=FLAGS.options.split())
if FLAGS.directory != None :
bufferFiles = []
for filename in os.listdir(FLAGS.directory):
bufferFiles.append(FLAGS.directory+'/'+filename)
if len(bufferFiles) == FLAGS.buffer :
predict(sess, maps_placeholder, is_training, logits, bufferFiles, meta_pl = meta_placeholder, outSuffix = FLAGS.suffix, mapOptions=FLAGS.options.split())
bufferFiles = []
predict(sess, maps_placeholder, is_training, logits, bufferFiles, meta_pl = meta_placeholder, outSuffix = FLAGS.suffix, mapOptions=FLAGS.options.split())
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'-d',
'--directory',
type=str,
help='Path to the validation data'
)
parser.add_argument(
'-s',
'--structure',
type=str,
help='Path to the structure to score (in pdb format)'
)
parser.add_argument(
'-f',
'--suffix',
default=".orn",
type=str,
help='suffix to add to result file'
)
parser.add_argument(
'-b',
'--buffer',
default="8",
type=int,
help='number of files to buffer '
)
parser.add_argument(
'-o',
'--options',
default=["-m", "24", "-v", "0.8"],
type=str,
help='argument to map generator '
)
parser.add_argument(
'-m',
'--model',
default=CONF.MODEL_PATH,
type=str,
help='argument to map generator '
)
FLAGS = parser.parse_args()
#print(FLAGS.options)
main()
import pickle
import model_v2_routed as m
arg = m.layer_params(['conv_20_3_VALID', 'conv_30_3_VALID', 'conv_40_3_VALID','conv_50_5_VALID','conv_60_5_VALID','conv_70_5_VALID','avgpool_2'])
with open('convdeep.pkl', 'wb') as output:
pickle.dump(arg, output, pickle.HIGHEST_PROTOCOL)
arg = m.layer_params(['linear_512','linear_384','linear_256','linear_128','linear_64','linear_32','linear_16','linear_1'])
with open('fcdeep.pkl', 'wb') as output:
pickle.dump(arg, output, pickle.HIGHEST_PROTOCOL)
arg = m.layer_params(['linear_512','linear_384','linear_256','linear_128','linear_64','linear_20'])
with open('fcdeep_router.pkl', 'wb') as output:
pickle.dump(arg, output, pickle.HIGHEST_PROTOCOL)
arg = m.layer_params(['conv_20_3_VALID', 'conv_30_4_VALID', 'conv_20_4_VALID', 'avgpool_4'])
with open('/home/benoitch/save/conv_params/conv0.pkl', 'wb') as output:
pickle.dump(arg, output, pickle.HIGHEST_PROTOCOL)
arg = m.layer_params(['conv_20_3_VALID', 'conv_40_4_VALID', 'conv_60_4_VALID', 'avgpool_4'])
with open('/home/benoitch/save/conv_params/conv1.pkl', 'wb') as output:
pickle.dump(arg, output, pickle.HIGHEST_PROTOCOL)
arg = m.layer_params(['conv_20_3_VALID', 'conv_40_4_VALID', 'avgpool_2', 'conv_60_4_VALID', 'avgpool_2'])
with open('/home/benoitch/save/conv_params/conv2.pkl', 'wb') as output:
pickle.dump(arg, output, pickle.HIGHEST_PROTOCOL)
arg = m.layer_params(['conv_20_5_VALID', 'conv_30_4_VALID', 'conv_30_4_VALID', 'conv_40_4_VALID', 'conv_40_4_VALID','avgpool_2'])
with open('/home/benoitch/save/conv_params/conv3.pkl', 'wb') as output:
pickle.dump(arg, output, pickle.HIGHEST_PROTOCOL)
arg = m.layer_params(['conv_20_3_VALID', 'conv_30_4_VALID', 'conv_20_4_VALID','avgpool_4'])
with open('/home/benoitch/save/conv_params/conv4.pkl', 'wb') as output:
pickle.dump(arg, output, pickle.HIGHEST_PROTOCOL)
arg = m.layer_params(['conv_20_3_VALID', 'conv_30_4_VALID', 'conv_20_4_VALID','avgpool_2'])
with open('/home/benoitch/save/conv_params/conv4.pkl', 'wb') as output:
pickle.dump(arg, output, pickle.HIGHEST_PROTOCOL)
from numpy.core.multiarray import ndarray
import config
import subprocess
import os
import numpy as np
import scipy.stats
import matplotlib.pyplot as plt
import tensorflow as tf
import load_data
import model
import stat_casp
import argparse
CONF = config.load_config()
FLAGS = None
class model_prot():
"""
Class that gathers informations about a protein models and that is used to
score it in diffrent ways
"""
def __init__(self, path_to_model, path_to_target=None):
"""
:param path_to_model: Path to the generated protein file
:param path_to_target: path to the protein's target's file
"""
self._mod = path_to_model
self._targ = path_to_target
self._gdt = None
self._gt_score = None
self._pred = None
self._gt_score_out = None
def get_gdt(self):
"""
Computes and returns the gdt score of a protein, using a subprocess.
:return: gdt scores
"""
if self._targ is None:
print('ERROR: add a target to the model using .add_target(path_to_target)\n--> Return None')
return None
# Compute the GDT_TS score of the model from the target
if self._gdt is None:
with open(CONF.TEMP_OUT_GDT, 'w') as f:
subprocess.call([CONF.TMSCORE_EXE_PATH, self._mod, self._targ], stdout=f)
with open(CONF.TEMP_OUT_GDT, 'r') as f:
lines = f.readlines()
for l in lines:
_l = l.split(' ')
if _l[0] == 'GDT-TS-score=':
self._gdt = float(_l[1])
return self._gdt
def get_prediction(self, sess, graph):
"""
Computes and returns the prediction made by a beforehand loaded tensorflow model
:param sess: Tensorflow Session
:param graph: Model graph
:return: predicted score
"""
# mapping protein
maps_placeholder = graph.get_tensor_by_name('main_input:0')
nbTypes = int(maps_placeholder.shape[1].value/(24*24*24))
subprocess.call(
[CONF.ROTAMERS_EXE_PATH, "--mode", "map", "-i", self._mod, "--native", "-m", "24", "-v", "0.8","-t",str(int(nbTypes)), "-o",
CONF.TEMP_PATH + '_pred.bin'])
if not os.path.exists(CONF.TEMP_PATH + '_pred.bin'):
print('# Mapping failed, ignoring protein')
return -1
predDataset = load_data.read_data_set(CONF.TEMP_PATH + '_pred.bin')
os.remove(CONF.TEMP_PATH + '_pred.bin')
# creating placeholder for input data and feed dict
is_training = graph.get_tensor_by_name('is_training:0')
logits = graph.get_tensor_by_name("main_output:0")
preds = []
# compute prediction res by res
for i in range(predDataset.num_res):
f_map = np.reshape(predDataset.maps[i], (1, model.GRID_VOXELS * nbTypes))
training = False
feed_dict = {maps_placeholder: f_map, is_training: training}
pred = sess.run(logits,feed_dict=feed_dict)
preds.append(pred)
# returns the mean value
return np.mean(preds)
def get_gt_out_score(self):
"""
if it is not already computed
compute the ground truth score using data in .out file (overall score)
:return: ground truth score
"""
if self._gt_score is None:
if os.path.exists(self._mod + '.out'):
with open(self._mod + '.out') as f:
line = f.readline()
if len(line) < 4:
return -1
self._gt_score = float(line.split(' ')[4])
if self._gt_score > 1:
print('Value error for file out')
else:
print('ERROR : .out file doesn\'t exist, cannot tell gt score - return None')
return -1
return self._gt_score
def get_gt_score(self):
"""
if it is not already computed
compute the ground truth score using data in .sco file (res by res score)
:return: ground truth score
"""
if self._gt_score_out is None:
if os.path.exists(self._mod + '.sco'):
self._gt_score_out = np.mean(stat_casp.stat_file(self._mod + '.sco'))
else:
print('ERROR : '+self._mod +'.sco file doesn\'t exist, cannot tell gt score - return None')
return None
return self._gt_score_out
def add_target(self, path_to_target):
self._targ = path_to_target
def compute_gdt_dir(path_to_models_directory, save=False):
"""
Compute gdt score for all proteins in a specified directory. If the scores where already computed and saved
restore the previous computation.
:param path_to_models_directory: Path to the directory where are located all the protein models from which we want
to compute GDT score. There must be one directory per protein containing all the
different models of this protein. These files must be under
path_to_models_directory + '/MODELS/'
:param save: True if saving the scores. It will be saved in path_to_models_directory + '/EVAL/'. It creates one file
per protein containing the scores of all its models.
:return: dictionary of the scored keyed by the name of the server
"""
save_path = path_to_models_directory[:-6] + path_to_models_directory[-6:-1] + '_gdt_ts.sco'
save_path = save_path.replace('MODELS', 'EVAL')
dict_sco = {}
if os.path.exists(save_path):
print('Restoring previous calculation from ' + save_path)
with open(save_path, 'r') as f:
lines = f.readlines()
for l in lines:
l = l.split(' ')
# verify that the score is not an outlier
if 0 <= float(l[1][:-1]) <= 1 :
dict_sco[l[0]] = float(l[1][:-1])
else:
ls_file = sorted(os.listdir(path_to_models_directory))
for f in ls_file:
m = model_prot(path_to_models_directory + f)
dict_sco[f] = m.get_gdt()
if save:
if not os.path.exists(save_path):
if not os.path.exists(path_to_models_directory + '/EVAL/'):
os.makedirs(path_to_models_directory + '/EVAL/')
with open(save_path, 'a') as f:
for f_ in enumerate(ls_file):
f.write(f_ + ' ' + str(dict_sco[f_]) + '\n')
return dict_sco
def compute_pred_dir(path_to_models_directory, path_to_nn = None, save=False, type_save='', model='deepscoring'):
"""
Computes predictions (from a specified model) for all the proteins in a directory
:param path_to_models_directory: Path to the directory where are located all the protein models from which we want
to compute GDT score. There must be one directory per protein containing all the
different models of this protein. These files must be under
path_to_models_directory + '/MODELS/'
:param path_to_nn: If testing our tensorflow model, path to the directory where the model is saved
:param save: True if saving the scores. It will be saved in path_to_nn + '/Eval/'. It creates one file
per protein, containing the scores of all its models.
:param type_save: This string will be added at the end opf the names of all the score files created to identify them
:param model: Name of the model from which to take the prediction (our own is deep scoring)
:return: dictionary of the scored keyed by the name of the server
"""
# If the model is not our own, it will look for files containing the scores of this model.
# The files must be named and saved following this convention :
# There must be one file per protein containing the scores of all the models of this protein.
# The files name's are NameOfProteinFolder + '.' + model + '_scores' eg. T0759.3Dcnn_scores
# Theses files are located in path_to_models_directory
score_dict = {}
if not model == 'deepscoring':
path = path_to_models_directory[:-6] + '/' + path_to_models_directory[-6:-1] + '.' + model + '_scores'
with open(path,'r') as f:
lines = f.readlines()
for l in lines:
# Since each model saved its scores following a different convention, we sparse the score file differently
# according to the model name sparse the file differently
# for
if model == 'sbrod':
name = l.split(' ')[0].split('/')[-1]
try:
sco = float(l.split('\t')[1])
score_dict[name] = sco
except ValueError:
print('Value Error, ignoring')
score_dict[name] = None
elif model == 'rw':
name = l.split(' ')[0].split('/')[-1]
try:
sco = -1*float(l[77:91])
score_dict[name] = sco
except ValueError:
print('Value Error, ignoring')
score_dict[name] = None
elif model == 'voro':
name = l.split(' ')[0].split('/')[-1]
try:
sco = float(l.split(' ')[3])
score_dict[name] = sco
except ValueError:
print('Value Error, ignoring')
score_dict[name] = None
elif model == '3Dcnn':
name = l.split(' ')[0]
try:
sco = float(l.split(' ')[-1])
score_dict[name] = sco
except ValueError:
print('Value Error, ignoring')
score_dict[name] = None
elif model == 'my3d':
name = l.split(' ')[0]
try:
sco = float(l.split(' ')[-1])
score_dict[name] = sco
except ValueError:
print('Value Error, ignoring')
score_dict[name] = None
elif model == 'ProQ3D':
name = l.split(' ')[0]
try:
sco = float(l.split(' ')[-1])
score_dict[name] = sco
except ValueError:
print('Value Error, ignoring')
score_dict[name] = None
return score_dict
# Removes the '_cad' appendix in cas
save_name = type_save.replace('_cad','')
save_path = path_to_nn + '/Eval/' + save_name + '_' + path_to_models_directory[-6:-1] + '_pred.sco'
print(save_path)
# If we're evaluating one of our own model, first we check if the calculation was already made for this protein.
# beware of respecting the naming convention : '_' + CASP number + 'stage_' + Stage Number (+ _cad if using cad measure)
if os.path.exists(save_path):
print('Restoring previous calculation from ' + save_path)
with open(save_path, 'r') as fi:
lines = fi.readlines()
for l in lines:
l = l.split(' ')
score_dict[l[0]] = float(l[1][:-1])
else:
# first needs to restore the model
sess = tf.Session()
print('Restore existing model: %s' % path_to_nn)
print('Latest checkpoint: %s' % (tf.train.latest_checkpoint(path_to_nn)))
saver = tf.train.import_meta_graph(tf.train.latest_checkpoint(path_to_nn) + '.meta')
saver.restore(sess, tf.train.latest_checkpoint(path_to_nn))
graph = tf.get_default_graph()
ls_file = sorted(os.listdir(path_to_models_directory))
tot = len(ls_file)//2
i__ = 0
for f in ls_file:
if f[-4:] == '.sco' or f[-4:] == '.out':
continue
i__ += 1
m = model_prot(path_to_models_directory + f)
score = m.get_prediction(sess, graph)
print(str(i__) + '/' + str(tot) + ' : ' + str(score))
score_dict[f] = score
if save:
if not os.path.exists(save_path):
print('saving predictions into : ' + save_path)
with open(save_path, 'a') as fi:
i = 0
for f in ls_file:
if f[-4:] == '.sco' or f[-4:] == '.out':
continue
fi.write(f + ' ' + str(score_dict[f]) + '\n')
i+=1
return score_dict
def compute_gt(path_to_models_directory):
"""
Compute the ground truth score for all the models of a protein using their .sco file
:param path_to_models_directory: Path to the protein directory where all the models are located
:return: dictionary of the scored keyed by the name of the server
"""
ls_file = sorted(os.listdir(path_to_models_directory))
dict_score = {}
for f in ls_file:
if f[-4:] == '.sco' or f[-4:] == '.out':
continue
m = model_prot(path_to_models_directory + '/' +f)
dict_score[f] = m.get_gt_score()
return dict_score
def compute_gt_out(path_to_models_directory):
"""
Compute the ground truth score for all the models of a protein using their .out file
:param path_to_models_directory: Path to the protein directory where all the models are located
:return: dictionary of the scored keyed by the name of the server
"""
ls_file = sorted(os.listdir(path_to_models_directory))
dict_score = {}
for f in ls_file:
if f[-4:] == '.sco' or f[-4:] == '.out':
continue
m = model_prot(path_to_models_directory + '/' +f)
dict_score[f] = m.get_gt_out_score()
return dict_score
def compute_loss_eval(path_to_eval_directory, path_to_nn, name, measure='gdt', model='deepscoring', avoid_saving=''):
"""
Evaluate one model (either our model, or any other model providing we have his prediction saved).
:param path_to_eval_directory: Path to the directory where the dataset used to evaluate the model is saved
eg. CASP11_stage1/
In this directory must be, on directory named EVAL/ to store the computations, one
named MODELS/ where are located all the proteins files containing the models, and the files
containing the models, and one named TARGETS/ where are located the targets files
:param path_to_nn: Path to the directory where the model is saved
:param name: name of the evaluation. This will impact the naming of the files where are stored the computations.
I named my using this convention : '_' + CASP number + 'stage_' + Stage Number (+ '_' + name of the model tested if not our own)(+ _cad if using cad measure)
eg. _11stage_2_voro_cad, _12stage_1
:param measure: The reference measure (either gdt or cad score)
:param model: Name of the model if not own eg. voro, sbrod, 3Dcnn...
:param avoid_saving: 'avoid' if you dont want to save the final results into a file
:return: Creates a file in path_to_nn + 'Eval/' final_results... containing the evaluation
"""
ls_targets = sorted(os.listdir(path_to_eval_directory + 'TARGETS/'))
losses = []
kens = []
pers = []
spes = []
pred_all = []
m_all = []
for target in ls_targets:
target = target[:5]
if not os.path.exists(path_to_eval_directory + 'MODELS/' + target + '/'):
print('Skipping because of missing data')
continue
if not model == 'deepscoring':
if not os.path.exists(path_to_eval_directory + 'MODELS/' + target + '.' + model + '_scores'):
print('Skipping because of missing data')
continue
print('--> For target : ' + target)
if measure == 'cad':
measure_dict = compute_gt(path_to_eval_directory + 'MODELS/' + target + '/')
elif measure == 'out':
measure_dict = compute_gt_out(path_to_eval_directory + 'MODELS/' + target + '/')
else:
measure_dict = compute_gdt_dir(path_to_eval_directory + 'MODELS/' + target + '/', save=True)
pred_dict = compute_pred_dir(path_to_eval_directory + 'MODELS/' + target + '/', path_to_nn, save=True, type_save='loss_eval' + name, model=model)
list_of_servers_ = sorted(os.listdir(path_to_eval_directory + 'MODELS/' + target + '/'))
list_of_servers = []
for ser in list_of_servers_:
if ser[-3:] == 'out' or ser[-3:] == 'sco':
continue
list_of_servers.append(ser)
if len(pred_dict) == 0:
print('No prediction made for this protein')
continue
ls_pred_scores, ls_m_scores = [], []
print(list_of_servers)
for server in list_of_servers:
pred = pred_dict.get(server)
m = measure_dict.get(server)
if (not pred is None) and (not m is None):
ls_pred_scores.append(pred)
ls_m_scores.append(m)
if sum(ls_m_scores) == 0:
print('> corrupted ground truth file : Ignoring model')
continue
rank_pred = scipy.stats.rankdata(ls_pred_scores, method='dense')
i_max_pred = np.argmax(np.array(ls_pred_scores))
if model == '3Dcnn' or model == 'my3d':
#ls_pred_scores = [-s for s in ls_pred_scores]
i_max_pred = np.argmin(np.array(ls_pred_scores))
#i_max_pred = np.argmax(np.array(ls_pred_scores))
rank_m = scipy.stats.rankdata(ls_m_scores, method='dense')
i_max_m = np.argmax(np.array(ls_m_scores))
for i_ in range(len(ls_pred_scores)):
pred_all.append(ls_pred_scores[i_])
m_all.append(ls_m_scores[i_])
loss = np.abs(ls_m_scores[i_max_m] - ls_m_scores[i_max_pred])
print('--> ' + target + ' loss : ', loss)
per = scipy.stats.pearsonr(ls_pred_scores, ls_m_scores)[0]
print('--> ' + target + ' pearson : ', per)
spe = scipy.stats.spearmanr(ls_pred_scores, ls_m_scores)[0]
print('--> ' + target + ' spearman : ', spe)
ken = scipy.stats.kendalltau(rank_pred, rank_m)[0]
print('--> ' + target + ' kendall : ', ken)
losses.append(loss)
kens.append(ken)
pers.append(per)
spes.append(spe)
print('---> Current mean loss : ', np.mean(losses))
print('----> Final std loss : ', np.std(losses))
print('---> Current mean pearson : ', np.mean(pers))
print('---> Current mean spearman : ', np.mean(spes))
print('---> Current mean kendall : ', np.mean(kens))
print('----> Final mean loss : ', np.mean(losses))
print('----> Final confidence interval loss : ', np.std(losses)/np.sqrt(len(losses)))
print('----> Final mean pearson : ', np.mean(pers))
print('----> Final mean spearman : ', np.mean(spes))
print('----> Final mean kendall : ', np.mean(kens))
print('\n\n')
print('Over all data :\n')
print('Pearson : ', scipy.stats.pearsonr(pred_all, m_all)[0])
print('Spearman : ', scipy.stats.spearmanr(pred_all, m_all)[0])
rank_m = scipy.stats.rankdata(m_all, method='dense')
rank_pred = scipy.stats.rankdata(pred_all, method='dense')
print('Kendall : ', scipy.stats.kendalltau(rank_pred, rank_m)[0])
save_path = path_to_nn + 'Eval/' + 'results_eval' + name
if avoid_saving == 'avoid':
return -1
if not os.path.exists(save_path):
print('Saving results into : ' + save_path)
fi = open(save_path, 'a')
fi.write('======= RESULTS =======\n')
fi.write('> Final loss : ' + np.str(np.mean(losses)) + '\n')
fi.write('> Final pearson : ' + np.str(np.mean(pers)) + '\n')
fi.write('> Final spearman : ' + np.str(np.mean(spes)) + '\n')
fi.write('> Final kendall : ' + np.str(np.mean(kens)) + '\n')
fi.write('target - loss - pearson - spearman - kendall\n')
for i, target in enumerate(ls_targets):
target = target[:5]
fi.write(target + ',' + np.str(losses[i]) + ',' + np.str(pers[i]) + ',' + np.str(spes[i]) + ',' + np.str(kens[i]) + '\n')
fi.close()
return np.mean(losses)
def main():
# Name of the data directory must be CASPX_stageY
save_name = '_' + FLAGS.data.split('/')[-2].split('_')[0][4:] + 'stage_' + FLAGS.data.split('/')[-2].split('_')[1][5:]
if not FLAGS.type_model == 'deepscoring':
save_name = save_name + '_' + FLAGS.type_model
if not FLAGS.score == 'gdt':
save_name = save_name + '_' + FLAGS.score
print(save_name)
compute_loss_eval(FLAGS.data, FLAGS.model, name =save_name, measure=FLAGS.score,model=FLAGS.type_model,avoid_saving=FLAGS.save)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'-s',
'--score',
type=str,
default='gdt',
help='Give the measure we are refering too'
)
parser.add_argument(
'-d',
'--data',
type=str,
help='Path to the validation data'
)
parser.add_argument(
'-m',
'--model',
type=str,
help='Path to the model that is tested'
)
parser.add_argument(
'-n',
'--name',
type=str,
default= '_1Xstage_X',
help='Name of the test'
)
parser.add_argument(
'--save',
type=str,
default= '',
help='avoid avoids saving'
)
parser.add_argument(
'-t',
'--type_model',
type=str,
default= 'deepscoring',
help='If testing a different kind of model'
)
FLAGS = parser.parse_args()
main()
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import gzip
import numpy
from six.moves import xrange
from tensorflow.contrib.learn.python.learn.datasets import base
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import random_seed
import tensorflow as tf
def _read32(bytestream):
dt = numpy.dtype(numpy.uint32)
return numpy.frombuffer(bytestream.read(4), dtype=dt)[0]
def check_dims(f, gridSize, nbDim):
print('Check dimensions ', f.name, flush = True)
with f as bytestream:
headerSize = _read32(bytestream)
magic = _read32(bytestream)
if magic != 7919:
raise ValueError('Invalid magic number %d in maps file: %s' %
(magic, f.name))
rows = _read32(bytestream)
cols = _read32(bytestream)
lays = _read32(bytestream)
assert(rows == gridSize)
assert(cols == gridSize)
assert(lays == gridSize)
chan = _read32(bytestream)
assert(chan == nbDim)
def extract_maps(f):
print('Extracting', f.name, flush = True)
with f as bytestream:
headerSize = _read32(bytestream)
magic = _read32(bytestream)
if magic != 7919:
raise ValueError('Invalid magic number %d in maps file: %s' %
(magic, f.name))
rows = _read32(bytestream)
#print("rows "+str(rows))
cols = _read32(bytestream)
#print("cols "+str(cols))
lays = _read32(bytestream)
#print("lays "+str(lays))
chan = _read32(bytestream)
#print("chan "+str(chan))
metaSize = _read32(bytestream)
#print("metaSize "+str(metaSize))
num_maps = _read32(bytestream)
#print("num_maps "+str(num_maps))
header_end = bytestream.read(headerSize - 4*8)
if num_maps<=0 :
return None,None
size = int(rows) * int(cols) * int(lays) * int(chan) * int(num_maps)
size += int(metaSize) * int(num_maps)
try :
buf = bytestream.read(size)
except OverflowError :
return None, None
data = numpy.frombuffer(buf, dtype=numpy.uint8)
data = data.reshape(num_maps, -1)
meta = numpy.ascontiguousarray(data[:, -int(metaSize):]).view(dtype=numpy.int32)
ss_dict = {0: -1,
66:0,#B
98:0,#b
67:1,#C
69:2,#E
71:3,#G
72:4,#H
73:5,#I
84:6,#T
}
meta[:,3] = [ss_dict[x] for x in meta[:,3]]
res_dict = {0:-1,
65:0, #A
67:1, #C
68:2, #D
69:3, #E
70:4, #F
71:5, #G
72:6, #H
73:7, #I
75:8, #K
76:9, #L
77:10,#M
78:11,#N
80:12,#P
81:13,#Q
82:14,#R
83:15,#S
84:16,#T
86:17,#V
87:18,#W
89:19 #Y
}
meta[:,1] = [res_dict[x] for x in meta[:,1]]
#print(meta[:,3])
#print(meta[:,2])
data = data[:,:-int(metaSize)]
return data , meta
class DataSet(object):
def __init__(self,
maps,
meta,
dtype=dtypes.float32,
seed=None,
prop = 1,
shuffle = False):
# prop means the percentage of maps from the data that are put in the dataset, useful to make the dataset lighter
# when doing that shuffle is useful to take different residue each time
seed1, seed2 = random_seed.get_seed(seed)
numpy.random.seed(seed1 if seed is None else seed2)
dtype = dtypes.as_dtype(dtype).base_dtype
if dtype not in (dtypes.uint8, dtypes.float32, dtypes.float16):
raise TypeError('Invalid map dtype %r, expected uint8 or float32 or float16' %
dtype)
if dtype == dtypes.float32:
maps = maps.astype(numpy.float32)
numpy.multiply(maps, 1.0 / 255.0, out = maps)
if dtype == dtypes.float16:
maps = maps.astype(numpy.float16)
numpy.multiply(maps, 1.0 / 255.0, out = maps)
if shuffle:
perm0 = numpy.arange(maps.shape[0])[:int(maps.shape[0]*prop)]
self._maps = maps[perm0]
self._meta = meta[perm0]
else:
self._maps = maps
self._meta = meta
self._epochs_completed = 0
self._index_in_epoch = 0
self._num_res = self._maps.shape[0]
@property
def maps(self):
return self._maps
@property
def meta(self):
return self._meta
@property
def num_res(self):
return self._num_res
@property
def epochs_completed(self):
return self._epochs_completed
def next_batch(self, batch_size, shuffle=True, select_residue = -1):
"""Return the next `batch_size` examples from this data set."""
# Select residue is not used anymore, just kept for compatibility purposes
start = self._index_in_epoch
# Shuffle for the first epoch
if self._epochs_completed == 0 and start == 0 and shuffle:
perm0 = numpy.arange(self._num_res)
numpy.random.shuffle(perm0)
self._maps = self.maps[perm0]
self._meta = self._meta[perm0] # Go to the next epoch
if start + batch_size > self._num_res:
# Finished epoch
self._epochs_completed += 1
# Get the rest examples in this epoch
rest_num_examples = self._num_res - start
maps_rest_part = self._maps[start:self._num_res]
meta_rest_part = self._meta[start:self._num_res]
# Shuffle the data
if shuffle:
perm = numpy.arange(self._num_res)
numpy.random.shuffle(perm)
self._maps = self.maps[perm]
self._meta = self.meta[perm]
# Start next epoch
start = 0
self._index_in_epoch = batch_size - rest_num_examples
end = self._index_in_epoch
maps_new_part = self._maps[start:end]
meta_new_part = self._meta[start:end]
return numpy.concatenate((maps_rest_part, maps_new_part), axis=0) , numpy.concatenate((meta_rest_part, meta_new_part), axis=0)
else:
self._index_in_epoch += batch_size
end = self._index_in_epoch
return self._maps[start:end], self._meta[start:end]
def append(self, dataSet_):
self._maps = numpy.concatenate((self._maps, dataSet_._maps))
self._meta = numpy.concatenate((self._meta, dataSet_._meta))
self._num_res += dataSet_._num_res
def is_res(self, index, res_code):
if index < self._num_res :
if self._meta[index, 1] == res_code:
return True
else:
print('index = num_res')
return False
def find_next_res(self, index, res_code):
i = index + 1
while (not self.is_res(i, res_code)) and i < self._num_res - 1:
i += 1
if self.is_res(i, res_code):
return i
return -1
def read_data_set(filename,
dtype=dtypes.float32,
seed=None,
shuffle = False,
prop = 1):
local_file = filename
try :
with open(local_file, 'rb') as f:
train_maps,train_meta = extract_maps(f)
if train_maps is None :
return None
train = DataSet(
train_maps, train_meta, dtype=dtype, seed=seed, shuffle = shuffle, prop = prop)
return train
except ValueError :
return None
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import numpy as np
import tensorflow as tf
NUM_RETYPE = 15
GRID_SIZE = 24
GRID_VOXELS = GRID_SIZE * GRID_SIZE * GRID_SIZE
NB_TYPE = 169
def _weight_variable(name, shape):
return tf.get_variable(name, shape, tf.float32, tf.truncated_normal_initializer(stddev=0.01))
def _bias_variable(name, shape):
return tf.get_variable(name, shape, tf.float32, tf.constant_initializer(0.1, dtype=tf.float32))
def scoringModel(num_retype, maps, isTraining, batch_norm = True, validation = 'softplus', final_activation = 'sigmoid'):
print("Create model start")
prev_layer = tf.reshape(maps,[-1,NB_TYPE, GRID_SIZE,GRID_SIZE,GRID_SIZE])
retyper = _weight_variable("retype"+"_"+str(num_retype), [NB_TYPE, num_retype])
with tf.name_scope('Retype'):
tf.summary.histogram("Weights_R", retyper)
prev_layer = tf.transpose(prev_layer, perm=[0, 2, 3, 4, 1])
map_shape = tf.gather(tf.shape(prev_layer), [0,1,2,3]) # Extract the first three dimensions
map_shape = tf.concat([map_shape, [num_retype]], axis=0)
prev_layer = tf.reshape(prev_layer,[-1,NB_TYPE])
prev_layer = tf.matmul(prev_layer,retyper);
retyped = tf.reshape(prev_layer, map_shape)
CONV1_OUT = 20
kernelConv1 = _weight_variable("weights_C1"+"_"+str(num_retype), [3,3,3, num_retype, CONV1_OUT])
prev_layer = tf.nn.conv3d(retyped, kernelConv1, [1, 1, 1, 1, 1], padding='VALID')
biasConv1 = _bias_variable("biases_C1"+"_"+str(num_retype), [CONV1_OUT])
with tf.name_scope('Conv1'):
tf.summary.histogram("weights_C1", kernelConv1)
tf.summary.histogram("bias_C1", biasConv1)
prev_layer = prev_layer + biasConv1;
if batch_norm :
prev_layer = tf.layers.batch_normalization(prev_layer, training = isTraining, name = "batchn1")
prev_layer = tf.nn.dropout(prev_layer, 1 - tf.cast(isTraining, dtype=tf.float32) * 0.5, name="dropout1")
if validation == 'softplus':
conv1 = tf.nn.softplus(prev_layer, name="softplus1")
else:
conv1 = tf.nn.elu(prev_layer, name="elu1")
CONV2_OUT = 30
kernelConv2 = _weight_variable("weights_C2"+"_"+str(num_retype), [4,4,4, CONV1_OUT, CONV2_OUT])
prev_layer = tf.nn.conv3d(conv1, kernelConv2, [1, 1, 1, 1, 1], padding='VALID')
biasConv2 = _bias_variable("biases_C2"+"_"+str(num_retype), [CONV2_OUT])
with tf.name_scope('Conv2'):
tf.summary.histogram("weights_C2", kernelConv2)
tf.summary.histogram("bias_C2", biasConv2)
prev_layer = prev_layer + biasConv2;
if batch_norm :
prev_layer = tf.layers.batch_normalization(prev_layer, training = isTraining, name = "batchn2")
if validation == 'softplus':
prev_layer = tf.nn.softplus(prev_layer, name="softplus2")
else:
prev_layer = tf.nn.elu(prev_layer, name="elu2")
CONV3_OUT = 20
kernelConv3 = _weight_variable("weights_C3"+"_"+str(num_retype), [4,4,4, CONV2_OUT, CONV3_OUT])
prev_layer = tf.nn.conv3d(prev_layer, kernelConv3, [1, 1, 1, 1, 1], padding='VALID')
biasConv3 = _bias_variable("biases_C3"+"_"+str(num_retype), [CONV3_OUT])
with tf.name_scope('Conv3'):
tf.summary.histogram("weights_C3", kernelConv3)
tf.summary.histogram("bias_C3", biasConv3)
prev_layer = prev_layer + biasConv3;
if batch_norm :
prev_layer = tf.layers.batch_normalization(prev_layer, training = isTraining, name = "batchn3")
if validation == 'softplus':
prev_layer = tf.nn.softplus(prev_layer, name="softplus3")
else:
prev_layer = tf.nn.elu(prev_layer, name="elu3")
POOL_SIZE = 4
prev_layer = tf.nn.avg_pool3d(
prev_layer,
[1,POOL_SIZE,POOL_SIZE,POOL_SIZE,1],
[1,POOL_SIZE,POOL_SIZE,POOL_SIZE,1],
padding='VALID')
NB_DIMOUT = 4*4*4*CONV3_OUT
flat0 = tf.reshape(prev_layer,[-1,NB_DIMOUT])
LINEAR1_OUT = 64
weightsLinear = _weight_variable("weights_L1"+"_"+str(num_retype), [NB_DIMOUT, LINEAR1_OUT])
prev_layer = tf.matmul(flat0, weightsLinear)
biasLinear1 = _bias_variable("biases_L1"+"_"+str(num_retype), [LINEAR1_OUT])
with tf.name_scope('Linear1'):
tf.summary.histogram("weights_L1", weightsLinear)
tf.summary.histogram("biases_L1", biasLinear1)
prev_layer = prev_layer + biasLinear1
if batch_norm:
prev_layer = tf.layers.batch_normalization(prev_layer, training = isTraining, name = "batchn4")
#prev_layer = tf.nn.l2_normalize(flat0,dim=1)
if validation == 'softplus':
flat1 = tf.nn.softplus(prev_layer, name="softplus3")
else:
flat1 = tf.nn.elu(prev_layer, name="elu1")
weightsLinear2 = _weight_variable("weights_L2"+"_"+str(num_retype), [LINEAR1_OUT,1])
with tf.name_scope('Linear2'):
tf.summary.histogram("weights_L2", weightsLinear2)
last = tf.matmul(flat1, weightsLinear2)
print("Create model end")
prev_layer = tf.squeeze(last)
if final_activation == 'tanh':
return tf.add(tf.tanh(prev_layer)*0.5, 0.5, name = "main_output"), flat1, last, weightsLinear2
else:
return tf.sigmoid(prev_layer, name = "main_output"), flat1, last, weightsLinear2
def loss(scores, cad_score):
#return tf.losses.mean_squared_error(scores,cad_score)
return tf.square(scores - cad_score)
def training(loss, learning_rate):
optimizer = tf.train.AdamOptimizer(learning_rate, beta1=0.9)
#optimizer = tf.train.RMSPropOptimizer(learning_rate, decay = 0.999)
global_step = tf.Variable(0, name='global_step', trainable=False)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_op = optimizer.minimize(loss, global_step=global_step)
return train_op
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
# NUM_RETYPE = 15
# GRID_SIZE = 24
# GRID_VOXELS = GRID_SIZE * GRID_SIZE * GRID_SIZE
# NB_TYPE = 169
def _weight_variable(name, shape):
return tf.get_variable(name, shape, tf.float32, tf.truncated_normal_initializer(stddev=0.01))
def _bias_variable(name, shape):
return tf.get_variable(name, shape, tf.float32, tf.constant_initializer(0.1, dtype=tf.float32))
class ScoringModel:
def __init__(self,
num_retype=15,
GRID_SIZE=24,
NB_TYPE=169,
batch_norm=True,
validation='softplus',
final_activation='sigmoid'
):
self.num_retype = num_retype
self.GRID_SIZE = GRID_SIZE
self.GRID_VOXELS = self.GRID_SIZE * self.GRID_SIZE * self.GRID_SIZE
self.NB_TYPE = NB_TYPE
self.batch_norm = batch_norm
self.validation = validation
self.final_activation = final_activation
def get_pred(self,
maps,
isTraining):
print('Creating model...')
input_data = tf.reshape(maps, [-1, self.NB_TYPE, self.GRID_SIZE, self.GRID_SIZE, self.GRID_SIZE])
# First step reducing data dimensionality
retyped = self.retype_layer(input_data, self.num_retype, self.NB_TYPE, name='retype')
# First convolution
CONV1_OUT = 20
out_conv_1 = self.conv_layer(retyped, [3, 3, 3, self.num_retype, CONV1_OUT], name='CONV_1')
# batch norm and activation
out_conv_1 = self.activation_normalization_layer(out_conv_1, self.batch_norm, self.validation, isTraining, name = 'act_norm_1')
# Second convolution
CONV2_OUT = 30
out_conv_2 = self.conv_layer(out_conv_1, [4, 4, 4, CONV1_OUT, CONV2_OUT], name='CONV_2')
# Batch norm and activation
out_conv_2 = self.activation_normalization_layer(out_conv_2, self.batch_norm, self.validation, isTraining, name = 'act_norm_2')
# Third convolution
CONV3_OUT = 20
out_conv_3 = self.conv_layer(out_conv_2, [4, 4, 4, CONV2_OUT, CONV3_OUT], name='CONV_3')
out_conv_3 = self.activation_normalization_layer(out_conv_3, self.batch_norm, self.validation, isTraining, name = 'act_norm_3')
# pooling and flattening
POOL_SIZE = 4
prev_layer = tf.nn.avg_pool3d(
out_conv_3,
[1, POOL_SIZE, POOL_SIZE, POOL_SIZE, 1],
[1, POOL_SIZE, POOL_SIZE, POOL_SIZE, 1],
padding='VALID')
NB_DIMOUT = 4 * 4 * 4 * CONV3_OUT
flat0 = tf.reshape(prev_layer, [-1, NB_DIMOUT])
# Fully connected layer 1
LINEAR1_OUT = 64
out_l1 = self.fc_layer(flat0, NB_DIMOUT, LINEAR1_OUT, bias=True, name='linear_1', num_retype=self.num_retype)
out_l1 = self.activation_normalization_layer(out_l1, self.batch_norm, self.validation, isTraining, name = 'act_norm_4')
out = self.fc_layer(out_l1, LINEAR1_OUT, 1, False, 'Linear_2', self.num_retype)
out = tf.squeeze(out)
if self.final_activation == 'tanh':
return tf.add(tf.tanh(out) * 0.5, 0.5, name="main_output")
else:
return tf.sigmoid(out, name="main_output")
def retype_layer(self, prev_layer, num_retype_, input_, name='retype'):
retyper = _weight_variable(name + "_" + str(num_retype_), [input_, num_retype_])
with tf.name_scope(name):
tf.summary.histogram(name, retyper)
prev_layer = tf.transpose(prev_layer, perm=[0, 2, 3, 4, 1])
map_shape = tf.gather(tf.shape(prev_layer), [0, 1, 2, 3]) # Extract the first three dimensions
map_shape = tf.concat([map_shape, [self.num_retype]], axis=0)
prev_layer = tf.reshape(prev_layer, [-1, self.NB_TYPE])
prev_layer = tf.matmul(prev_layer, retyper)
return tf.reshape(prev_layer, map_shape)
def conv_layer(self, prev_layer, kernel_size, name='CONV'):
kernelConv = _weight_variable("weights_" + name + "_" + str(self.num_retype), kernel_size)
prev_layer = tf.nn.conv3d(prev_layer, kernelConv, [1, 1, 1, 1, 1], padding='VALID', name = name)
biasConv = _bias_variable("biases_" + name + "_" + str(kernel_size[3]), kernel_size[-1])
with tf.name_scope(name):
tf.summary.histogram("weights_" + name, kernelConv)
tf.summary.histogram("biases_" + name, biasConv)
return prev_layer + biasConv;
def activation_normalization_layer(self, input_vector, batch_norm, validation, isTraining, name='act_norm_'):
if batch_norm:
input_vector = tf.layers.batch_normalization(input_vector, training=isTraining, name = name)
if validation == 'softplus':
return tf.nn.softplus(input_vector, name="softplus")
else:
return tf.nn.elu(input_vector, name="elu")
def fc_layer(self, input_vector, input_size, output_size, bias, name, num_retype):
weightsLinear = _weight_variable("weights_" + name + "_" + str(num_retype), [input_size, output_size])
prev_layer = tf.matmul(input_vector, weightsLinear)
if bias:
biasLinear = _bias_variable("biases_" + name + "_" + str(num_retype), [output_size])
with tf.name_scope(name):
tf.summary.histogram("weights_" + name, weightsLinear)
if bias:
tf.summary.histogram("biases_" + name, biasLinear)
if bias:
return prev_layer + biasLinear
else:
return prev_layer
def compute_loss(self, scores, cad_score):
return tf.square(scores - cad_score, name='loss')
def train(self, loss, learning_rate):
optimizer = tf.train.AdamOptimizer(learning_rate, beta1=0.9)
# optimizer = tf.train.RMSPropOptimizer(learning_rate, decay = 0.999)
global_step = tf.Variable(0, name='global_step', trainable=False)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_op = optimizer.minimize(loss, global_step=global_step, name='train_op')
return train_op
\ No newline at end of file
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import numpy as np
class conv_params:
def __init__(self, layers):
"""
Class aimed at providing an rapid way to implement different architectures for the
convolution part of the network in order to test small changes easily. It gathers
the information needed to build the convolution block in an object that is fed
at the creation of the model.
: params layers:
['conv_20_3_VALID', 'conv_30_4_VALID', 'conv_20_4_VALID', 'avgpool_4_VALID']
each element of the list is a string, starting either
with 'conv' keyword followed by the number of channels
and the size of the cubic kernel and the padding (VALID / SAME) or
by 'avgpool' for a pooling layer followed by the pooling size.
For now every conv layer is followed by a batch normalization
layer.
"""
# Parse the layers list to check for errors
errors = 0
for la in layers:
parse = la.split('_')
# TODO: improve the errors checking
if not(parse[0] == 'conv' or parse[0] == 'avgpool'):
print('ERROR: Anomaly detected while parsing argument :', parse[0],'is not a valid keyword')
errors += 1
if not errors == 0:
raise ValueError(str(errors) + ' error(s) while parsing the argument')
self.layers = layers
def get_output_size(self, input_size):
"""
Used when building the graph of the model
:param input_size: Number of voxels in one side of the map before convolution bloc
:return: Number of voxels in one side of the map after convolution operations
"""
final_size = input_size
for l in self.layers:
if l.split('_')[0] == 'conv':
if l.split('_')[3] == 'VALID':
final_size = final_size - int(l.split('_')[2]) + 1
elif l.split('_')[0] == 'avgpool':
final_size = int(final_size//int(l.split('_')[1]))
return final_size
def _weight_variable(name, shape):
return tf.get_variable(name, shape, tf.float32, tf.truncated_normal_initializer(stddev=0.01))
def _bias_variable(name, shape):
return tf.get_variable(name, shape, tf.float32, tf.constant_initializer(0.1, dtype=tf.float32))
class ScoringModel:
def __init__(self,
conv_param,
num_retype=15,
GRID_SIZE=24,
batch_norm=True,
activation='softplus',
final_activation='sigmoid',
nb_branches = 5,
router = 'base',
coarse = False):
"""
Initialize variable of the model.
:param conv_param: Object that contains all the parameters used to build different versions of the conv bloc
:param num_retype: Dimension of the map after dimensionality reduction by the retyper
:param GRID_SIZE: Number of voxel in one side of the map
:param batch_norm: True if using batchnorm before activation layers
:param validation: Activation layer (softplus or elu)
:param final_activation: Final activation (sigmiod or tanh 'like')
:param nb_branches: If the model is not routed acording to residue type, numbre of branche in decision tree
:param router: different versions of the router ('advanced' is a version where at least 0.01 of the data is sent to every branche)
:param coarse: Bool, True if coarse grained mapping.
"""
self.conv_param = conv_param
self.num_retype = num_retype
self.GRID_SIZE = GRID_SIZE
self.GRID_VOXELS = self.GRID_SIZE * self.GRID_SIZE * self.GRID_SIZE
self.coarse = coarse
self.NB_TYPE = 169
if coarse:
self.NB_TYPE=126
self.num_retype = 3
self.batch_norm = batch_norm
self.validation = activation
self.final_activation = final_activation
self.nb_branches = nb_branches
self.router = router
if self.router == 'routed_res':
self.nb_branches = 20 # no choice for number of branches if routed res.
def get_pred(self,
maps,
res,
isTraining):
"""
Gives the predictions of a given batch of maps. Builds the graph, one retyping layer one conv bloc which
architecture is defined by self.conv_params and one fully connected bloc.
:param maps: Maps generated by the protein mapper
:param res: res is the one hot encoding of the batch's residues types (extracted from the maps meta data).
Used if the model is routed according to the residue type
:param isTraining: True if model is training.
:return: List of the predictions for a given batch
"""
input_data = tf.reshape(maps, [-1, self.NB_TYPE, self.GRID_SIZE, self.GRID_SIZE, self.GRID_SIZE])
# First step reducing data dimensionality
retyped = self.retype_layer(input_data, self.num_retype, self.NB_TYPE, coarse=self.coarse, name='retype')
flat0 = self.conv_bloc(retyped, isTraining, self.conv_param, name='main_branche')
# The router gives a weight to each branche according to the output of the convolution operations
if not self.router == 'routed_res':
rout = self.router_bloc(flat0, self.nb_branches, self.router, 'router_1')
# Each branche makes a prediction that are gathered in outs (the prediction is not normalized)
outs = []
for j_ in range(self.nb_branches):
out_r = self.fc_bloc(flat0, isTraining, name='r_' + str(j_ + 1))
outs.append(out_r)
conc = tf.transpose(outs)
# Each prediction is weighted by either the router or the residue type
if self.router == 'routed_res':
out = tf.multiply(conc, res)
else:
out = tf.multiply(conc, rout)
out = tf.reduce_sum(out, axis = 1)
out = tf.squeeze(out)
# predictions are normalized between zero and one.
if self.final_activation == 'tanh':
return tf.add(tf.tanh(out) * 0.5, 0.5, name="main_output")
else:
return tf.sigmoid(out, name="main_output")
def retype_layer(self, prev_layer, num_retype_, input_dim, name='retype', coarse=False):
"""
Reduces the dimensionality of the map.
:param prev_layer: Input data (maps of the proteins)
:param num_retype_: Dimensionality after retyping
:param input_: Dimensionality before retyping
:param name: Name given for the tf variables
:param coarse: True if mapping coarse grained
:return: Retyped map
"""
if coarse:
dimPerType = 6
nbBigType = int(self.NB_TYPE / dimPerType)
retyper = _weight_variable(name + "_" + str(num_retype_), [nbBigType, dimPerType * num_retype_])
with tf.name_scope(name):
tf.summary.histogram(name, retyper)
prev_layer = tf.transpose(prev_layer, perm=[0, 2, 3, 4, 1])
map_shape = tf.gather(tf.shape(prev_layer), [0, 1, 2, 3]) # Extract the first three dimensions
map_shape = tf.concat([map_shape, [dimPerType * self.num_retype]], axis=0)
prev_layer = tf.reshape(prev_layer, [-1, self.NB_TYPE])
prev_layer = tf.reshape(prev_layer, [-1, nbBigType, dimPerType])
prev_layer = tf.transpose(prev_layer, perm=[0, 2, 1])
prev_layer = tf.reshape(prev_layer, [-1, nbBigType])
prev_layer = tf.matmul(prev_layer, retyper)
return tf.reshape(prev_layer, map_shape)
else:
retyper = _weight_variable(name + "_" + str(num_retype_), [input_dim, num_retype_])
with tf.name_scope(name):
tf.summary.histogram(name, retyper)
prev_layer = tf.transpose(prev_layer, perm=[0, 2, 3, 4, 1])
map_shape = tf.gather(tf.shape(prev_layer), [0, 1, 2, 3]) # Extract the first three dimensions
map_shape = tf.concat([map_shape, [self.num_retype]], axis=0)
prev_layer = tf.reshape(prev_layer, [-1, self.NB_TYPE])
prev_layer = tf.matmul(prev_layer, retyper)
return tf.reshape(prev_layer, map_shape)
def conv_layer(self, prev_layer, kernel_size, name='CONV', padding='VALID'):
"""
One conv layer with biais. the activation layer is not defined here, must be added after.
:param prev_layer: Output of the previous layer
:param kernel_size: Kernel size (list of integer) [side_size, side_size, side_size, nb_channels_input, nb_channels_output]
:param name: Name given to the operations in that layer. must bu unique
:param padding: Padding, SAME or VALID
:return: output layer (conv + biais)
"""
kernelConv = _weight_variable("weights_" + name + "_" + str(self.num_retype), kernel_size)
prev_layer = tf.nn.conv3d(prev_layer, kernelConv, [1, 1, 1, 1, 1], padding=padding, name = name)
biasConv = _bias_variable("biases_" + name + "_" + str(kernel_size[3]), kernel_size[-1])
with tf.name_scope(name):
tf.summary.histogram("weights_" + name, kernelConv)
tf.summary.histogram("biases_" + name, biasConv)
return prev_layer + biasConv
def activation_normalization_layer(self, input_vector, batch_norm, validation, isTraining, name='activ_norma_'):
"""
Activation layer, with optional batch noramlization
:param input_vector: Output of the previous layer
:param batch_norm: Bool, True if you want to enable batch noramlization
:param validation: string, between elu and softplus
:param isTraining: True if the network is in its training phase (useful for batch normalization)
:param name: Unique name given to operation in this layer
:return:
"""
if batch_norm:
input_vector = tf.layers.batch_normalization(input_vector, training=isTraining, name = name)
if validation == 'softplus':
return tf.nn.softplus(input_vector, name="softplus")
elif validation == 'elu':
return tf.nn.elu(input_vector, name="elu")
def fc_layer(self, input_vector, output_size, bias, name):
"""
Fully connected layer
:param input_vector: Output of the previous layer
:param output_size: dimension of the output vector
:param bias: True if you want biais in the layer
:param name: Unique name to give to the operations in this layers
:return: Output of fully connected layer
"""
# exctract the input size from previous layer
input_size = input_vector.get_shape().as_list()[1]
weightsLinear = _weight_variable("weights_" + name + "_" + str(self.num_retype), [input_size, output_size])
prev_layer = tf.matmul(input_vector, weightsLinear)
if bias:
biasLinear = _bias_variable("biases_" + name + "_" + str(self.num_retype), [output_size])
with tf.name_scope(name):
tf.summary.histogram("weights_" + name, weightsLinear)
if bias:
tf.summary.histogram("biases_" + name, biasLinear)
if bias:
return prev_layer + biasLinear
else:
return prev_layer
def router_bloc(self, input_vector, nb_route = 2, router='base', name = 'router'):
"""
Bloc that takes the result of convolutionnal operations and predicts a weight to give to each branch.
Small network of two fully connected layers with softplus activation
:param input_vector: Output of conv bloc
:param nb_route: Number of branches to choose between
:param router: If different versions of the router qre implemented, allows to choose the one to use.
for now, 'advanced' allows to send a fraction of the predictions made by each branche to the final
result to avoid the case where only one branch is selected and help training.
:param name: Unique name given to the operations in these layers
:return: A vector containing a weight for each branche
"""
# exctract the input size from previous layer
input_size = input_vector.get_shape().as_list()[1]
out_size = input_size // 2
weightsLinear = _weight_variable("weights_" + name, [input_size, out_size])
biasLinear = _bias_variable("biases_" + name, [out_size])
routes = tf.matmul(input_vector, weightsLinear) + biasLinear
routes = tf.nn.softplus(routes, name="softplus_" + name)
weightsLinear2 = _weight_variable("weights2_" + name, [out_size, nb_route])
biasLinear2 = _bias_variable("biases2_" + name, [nb_route])
routes = tf.matmul(routes, weightsLinear2) + biasLinear2
if router=='advanced':
fraction = 0.01
return tf.add(tf.nn.softmax(routes)*(1 - fraction), tf.constant(np.ones(nb_route)*fraction, dtype=tf.float32), name='out_' + name)
return tf.nn.softmax(routes, name='out_' + name)
def fc_bloc(self, input, train, name = 'r_'):
"""
Fully connected bloc. Two fully conected layer one activation + batch norm after first layer, none after second
:param input: Output of the previous bloc
:param train: True if network is in training phase : useful for batch normalization so that it only adjusts on training data
:param name: Unique name for operations in this layer
:return: One prediction (not normalized)
"""
LINEAR1_OUT = 64
out_l1 = self.fc_layer(input, LINEAR1_OUT, bias=True, name='linear_1_' + name)
out_l1 = self.activation_normalization_layer(out_l1, self.batch_norm, self.validation, train,
name='act_norm_4_' + name)
out = self.fc_layer(out_l1, 1, False, 'Linear_2_' + name)
return tf.squeeze(out)
def conv_bloc(self, input, train, params_conv, name = 'r_'):
"""
Conv operation, dicted by the conv params
:param input: output of retype layer, retyped 3D map
:param train: True if network is in training phase : useful for batch normalization so that it only adjusts on training data
:param params_conv: list of strings that discribe the architecture of the bloc
:param name: unique name for operations in the bloc
:return: flattened vector, result of conv operations
"""
id = 0
# extract the nb of channel of the input vector
last_channel = input.get_shape().as_list()[-1]
for l_i, layer in enumerate(params_conv.layers):
id += 1
# For each string in the conv_params list, add a layer that correspond to the specified one conv or maxpool
if layer.split('_')[0] == 'conv':
# exctract the hyperparameters from the string
k_side = int(layer.split('_')[2])
input_channel, output_channel = last_channel, int(layer.split('_')[1])
# keep output number of channel in memory for next layer
last_channel = output_channel
kernel_params = [k_side, k_side, k_side, input_channel, output_channel]
input = self.conv_layer(input, kernel_params, name='CONV_'+ str(id)+'_'+name, padding=layer.split('_')[3])
input = self.activation_normalization_layer(input, self.batch_norm, self.validation, train, name='act_norm_' + str(id) + '_' + name)
elif layer.split('_')[0] == 'avgpool':
pool_size = int(layer.split('_')[1])
if not pool_size == 0:
input = tf.nn.avg_pool3d(input,
[1, pool_size, pool_size, pool_size, 1],
[1, pool_size, pool_size, pool_size, 1],
padding='VALID')
final_side_size = params_conv.get_output_size(self.GRID_SIZE)
nb_dimout = final_side_size * final_side_size * final_side_size * last_channel
return tf.reshape(input, [-1, nb_dimout])
def compute_loss(self, scores, cad_score):
"""
Compute loss, l2 distance
:param scores: List of predicted scores for batch
:param cad_score: List of ground truth scores
:return: list of l2 distance between predicted scores and gt scores
"""
return tf.square(scores - cad_score, name='loss')
def train(self, loss, learning_rate):
"""
Train operations when building graph. With adam optimizer
:param loss: output of loss function
:param learning_rate: Learning rate
:return: training operation output of Optimizer.minimize()
"""
optimizer = tf.train.AdamOptimizer(learning_rate, beta1=0.9)
# optimizer = tf.train.RMSPropOptimizer(learning_rate, decay = 0.999)
global_step = tf.Variable(0, name='global_step', trainable=False)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_op = optimizer.minimize(loss, global_step=global_step, name='train_op')
return train_op
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
import numpy as np
import os
def stat_file(path_to_file):
f = open(path_to_file, 'r')
lines = f.readlines()
scores = []
for l in lines:
[_, sco] = l.split(' ')
if sco[-2:] == '\n':
sco = sco[:-2]
if sco == '0':
scores.append(0)
else:
scores.append(float(sco))
f.close()
if len(scores) == 0:
return [0]
return np.mean(scores)
def stat_prot(path_to_prot):
file_ls = os.listdir(path_to_prot)
scores_p = np.array([])
for f in file_ls:
if f[-4:] == '.sco':
sco_f = stat_file(path_to_prot + '/' + f)
scores_p = np.append(scores_p, sco_f)
if scores_p == np.array([]):
return None
return scores_p
def stat_casp(path_to_casp):
file_ls = os.listdir(path_to_casp)
scores_c = np.array([])
for f in file_ls:
stats_prot = stat_prot(path_to_casp + '/' + f)
if not stats_prot is None:
scores_c = np.append(scores_c, stats_prot)
scores_c = np.reshape(scores_c,(-1,))
return scores_c
def stat_full(path_to_data, casp_choice):
file_ls = casp_choice
scores_full = np.array([])
for f in file_ls:
print('Stats : ' + f)
scores_c = stat_casp(path_to_data + '/' + f + '/MODELS/')
scores_full = np.append(scores_full, scores_c)
print('Done')
scores_full = np.reshape(scores_full,(-1,))
return scores_full
def main():
print(stat_full('/home/benoitch/data/', ['CASP7','CASP8']))
if __name__ == '__main__':
main()
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import os.path
import sys
import time
import numpy
import subprocess
import os
import random
from pathlib import Path
from six.moves import xrange
import tensorflow as tf
import load_data
import model
import config
import stat_casp
# Basic model parameters as external flags.
FLAGS = None
log_file = None
def placeholder_inputs(batch_size):
maps_placeholder = tf.placeholder(tf.float32, shape=(batch_size, model.GRID_VOXELS * model.NB_TYPE),
name='main_input')
ground_truth = tf.placeholder(tf.float32, shape=(batch_size))
is_training = tf.placeholder(tf.bool, name='is_training')
return maps_placeholder, ground_truth, is_training
def fill_feed_dict(data_set, maps_pl, gt_pl, is_training, training=True, batch_size_=50, selec_res=-1):
maps_feed, gt_feed = data_set.next_batch(batch_size_, shuffle=True, select_residue=selec_res)
feed_dict = {
maps_pl: maps_feed,
gt_pl: (gt_feed[:, 2] / 1000000),
is_training: training
}
return feed_dict
def run_training(test=None, strategy="active", nb_prot_batch=1, selec_res = 83):
# Tell TensorFlow that the model will be built into the default Graph.
with tf.Graph().as_default():
# testfname = FLAGS.test_file
# testDataset = load_data.read_data_set(testfname, dtype=dtypes.float16, seed = 1)
# initialize a map generator variable with the configuration of the config file
conf = config.load_config()
# Create a session for running Ops on the Graph.
sess = tf.Session()
# Generate placeholders for the maps and labels.
maps_placeholder, ground_truth_placeholder, is_training = placeholder_inputs(None)
# Build a Graph that computes predictions from the inference model.
n_retype = 15
logits, _, _, weightsLinear2 = model.scoringModel(n_retype, maps_placeholder, is_training, batch_norm=True,
validation='elu', final_activation='tanh')
loss = model.loss(logits, ground_truth_placeholder)
train_op = model.training(loss, FLAGS.learning_rate)
variable_summaries(logits)
variable_summaries(ground_truth_placeholder)
with tf.name_scope('summaries'):
tf.summary.scalar('loss', tf.reduce_mean(1000 * loss))
merged = tf.summary.merge_all()
writer = tf.summary.FileWriter(conf.TENSORBOARD_PATH)
writer.add_graph(sess.graph)
# Create a saver for writing training checkpoints.
print("Create saver ...")
saver = tf.train.Saver(tf.global_variables())
# And then after everything is built:
# Run the Op to initialize the variables.
if (os.path.isdir(FLAGS.restore)):
print('Restore existing model: %s' % (FLAGS.restore))
print('Latest checkpoint: %s' % (tf.train.latest_checkpoint(FLAGS.restore)))
saver = tf.train.import_meta_graph(tf.train.latest_checkpoint(FLAGS.restore) + '.meta')
saver.restore(sess, tf.train.latest_checkpoint(FLAGS.restore))
print("Model restored!")
else:
# Add the variable initializer Op.
print("Initializing...")
init = tf.global_variables_initializer()
sess.run(init)
print("Initialized!")
trainDataset = None
slidingloss = 0
decayLoss = 0.999
dln = 1
# counts the number of updates of the weights
nb_updates = 0
lim = 0
percentile_range = 1
init = True
for step in xrange(FLAGS.max_steps):
# choose a file randomly
for i_ in range(nb_prot_batch):
pdbfile = None
native = False
while (pdbfile == None):
caspNumber, casp_choice = choose_casp()
# file in models
if strategy == 'active':
# increases the number of protein progressively starting with the ones wich means is closest the the mean of the dataset
if init or lim > 500:
if init:
if os.path.exists(conf.TEMP_META_PATH + FLAGS.data_variability + '_means.npy'):
print('Loading CASP stats...')
f = open(conf.TEMP_META_PATH + FLAGS.data_variability + '_means.npy', 'rb')
means_casp = numpy.load(f)
f.close()
print('Done !')
else:
print('Computing CASP stats...')
means_casp = stat_casp.stat_full(conf.DATA_FILE_PATH, casp_choice)
f = open(conf.TEMP_META_PATH + FLAGS.data_variability + '_means.npy', 'wb')
print('Saving CASP stats...')
means_casp = numpy.save(f, means_casp)
f.close()
print('Done !')
init = False
a = numpy.percentile(means_casp, 100 - percentile_range)
# b = numpy.percentile(means_casp, 50 + percentile_range)
# print('Range : ', percentile_range)
# print(a)
# print(b)
if percentile_range * 1.09 < 60:
percentile_range = percentile_range * 1.09
else:
percentile_range = 60
lim = 0
mean_file = 0
nb_file_ignored = -1
loop = True
start_time = time.time()
while loop:
nb_file_ignored += 1
model_dir = random.choice(os.listdir(conf.DATA_FILE_PATH + caspNumber + "/MODELS"))
model_file = random.choice(os.listdir(conf.DATA_FILE_PATH + caspNumber + "/MODELS/" + model_dir))
if model_file[-4:] == ".sco":
model_file = model_file[:-4]
if os.path.isfile(conf.DATA_FILE_PATH + caspNumber + "/MODELS/" + model_dir + "/" + model_file + '.sco'):
mean_file = numpy.mean(stat_casp.stat_file(
conf.DATA_FILE_PATH + caspNumber + "/MODELS/" + model_dir + "/" + model_file + '.sco'))
else:
mean_file = 0
if a <= mean_file:
loop = False
print(mean_file)
duration = time.time() - start_time
print("Time to find file : %.3f - files ignored : " % duration, nb_file_ignored)
print('Range : ', percentile_range / 1.09)
else:
model_dir = random.choice(os.listdir(conf.DATA_FILE_PATH + caspNumber + "/MODELS"))
model_file = random.choice(os.listdir(conf.DATA_FILE_PATH + caspNumber + "/MODELS/" + model_dir))
if model_file[-4:] == ".sco":
model_file = model_file[:-4]
score_file = Path(conf.DATA_FILE_PATH + caspNumber + "/MODELS/" + model_dir + "/" + model_file + ".sco")
if score_file.is_file():
pdbfile = conf.DATA_FILE_PATH + caspNumber + "/MODELS/" + model_dir + "/" + model_file
native = False
print(pdbfile)
# create a density map of it
start_time = time.time()
if native:
subprocess.call(
[conf.ROTAMERS_EXE_PATH, "--mode", "map", "-i", pdbfile, "--native", "-m", "24", "-v", "0.8", "-o",
conf.TEMP_PATH + '_' + str(i_) + '.bin'])
else:
subprocess.call([conf.ROTAMERS_EXE_PATH, "--mode", "map", "-i", pdbfile, "-m", "24", "-v", "0.8", "-o",
conf.TEMP_PATH + '_' + str(i_) + '.bin'])
duration = time.time() - start_time
print("Mapping duration : %.3f" % duration)
start_time = time.time()
trainDataset = load_data.read_data_set(conf.TEMP_PATH + '_' + str(i_) + '.bin', shuffle=True, prop=1)
duration = time.time() - start_time
print("Loading duration : %.3f" % duration)
if trainDataset is None:
continue;
if i_ == 0:
trainDataset_full = trainDataset
print('New data set created with extracted maps')
else:
trainDataset_full.append(trainDataset)
print('Extracted maps added to current data set')
if selec_res == -1:
for i in xrange(1 + trainDataset_full.num_res // FLAGS.batch_size):
# Fill a feed dictionary with the actual set of maps and labels
# for this particular training step.
feed_dict = fill_feed_dict(trainDataset_full,
maps_placeholder,
ground_truth_placeholder,
is_training, FLAGS.dropout, batch_size_=FLAGS.batch_size) # 0.5
# Run one step of the model. The return values are the activations
# from the `train_op` (which is discarded) and the `loss` Op. To
# inspect the values of your Ops or variables, you may include them
# in the list passed to sess.run() and the value tensors will be
# returned in the tuple from the call.
start_time = time.time()
if numpy.sum(feed_dict[ground_truth_placeholder]) > 1:
dln = dln * decayLoss
_, loss_value, res, s = sess.run([train_op, loss, logits, merged],
feed_dict=feed_dict)
nb_updates += 1
lim += 1
slidingloss = decayLoss * slidingloss + (1 - decayLoss) * numpy.mean(loss_value)
print('Res - mean : %.4f - std : %.6f' % (numpy.mean(res), numpy.std(res)))
print('Tru - mean : %.4f - std : %.6f' % (
numpy.mean(feed_dict[ground_truth_placeholder]), numpy.std(feed_dict[ground_truth_placeholder])))
writer.add_summary(s, nb_updates)
# print(loss_value)
# print(numpy.sum(loss_value))
duration = time.time() - start_time
print('Step %d: loss = %.4f corr = (%.3f sec)' % (step, 1000 * slidingloss / (1 - dln), duration))
# print(numpy.corrcoef(res, feed_dict[ground_truth_placeholder] ))
else:
epoch_before_batch = trainDataset_full._epochs_completed
while epoch_before_batch == trainDataset_full._epochs_completed:
# Fill a feed dictionary with the actual set of maps and labels
# for this particular training step.
feed_dict = fill_feed_dict(trainDataset_full,
maps_placeholder,
ground_truth_placeholder,
is_training, FLAGS.dropout, batch_size_=FLAGS.batch_size, selec_res = selec_res) # 0.5
# Run one step of the model. The return values are the activations
# from the `train_op` (which is discarded) and the `loss` Op. To
# inspect the values of your Ops or variables, you may include them
# in the list passed to sess.run() and the value tensors will be
# returned in the tuple from the call.
start_time = time.time()
if numpy.sum(feed_dict[ground_truth_placeholder]) > 1:
dln = dln * decayLoss
_, loss_value, res, s = sess.run([train_op, loss, logits, merged],
feed_dict=feed_dict)
nb_updates += 1
lim += 1
slidingloss = decayLoss * slidingloss + (1 - decayLoss) * numpy.mean(loss_value)
print('Res - mean : %.4f - std : %.6f' % (numpy.mean(res), numpy.std(res)))
print('Tru - mean : %.4f - std : %.6f' % (
numpy.mean(feed_dict[ground_truth_placeholder]), numpy.std(feed_dict[ground_truth_placeholder])))
writer.add_summary(s, nb_updates)
# print(loss_value)
# print(numpy.sum(loss_value))
duration = time.time() - start_time
print('Step %d: loss = %.4f corr = (%.3f sec)' % (step, 1000 * slidingloss / (1 - dln), duration))
# print(numpy.corrcoef(res, feed_dict[ground_truth_placeholder] ))
# Write the summaries and print an overview fairly often.
if step % 10 == 0:
save_path = saver.save(sess, conf.SAVE_PATH)
print("Model saved in path: %s" % save_path)
print('\n')
# print('Done')
# var = sess.run(weightsLinear2)
# print(var)
def variable_summaries(var):
"""Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
with tf.name_scope('summaries'):
mean, std = tf.nn.moments(var, axes=[0])
tf.summary.scalar('mean', mean)
tf.summary.scalar('std', std)
def choose_casp():
# chose a casp file in the data directory randomly depending on the different options you have in the data directory
if FLAGS.data_variability == 'file_one':
casp_choice = ["dummy1"]
return random.choice(casp_choice), casp_choice
elif FLAGS.data_variability == 'files_same':
casp_choice = ["dummy4"]
return random.choice(casp_choice), casp_choice
elif FLAGS.data_variability == 'files_different':
casp_choice = ["dummy2"]
caspNumber = random.choice(casp_choice), casp_choice
elif FLAGS.data_variability == 'protein':
casp_choice = ["dummy3"]
return random.choice(casp_choice), casp_choice
elif FLAGS.data_variability == 'protein_few':
casp_choice = ["dummy5"]
return random.choice(casp_choice), casp_choice
elif FLAGS.data_variability == 'all':
casp_choice = ["CASP7", "CASP8", "CASP9", "CASP10", "CASP11"]
return random.choice(casp_choice), casp_choice
elif FLAGS.data_variability == 'casp':
casp_choice = ["CASP11"]
return random.choice(casp_choice), casp_choice
def main(_):
run_training()
# conf = config.load_config()
# means_casp12 = stat_casp.stat_casp('/home/benoitch/data/CASP12/MODELS/')
# eval_model.run_eval('/home/benoitch/Temp/Model', '/home/benoitch/data/', numpy.percentile(means_casp12, 40), FLAGS.learning_rate, batch_s = 40)
# eval_model.prepare_plot('/home/benoitch/save/nn_3/Model/', '/home/benoitch/data/')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--learning_rate',
type=float,
default=0.0001,
help='Initial learning rate.'
)
parser.add_argument(
'--dropout',
type=float,
default=0.5,
help='Dropout rate.'
)
parser.add_argument(
'--max_steps',
type=int,
default=25000,
help='Number of steps to run trainer.'
)
parser.add_argument(
'--conv_layers_size',
type=int,
nargs='*',
dest='conv_size_list', # store in 'list'.
default=[],
help='Number of output in each convolutional layer.'
)
parser.add_argument(
'--full_layers_size',
type=int,
nargs='*',
dest='full_size_list', # store in 'list'.
default=[],
help='Number of units in each hidden layer.'
)
parser.add_argument(
'--groupSize',
type=int,
default=12,
help='Size of the group representing the same data.'
)
parser.add_argument(
'--elementSize',
type=int,
default=1,
help='Number of group of density map representing the same structure.'
)
parser.add_argument(
'--batch_size',
type=int,
default=10,
help='Batch size. Must divide evenly into the dataset sizes.'
)
parser.add_argument(
'--log_dir',
type=str,
default='/tmp/tensorflow/mnist/logs/fully_connected_feed2',
help='Directory to put the log data.'
)
parser.add_argument(
'--log_file',
type=str,
default='tmp.log',
help='File to put human readable logs.'
)
parser.add_argument(
'--restore',
type=str,
default='', # '/tmp/tensorflow/mnist/logs/fully_connected_feed_',
help='path to restore model from'
)
parser.add_argument(
'--evaluate',
default=False,
help='train or evaluate the data'
)
parser.add_argument(
'--training_data_path',
default='',
help='path to the training data directory'
)
parser.add_argument(
'--test_file',
default='',
help='path to the test data file'
)
parser.add_argument(
'--data_variability',
default='all',
help='Define the variability of data, could be only one version of a protein (file_one), a few versions of the same protein (files_same) or different proteins (files_different) one protein (protein), a few proteins (protein_few), a casp directory (casp) or all the data (all)'
)
FLAGS, unparsed = parser.parse_known_args()
log_file = open(FLAGS.log_file, "w")
print(FLAGS)
print(unparsed)
print(FLAGS, file=log_file)
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
Namespace(batch_size=10, conv_size_list=[], data_variability='all', dropout=0.5, elementSize=1, evaluate=False, full_size_list=[], groupSize=12, learning_rate=0.0001, log_dir='/tmp/tensorflow/mnist/logs/fully_connected_feed2', log_file='tmp.log', max_steps=25000, restore='', test_file='', training_data_path='')
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import random
import os
import subprocess
import numpy as np
import argparse
import pickle
import config
import model_v2_routed as model_v2
import load_data
import stat_casp
"""
General notes :
# I suggest to name the tests following this convention : start with '_' then the id of the test (integer) '28'
If you stop the training and want to start it again just name ad '_' and an integer equals to the number of time you've
restarted the training.
eg. Initial training : test = '_28'
restarded : test = '_28_1'
restarted again: test = '_28_2' etc...
To plot the loss curves just moove the loss files, into a specific directory, change the compare_losses.py variable
so that it points to that directory and then call:
python3 compare_losses.py -t [list of ids that you want to plot, eg: 0 12 28]
"""
conf = config.load_config()
FLAGS = None
log_file = None
class Strategy(object):
"""
A strategy is an object that is used to provide the model with data during the training phase.
Its role is to choose files from the database according to a training strategy (for instance: choose the file
among the top 1% (in terms of ground truth cad score), every 200 steps increase this range by a factor 1.05)
and it then gives it to the mapper.
"""
def __init__(self, casp_possibilities, range_init = 1, update_every = 200, update_speed = 1.09, range_max = 65, use_ls_file = None):
"""
:param casp_possibilities: String that is given to choose_casp function and that will then determined the directories
from where the data is taken (eg. 'all' -> CASP7 to CASP11, see choose_casp())
:param range_init: For strategy1 initial range. See Strategy1().
:param update_every: Number of batch to wait before updating the range
:param update_speed: Range updated by this factor
:param range_max: range updates stop after reaching this level
:param use_ls_file: For strategy feeding_order_strategy() : path to feeding order file.
"""
# loads a distributions of the gt scores for the data set. if it exists
if os.path.exists(conf.TEMP_META_PATH + casp_possibilities + '_means.npy'):
print('Loading CASP stats...')
f = open(conf.TEMP_META_PATH + casp_possibilities + '_means.npy', 'rb')
self.means_casp = np.load(f)
f.close()
print('Done !')
# if it does not, computes it with stat_casp file. And then saves it
else:
print('Computing CASP stats...')
_, casp_list = self.choose_casp(casp_possibilities)
self.means_casp = stat_casp.stat_full(conf.DATA_FILE_PATH, casp_list)
f = open(conf.TEMP_META_PATH + casp_possibilities + '_means.npy', 'wb')
print('Saving CASP stats...')
np.save(f, self.means_casp)
f.close()
print('Done !')
self.casp_choice = casp_possibilities
self.range = range_init
self.update = update_every
self.count = 0
self.update_factor = update_speed
self.lim_inf = range_max
self.use_ls_file = use_ls_file
# indix of the line
self.file_i = 0
# For feeding_order_strategy, loads the file and then reads its line : each line is the path to a protein
if use_ls_file is None:
self.ls_file = []
else:
f = open(use_ls_file,'r')
self.ls_file = f.readlines()
for i,l in enumerate(self.ls_file):
self.ls_file[i] = l[:-1]
print(self.ls_file)
f.close()
def update_range(self):
"""
Updates the range variable every self.update times it is called, up to self.lim_inf
:return:
"""
self.count += 1
if self.count > self.update:
self.count = 0
if self.range*self.update_factor < self.lim_inf:
self.range *= self.update_factor
else:
self.range = self.lim_inf
def choose_file(self):
"""
Choose a file randomly, check if this file suits the strategy. Once it finds one that does returns is
:return: path to the chosen file
"""
pdb = None
nb_file_ignored = -1
while pdb is None:
nb_file_ignored += 1
caspNumber, casp_choice = self.choose_casp(self.casp_choice)
prot_path = conf.DATA_FILE_PATH + caspNumber + '/MODELS/'
prot_dir = random.choice(os.listdir(prot_path))
prot_file = random.choice(os.listdir(prot_path + prot_dir))
if prot_file[-4:] == ".sco":
prot_file = prot_file[:-4]
prot_dir_path = prot_path + prot_dir + '/'
prot_file_path = prot_dir_path + prot_file
score_file = prot_file_path + ".sco"
if os.path.exists(score_file):
pdb, sco = self.check_file(prot_file_path)
else:
continue
if pdb is None:
continue
print('Chose file %s according to strategy.' % pdb)
print('Nb files ignored : ', nb_file_ignored)
print('File quality : ', sco)
return pdb
def check_file(self, model_file_path):
"""
Check if one file correspond to the object's strategy
:param model_file_path: path to a file (pdb format)
:return: String if the file fits the strategy, None otherwise.
"""
# returns none if the file chosen doesn't correspond match the training strategy at that point of time
# if it does returns the file itself
sco_file = np.mean(stat_casp.stat_file(model_file_path + '.sco'))
if self.is_ok(sco_file):
return model_file_path, sco_file
print('None')
return None, None
def choose_casp(self, var):
"""
Randomly chooses a CASP directory according to the variability keyword
This directory must have one MODEL/ subdir, with subdirs for every protein (eg. MODELS/T0854/) in which are saved
the proteins files in pdb format
:param var: Keyword
:return: String, name of the chosen directory
"""
# chose a casp file in the data directory randomly depending on the different options you have in the data directory
if var == 'file_one':
casp_choice = ["dummy1"]
return random.choice(casp_choice), casp_choice
elif var == 'files_same':
casp_choice = ["dummy4"]
return random.choice(casp_choice), casp_choice
elif var == 'files_different':
casp_choice = ["dummy2"]
return random.choice(casp_choice), casp_choice
elif var == 'protein':
casp_choice = ["dummy3"]
return random.choice(casp_choice), casp_choice
elif var == 'protein_few':
casp_choice = ["dummy5"]
return random.choice(casp_choice), casp_choice
elif var == 'all':
# casp_choice = ["CASP7", "CASP8", "CASP9", "CASP10", "CASP11"]
casp_choice = ["CASP7", "CASP8", "CASP9", "CASP10"]
return random.choice(casp_choice), casp_choice
elif var == 'one_casp':
casp_choice = ["CASP11"]
return random.choice(casp_choice), casp_choice
class Passive(Strategy):
"""
No discrimination between the files.
"""
def is_ok(self, sco_file):
return True
class Active_1(Strategy):
"""
Selects the file among the top self.range % (ground truth cad score) of the data set.
Every self.update batch, updates the range by a factor self.update_factor up to self.lim_inf
"""
def is_ok(self, sco_file):
a = np.percentile(self.means_casp, 100 - self.range)
if sco_file > a:
return True
return False
class Active_2(Strategy):
"""
Selects the file among the top self.lim_inf %. Constant range.
"""
def is_ok(self, sco_file):
a = np.percentile(self.means_casp, 100 - self.lim_inf)
if sco_file > a:
return True
return False
class feeding_order_strategy(Strategy):
"""
Selects the file following the order specified in the feeding_order file specified at the creation of the object
"""
def choose_file(self):
model_file_path = self.ls_file[self.file_i]
print(model_file_path)
self.file_i += 1
if self.file_i == len(self.ls_file):
print('Run out of files')
return 'out_of_file'
return model_file_path
def is_ok(self, sco_file):
return True
def placeholder_inputs(batch_size, coarse=False):
"""
Creates placeholder for the input data during the training of the model
:param batch_size: integer
:param coarse: True if mapping coarse grained
:return: Placesholders
"""
nbDimTypes = FLAGS.types
if coarse :
nbDimTypes *=6
maps_placeholder = tf.placeholder(tf.float32, shape=(batch_size, FLAGS.voxels * FLAGS.voxels * FLAGS.voxels * nbDimTypes),
name='main_input')
ground_truth = tf.placeholder(tf.float32, shape=batch_size, name = 'gt_pl')
is_training = tf.placeholder(tf.bool, name='is_training')
meta_placeholder = tf.placeholder(tf.float32, shape=(batch_size,16), name='meta_pl')
return maps_placeholder, ground_truth, is_training, meta_placeholder
def variable_summaries(var):
"""Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
with tf.name_scope('summaries'):
mean, std = tf.nn.moments(var, axes=[0])
tf.summary.scalar('mean', mean)
tf.summary.scalar('std', std)
def fill_feed_dict(data_set, maps_pl, meta_pl, gt_pl, is_training, training=True, batch_size_=50):
"""
:param data_set: |
:param maps_pl: |
:param meta_pl: | Data placeholders
:param gt_pl: |
:param is_training: |
:param training: True if the model is training
:param batch_size_: integer
:return: dictionnary to give to sess.run(.., feed_dict = )
"""
maps_feed, gt_feed = data_set.next_batch(batch_size_, shuffle=True)
feed_dict = {
maps_pl: maps_feed,
meta_pl: gt_feed,
gt_pl: (gt_feed[:, 2] / 1000000),
is_training: training
}
return feed_dict
def train_model(feeding_strategy, restore=None, conv_param=None, max_step=20000, batch_size=10, learning_rate=0.001, nb_branches=1, nb_voxel=24, size_voxel=0.8, router='base', test='', coarse='', useMeta = 0):
"""
Trains the model
:param feeding_strategy: Strategy object
:param restore: Path to the model to restore
:param conv_param: Conv_params object from model_v2_routed
:param max_step: Number of steps to train the model
:param batch_size: integer
:param learning_rate: float
:param nb_branches: Number of branches for the model
:param nb_voxel: Number of voxel for mapping
:param size_voxel: Size in angstrom of one voxel for mapping
:param router: Type of router (eg. 'advanced', 'routed_res)
:param test: string, test identifier, used as appendix for the file's name created by the script
:param coarse: True if coarse grained mapping
:return: ...
"""
with tf.Graph().as_default():
sess = tf.Session()
# if the model is not restored, it is build using routed_model_v2
if restore is None:
print('Model needs to be build...')
coarseParam = ( coarse == 'coarse' )
scoring_nn = model_v2.ScoringModel(conv_param, num_retype=15, GRID_SIZE=nb_voxel, activation='elu', final_activation='tanh', nb_branches=nb_branches, router=router, coarse = coarseParam, types = FLAGS.types, useMeta = useMeta)
maps_placeholder, ground_truth_placeholder, is_training, meta_placeholder = placeholder_inputs(None, coarse=coarseParam)
logits = scoring_nn.get_pred(maps_placeholder, meta_placeholder, is_training)
loss = scoring_nn.compute_loss(logits, ground_truth_placeholder)
train_op = scoring_nn.train(loss, learning_rate)
# adding all the summaries
variable_summaries(logits)
variable_summaries(ground_truth_placeholder)
with tf.name_scope('summaries'):
tf.summary.scalar('loss', tf.reduce_mean(1000 * loss))
merged = tf.summary.merge_all()
print(merged.name)
writer = tf.summary.FileWriter(conf.TENSORBOARD_PATH)
writer.add_graph(sess.graph)
# Create a saver for writing training checkpoints.
saver = tf.train.Saver(tf.global_variables())
# initializing variables
init = tf.global_variables_initializer()
sess.run(init)
else:
print('Restore existing model: %s' % (restore))
print('Latest checkpoint: %s' % (tf.train.latest_checkpoint(restore)))
saver = tf.train.import_meta_graph(tf.train.latest_checkpoint(restore) + '.meta')
saver.restore(sess, tf.train.latest_checkpoint(restore))
graph = tf.get_default_graph()
meta_placeholder = graph.get_tensor_by_name('meta_pl:0')
maps_placeholder = graph.get_tensor_by_name('main_input:0')
ground_truth_placeholder = graph.get_tensor_by_name('gt_pl:0')
is_training = graph.get_tensor_by_name('is_training:0')
logits = graph.get_tensor_by_name("main_output:0")
loss = graph.get_tensor_by_name("loss:0")
train_op = graph.get_tensor_by_name('train_op:0')
merged = graph.get_tensor_by_name("Merge/MergeSummary:0")
writer = tf.summary.FileWriter(conf.TENSORBOARD_PATH)
if not router == 'routed_res':
graph = tf.get_default_graph()
router = graph.get_tensor_by_name("out_router_1:0")
updates = 0
slidingloss = 0
decayLoss = 0.999
dln = 1
losses = []
while updates < max_step:
trainDataset = None
while trainDataset is None:
# Choosing a file
pdb_file = feeding_strategy.choose_file()
if pdb_file == 'out_of_file':
# this means we've reach the end of the feeding_order file
updates = max_step + 10
trainDataset = 'out_of_file'
continue
print('Mapping pdb file...')
mapcommand = [conf.ROTAMERS_EXE_PATH, "--mode", "map", "-i", pdb_file, "-m", str(nb_voxel), "-t", str(FLAGS.types), "-v",
str(size_voxel), "-o", # v 0.8
conf.TEMP_PATH + str(test) + '.bin']
if FLAGS.orient == 0:
mapcommand.append("--orient")
mapcommand.append("0")
if FLAGS.skipNeighb != 0:
mapcommand.append("--skip_neighb")
mapcommand.append(str(FLAGS.skipNeighb))
if coarse == 'coarse':
mapcommand.append("--coarse")
subprocess.call(mapcommand)
print('Loading file...')
trainDataset = load_data.read_data_set(conf.TEMP_PATH + test + '.bin', shuffle=True)
if trainDataset is None:
print('Mapping or loading went wrong ; choosing another file...')
continue
if pdb_file == 'out_of_file':
# this means we've reach the end of the feeding_order file, breaks the while loop
continue
# If we're not already following a feeding order_file : adds the chosen file to the list
if feeding_strategy.use_ls_file is None:
f = open(conf.LS_TRAINING_FILE + 'feeding_order', 'a')
f.write(pdb_file + '\n')
# Split the protein into different batches
for i in range(1 + trainDataset.num_res // batch_size):
try :
feed_dict = fill_feed_dict(trainDataset,
maps_placeholder,
meta_placeholder,
ground_truth_placeholder,
is_training, batch_size_= batch_size)
if not np.sum(feed_dict[ground_truth_placeholder]) > 1:
print('Not relevant batch : ignore')
continue
dln = dln * decayLoss
# print(np.array(sess.run([tocheck], feed_dict=feed_dict)).shape)
print(maps_placeholder.shape)
_, loss_value, result, s = sess.run([train_op, loss, logits, merged], feed_dict=feed_dict)
except ValueError as e:
print(e)
print(ValueError)
print("Error appened in this batch")
continue
if updates % 100 == 0:
if router == 'routed_res':
a = sess.run(meta_placeholder, feed_dict=feed_dict)
print(a[:,1])
else:
r = sess.run(router, feed_dict=feed_dict)
print(r)
updates += 1
feeding_strategy.update_range()
slidingloss = decayLoss * slidingloss + (1 - decayLoss) * np.mean(loss_value)
print('Res - mean : %.4f - std : %.6f' % (np.mean(result), np.std(result)))
print('Tru - mean : %.4f - std : %.6f' % (np.mean(feed_dict[ground_truth_placeholder]), np.std(feed_dict[ground_truth_placeholder])))
writer.add_summary(s, updates)
print('Step %d: loss = %.4f' % (updates, 1000 * slidingloss / (1 - dln)))
losses.append(1000 * slidingloss / (1 - dln))
if updates % 100 == 0:
if not os.path.exists(conf.SAVE_DIR + test):
os.makedirs(conf.SAVE_DIR + test)
save_path = saver.save(sess, conf.SAVE_DIR + test + '/model.ckpt')
print("Model saved in path: %s" % save_path)
f__ = open(conf.LS_TRAINING_FILE + 'losses'+ test + '.npy', 'wb')
np.save(f__,losses)
f__.close()
if updates % 5000 == 0:
if not os.path.exists(conf.SAVE_DIR + test +'_5000' ):
os.makedirs(conf.SAVE_DIR + test + '_5000' )
save_path = saver.save(sess, conf.SAVE_DIR + test + '_5000' + '/model_'+ str(updates)+'.ckpt')
print("Model saved in path: %s" % save_path)
print('\n\n')
def main():
# use_ls_file = conf.LS_TRAINING_FILE
with open(FLAGS.conv, 'rb') as input:
conv_params = pickle.load(input)
strategy = Passive('all',range_init = 3, update_every = 200, update_speed = 1.1, range_max = 90, use_ls_file=FLAGS.feeding_order)
if not FLAGS.feeding_order is None:
strategy = feeding_order_strategy('all', range_init=3, update_every=200, update_speed=1.1, range_max=90,
use_ls_file=FLAGS.feeding_order)
train_model(strategy, restore=FLAGS.restore,conv_param=conv_params, max_step=FLAGS.steps, batch_size=FLAGS.batch_size,
learning_rate=FLAGS.learning_rate, nb_branches=FLAGS.branches, nb_voxel=FLAGS.voxels,
size_voxel=FLAGS.size, router=FLAGS.router, test = FLAGS.test, coarse=FLAGS.map, useMeta = FLAGS.useMeta)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'-s',
'--steps',
type=int,
default=102000,
help='Number of steps of training'
)
parser.add_argument(
'-b',
'--batch_size',
type=int,
default=40,
help='Size of one batch'
)
parser.add_argument(
'-l',
'--learning_rate',
type=float,
default=0.0001,
help='learning rate'
)
parser.add_argument(
'--branches',
type=int,
default=22,
help='number of branches'
)
parser.add_argument(
'--types',
type=int,
default=167,
help='number of atom types'
)
parser.add_argument(
'-v',
'--voxels',
type=int,
default=24,
help='number of voxel in the map'
)
parser.add_argument(
'--size',
type=float,
default=0.8,
help='Size of a vovel in the map'
)
parser.add_argument(
'-t',
'--test',
type=str,
default='',
help='id of the test'
)
parser.add_argument(
'-c',
'--conv',
type=str,
help='path to the file where the conv setting are saved'
)
parser.add_argument(
'-f',
'--feeding_order',
default=None,
type=str,
help='path to the file where the feeding order is saved'
)
parser.add_argument(
'--restore',
default=None,
type=str,
help='path to model'
)
parser.add_argument(
'--orient',
default=1,
type=int,
help='orientation 0 or 1 (1 is orientation)'
)
parser.add_argument(
'--skipNeighb',
default=0,
type=int,
help='number of neighbours to skip (default is zero)'
)
parser.add_argument(
'--useMeta',
default=0,
type=int,
help='useMeta information such as secondary structure and surface area (0 is no information, 1 is sec struct, 2 is surface area 3 is both)'
)
parser.add_argument(
'--router',
default='base',
type=str,
help='Define the kind of router you want (base, routed_res, advanced)'
)
parser.add_argument(
'--map',
default='',
type=str,
help='Set to coarse to have a coarse map'
)
FLAGS = parser.parse_args()
main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment