Added evcouplings to the image for MSA generation.

16b622fa · Mustafa Tekpinar · 6c95c457 · 16b622fa · 16b622fa · 16b622fa
Commit 16b622fa authored Jun 06, 2023 by Mustafa Tekpinar
Hide whitespace changes
Inline Side-by-side

Showing with 440 additions and 0 deletions

.gitignore .gitignore +2 -0

Dockerfile Dockerfile +12 -0

evc_config_monomer_template.txt data/evc_config_monomer_template.txt +426 -0

No files found.
--- a/.gitignore
+++ b/.gitignore
@@ -4,5 +4,6 @@
 example/.RData
 gemmeAnal.pyc
 __pycache__/*
+esgemme/__pycache__/
 esgemme.egg-info/*
 .RData
\ No newline at end of file
--- a/Dockerfile
+++ b/Dockerfile
@@ -36,6 +36,7 @@ apt-get install -y nano && \
 apt-get install -y less && \
 apt-get install -y wget && \
 apt-get install csh && \
+apt-get install -y hmmer && \
 apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 ###################################################################
 RUN wget https://github.com/soedinglab/hh-suite/releases/download/v3.3.0/hhsuite-3.3.0-AVX2-Linux.tar.gz
@@ -55,6 +56,17 @@ RUN pip3 install -e ./demust/
 RUN Rscript -e 'install.packages("seqinr", repos="http://cran.us.r-project.org", dependencies=TRUE)'
+#PLMC and EVcouplings installation for obtaining sequences.
+RUN wget https://github.com/debbiemarkslab/plmc/archive/refs/heads/master.zip
+RUN unzip master.zip
+WORKDIR /home/tekpinar/research/lcqb/plmc-master
+RUN make all-openmp32
+RUN cp bin/plmc /usr/local/bin/
+WORKDIR /home/tekpinar/research/lcqb
+RUN rm -r master.zip plmc-master
+RUN pip3 install install https://github.com/debbiemarkslab/EVcouplings/archive/develop.zip
 # Build command
 # sudo docker build -t tekpinar/esgemme-docker:v1.3.0 .
 # sudo docker login

--- a/data/evc_config_monomer_template.txt
+++ b/data/evc_config_monomer_template.txt
+# Sample configuration file for evcouplings monomer protein prediction pipeline.
+# This file determines all aspects of the computation:
+# - which compute environment to use
+# - which stages of the pipeline to run
+# - what the settings for each of the stages are
+# Minimal settings required before this configuration can be executed:
+# - set your environment, paths to tools and databases (at the end of this file)
+# - under "global", set prefix and sequence_id
+# - run it! :)
+# Configuration rules:
+# 1) Global settings override settings for stages
+# 2) Outputs of a stage are merged into "global" and fed into the input of subsequent stages
+#    (e.g., the alignment_file output of align will be used by the alignment_file input of couplings)
+# 3) All settings are explicitly specified here. No hidden defaults in code.
+# 4) Each stage is also passed the parameters in the "databases" and "tools" sections
+pipeline: protein_monomer
+# which stages of workflow to run. Uncomment downstream stages using # (however, no stage can be run before the previous
+# stage has been run)
+stages:
+    - align
+#    - couplings
+#    - compare
+#    - mutate
+#    - fold
+# Global job settings (which protein, region). These will override settings of the same name in each of the stages.
+# These are typically the settings you want to modify for each of your jobs, together with some settings in the align stage.
+global:
+    # mandatory output prefix of the job (e.g. output/HRAS will store outputs in folder "output", using files prefixed with "HRAS")
+    prefix: 
+    # mandatory sequence identifier (mandatory, even if sequence_file is given)
+    sequence_id: 
+    # optional FASTA file with target sequence (if blank, will fetch try to fetch sequence_id from databases.sequence_download_url)
+    # if sequence_file is set, sequence_id must be defined, but can be arbitrary identifier (does not have to match ID in file)
+    sequence_file: 
+    # cut to subregion of sequence (specify as list, e.g. [24, 286], leave blank for full sequence)
+    region:
+    # Clustering threshold for downweighting redudant sequences (Meff computation). E.g. 0.8 will cluster sequences
+    # at a 80% sequence identity cutoff
+    theta: 0.8
+    # number of cores to use. If running through evcouplings application, will be overriden by environment.cores
+    cpu:
+# Specify multiple batch jobs (if empty, only a single job will be run). Each entry (e.g. b_0.75) will be appended to
+# global.prefix to uniquely identify the subjob. Parameters for individual stages that should be overridden for each
+# subjob have to be specified, for all other parameters jobs share the same values.
+batch:
+#    _b0.75:
+#       align: {domain_threshold: 0.75, sequence_threshold: 0.75}
+#    _b0.3:
+#       align: {domain_threshold: 0.3, sequence_threshold: 0.3}
+# Sequence alignment generation/processing.
+align:
+    # standard: iterative sequence search and postprocessing using jackhmmer.
+    protocol: standard
+    # The following fields usually do not need to be set, since "global" defines them.
+    # prefix:
+    # sequence_id:
+    # sequence_file:
+    # region:
+    # theta:
+    # index of first residue in sequence_id / sequence_file. This can be used to renumber sequences that already have
+    # been cut to a subsequence
+    first_index: 1
+    # Use bitscore threshold instead of E-value threshold for sequence search
+    use_bitscores: True
+    # jackhmmer domain- and sequence-level inclusion thresholds.
+    # if use_bitscores is True:
+    # - floating point number will be interpreted as a relative bitscore threshold (bits/residue)
+    # - integer will be interpreted as an absolute bitscore threshold
+    # if use_bitscore is False:
+    # - mantissa-exponent string or float will be interpreted literally
+    # - integer will be interpreted as negative of the exponent (10 -> 1E-10)
+    domain_threshold: 0.5
+    sequence_threshold: 0.5
+    # number of jackhmmer iterations
+    iterations: 5
+    # sequence database (specify possible databases and paths in "databases" section below)
+    database: uniref100
+    # compute the redundancy-reduced number of effective sequences (M_eff) already in the alignment stage.
+    # To save compute time, this computation is normally carried out in the couplings stage
+    compute_num_effective_seqs: False
+    # Filter sequence alignment at this % sequence identity cutoff. Can be used to cut computation time in
+    # the couplings stage (e.g. set to 95 to remove any sequence that is more than 95% identical to a sequence
+    # already present in the alignment). If blank, no filtering. If filtering, HHfilter must be installed.
+    seqid_filter: 98
+    # Only keep sequences that align to at least x% of the target sequence (i.e. remove fragments)
+    minimum_sequence_coverage: 50
+    # Only include alignment columns with at least x% residues (rather than gaps) during model inference
+    minimum_column_coverage: 70
+    # Create a file with extracted annotation from UniRef/UniProt sequence FASTA headers
+    extract_annotation: True
+    cpu:
+    # set to True to turn of jackhmmer bias correction
+    nobias: False
+    # if align stage has been run previously, reuse the generated raw sequence alignment coming out of jackhmmer
+    reuse_alignment: True
+    # create checkpoint files of HMM and aligment after each iteration
+    checkpoints_hmm: False
+    checkpoints_ali: False
+# Alternative protocol: reuse existing alignment and apply postprocessing to generate alignment that is consistent
+# with pipeline requirements. Uncomment, and comment all values in align section above to enable the "existing" protocol
+#    protocol: existing
+#    prefix:
+#    # Path of input alignment. Alignment needs to contain region in form SEQID/start-end, or first_index must be set
+#    input_alignment:
+#    sequence_id:
+#    first_index:
+#    compute_num_effective_seqs: False
+#    theta:
+#    seqid_filter:
+#    minimum_sequence_coverage: 50
+#    minimum_column_coverage: 70
+#    extract_annotation: True
+# Inference of evolutionary couplings from sequence alignment
+couplings:
+    # current options: 
+    # - standard (model inference using plmc)
+    # - mean_field (mean field direct coupling analysis, see below)
+    protocol: standard
+    # number of plmc iterations
+    iterations: 100
+    # specify custom alphabet as a string. Gap symbol must be first character
+    alphabet:
+    # Treat gaps as missing data during model inference
+    ignore_gaps: True
+    # strength of regularization on coupling parameters J
+    lambda_J: 0.01
+    # adjust for larger number of coupling parameters relative to number of fields h (multiply by model length and
+    # number of states)
+    lambda_J_times_Lq: True
+    # strength of regularization on fields h
+    lambda_h: 0.01
+    lambda_group:
+    scale_clusters:
+    # reuse ECs and model parameters, if this stage has been run before
+    reuse_ecs: True
+    # Sequence separation filter for generation of CouplingScores_longrange.csv table (i.e. to take out short-range
+    # ECs from table, only pairs with abs(i-j)>=min_sequence_distance will be kept.
+    min_sequence_distance: 6
+    # Post-inference scoring of ECs to derive probabilities
+    # Options are: skewnormal, normal, logistic_regression
+    scoring_model: logistic_regression
+# Alternative protocol: compute couplings with mean field direct coupling analysis
+# Uncomment, and comment all values in align section above to enable the mean_field protocol
+    # protocol: mean_field
+    # Options: cn, di, mi_apc, mi
+    # ec_score_type: cn
+    # Post-inference scoring of ECs to derive probabilities - only available if used_score == "cn"!
+    # Options are: skewnormal, normal, logistic_regression
+    # scoring_model: logistic_regression
+    # pseudo_count: 0.5
+    # alphabet:
+    # ignore_gaps: False
+    # reuse_ecs: True
+    # min_sequence_distance: 6
+    # Following input parameters will usually be overriden by "global" and outputs of "align" stage
+    # prefix:
+    # alignment_file:
+    # focus_sequence:
+    # focus_mode: True
+    # segments:
+    # cpu:
+    # theta:
+# Compare ECs to known 3D structures
+compare:
+    # Current options: standard
+    protocol: standard
+    # Following parameters will be usually overriden by global settings / output of previous stage
+    prefix:
+    sequence_id:
+    ec_file:
+    target_sequence_file:
+    # If True, find structures by sequence alignment against the PDB, otherwise identify structures using
+    # sequence_id and SIFTS database (sequence_id must be UniProt AC/ID in this case)
+    by_alignment: True
+    # Leave this parameter empty to use all PDB structures for given sequence_id, otherwise
+    # will be limited to the given IDs (single value or list). Important: note that this acts only as a filter on the
+    # structures found by alignment or in the SIFTS table (!)
+    pdb_ids:
+    # Limit number of structures and chains for comparison
+    max_num_structures: 10
+    max_num_hits: 25
+    # compare to multimer contacts (if multiple chains of the same sequence or its homologs are present in a structure)
+    compare_multimer: True
+    # settings for sequence alignment against PDB sequences using jackhmmer
+    # (additional settings like iterations possible, compare to align stage)
+    sequence_file:
+    first_index:
+    region:
+    alignment_min_overlap: 20
+    use_bitscores: True
+    domain_threshold: 0.1
+    sequence_threshold: 0.1
+    # Comparison and plotting settings
+    # Filter that defines which atoms will be used for distance calculations. If empty/None, no filter will be
+    # applied (resulting in the computation of minimum atom distances between all pairs of atoms). If setting to any
+    # particular PDB atom type, only these atoms will be used for the computation (e.g. CA will give C_alpha distances,
+    # CB will give C_beta distances, etc.)
+    atom_filter:
+    # Distance cutoff (Angstrom) for a true positive pair
+    distance_cutoff: 5
+    # Only long-range pairs with abs(i-j)>= min_sequence_distance will be used for CouplingScoresCompared_longrange.csv file
+    min_sequence_distance: 6
+    # Plot contact maps with ECs above these mixture model probability cutoffs
+    plot_probability_cutoffs: [0.90, 0.99]
+    # Plot fixed numbers of ECS. Integers will be interpreted as absolute numbers, floats as fractions of L (model length)
+    plot_lowest_count: 0.05
+    plot_highest_count: 1.0
+    plot_increase: 0.05
+    # Axis boundaries of contact map plot depending on range of ECs and structure.
+    # Options: union, intersection, ecs, structure, [start, end] (e.g. [100, 200])
+    boundaries: union
+    # scale sizes of EC dots in scatter plot based on strength of EC score
+    scale_sizes: True
+    # draw secondary structure on contact map plots
+    draw_secondary_structure: True
+    # draw structure and alignment/EC coverage information on contact map plots
+    draw_coverage: True
+    # print information about used PDB structures on contact map plots
+    print_pdb_information: True
+    # Alignment method to use to search the PDB Seqres database. Options: jackhmmer, hmmsearch
+    # Set to jackhmmer to search the PDB Seqres database using jackhmmer from the target sequence only (more stringent). 
+    # Set to hmmsearch to search the PDB seqres database using an HMM built from the output monomer alignment (less stringent). 
+    # Warning: searching by HMM may result in crystal structures from very distant homologs or even unrelated sequences. 
+    pdb_alignment_method: jackhmmer
+# Settings for Mutation effect predictions
+mutate:
+    # Options: standard
+    protocol: standard
+    # predict the following dataset file (.csv file, mutants like A102V or A102V,K199W in column "mutant")
+    mutation_dataset_file:
+    # Inputs set by global stage and output of previous stages
+    # prefix:
+    # model_file:
+# Settings for 3D structure prediction
+fold:
+    # Options: standard
+    protocol: standard
+    # Options: cns_dgsa
+    engine: cns_dgsa
+    # Config file. If blank, default configuration (restraints.yml) in package will be used
+    folding_config_file:
+    # If True, limit 3D modeling only to that region of sequence that actually has sequence alignment coverage)
+    cut_to_alignment_region: True
+    # Method for secondary structure prediction (options: psipred, requires PSIPRED installation). Will be used
+    # to generate distance and dihedral angle restraints for local geometry.
+    sec_struct_method: psipred
+    # If secondary structure was already predicted in previous execution of configuration, reuse the file
+    reuse_sec_struct: True
+    # Instead of predicting secondary structure in pipeline, can specify a secondary structure file:
+    # Must be csv file with columns i (position), A_i (residue) and sec_struct_3state (secondary structure, H, E or C
+    # for helix, sheet or coil)
+    sec_struct_file:
+    # Do not use EC pairs as distance restraints that are geometrically incompatible with predicted secondary structure
+    filter_sec_struct_clashes: True
+    # Only use ECs with sequence distance abs(i-j) >= min_sequence_distance for folding
+    min_sequence_distance: 6
+    # Predict structures using all ECs above these probability cutoffs according to mixture model
+    fold_probability_cutoffs: [0.90, 0.99]
+    # Predict structures with selected number of ECs.
+    # Integers will be interpreted as absolute number of ECs, floats as a fraction of L (model length)
+    fold_lowest_count: 0.5
+    fold_highest_count: 1.3
+    fold_increase: 0.05
+    # number of trial structure to generate for each EC set
+    num_models: 10
+    # remove intermediate files created during folding and keep only final models
+    cleanup: True
+    # Inputs defined by "global" or previous stages
+    # prefix:
+    # ec_file:
+    # target_sequence_file:
+    # segments:
+    # remapped_pdb_files:
+    # cpu:
+# These settings allow job status tracking using a database, and result collection in an archive
+management:
+    # URI of database
+    database_uri:
+    # unique job identifier
+    job_name:
+    # add the following output files to results archive
+    archive: [target_sequence_file, statistics_file, alignment_file, frequencies_file, ec_file, ec_longrange_file,
+              model_file, enrichment_file, evzoom_file, enrichment_pml_files, ec_lines_pml_file, contact_map_files,
+              ec_compared_all_file, ec_compared_longrange_file, remapped_pdb_files, mutations_epistatic_pml_files,
+              mutation_matrix_file, mutation_matrix_plot_files, secondary_structure_pml_file, folding_ec_file,
+              folded_structure_files, folding_ranking_file, folding_comparison_file, folding_individual_comparison_files,
+              ec_lines_compared_pml_file, pdb_structure_hits_file, sec_struct_file]
+    # Delete the following output files after running the job if you don't need them, to save disk space.
+    # Note that this may jeopardize your ability to rerun parts of the job if intermediate files are missing.
+    # The following, deactivated default deletes the biggest output files.
+    # delete: [raw_alignment_file, model_file]
+# Computational environment for batch jobs (using evcouplings command line application)
+environment:
+    # current options for engine: lsf, local, slurm (for local, only set cores and leave all other fields blank)
+    # If your batch engine of choice (e.g. SGE, Torque) is not available yet, please consider contributing by
+    # implementing it and submitting a pull request!
+    # Note that "cores" will override the "cpu" parameter for "global"
+    engine: 
+    queue: 
+    cores: 6
+    memory: 
+    time: 
+    # Special setting for "local" engine to define number of workers running in parallel
+    # (note that "cores" has to be defined above to make sure each job only uses a defined
+    # number of cores). If not defined or None, will default to number of cores / cores per job;
+    # otherwise specify integer to limit number of workers (1 for serial execution of subjobs)
+    # parallel_workers: 1
+    # command that will be executed before running actual computation (can be used to set up environment)
+    configuration:
+# Paths to databases used by evcouplings.
+databases:
+    # Sequence databases (only download the ones you want to use). You can also specify arbitrary databases in FASTA format
+    # using a database name of your choice here)
+    uniprot: /n/groups/marks/databases/jackhmmer/uniprot/uniprot_current.o2.fasta
+    uniref100: /home/tekpinar/databases/uniref100.fasta
+    uniref90: /home/tekpinar/databases/uniref90.fasta
+    # URL do download sequences if sequence_file is not given. {} will be replaced by sequence_id.
+    sequence_download_url: http://rest.uniprot.org/uniprot/{}.fasta
+    # Directory with PDB MMTF structures (leave blank to fetch structures from web)
+    pdb_mmtf_dir:
+    # SIFTS mapping information. Point to file paths in an existing directory, and if these files do not exist, they will be
+    # automatically generated and saved at the given file path (this may take a while).
+    # Periodically delete these files to more recent versions of SIFTS are used.
+    sifts_mapping_table: /n/groups/marks/databases/SIFTS/pdb_chain_uniprot_plus_current.o2.csv
+    sifts_sequence_db: /n/groups/marks/databases/SIFTS/pdb_chain_uniprot_plus_current.o2.fasta
+# Paths to external tools used by evcouplings. Please refer to README.md for installation instructions and which tools are required.
+tools:
+    jackhmmer: /usr/bin/jackhmmer
+    plmc: /usr/local/bin/plmc
+    hmmbuild: /usr/bin/hmmbuild
+    hmmsearch: /usr/bin/hmmsearch
+    hhfilter: /home/tekpinar/research/lcqb/hhsuite/bin/hhfilter
+    psipred: /n/groups/marks/software/runpsipred_o2
+    cns: /n/groups/marks/pipelines/evcouplings/software/cns_solve_1.21/intel-x86_64bit-linux/bin/cns
+    maxcluster: /n/groups/marks/pipelines/evcouplings/software/maxcluster64bit