Commit 1f09d480 by Edoardo SARTI

table1 script updated

parent 530735fc
FBA
GAPDH
PGK
PRK
RPE
RPI
RubiscoLS
RubiscoSS
TPI
TRK
FBPase
SBPase
FBA
GAPDH
PGK
PRK
RPE
RPI
TPI
TRK
FBPase
SBPase
FBPase_PF00316
FBPase_PF18913
GAPDH_PF00044
GAPDH_PF02800
RubiscoLS_PF00016
RubiscoLS_PF02788
SBPase_PF00316
SBPase_PF18913
TRK_PF00456
TRK_PF02779
TRK_PF02780
SBPase_PF00316
SBPase_PF18913
import sys
import ete3
tree_fn = sys.argv[1]
node_name = sys.argv[2]
t = ete3.Tree(tree_fn)
if node_name == "root":
subtree_root = t
else:
subtree_root = t.search_nodes(name=node_name)[0]
for l in [x for x in subtree_root]:
n = l.name.split()[0]
up = l.name.split("|")[1]
print(n)
for CBP in `cat CB_proteins.txt` INPUT_DIR="PV_trees/input_sequences_for_trees"
TREES_DIR="PV_trees/PV_annotated_trees"
SCRIPTS_DIR="scripts"
ANNEXES_DIR="scripts/annexes"
TMP_DIR="tmp/"
GROUND_TRUTH_DIR="PV_trees/PV_annotated_trees/ground_truth"
for CBP in `cat ${ANNEXES_DIR}/CB_proteins_an.txt`
do do
grep ${CBP} PF_domains > ${CBP}_domains.txt cat ${ANNEXES_DIR}/${CBP}_domains.txt > ${TMP_DIR}/x.tmp.txt
if [[ "${CBP}" == "GAPDH" ]] if [[ "${CBP}" == "PGK" ]]
then
echo "PGK_PGAbinding" >> ${TMP_DIR}/x.tmp.txt
elif [[ "${CBP}" == "TRK" ]]
then then
for x in "GAPDH_PF00044" "GAPDH_PF02800" echo "TRK_multi" >> ${TMP_DIR}/x.tmp.txt
do elif [[ "${CBP}" == "RPI" ]]
grep -A1 "CHLRE\|VOLCA\|ARATH" /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus2_exp.fa | awk 'length($0)>2' | sort | uniq > f0.txt then
P_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'` echo "RPI_isomerase" >> ${TMP_DIR}/x.tmp.txt
P_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'` echo "RPI_dimerase" >> ${TMP_DIR}/x.tmp.txt
P_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'` echo "RPI_isomerase_dimerase" >> ${TMP_DIR}/x.tmp.txt
fi
for x in `cat ${TMP_DIR}/x.tmp.txt`
do
grep -A1 "CHLRE\|VOLCA\|ARATH" ${INPUT_DIR}/${CBP}/${CBP}.fa | awk 'length($0)>2' | sort | uniq > ${TMP_DIR}/f0.txt
P_CHLRE=`grep "CHLRE" ${TMP_DIR}/f0.txt | wc -l | awk '{print $1}'`
P_VOLCA=`grep "VOLCA" ${TMP_DIR}/f0.txt | wc -l | awk '{print $1}'`
P_ARATH=`grep "ARATH" ${TMP_DIR}/f0.txt | wc -l | awk '{print $1}'`
# Count all paralogs predicted by Profileview # Count all paralogs predicted by Profileview
grep ${CBP} PF_domains > ${CBP}_domains.txt python3 ${SCRIPTS_DIR}/get_all_up_subtree.py ${TREES_DIR}/${x}/${x}.nhx root > ${TMP_DIR}/f1.txt; grep -A1 -f ${TMP_DIR}/f1.txt ${INPUT_DIR}/${CBP}/${CBP}.fa > ${TMP_DIR}/f2.txt; grep -A1 "CHLRE\|VOLCA\|ARATH" ${TMP_DIR}/f2.txt | awk 'length($0)>2{print $1}' | sort | uniq > ${TMP_DIR}/f0.txt
python3 get_all_up_subtree.py /workspaces/2024_PV_on_CB/data/PV_trees/from_monodomain/${x}/${x}_plus2.nhx root > f1.txt; grep -A1 -f f1.txt /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus2_exp.fa > f2.txt; grep -A1 "CHLRE\|VOLCA\|ARATH" f2.txt | awk 'length($0)>2' | sort | uniq > f0.txt
PR_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'` PR_CHLRE=`grep "CHLRE" ${TMP_DIR}/f0.txt | wc -l | awk '{print $1}'`
PR_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'` PR_VOLCA=`grep "VOLCA" ${TMP_DIR}/f0.txt | wc -l | awk '{print $1}'`
PR_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'` PR_ARATH=`grep "ARATH" ${TMP_DIR}/f0.txt | wc -l | awk '{print $1}'`
# Count all predicted paralogs having functional annotations # Count all predicted paralogs having functional annotations
grep -f /workspaces/2024_PV_on_CB/data/experimental_evidence/${CBP}_class_all_primaryacc_names.txt f2.txt | awk 'length($0)>2' | sort | uniq > f0.txt grep -f ${GROUND_TRUTH_DIR}/${CBP}_class_all_primaryacc_names.txt ${TMP_DIR}/f2.txt | awk 'length($0)>2{print $1}' | sort | uniq > ${TMP_DIR}/f0.txt
F_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'` F_CHLRE=`grep "CHLRE" ${TMP_DIR}/f0.txt | wc -l | awk '{print $1}'`
F_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'` F_VOLCA=`grep "VOLCA" ${TMP_DIR}/f0.txt | wc -l | awk '{print $1}'`
F_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'` F_ARATH=`grep "ARATH" ${TMP_DIR}/f0.txt | wc -l | awk '{print $1}'`
echo "${x} | ${P_CHLRE} ${PR_CHLRE} ${F_CHLRE} | ${P_VOLCA} ${PR_VOLCA} ${F_VOLCA} | ${P_ARATH} ${PR_ARATH} ${F_ARATH}" echo "${x} | ${P_CHLRE} ${PR_CHLRE} ${F_CHLRE} | ${P_VOLCA} ${PR_VOLCA} ${F_VOLCA} | ${P_ARATH} ${PR_ARATH} ${F_ARATH}"
done done
else
for x in `cat ${CBP}_domains.txt`
do
# Count all paralogs in UniProtKB sequences
#grep -A1 "CHLRE\|VOLCA\|ARATH" /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa | awk 'length($0)>2' | awk 'substr($0,1,1)==">"{h=$0} substr($0,1,1)!=">"{a[$1]=h} END{for (i in a) {print a[i]}}' | sort | uniq > f0.txt
grep -A1 "CHLRE\|VOLCA\|ARATH" /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa | awk 'length($0)>2' | sort | uniq > f0.txt
P_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
P_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
P_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
# Count all paralogs predicted by Profileview
grep ${CBP} PF_domains > ${CBP}_domains.txt
#python3 get_all_up_subtree.py /workspaces/2024_PV_on_CB/data/PV_trees/from_monodomain/${x}/${x}_plus.nhx root > f1.txt; grep -A1 -f f1.txt /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa > f2.txt; grep -A1 "CHLRE\|VOLCA\|ARATH" f2.txt | awk 'length($0)>2' | awk 'substr($0,1,1)==">"{h=$0} substr($0,1,1)!=">"{a[$1]=h} END{for (i in a) {print a[i]}}' | sort | uniq > f0.txt
python3 get_all_up_subtree.py /workspaces/2024_PV_on_CB/data/PV_trees/from_monodomain/${x}/${x}_plus.nhx root > f1.txt; grep -A1 -f f1.txt /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa > f2.txt; grep -A1 "CHLRE\|VOLCA\|ARATH" f2.txt | awk 'length($0)>2' | sort | uniq > f0.txt
#python3 get_all_up_subtree.py /workspaces/2024_PV_on_CB/data/PV_trees/from_monodomain/${x}/${x}_plus.nhx root > f1.txt; grep "CHLRE\|VOLCA\|ARATH" f1.txt | awk 'length($0)>2' | sort | uniq > f0.txt
PR_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
PR_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
PR_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
# Count all predicted paralogs having functional annotations
# grep -A1 -f /workspaces/2024_PV_on_CB/data/experimental_evidence/${CBP}_class_all_primaryacc_names.txt /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa | awk 'length($0)>2' | awk 'substr($0,1,1)==">"{h=$0} substr($0,1,1)!=">"{a[$1]=h} END{for (i in a) {print a[i]}}' > f0.txt
grep -f /workspaces/2024_PV_on_CB/data/experimental_evidence/${CBP}_class_all_primaryacc_names.txt f2.txt | awk 'length($0)>2' | sort | uniq > f0.txt
F_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
F_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
F_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
echo "${x} | ${P_CHLRE} ${PR_CHLRE} ${F_CHLRE} | ${P_VOLCA} ${PR_VOLCA} ${F_VOLCA} | ${P_ARATH} ${PR_ARATH} ${F_ARATH}"
done
if [[ "${CBP}" == "PGK" ]]
then
x="PGK_PGAbinding"
grep -A1 "CHLRE\|VOLCA\|ARATH" /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa | awk 'length($0)>2' | sort | uniq > f0.txt
P_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
P_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
P_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
# Count all paralogs predicted by Profileview
grep ${CBP} PF_domains > ${CBP}_domains.txt
python3 get_all_up_subtree.py /workspaces/2024_PV_on_CB/data/PV_trees/from_mixed/PGK_PGAbinding.nhx root > f1.txt; grep -A1 -f f1.txt /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa > f2.txt; grep -A1 "CHLRE\|VOLCA\|ARATH" f2.txt | awk 'length($0)>2' | sort | uniq > f0.txt
PR_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
PR_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
PR_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
grep -f /workspaces/2024_PV_on_CB/data/experimental_evidence/${CBP}_class_all_primaryacc_names.txt f2.txt | awk 'length($0)>2' | sort | uniq > f0.txt
F_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
F_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
F_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
echo "${x} | ${P_CHLRE} ${PR_CHLRE} ${F_CHLRE} | ${P_VOLCA} ${PR_VOLCA} ${F_VOLCA} | ${P_ARATH} ${PR_ARATH} ${F_ARATH}"
fi
if [[ "${CBP}" == "TRK" ]]
then
x="TRK_multi"
grep -A1 "CHLRE\|VOLCA\|ARATH" /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa | awk 'length($0)>2' | sort | uniq > f0.txt
P_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
P_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
P_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
# Count all paralogs predicted by Profileview
grep ${CBP} PF_domains > ${CBP}_domains.txt
python3 get_all_up_subtree.py /workspaces/2024_PV_on_CB/data/PV_trees/from_mixed/TRK_multi.nhx root > f1.txt; grep -A1 -f f1.txt /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa > f2.txt; grep -A1 "CHLRE\|VOLCA\|ARATH" f2.txt | awk 'length($0)>2' | sort | uniq > f0.txt
PR_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
PR_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
PR_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
grep -f /workspaces/2024_PV_on_CB/data/experimental_evidence/${CBP}_class_all_primaryacc_names.txt f2.txt | awk 'length($0)>2' | sort | uniq > f0.txt
F_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
F_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
F_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
echo "${x} | ${P_CHLRE} ${PR_CHLRE} ${F_CHLRE} | ${P_VOLCA} ${PR_VOLCA} ${F_VOLCA} | ${P_ARATH} ${PR_ARATH} ${F_ARATH}"
fi
if [[ "${CBP}" == "RPI" ]]
then
for x in "RPI_isomerase" "RPI_dimerase" "RPI_isomerase_dimerase"
do
grep -A1 "CHLRE\|VOLCA\|ARATH" /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa | awk 'length($0)>2' | sort | uniq > f0.txt
P_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
P_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
P_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
# Count all paralogs predicted by Profileview
grep ${CBP} PF_domains > ${CBP}_domains.txt
python3 get_all_up_subtree.py /workspaces/2024_PV_on_CB/data/PV_trees/from_mixed/${x}.nhx root > f1.txt; grep -A1 -f f1.txt /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa > f2.txt; grep -A1 "CHLRE\|VOLCA\|ARATH" f2.txt | awk 'length($0)>2' | sort | uniq > f0.txt
PR_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
PR_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
PR_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
grep -f /workspaces/2024_PV_on_CB/data/experimental_evidence/${CBP}_class_all_primaryacc_names.txt f2.txt | awk 'length($0)>2' | sort | uniq > f0.txt
F_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
F_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
F_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
echo "${x} | ${P_CHLRE} ${PR_CHLRE} ${F_CHLRE} | ${P_VOLCA} ${PR_VOLCA} ${F_VOLCA} | ${P_ARATH} ${PR_ARATH} ${F_ARATH}"
done
fi
fi
done done
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment