Commit 530735fc by Edoardo SARTI

initial commit

parent 197db284
# PV_build files
Put here the fasta files for the generation of ProfileView libraries
# PV_libraries
Put here the libraries created with profileview-build
# PV_trees
Put here all the files produced with profileview-tree
import sys
from ete3 import Tree
seqs_fn = sys.argv[1]
oxlin_fn = sys.argv[2]
nhx_fn = sys.argv[3]
ox2class = {}
with open(oxlin_fn) as f:
for line in f:
fields = line.split(",")
ox = fields[0]
if "Rhodophyta" in fields:
ox2class[ox] = "Rhodophyta"
elif "Chlorophyta" in fields:
ox2class[ox] = "Seedless plants"
elif "Liliopsida" in fields:
ox2class[ox] = "Liliopsida"
elif "rosids" in fields:
ox2class[ox] = "Rosids"
else:
ox2class[ox] = "Other seed plants"
id2ox = {}
id2class = {}
with open(seqs_fn) as f:
for line in f:
if line.startswith(">"):
fields = line.split()
for field in fields:
if field[:3] == "OX=":
ox = field[3:]
break
ffields = line.split("|")
id2ox[ffields[1]] = ox
id2class[ffields[1]] = ox2class[ox]
class2color = {
"Rhodophyta" : "#9fc5e8",
"Seedless plants" : "#b4a7d6",
"Liliopsida" : "#d5a6bd",
"Rosids" : "#f6b26b",
"Other seed plants" : "#ffd966"
}
t = Tree(nhx_fn)
print("DATASET_COLORSTRIP\nSEPARATOR COMMA\nDATASET_LABEL,strip_color\nCOLOR,#ffffff\nLEGEND_TITLE,Dataset_strip\nLEGEND_SHAPES,1,1,1,1,1\nLEGEND_COLORS,#cfe2f3,#d9d2e9,#d5a6bd,#f9cb9c,#ffe390\nLEGEND_LABELS,Rhodophyta,Seedless plants,Liliopsida,Rosids,Other seed plants\nDATA")
for l in [x for x in t]:
n = l.name.split()[0]
up = l.name.split("|")[1]
print(f"{n},{class2color[id2class[up]]}")
import sys
from ete3 import Tree
tree_fn = sys.argv[1]
t = Tree(tree_fn)
nonCBBC = set()
CBBC = set()
with open(sys.argv[2]) as f:
for line in f:
fields = line.split()
if len(fields) < 4:
continue
if "rgba(255, 0" in line:
nonCBBC.add(fields[0])
elif "rgba(173, 255" in line or "rgba(0, 255, 0" in line:
CBBC.add(fields[0])
else:
continue
#print(CBBC, nonCBBC)
nonCBBC_lines = set()
CBBC_lines = set()
for n in CBBC:
try:
node = t&n
except:
continue
CBBC_lines.add(node)
while node != t:
node = node.up
CBBC_lines.add(node)
for n in nonCBBC:
try:
node = t&n
except:
continue
nonCBBC_lines.add(node)
while node != t:
node = node.up
nonCBBC_lines.add(node)
CBBC_treeroots = set()
nonCBBC_treeroots = set()
for n in CBBC:
try:
node = t&n
except:
continue
while node != t:
newnode = node.up
if newnode in nonCBBC_lines:
CBBC_treeroots.add(node)
break
node = newnode
for n in nonCBBC:
try:
node = t&n
except:
continue
while node != t:
newnode = node.up
if newnode in CBBC_lines:
nonCBBC_treeroots.add(node)
break
node = newnode
#print("CBBC roots")
for x in CBBC_treeroots:
print(f"{x.name}\trange\t#d9ead3\tpredicted CBBC")
#print("nonCBBC roots")
for x in nonCBBC_treeroots:
print(f"{x.name}\trange\t#f4cccc\tpredicted non-CBBC")
for CBP in `cat CB_proteins.txt`
do
grep ${CBP} PF_domains > ${CBP}_domains.txt
if [[ "${CBP}" == "GAPDH" ]]
then
for x in "GAPDH_PF00044" "GAPDH_PF02800"
do
grep -A1 "CHLRE\|VOLCA\|ARATH" /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus2_exp.fa | awk 'length($0)>2' | sort | uniq > f0.txt
P_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
P_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
P_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
# Count all paralogs predicted by Profileview
grep ${CBP} PF_domains > ${CBP}_domains.txt
python3 get_all_up_subtree.py /workspaces/2024_PV_on_CB/data/PV_trees/from_monodomain/${x}/${x}_plus2.nhx root > f1.txt; grep -A1 -f f1.txt /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus2_exp.fa > f2.txt; grep -A1 "CHLRE\|VOLCA\|ARATH" f2.txt | awk 'length($0)>2' | sort | uniq > f0.txt
PR_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
PR_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
PR_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
# Count all predicted paralogs having functional annotations
grep -f /workspaces/2024_PV_on_CB/data/experimental_evidence/${CBP}_class_all_primaryacc_names.txt f2.txt | awk 'length($0)>2' | sort | uniq > f0.txt
F_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
F_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
F_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
echo "${x} | ${P_CHLRE} ${PR_CHLRE} ${F_CHLRE} | ${P_VOLCA} ${PR_VOLCA} ${F_VOLCA} | ${P_ARATH} ${PR_ARATH} ${F_ARATH}"
done
else
for x in `cat ${CBP}_domains.txt`
do
# Count all paralogs in UniProtKB sequences
#grep -A1 "CHLRE\|VOLCA\|ARATH" /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa | awk 'length($0)>2' | awk 'substr($0,1,1)==">"{h=$0} substr($0,1,1)!=">"{a[$1]=h} END{for (i in a) {print a[i]}}' | sort | uniq > f0.txt
grep -A1 "CHLRE\|VOLCA\|ARATH" /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa | awk 'length($0)>2' | sort | uniq > f0.txt
P_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
P_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
P_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
# Count all paralogs predicted by Profileview
grep ${CBP} PF_domains > ${CBP}_domains.txt
#python3 get_all_up_subtree.py /workspaces/2024_PV_on_CB/data/PV_trees/from_monodomain/${x}/${x}_plus.nhx root > f1.txt; grep -A1 -f f1.txt /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa > f2.txt; grep -A1 "CHLRE\|VOLCA\|ARATH" f2.txt | awk 'length($0)>2' | awk 'substr($0,1,1)==">"{h=$0} substr($0,1,1)!=">"{a[$1]=h} END{for (i in a) {print a[i]}}' | sort | uniq > f0.txt
python3 get_all_up_subtree.py /workspaces/2024_PV_on_CB/data/PV_trees/from_monodomain/${x}/${x}_plus.nhx root > f1.txt; grep -A1 -f f1.txt /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa > f2.txt; grep -A1 "CHLRE\|VOLCA\|ARATH" f2.txt | awk 'length($0)>2' | sort | uniq > f0.txt
#python3 get_all_up_subtree.py /workspaces/2024_PV_on_CB/data/PV_trees/from_monodomain/${x}/${x}_plus.nhx root > f1.txt; grep "CHLRE\|VOLCA\|ARATH" f1.txt | awk 'length($0)>2' | sort | uniq > f0.txt
PR_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
PR_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
PR_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
# Count all predicted paralogs having functional annotations
# grep -A1 -f /workspaces/2024_PV_on_CB/data/experimental_evidence/${CBP}_class_all_primaryacc_names.txt /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa | awk 'length($0)>2' | awk 'substr($0,1,1)==">"{h=$0} substr($0,1,1)!=">"{a[$1]=h} END{for (i in a) {print a[i]}}' > f0.txt
grep -f /workspaces/2024_PV_on_CB/data/experimental_evidence/${CBP}_class_all_primaryacc_names.txt f2.txt | awk 'length($0)>2' | sort | uniq > f0.txt
F_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
F_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
F_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
echo "${x} | ${P_CHLRE} ${PR_CHLRE} ${F_CHLRE} | ${P_VOLCA} ${PR_VOLCA} ${F_VOLCA} | ${P_ARATH} ${PR_ARATH} ${F_ARATH}"
done
if [[ "${CBP}" == "PGK" ]]
then
x="PGK_PGAbinding"
grep -A1 "CHLRE\|VOLCA\|ARATH" /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa | awk 'length($0)>2' | sort | uniq > f0.txt
P_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
P_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
P_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
# Count all paralogs predicted by Profileview
grep ${CBP} PF_domains > ${CBP}_domains.txt
python3 get_all_up_subtree.py /workspaces/2024_PV_on_CB/data/PV_trees/from_mixed/PGK_PGAbinding.nhx root > f1.txt; grep -A1 -f f1.txt /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa > f2.txt; grep -A1 "CHLRE\|VOLCA\|ARATH" f2.txt | awk 'length($0)>2' | sort | uniq > f0.txt
PR_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
PR_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
PR_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
grep -f /workspaces/2024_PV_on_CB/data/experimental_evidence/${CBP}_class_all_primaryacc_names.txt f2.txt | awk 'length($0)>2' | sort | uniq > f0.txt
F_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
F_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
F_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
echo "${x} | ${P_CHLRE} ${PR_CHLRE} ${F_CHLRE} | ${P_VOLCA} ${PR_VOLCA} ${F_VOLCA} | ${P_ARATH} ${PR_ARATH} ${F_ARATH}"
fi
if [[ "${CBP}" == "TRK" ]]
then
x="TRK_multi"
grep -A1 "CHLRE\|VOLCA\|ARATH" /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa | awk 'length($0)>2' | sort | uniq > f0.txt
P_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
P_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
P_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
# Count all paralogs predicted by Profileview
grep ${CBP} PF_domains > ${CBP}_domains.txt
python3 get_all_up_subtree.py /workspaces/2024_PV_on_CB/data/PV_trees/from_mixed/TRK_multi.nhx root > f1.txt; grep -A1 -f f1.txt /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa > f2.txt; grep -A1 "CHLRE\|VOLCA\|ARATH" f2.txt | awk 'length($0)>2' | sort | uniq > f0.txt
PR_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
PR_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
PR_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
grep -f /workspaces/2024_PV_on_CB/data/experimental_evidence/${CBP}_class_all_primaryacc_names.txt f2.txt | awk 'length($0)>2' | sort | uniq > f0.txt
F_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
F_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
F_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
echo "${x} | ${P_CHLRE} ${PR_CHLRE} ${F_CHLRE} | ${P_VOLCA} ${PR_VOLCA} ${F_VOLCA} | ${P_ARATH} ${PR_ARATH} ${F_ARATH}"
fi
if [[ "${CBP}" == "RPI" ]]
then
for x in "RPI_isomerase" "RPI_dimerase" "RPI_isomerase_dimerase"
do
grep -A1 "CHLRE\|VOLCA\|ARATH" /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa | awk 'length($0)>2' | sort | uniq > f0.txt
P_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
P_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
P_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
# Count all paralogs predicted by Profileview
grep ${CBP} PF_domains > ${CBP}_domains.txt
python3 get_all_up_subtree.py /workspaces/2024_PV_on_CB/data/PV_trees/from_mixed/${x}.nhx root > f1.txt; grep -A1 -f f1.txt /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa > f2.txt; grep -A1 "CHLRE\|VOLCA\|ARATH" f2.txt | awk 'length($0)>2' | sort | uniq > f0.txt
PR_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
PR_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
PR_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
grep -f /workspaces/2024_PV_on_CB/data/experimental_evidence/${CBP}_class_all_primaryacc_names.txt f2.txt | awk 'length($0)>2' | sort | uniq > f0.txt
F_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
F_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
F_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
echo "${x} | ${P_CHLRE} ${PR_CHLRE} ${F_CHLRE} | ${P_VOLCA} ${PR_VOLCA} ${F_VOLCA} | ${P_ARATH} ${PR_ARATH} ${F_ARATH}"
done
fi
fi
done
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment