initial commit

530735fc · Edoardo SARTI · 197db284 · 530735fc · 530735fc · 530735fc
Commit 530735fc authored Dec 01, 2024 by Edoardo SARTI
6 changed files
--- a/PV_build/README.md
+++ b/PV_build/README.md
+# PV_build files
+Put here the fasta files for the generation of ProfileView libraries
--- a/PV_libraries/README.md
+++ b/PV_libraries/README.md
+# PV_libraries
+Put here the libraries created with profileview-build
--- a/PV_trees/README.md
+++ b/PV_trees/README.md
+# PV_trees
+Put here all the files produced with profileview-tree
--- a/scripts/classify.py
+++ b/scripts/classify.py
+import sys
+from ete3 import Tree
+
+seqs_fn = sys.argv[1]
+oxlin_fn = sys.argv[2]
+nhx_fn = sys.argv[3]
+
+ox2class = {}
+with open(oxlin_fn) as f:
+    for line in f:
+        fields = line.split(",")
+        ox = fields[0]
+        if "Rhodophyta" in fields:
+            ox2class[ox] = "Rhodophyta"
+        elif "Chlorophyta" in fields:
+            ox2class[ox] = "Seedless plants"
+        elif "Liliopsida" in fields:
+            ox2class[ox] = "Liliopsida"
+        elif "rosids" in fields:
+            ox2class[ox] = "Rosids"
+        else:
+            ox2class[ox] = "Other seed plants"
+
+id2ox = {}
+id2class = {}
+with open(seqs_fn) as f:
+    for line in f:
+        if line.startswith(">"):
+            fields = line.split()
+            for field in fields:
+                if field[:3] == "OX=":
+                    ox = field[3:]
+                    break
+            ffields = line.split("|")
+            id2ox[ffields[1]] = ox
+            id2class[ffields[1]] = ox2class[ox]
+
+class2color = {
+        "Rhodophyta" : "#9fc5e8",
+        "Seedless plants" : "#b4a7d6",
+        "Liliopsida" : "#d5a6bd",
+        "Rosids" : "#f6b26b",
+        "Other seed plants" : "#ffd966"
+        }
+
+t = Tree(nhx_fn)
+print("DATASET_COLORSTRIP\nSEPARATOR COMMA\nDATASET_LABEL,strip_color\nCOLOR,#ffffff\nLEGEND_TITLE,Dataset_strip\nLEGEND_SHAPES,1,1,1,1,1\nLEGEND_COLORS,#cfe2f3,#d9d2e9,#d5a6bd,#f9cb9c,#ffe390\nLEGEND_LABELS,Rhodophyta,Seedless plants,Liliopsida,Rosids,Other seed plants\nDATA")
+for l in [x for x in t]:
+    n = l.name.split()[0]
+    up = l.name.split("|")[1]
+    
+    print(f"{n},{class2color[id2class[up]]}")
+
--- a/scripts/define_CBBC_subtrees.py
+++ b/scripts/define_CBBC_subtrees.py
+import sys
+from ete3 import Tree
+
+tree_fn = sys.argv[1]
+t = Tree(tree_fn)
+
+nonCBBC = set()
+CBBC = set()
+with open(sys.argv[2]) as f:
+    for line in f:
+        fields = line.split()
+        if len(fields) < 4:
+            continue
+        if "rgba(255, 0" in line:
+            nonCBBC.add(fields[0])
+        elif "rgba(173, 255" in line or "rgba(0, 255, 0" in line:
+            CBBC.add(fields[0])
+        else:
+            continue
+
+#print(CBBC, nonCBBC)
+
+nonCBBC_lines = set()
+CBBC_lines = set()
+for n in CBBC:
+    try:
+        node = t&n
+    except:
+        continue
+    CBBC_lines.add(node)
+    while node != t:
+        node = node.up
+        CBBC_lines.add(node)
+for n in nonCBBC:
+    try:
+        node = t&n
+    except:
+        continue
+    nonCBBC_lines.add(node)
+    while node != t:
+        node = node.up
+        nonCBBC_lines.add(node)
+
+CBBC_treeroots = set()
+nonCBBC_treeroots = set()
+for n in CBBC:
+    try:
+        node = t&n
+    except:
+        continue
+    while node != t:
+        newnode = node.up
+        if newnode in nonCBBC_lines:
+            CBBC_treeroots.add(node)
+            break
+        node = newnode
+for n in nonCBBC:
+    try:
+        node = t&n
+    except:
+        continue
+    while node != t:
+        newnode = node.up
+        if newnode in CBBC_lines:
+            nonCBBC_treeroots.add(node)
+            break
+        node = newnode
+
+#print("CBBC roots")
+for x in CBBC_treeroots:
+    print(f"{x.name}\trange\t#d9ead3\tpredicted CBBC")
+#print("nonCBBC roots")
+for x in nonCBBC_treeroots:
+    print(f"{x.name}\trange\t#f4cccc\tpredicted non-CBBC")
--- a/scripts/table_S1.sh
+++ b/scripts/table_S1.sh
+for CBP in `cat CB_proteins.txt`
+do 	
+	grep ${CBP} PF_domains > ${CBP}_domains.txt
+	if [[ "${CBP}" == "GAPDH" ]]
+	then
+		for x in "GAPDH_PF00044" "GAPDH_PF02800"
+		do
+			grep -A1 "CHLRE\|VOLCA\|ARATH" /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus2_exp.fa  | awk 'length($0)>2' | sort | uniq > f0.txt
+	                P_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
+        	        P_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
+                	P_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
+
+
+	                # Count all paralogs predicted by Profileview
+        	        grep ${CBP} PF_domains > ${CBP}_domains.txt
+	                python3 get_all_up_subtree.py /workspaces/2024_PV_on_CB/data/PV_trees/from_monodomain/${x}/${x}_plus2.nhx root > f1.txt; grep -A1 -f f1.txt /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus2_exp.fa > f2.txt; grep -A1 "CHLRE\|VOLCA\|ARATH"  f2.txt | awk 'length($0)>2' | sort | uniq > f0.txt
+
+                	PR_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
+	                PR_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
+	                PR_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
+
+        	        # Count all predicted paralogs having functional annotations
+	                grep -f /workspaces/2024_PV_on_CB/data/experimental_evidence/${CBP}_class_all_primaryacc_names.txt f2.txt | awk 'length($0)>2' | sort | uniq > f0.txt
+	                F_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
+	                F_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
+	                F_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
+
+	                echo "${x} | ${P_CHLRE} ${PR_CHLRE} ${F_CHLRE} | ${P_VOLCA} ${PR_VOLCA} ${F_VOLCA} | ${P_ARATH} ${PR_ARATH} ${F_ARATH}"
+
+		done
+	else
+	        for x in `cat ${CBP}_domains.txt`
+		do
+			# Count all paralogs in UniProtKB sequences
+			#grep -A1 "CHLRE\|VOLCA\|ARATH" /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa  | awk 'length($0)>2' | awk 'substr($0,1,1)==">"{h=$0} substr($0,1,1)!=">"{a[$1]=h} END{for (i in a) {print a[i]}}' | sort | uniq > f0.txt
+			grep -A1 "CHLRE\|VOLCA\|ARATH" /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa  | awk 'length($0)>2' | sort | uniq > f0.txt
+			P_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
+			P_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
+			P_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
+	
+		
+			# Count all paralogs predicted by Profileview
+			grep ${CBP} PF_domains > ${CBP}_domains.txt
+			#python3 get_all_up_subtree.py /workspaces/2024_PV_on_CB/data/PV_trees/from_monodomain/${x}/${x}_plus.nhx root > f1.txt; grep -A1 -f f1.txt /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa > f2.txt; grep -A1 "CHLRE\|VOLCA\|ARATH"  f2.txt | awk 'length($0)>2' | awk 'substr($0,1,1)==">"{h=$0} substr($0,1,1)!=">"{a[$1]=h} END{for (i in a) {print a[i]}}' | sort | uniq > f0.txt
+			python3 get_all_up_subtree.py /workspaces/2024_PV_on_CB/data/PV_trees/from_monodomain/${x}/${x}_plus.nhx root > f1.txt; grep -A1 -f f1.txt /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa > f2.txt; grep -A1 "CHLRE\|VOLCA\|ARATH"  f2.txt | awk 'length($0)>2' | sort | uniq > f0.txt
+			#python3 get_all_up_subtree.py /workspaces/2024_PV_on_CB/data/PV_trees/from_monodomain/${x}/${x}_plus.nhx root > f1.txt; grep "CHLRE\|VOLCA\|ARATH"  f1.txt | awk 'length($0)>2' | sort | uniq > f0.txt
+	
+			PR_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
+	                PR_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
+	                PR_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
+	
+			# Count all predicted paralogs having functional annotations
+			# grep -A1 -f /workspaces/2024_PV_on_CB/data/experimental_evidence/${CBP}_class_all_primaryacc_names.txt /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa | awk 'length($0)>2' | awk 'substr($0,1,1)==">"{h=$0} substr($0,1,1)!=">"{a[$1]=h} END{for (i in a) {print a[i]}}' > f0.txt
+			grep -f /workspaces/2024_PV_on_CB/data/experimental_evidence/${CBP}_class_all_primaryacc_names.txt f2.txt | awk 'length($0)>2' | sort | uniq > f0.txt
+			F_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
+			F_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
+	                F_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
+	
+			echo "${x} | ${P_CHLRE} ${PR_CHLRE} ${F_CHLRE} | ${P_VOLCA} ${PR_VOLCA} ${F_VOLCA} | ${P_ARATH} ${PR_ARATH} ${F_ARATH}"
+		done
+	
+		if [[ "${CBP}" == "PGK" ]]
+		then
+			x="PGK_PGAbinding"
+			grep -A1 "CHLRE\|VOLCA\|ARATH" /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa  | awk 'length($0)>2' | sort | uniq > f0.txt
+	                P_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
+	                P_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
+	                P_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
+	
+	
+	                # Count all paralogs predicted by Profileview
+	                grep ${CBP} PF_domains > ${CBP}_domains.txt
+	                python3 get_all_up_subtree.py /workspaces/2024_PV_on_CB/data/PV_trees/from_mixed/PGK_PGAbinding.nhx root > f1.txt; grep -A1 -f f1.txt /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa > f2.txt; grep -A1 "CHLRE\|VOLCA\|ARATH"  f2.txt | awk 'length($0)>2' | sort | uniq > f0.txt
+			                PR_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
+	                PR_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
+	                PR_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
+	
+	grep -f /workspaces/2024_PV_on_CB/data/experimental_evidence/${CBP}_class_all_primaryacc_names.txt f2.txt | awk 'length($0)>2' | sort | uniq > f0.txt
+	                F_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
+	                F_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
+	                F_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
+	
+	                echo "${x} | ${P_CHLRE} ${PR_CHLRE} ${F_CHLRE} | ${P_VOLCA} ${PR_VOLCA} ${F_VOLCA} | ${P_ARATH} ${PR_ARATH} ${F_ARATH}"
+		fi
+	
+		if [[ "${CBP}" == "TRK" ]]
+	        then
+	                x="TRK_multi"
+	                grep -A1 "CHLRE\|VOLCA\|ARATH" /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa  | awk 'length($0)>2' | sort | uniq > f0.txt
+	                P_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
+	                P_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
+	                P_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
+	
+	
+	                # Count all paralogs predicted by Profileview
+	                grep ${CBP} PF_domains > ${CBP}_domains.txt
+	                python3 get_all_up_subtree.py /workspaces/2024_PV_on_CB/data/PV_trees/from_mixed/TRK_multi.nhx root > f1.txt; grep -A1 -f f1.txt /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa > f2.txt; grep -A1 "CHLRE\|VOLCA\|ARATH"  f2.txt | awk 'length($0)>2' | sort | uniq > f0.txt
+	                                PR_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
+	                PR_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
+	                PR_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
+	
+	grep -f /workspaces/2024_PV_on_CB/data/experimental_evidence/${CBP}_class_all_primaryacc_names.txt f2.txt | awk 'length($0)>2' | sort | uniq > f0.txt
+	                F_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
+	                F_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
+	                F_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
+	
+	                echo "${x} | ${P_CHLRE} ${PR_CHLRE} ${F_CHLRE} | ${P_VOLCA} ${PR_VOLCA} ${F_VOLCA} | ${P_ARATH} ${PR_ARATH} ${F_ARATH}"
+	        fi
+	
+	        if [[ "${CBP}" == "RPI" ]]
+	        then
+			for x in "RPI_isomerase" "RPI_dimerase" "RPI_isomerase_dimerase"
+			do
+		                grep -A1 "CHLRE\|VOLCA\|ARATH" /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa  | awk 'length($0)>2' | sort | uniq > f0.txt
+		                P_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
+		                P_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
+		                P_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
+	
+	
+		                # Count all paralogs predicted by Profileview
+		                grep ${CBP} PF_domains > ${CBP}_domains.txt
+		                python3 get_all_up_subtree.py /workspaces/2024_PV_on_CB/data/PV_trees/from_mixed/${x}.nhx root > f1.txt; grep -A1 -f f1.txt /workspaces/2024_PV_on_CB/data/CB_sequences/${CBP}/${CBP}_plus_exp.fa > f2.txt; grep -A1 "CHLRE\|VOLCA\|ARATH"  f2.txt | awk 'length($0)>2' | sort | uniq > f0.txt
+		                PR_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
+		                PR_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
+		                PR_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
+	
+				grep -f /workspaces/2024_PV_on_CB/data/experimental_evidence/${CBP}_class_all_primaryacc_names.txt f2.txt | awk 'length($0)>2' | sort | uniq > f0.txt
+		                F_CHLRE=`grep "CHLRE" f0.txt | wc -l | awk '{print $1}'`
+	       		        F_VOLCA=`grep "VOLCA" f0.txt | wc -l | awk '{print $1}'`
+	                	F_ARATH=`grep "ARATH" f0.txt | wc -l | awk '{print $1}'`
+	
+		                echo "${x} | ${P_CHLRE} ${PR_CHLRE} ${F_CHLRE} | ${P_VOLCA} ${PR_VOLCA} ${F_VOLCA} | ${P_ARATH} ${PR_ARATH} ${F_ARATH}"
+			done
+	        fi
+	fi
+done
+
+