Commit f769587a by LeliaP

Initial commit

parents
File added
# Matrix made by matblas from blosum62.iij
# * column uses minimum score
# BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
# Blocks Database = /data/blocks_5.0/blocks.dat
# Cluster Percentage: >= 62
# Entropy = 0.6979, Expected = -0.5209
A R N D C Q E G H I L K M F P S T W Y V B Z X *
A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4
R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4
N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4
D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4
C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4
Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4
E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4
H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4
I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4
L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4
K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4
M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4
F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4
P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4
S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4
T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4
W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4
Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4
V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4
B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4
Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4
* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1
\ No newline at end of file
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Correction UPGMA / Neighbor Joining\n",
"- Ultrametric / additive\n",
"- UPGMA\n",
"- Neighbor Joining"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Imports"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import itertools\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from Bio import Phylo\n",
"from io import StringIO\n",
"from scipy.cluster import hierarchy\n",
"from IPython.display import display"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Example"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"ename": "AttributeError",
"evalue": "'numpy.ndarray' object has no attribute 'columns'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-3-dcbf74883f81>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 10\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 11\u001b[0m \u001b[1;31m# Add +inf values to the matrix since we will only use the diagonal matrix\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 12\u001b[1;33m \u001b[1;32mfor\u001b[0m \u001b[0mcol\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mdm\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 13\u001b[0m \u001b[0mdm\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mdm\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mcol\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcol\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mfloat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"+inf\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 14\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mAttributeError\u001b[0m: 'numpy.ndarray' object has no attribute 'columns'"
]
}
],
"source": [
"columns = [\"A\",\"B\",\"C\",\"D\",\"E\",\"F\",\"G\"]\n",
"\n",
"distance_matrix = np.array([[0,0,0,0,0,0,0],\n",
" [19,0,0,0,0,0,0],\n",
" [27,31,0,0,0,0,0],\n",
" [8,18,26,0,0,0,0],\n",
" [33,36,41,31,0,0,0],\n",
" [18,1,32,17,35,0,0],\n",
" [13,13,29,14,28,12,0]])\n",
"\n",
"# Add +inf values to the matrix since we will only use the diagonal matrix\n",
"for col in dm.columns:\n",
" dm.loc[dm[col] == 0, col] = float(\"+inf\")\n",
"\n",
"\n",
"dm = pd.DataFrame(distance_matrix, columns=columns, index=columns)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dm.loc[\"A\"][\"A\"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Utrametricity"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def is_ultrametric(matrix):\n",
" for i in matrix:\n",
" for j in matrix:\n",
" if i != j:\n",
" distance_ij = min(dm.loc[i, j], dm.loc[j, i])\n",
" for k in matrix:\n",
" if k != i and k != j:\n",
" distance_ik = min(dm.loc[i, k], dm.loc[k, i])\n",
" distance_kj = min(dm.loc[k, j], dm.loc[j, k])\n",
" if distance_ij > max(distance_ik, distance_kj):\n",
" return False\n",
" return True"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"# Better version with itertools\n",
"def is_ultrametric2(matrix):\n",
" # get column names\n",
" columns = list(matrix.columns)\n",
" combos = itertools.combinations(columns, 3)\n",
" for triplet in combos:\n",
" if matrix.loc[triplet[0]][triplet[2]] > max(matrix.loc[triplet[0]][triplet[1]], matrix.loc[triplet[1]][triplet[2]]):\n",
" return False\n",
" return True "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Additivity"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def is_additive(matrix):\n",
" for i in matrix:\n",
" for j in matrix:\n",
" if i != j:\n",
" distance_ij = min(dm.loc[i, j], dm.loc[j, i])\n",
" for k in matrix:\n",
" if k != i and k != j:\n",
" for l in matrix:\n",
" if l != k and l != i and l != j:\n",
" distance_ik = min(dm.loc[i, k], dm.loc[k, i])\n",
" distance_kl = min(dm.loc[k, l], dm.loc[l, k])\n",
" distance_lj = min(dm.loc[l, j], dm.loc[j, l])\n",
" if distance_ij > max(distance_ik, distance_kl, distance_lj):\n",
" return False\n",
" return True"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"def is_additive2(matrix):\n",
" # Better version with itertools\n",
" # get column names\n",
" columns = list(matrix.columns)\n",
" combos = itertools.combinations(columns, 4)\n",
" for triplet in combos:\n",
" if matrix.loc[triplet[0]][triplet[3]] > max(matrix.loc[triplet[0]][triplet[1]], matrix.loc[triplet[1]][triplet[2]],matrix.loc[triplet[2]][triplet[3]]):\n",
" return False\n",
" return True "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tree class definition"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"class Tree(object):\n",
"\n",
" def __init__(self, name=\"\", height=0):\n",
" self.childrens = []\n",
" self.parent = None\n",
" self.name = name\n",
" self.height = height\n",
" \n",
" def add_children(self, children):\n",
" self.childrens.append(children)\n",
" \n",
" def set_parent(self, parent):\n",
" parent.add_children(self)\n",
" self.parent = parent\n",
" \n",
" def get_root(self):\n",
" if self.parent == None:\n",
" return self\n",
" return get_root(parent)\n",
" \n",
" def __str__(self):\n",
" return self.name + \" : \" + str(self.height)\n",
" \n",
" def __repr__(self):\n",
" return self.name + \" : \" + str(self.height)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tree to newick format\n",
"\n",
"Fonction recursive prenant en parametre la racine de l'arbre et qui renvoie l'arbre au format newick en string"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def tree_to_newick(root):\n",
" \n",
" if len(root.childrens) == 0:\n",
" return root.name\n",
" \n",
" hierarchy = [tree_to_newick(root.childrens[i]) if len(root.childrens) else root.name for i in range(len(root.childrens))]\n",
"\n",
" \n",
" if len(root.childrens) == 2:\n",
" height_left = root.childrens[0].height\n",
" height_right = root.childrens[1].height\n",
" \n",
" return \"({}:{},{}:{})\".format(hierarchy[0], height_left, hierarchy[1], height_right)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# UPGMA\n",
"\n",
"## Matrice des distances"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"True\n",
"True\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" <th>D</th>\n",
" <th>E</th>\n",
" <th>F</th>\n",
" <th>G</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>A</th>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>B</th>\n",
" <td>19.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>C</th>\n",
" <td>27.000000</td>\n",
" <td>31.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>D</th>\n",
" <td>8.000000</td>\n",
" <td>18.000000</td>\n",
" <td>26.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>E</th>\n",
" <td>33.000000</td>\n",
" <td>36.000000</td>\n",
" <td>41.000000</td>\n",
" <td>31.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>F</th>\n",
" <td>18.000000</td>\n",
" <td>1.000000</td>\n",
" <td>32.000000</td>\n",
" <td>17.000000</td>\n",
" <td>35.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>G</th>\n",
" <td>13.000000</td>\n",
" <td>13.000000</td>\n",
" <td>29.000000</td>\n",
" <td>14.000000</td>\n",
" <td>28.000000</td>\n",
" <td>12.000000</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C D E F G\n",
"A inf inf inf inf inf inf inf\n",
"B 19.000000 inf inf inf inf inf inf\n",
"C 27.000000 31.000000 inf inf inf inf inf\n",
"D 8.000000 18.000000 26.000000 inf inf inf inf\n",
"E 33.000000 36.000000 41.000000 31.000000 inf inf inf\n",
"F 18.000000 1.000000 32.000000 17.000000 35.000000 inf inf\n",
"G 13.000000 13.000000 29.000000 14.000000 28.000000 12.000000 inf"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"columns = [\"A\",\"B\",\"C\",\"D\",\"E\",\"F\",\"G\"]\n",
"\n",
"distance_matrix = np.array([[0,0,0,0,0,0,0],\n",
" [19,0,0,0,0,0,0],\n",
" [27,31,0,0,0,0,0],\n",
" [8,18,26,0,0,0,0],\n",
" [33,36,41,31,0,0,0],\n",
" [18,1,32,17,35,0,0],\n",
" [13,13,29,14,28,12,0]])\n",
"\n",
"dm = pd.DataFrame(distance_matrix, columns=columns, index=columns)\n",
"\n",
"# Check if dm is ultrametric\n",
"print(is_ultrametric(dm))\n",
"\n",
"# Check if dm is additive\n",
"print(is_additive(dm))\n",
"\n",
"# Add +inf values to the matrix since we will only use the diagonal matrix\n",
"for col in dm.columns:\n",
" dm.loc[dm[col] == 0, col] = float(\"+inf\")\n",
"\n",
"display(dm)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## UPGMA functions"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def compute_distance_upgma(dm, cluster_1, cluster_2, new):\n",
" \n",
" # cluster_1 : tuple\n",
" # cluster_2 : single\n",
" \n",
" # If cluster_1 == cluster_2 return +inf (0)\n",
" if list(\"\".join(list(cluster_1))) == list(cluster_2):\n",
" return float(\"+inf\")\n",
" \n",
" cardinal_1 = len(cluster_1[0])\n",
" cardinal_2 = len(cluster_1[1])\n",
" \n",
" distance_1 = min(dm.loc[cluster_1[0], cluster_2], dm.loc[cluster_2, cluster_1[0]])\n",
" distance_2 = min(dm.loc[cluster_1[1], cluster_2], dm.loc[cluster_2, cluster_1[1]])\n",
" \n",
" return ((cardinal_1 * distance_1) + (cardinal_2 * distance_2)) / (cardinal_1 + cardinal_2)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Lengths to height for UPGMA tree\n",
"def normalize_tree(root):\n",
" \n",
" if len(root.childrens) >= 1:\n",
" root.height = root.height - max([children.height for children in root.childrens])\n",
" for children in root.childrens:\n",
" normalize_tree(children)\n",
" \n",
" return root\n",
" \n",
"def upgma(dm):\n",
" \n",
" matrices = []\n",
" \n",
" nodes = {column: Tree(name=column) for column in list(dm)}\n",
" root = None\n",
" \n",
" while len(list(dm)) > 2:\n",
" \n",
" # Save the current matrix\n",
" matrices.append(dm)\n",
" \n",
" # Find the minimum distance\n",
" min_coords = (dm.min(axis=1).idxmin(), dm.min(axis=0).idxmin())\n",
" columns = [name for name in dm.columns if name not in min_coords] + [''.join(list(min_coords))]\n",
" \n",
" # Drop the row and column for the minimum distance species\n",
" new_dm = dm.drop([min_coords[0], min_coords[1]], axis=1).drop([min_coords[0], min_coords[1]], axis=0)\n",
" \n",
" # Insert a new column : new cluster\n",
" new_dm.insert(len(dm.columns) - 2, ''.join(list(min_coords)), float(\"+inf\"))\n",
" \n",
" # Compute new distance from all species to new cluster\n",
" new_record = [compute_distance_upgma(dm, min_coords, column, ''.join(list(min_coords))) for column in new_dm.columns]\n",
" \n",
" # Update the new column with the newly computed distances\n",
" added = pd.DataFrame([new_record], columns=new_dm.columns)\n",
" new_dm = new_dm.append(added)\n",
" new_dm.index = new_dm.columns\n",
" \n",
" # Setup height of the branch\n",
" height = min(dm.loc[min_coords[0], min_coords[1]], dm.loc[min_coords[1], min_coords[0]]) / 2\n",
" \n",
" # Create a new node in the tree\n",
" nodes[''.join(list(min_coords))] = Tree(name=''.join(list(min_coords)))\n",
" \n",
" # Update root node\n",
" root = nodes[''.join(list(min_coords))]\n",
" \n",
" # Set parent / children relations for the newly created node and its childrens\n",
" for node in min_coords:\n",
" nodes[node].set_parent(nodes[''.join(list(min_coords))])\n",
" nodes[node].height = height\n",
" \n",
" # Set the distances to +inf for all the symetric part of the matrix\n",
" for col in new_dm.columns:\n",
" new_dm.loc[new_dm[col] == 0, col] = float(\"+inf\")\n",
" \n",
" # Update dataframe\n",
" dm = new_dm\n",
" \n",
" # Final case : save the dataframe\n",
" matrices.append(dm)\n",
" \n",
" # Compute the height of the root node\n",
" height = min(dm.loc[list(dm)[0], list(dm)[1]], dm.loc[list(dm)[1], list(dm)[0]]) / 2 \n",
" \n",
" # Update the root node\n",
" root = Tree(name=\"\".join(list(dm.columns)))\n",
" \n",
" # Set parent / children relations for the newly created node and its childrens\n",
" for node in dm.columns:\n",
" nodes[node].set_parent(root)\n",
" nodes[node].height = height\n",
" \n",
" return matrices, root"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Tree(rooted=False, weight=1.0)\n",
" Clade()\n",
" Clade(branch_length=17.0, name='E')\n",
" Clade(branch_length=2.5)\n",
" Clade(branch_length=6.25)\n",
" Clade(branch_length=2.0)\n",
" Clade(branch_length=5.75)\n",
" Clade(branch_length=0.5, name='F')\n",
" Clade(branch_length=0.5, name='B')\n",
" Clade(branch_length=6.25, name='G')\n",
" Clade(branch_length=4.25)\n",
" Clade(branch_length=4.0, name='D')\n",
" Clade(branch_length=4.0, name='A')\n",
" Clade(branch_length=14.5, name='C')\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAEKCAYAAAARnO4WAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAFwpJREFUeJzt3X+U3XV95/HnmwFBJWgwlCICk4RK\nj4AMMKYKmDNZOaywNOGHdeNxd2kVoig9lbM9xV0Ee3Q9hfZsW+2pJREt2oYURAHXgyhuM7XaY3CQ\nGwKJGBOGhfIjEVZC1lIR3vvH/Q4dhpnJnTDf+d6Zz/NxTs7cud/PvZ93vvfe13zu537v5xuZiSRp\n7tun6QIkSTPDwJekQhj4klQIA1+SCmHgS1IhDHxJKoSBL0mFMPAlqRAGviQVYt+mCxhtwYIF2dvb\n23QZkjRr3HXXXT/NzEM6adtVgd/b28vQ0FDTZUjSrBERD3ba1ikdSSqEgS9JhTDwJakQBr4kFcLA\nl6RCGPiSVAgDX5IKYeBLUiEMfEkqhIEvSYUw8CWpEAa+JBXCwJekQhj4klQIA1+SCmHgS1IhDHxJ\nKkRXnfFKkl6u3t5e5s2bR09PDwBLly7lM5/5TMNVdYfaAj8ivgCcDezIzOPq6keSxlq/fj0LFixo\nuoyuU+eUznXAO2u8f0nSFNQ2ws/M70REb133DzAwMFDn3UuaBQYHB19y3bJly16Y0rngggu49NJL\nZ7iq7tT4HH5ErAJWARx55JEd325gYIBWq0VfX19dpUmapZzSGV/jgZ+Za4A1AP39/TmV2/b19Y37\n112S9FIelilJhWh8hC9J0230HP6b3/xmvvSlLzVcUXeo87DMdcAAsCAiHgY+npmfr6s/SQIYHh5u\nuoSuVedROu+p674lSVPnHL4kFcLAl6RCGPiSVAgDX5IKYeBLUiEMfEkqhIEvSYUw8CWpEAa+JBXC\nwJekQhj4klQIA1+SCmHgS1IhDHxJKoSBL0mFMPAlqRAGviQVwsCXpEJ4EnNJc0pvby/z5s174STm\nn/3sZznllFMarqo71HkS8yOALwG/CjwPrMnMT9fVnySNWL9+PQsWLGi6jK5T5wj/l8B/zcwfRsQ8\n4K6IuCMzN9fYpyRpArUFfmY+CjxaXX46IrYAhwMGvqRaLVu2jJ6eHvbff382bNjQdDldY0bm8COi\nFzgRcM9LmpKBgYFJtw8ODr7kOqd0xlf7UToRcSDwFeAjmblrnO2rImIoIoZ27txZdzmSZpGBgQFa\nrVbTZcwZtY7wI2I/2mG/NjO/Ol6bzFwDrAHo7+/POuuRNPv09fWNO4rX1NU2wo+IAD4PbMnMP62r\nH0lSZ+oc4Z8K/GdgU0SMvCf775l5W419Sirc8PBw0yV0rTqP0vkuEHXdvyRpalxaQZIKYeBLUiEM\nfEkqhIEvSYUw8CWpEAa+JBXCwJekQhj4klQIz3glTYM9reiovdNqtejr62u6jDnDEb70Mrmio2YL\nR/jSNHBFx3r4zml6OcKXpEIY+JJUCANfkgph4EtSIQx8SSqEgS9JhTDwJakQBr4kFcLAlzSn9Pb2\ncvzxx9PX18fxxx/Prbfe2nRJXaO2b9pGxAHAd4D9q35uysyP19WfJI1Yv349CxYs4P777+eMM85g\nxYoVTZfUFepcWuFfgX+XmbsjYj/guxHxjcz8fo19StILdu3axfz585suo2vUFviZmcDu6tf9qn9Z\nV3+S5qZWqzXpmjrjrWG0bNkyMpPt27dz44031lfcLFPrHH5E9EREC9gB3JGZG8ZpsyoihiJiaOfO\nnXWWI2mWGRwc3KvlkdevX8+9997Lpk2buOSSS9i9e/eeb1SAWlfLzMzngL6IeC1wc0Qcl5n3jmmz\nBlgD0N/f7zsASS/yclYhXbx4MYceeiibN29myZIl01fULDUjR+lk5s+AQeCdM9GfJAHs2LGDBx54\ngKOOOqrpUrpCnUfpHAI8m5k/i4hXAqcDV9fVnySNWLZsGT09PTz77LNcddVVHHrooU2X1BXqnNI5\nDPhiRPTQfidxY2Z+vcb+JInh4eGmS+hadR6lcw9wYl33L0maGr9pK0mF8Jy2mlOaOAdqq9Xaq0MH\npZnmCF9zxsDAAK1Wq+kypK7lCF9zSl9f38s6bntvNPGuQtobjvAlqRAGviQVwsCXpEIY+JJUCANf\nkgph4EtSIQx8SSqEgS9JhTDwJakQBr4kFaLjpRUi4leAA0Z+z8z/U0tFkqRa7HGEHxHLI2Ir8ADw\nD8Aw8I2a65IkTbNOpnQ+CbwV+HFmLgTeAXyv1qokSdOuk8B/NjOfAPaJiH0ycz3g4t+SNMt0Evg/\ni4gDge8AayPi08Av6y1LkvbO7t27ufjii1m8eDEnnngiJ598Mp/73OeaLqsrdBL4K4B/AS4Fbge2\nAb/ZaQcR0RMRd0eEJzCXVLsLL7yQ+fPns3XrVu6++25uv/12nnzyyabL6gqdHKVzVGZuri5/ESAi\nBoDBDvv4PWALcNBUi5Okqdi2bRt33nkn119/Pfvs0x7PHnLIIVx22WUNV9YdOhnh3xgRl0XbKyPi\nL4A/6uTOI+INwH8Arn05RUpSJ+677z5OOOGEF8JeL9bJCP83gKuBfwLmAWuBUzu8/z8H/qC6nTRn\ntVotT3XYkMlOafmpT32KL3/5y+zYsYNHHnlk5orqUh0dpUN7Dv+VtL949UBmPr+nG0XE2cCOzLxr\nD+1WRcRQRAzt3Lmzk5qlrjI4OEhfnweudYM3velNbNy4keefb0fU5ZdfTqvVYteuXQ1X1h06GeH/\nALgVeAvwOmB1RLwrM9+1h9udCiyPiLNo/6E4KCL+NjP/0+hGmbkGWAPQ39+fU/0PSN1gpk+crvEd\nffTR9Pf387GPfYxPfvKT9PT08Mwzz5BptEBnI/z3Z+aVmflsZj6WmSto/wGYVGb+t8x8Q2b2AiuB\nvx8b9pI03a699lqeeOIJjj76aE4++WROP/10rr766qbL6gp7HOFn5hC8ZC2df6izKEnaWwcddBCr\nV69uuoyu1MlaOr/5ctfSyczBzDx7ryqUJE2LTqZ0/geupSNJs55r6UhSITo5SmfsWjo7aB+qKUma\nRToJ/I3Az2mvpfNe4DXAgXUWJUmafp0E/rLqi1bP829r6dxTa1WSpGk3YeBHxMXAh4DFYwJ+Hn5o\nOym/Yt+MVqvlN16lSUw2wr+e9uGXfwR8dNT1T2ema41OYGBgwOCR1JUmDPzMfAp4CnjPzJUzN/T1\n9flV+wb4zkqanGuISlIhDHxJKoSBL0mFMPAlqRAGviQVwsCXpEIY+JJUCANfkgph4EtSIQx8SSqE\ngS9JhehkeWRJmjV6e3uZN28eAM899xznnXceV1xxBfvvv3/DlTWv1hF+RAxHxKaIaEXEUJ19SdKI\n9evXs2nTJu688062b9/OqlWrmi6pK8zECH9ZZv50BvqRpBc58MADueaaazjiiCN48sknOfjgg5su\nqVHO4Uua0w466CAWLlzI1q1bmy6lcXWP8BP4VkQksDoz19Tcn1S0Es8J0Mm5JzKz/kJmgbpH+Kdm\n5knAmcCHI2Lp2AYRsSoihiJiaOfOnTWXI81dI2db04s9/fTTDA8P88Y3vrHpUhpX6wg/Mx+pfu6I\niJuBJcB3xrRZA6wB6O/v98+w9DJ4trUX2717Nx/60Ic455xzmD9/ftPlNK62EX5EvDoi5o1cBs4A\n7q2rP0kasWzZMo477jiWLFnCkUceyerVq5suqSvUOcI/FLg5Ikb6uT4zb6+xP0lieHi46RK6Vm2B\nn5nbgRPqun9J0tR4WKYkFcLAl6RCGPiSVAgDX5IKYeBLUiEMfEkqhIEvSYUw8CWpEJ7xSnNKq9Uq\ncsVIaP/f+/r6mi5DXcwRvuaMwcFBA0+ahCN8zSklrxRZ6jsbdc4RviQVwsCXpELM6imdbvyAzg/O\nJHWrWTvC9wM6SZqaWT3C78YP6LrtHYckjZi1I3xJ0tQY+JJUCANfkgph4Euac+6++24igm9+85tN\nl9JVag38iHhtRNwUET+KiC0R8bY6+5MkgHXr1nHaaaexbt26pkvpKnUfpfNp4PbMfFdEvAJ4Vc39\nSSpcZnLTTTdxxx138Pa3v51nnnmGAw44oOmyukJtgR8RBwFLgd8GyMxfAL+oqz9J3fllxLqNPTz7\ne9/7HgsXLmTx4sUMDAxw2223cd555zVTXJepc0pnEbAT+OuIuDsiro2IV49tFBGrImIoIoZ27txZ\nYznS3OaXEdvWrVvHypUrAVi5cqXTOqNEZtZzxxH9wPeBUzNzQ0R8GtiVmVdMdJv+/v4cGhqqpZ6Z\nMjK66sYvhUlz3XPPPcfhhx/OfvvtR09PD5nJE088waOPPsq8efOaLq8WEXFXZvZ30rbOEf7DwMOZ\nuaH6/SbgpBr7k1S4b3/725xwwgk89NBDDA8P8+CDD3L++edzyy23NF1aV6gt8DPzMeChiDimuuod\nwOa6+pOkdevWce65577ouvPPP5/rr7++oYq6S91H6fwusLY6Qmc78Ds19yepYNddd91Lrlu+fDnL\nly+f+WK6UK2Bn5ktoKO5JUlSvfymrSQVwsCXpEIY+JJUCANfkgph4EtSIQx8SSqEgS9JhTDwJakQ\nBr4kFcLAl6RCGPiSVAgDX5IKYeBLUiEMfEkqhIEvSYUw8CWpEAa+JBXCwJekQhj4kuaU3bt384EP\nfIDFixdz7LHHsnTpUjZs2NB0WV2htnPaRsQxwA2jrloEXJmZf15Xn5J04YUXsnDhQrZu3co+++zD\n9u3b2bJlS9NldYXaAj8z7wf6ACKiB/hn4Oa6+pOkbdu2sWHDBtauXcs++7QnMBYtWsSiRYsarqw7\nzNSUzjuAbZn54Az1J6lA9913H319ffT09DRdSleqbYQ/xkpg3Qz11bhWq8XAwEDTZUhFGBwcbLqE\nWaP2EX5EvAJYDnx5gu2rImIoIoZ27txZdzm1GxwcpK+vr+kypCIde+yxbNy4keeff77pUrpSZGa9\nHUSsAD6cmWfsqW1/f38ODQ3VWo+kue3d7343xxxzDJ/4xCeICLZu3crmzZtZsWJF06XVIiLuysz+\nTtrOxBz+eyhoOkdSs6699loee+wxjj76aI4//nguuugiXv/61zddVleodYQfEa8CHgIWZeZTe2rv\nCF+SpmYqI/xaP7TNzJ8Dr6uzD0lSZ/ymrSQVwsCXpEIY+JJUCANfkgph4EtSIQx8SSqEgS9JhTDw\nJakQBr4kFcLAl6RCGPiSVAgDX5IKYeBLUiEMfEkqhIEvSYUw8CWpEAa+JBXCwJekQtR6Ttupioid\nwINTuMkC4Kc1lfNyWNfUWNfUWNfUdGNd01nTUZl5SCcNuyrwpyoihjo9ee9Msq6psa6psa6p6ca6\nmqrJKR1JKoSBL0mFmO2Bv6bpAiZgXVNjXVNjXVPTjXU1UtOsnsOXJHVuto/wJUkdmhWBHxHvjIj7\nI+InEfHRcbbvHxE3VNs3RETvDNR0RESsj4gtEXFfRPzeOG0GIuKpiGhV/66su66q3+GI2FT1OTTO\n9oiIz1T7656IOGkGajpm1H5oRcSuiPjImDYzsr8i4gsRsSMi7h113cERcUdEbK1+zp/gthdUbbZG\nxAUzUNefRMSPqsfp5oh47QS3nfQxr6GuP4yIfx71WJ01wW0nfe3WUNcNo2oajojWBLetZX9NlAvd\n8PwCIDO7+h/QA2wDFgGvADYCbxrT5kPANdXllcANM1DXYcBJ1eV5wI/HqWsA+HoD+2wYWDDJ9rOA\nbwABvBXY0MBj+hjt44dnfH8BS4GTgHtHXffHwEeryx8Frh7ndgcD26uf86vL82uu6wxg3+ry1ePV\n1cljXkNdfwj8fgeP86Sv3emua8z2/wlcOZP7a6Jc6IbnV2bOihH+EuAnmbk9M38B/B2wYkybFcAX\nq8s3Ae+IiKizqMx8NDN/WF1+GtgCHF5nn9NoBfClbPs+8NqIOGwG+38HsC0zp/Ilu2mTmd8Bnhxz\n9ejn0BeBc8a56b8H7sjMJzPz/wJ3AO+ss67M/FZm/rL69fvAG6arv5dTV4c6ee3WUlf1+n83sG66\n+uuwpolyofHnF8yOKZ3DgYdG/f4wLw3WF9pUL46ngNfNSHVANYV0IrBhnM1vi4iNEfGNiDh2hkpK\n4FsRcVdErBpneyf7tE4rmfiF2MT+Ajg0Mx+F9osW+JVx2jS9395H+53ZePb0mNfhkmqq6QsTTFE0\nub/eDjyemVsn2F77/hqTC13x/JoNgT/eSH3soUWdtKlFRBwIfAX4SGbuGrP5h7SnLU4A/gK4ZSZq\nAk7NzJOAM4EPR8TSMdub3F+vAJYDXx5nc1P7q1NN7rfLgV8CaydosqfHfLr9FbAY6AMepT19MlZj\n+wt4D5OP7mvdX3vIhQlvNs5107q/ZkPgPwwcMer3NwCPTNQmIvYFXsPevQWdkojYj/aDujYzvzp2\ne2buyszd1eXbgP0iYkHddWXmI9XPHcDNtN9aj9bJPq3LmcAPM/PxsRua2l+Vx0emtaqfO8Zp08h+\nqz68Oxt4b1aTvWN18JhPq8x8PDOfy8zngc9N0F9T+2tf4Dzghona1Lm/JsiFrnh+zYbA/wHwaxGx\nsBodrgS+NqbN14CRT7TfBfz9RC+M6VLNEX4e2JKZfzpBm18d+SwhIpbQ3t9P1FzXqyNi3shl2h/6\n3Tum2deA/xJtbwWeGnm7OQMmHHk1sb9GGf0cugC4dZw23wTOiIj51RTGGdV1tYmIdwKXAcsz8+cT\ntOnkMZ/uukZ/5nPuBP118tqtw+nAjzLz4fE21rm/JsmF7nh+Tfen1HX8o31UyY9pf+J/eXXdJ2i/\nCAAOoD1F8BPgTmDRDNR0Gu23W/cArerfWcAHgQ9WbS4B7qN9dML3gVNmoK5FVX8bq75H9tfougL4\ny2p/bgL6Z+hxfBXtAH/NqOtmfH/R/oPzKPAs7VHV+2l/5vO/ga3Vz4Ortv3AtaNu+77qefYT4Hdm\noK6f0J7XHXmOjRyN9nrgtske85rr+pvquXMP7TA7bGxd1e8vee3WWVd1/XUjz6lRbWdkf02SC40/\nvzLTb9pKUilmw5SOJGkaGPiSVAgDX5IKYeBLUiEMfEkqhIGvrhURvaNXQpyhPgcjYtJzjdZVV7RX\nCz1l1O/XRcS7prsflcvA16wXET1N1zBNBoBT9tRI2lsGvrrdvhHxxWqRrpsi4lXwwnrmV0bEd4Hf\nioiLIuIH1cJrXxnV7rpor/3/TxGxffSIOSL+oFoTfWNEXDWqz9+KiDsj4scR8fbJiouInmivWf+D\nqsYPVNcPVO8Wbor2evZrR32L+Kzquu9WtX29Wmjrg8Cl0V6jfaTfpePVLu0NA1/d7hhgTWa+GdhF\n+9wHI57JzNMy8++Ar2bmW7K98NoW2t8GHXEY7W9Ang1cBRARZ9JeovY3qtv88aj2+2bmEuAjwMf3\nUN/7aS9N8RbgLcBFEbGw2nZidR9vov3tzlMj4gBgNXBmZp4GHAKQmcPANcCfZWZfZv7jRLVLe8vA\nV7d7KDO/V13+W9rhN2L04ljHRcQ/RsQm4L3A6KWVb8nM5zNzM3Bodd3pwF9ntT5NZo5ebG9kwau7\ngN491HcG7XWJWrSXwX0d8GvVtjsz8+FsLzDWqu7r14HtmflA1WZP67WPV7u0V/ZtugBpD8au/TH6\n9/836vJ1wDmZuTEifpv2fPiIfx11OUb9nGhdkZH2z7Hn10gAv5uZL1rkKiIGxvQ7cl9TPTHPeLVL\ne8URvrrdkRHxturye4DvTtBuHvBotTTtezu4328B7xs113/wXtb3TeDiql8i4o3VCowT+RGwKP7t\nvMv/cdS2p2n/P6RaGPjqdluACyLiHtrn+vyrCdpdQXtK5Q7aoTqpzLyd9iqPQ9V0zO/vZX3XApuB\nH1aHaq5mkncFmfkvtD+HuL36wPlx2mdoA/hfwLljPrSVpo2rZUozLCIOzMzd1VE7fwlszcw/a7ou\nzX2O8KWZd1H1ruI+2mdnW91wPSqEI3xJKoQjfEkqhIEvSYUw8CWpEAa+JBXCwJekQhj4klSI/w8X\n7F24uLuSOAAAAABJRU5ErkJggg==\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x7f529608cb38>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Run UPGMA on the dataframe\n",
"matrices, root = upgma(dm)\n",
"\n",
"# Convert length to heights in the tree\n",
"root = normalize_tree(root)\n",
"\n",
"# Convert the tree to newick format\n",
"tree = tree_to_newick(root)\n",
"\n",
"# Convert the newick tree to PhyloPython Tree format\n",
"tree = Phylo.read(StringIO(tree), \"newick\")\n",
"\n",
"print(tree)\n",
"\n",
"# Display the tree\n",
"Phylo.draw(tree)\n",
"\n",
"# Save the tree to upgma.nw file\n",
"Phylo.write(tree, 'upgma.nw', 'newick')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Display UPGMA results"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" <th>D</th>\n",
" <th>E</th>\n",
" <th>F</th>\n",
" <th>G</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>A</th>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>B</th>\n",
" <td>19.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>C</th>\n",
" <td>27.000000</td>\n",
" <td>31.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>D</th>\n",
" <td>8.000000</td>\n",
" <td>18.000000</td>\n",
" <td>26.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>E</th>\n",
" <td>33.000000</td>\n",
" <td>36.000000</td>\n",
" <td>41.000000</td>\n",
" <td>31.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>F</th>\n",
" <td>18.000000</td>\n",
" <td>1.000000</td>\n",
" <td>32.000000</td>\n",
" <td>17.000000</td>\n",
" <td>35.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>G</th>\n",
" <td>13.000000</td>\n",
" <td>13.000000</td>\n",
" <td>29.000000</td>\n",
" <td>14.000000</td>\n",
" <td>28.000000</td>\n",
" <td>12.000000</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C D E F G\n",
"A inf inf inf inf inf inf inf\n",
"B 19.000000 inf inf inf inf inf inf\n",
"C 27.000000 31.000000 inf inf inf inf inf\n",
"D 8.000000 18.000000 26.000000 inf inf inf inf\n",
"E 33.000000 36.000000 41.000000 31.000000 inf inf inf\n",
"F 18.000000 1.000000 32.000000 17.000000 35.000000 inf inf\n",
"G 13.000000 13.000000 29.000000 14.000000 28.000000 12.000000 inf"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>C</th>\n",
" <th>D</th>\n",
" <th>E</th>\n",
" <th>G</th>\n",
" <th>FB</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>A</th>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>C</th>\n",
" <td>27.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>D</th>\n",
" <td>8.000000</td>\n",
" <td>26.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>E</th>\n",
" <td>33.000000</td>\n",
" <td>41.000000</td>\n",
" <td>31.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>G</th>\n",
" <td>13.000000</td>\n",
" <td>29.000000</td>\n",
" <td>14.000000</td>\n",
" <td>28.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>FB</th>\n",
" <td>18.500000</td>\n",
" <td>31.500000</td>\n",
" <td>17.500000</td>\n",
" <td>35.500000</td>\n",
" <td>12.500000</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A C D E G FB\n",
"A inf inf inf inf inf inf\n",
"C 27.000000 inf inf inf inf inf\n",
"D 8.000000 26.000000 inf inf inf inf\n",
"E 33.000000 41.000000 31.000000 inf inf inf\n",
"G 13.000000 29.000000 14.000000 28.000000 inf inf\n",
"FB 18.500000 31.500000 17.500000 35.500000 12.500000 inf"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>C</th>\n",
" <th>E</th>\n",
" <th>G</th>\n",
" <th>FB</th>\n",
" <th>DA</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>C</th>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>E</th>\n",
" <td>41.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>G</th>\n",
" <td>29.000000</td>\n",
" <td>28.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>FB</th>\n",
" <td>31.500000</td>\n",
" <td>35.500000</td>\n",
" <td>12.500000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>DA</th>\n",
" <td>26.500000</td>\n",
" <td>32.000000</td>\n",
" <td>13.500000</td>\n",
" <td>18.000000</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" C E G FB DA\n",
"C inf inf inf inf inf\n",
"E 41.000000 inf inf inf inf\n",
"G 29.000000 28.000000 inf inf inf\n",
"FB 31.500000 35.500000 12.500000 inf inf\n",
"DA 26.500000 32.000000 13.500000 18.000000 inf"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>C</th>\n",
" <th>E</th>\n",
" <th>DA</th>\n",
" <th>FBG</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>C</th>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>E</th>\n",
" <td>41.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>DA</th>\n",
" <td>26.500000</td>\n",
" <td>32.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>FBG</th>\n",
" <td>30.666667</td>\n",
" <td>33.000000</td>\n",
" <td>16.500000</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" C E DA FBG\n",
"C inf inf inf inf\n",
"E 41.000000 inf inf inf\n",
"DA 26.500000 32.000000 inf inf\n",
"FBG 30.666667 33.000000 16.500000 inf"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>C</th>\n",
" <th>E</th>\n",
" <th>FBGDA</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>C</th>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>E</th>\n",
" <td>41.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>FBGDA</th>\n",
" <td>29.000000</td>\n",
" <td>32.600000</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" C E FBGDA\n",
"C inf inf inf\n",
"E 41.000000 inf inf\n",
"FBGDA 29.000000 32.600000 inf"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>E</th>\n",
" <th>FBGDAC</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>E</th>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>FBGDAC</th>\n",
" <td>34.000000</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" E FBGDAC\n",
"E inf inf\n",
"FBGDAC 34.000000 inf"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"for matrice in matrices:\n",
" display(matrice)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# NEIGHBOUR JOINING"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" <th>D</th>\n",
" <th>E</th>\n",
" <th>F</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>A</th>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>B</th>\n",
" <td>5.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>C</th>\n",
" <td>4.000000</td>\n",
" <td>7.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>D</th>\n",
" <td>7.000000</td>\n",
" <td>10.000000</td>\n",
" <td>7.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>E</th>\n",
" <td>6.000000</td>\n",
" <td>9.000000</td>\n",
" <td>6.000000</td>\n",
" <td>5.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>F</th>\n",
" <td>8.000000</td>\n",
" <td>11.000000</td>\n",
" <td>8.000000</td>\n",
" <td>9.000000</td>\n",
" <td>8.000000</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C D E F\n",
"A inf inf inf inf inf inf\n",
"B 5.000000 inf inf inf inf inf\n",
"C 4.000000 7.000000 inf inf inf inf\n",
"D 7.000000 10.000000 7.000000 inf inf inf\n",
"E 6.000000 9.000000 6.000000 5.000000 inf inf\n",
"F 8.000000 11.000000 8.000000 9.000000 8.000000 inf"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"columns = [\"A\",\"B\",\"C\",\"D\",\"E\",\"F\",\"G\"]\n",
"\n",
"distance_matrix = np.array([[0,0,0,0,0,0,0],\n",
" [19,0,0,0,0,0,0],\n",
" [27,31,0,0,0,0,0],\n",
" [8,18,26,0,0,0,0],\n",
" [33,36,41,31,0,0,0],\n",
" [18,1,32,17,35,0,0],\n",
" [13,13,29,14,28,12,0]])\n",
"\n",
"columns = [\"A\",\"B\",\"C\",\"D\",\"E\", \"F\"]\n",
"distance_matrix = np.array([[0,0,0,0,0,0],\n",
" [5,0,0,0,0, 0],\n",
" [4,7,0,0,0, 0],\n",
" [7,10,7,0,0, 0],\n",
" [6,9,6,5,0, 0],\n",
" [8,11,8,9,8, 0]])\n",
"\n",
"dm = pd.DataFrame(distance_matrix, columns=columns, index=columns)\n",
"\n",
"\n",
"for col in dm.columns:\n",
" dm.loc[dm[col] == 0, col] = float(\"+inf\")\n",
"\n",
"display(dm)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def compute_distance_nj(dm, cluster_1, cluster_2, new):\n",
" \n",
" # cluster_1 : tuple\n",
" # cluster_2 : single\n",
" \n",
" if list(\"\".join(list(cluster_1))) == list(cluster_2):\n",
" return float(\"+inf\")\n",
" \n",
" distance_ik = min(dm.loc[cluster_1[0], cluster_2], dm.loc[cluster_2, cluster_1[0]])\n",
" distance_jk = min(dm.loc[cluster_1[1], cluster_2], dm.loc[cluster_2, cluster_1[1]])\n",
" distance_ij = min(dm.loc[cluster_1[0], cluster_1[1]], dm.loc[cluster_1[1], cluster_1[0]])\n",
" \n",
" return (distance_ik + distance_jk - distance_ij) / 2"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def nj(dm):\n",
" \n",
" uis = []\n",
" matrices = []\n",
" nodes = {column: Tree(name=column) for column in list(dm)}\n",
" root = None\n",
" \n",
" while len(list(dm)) > 2:\n",
" \n",
" # Init new ui dictionary\n",
" ui = dict()\n",
" \n",
" # Compute ui for all species\n",
" for column_a in dm.columns:\n",
" ui[column_a] = sum([min(dm.loc[column_a, column_b], dm.loc[column_b, column_a]) for column_b in dm.columns if column_a != column_b]) / (len(dm.columns) - 2)\n",
" \n",
" # Save dataframe and ui for the current iteration\n",
" matrices.append(dm)\n",
" uis.append(ui)\n",
" \n",
" min_distance = float(\"inf\") \n",
" min_ij = (None, None)\n",
" \n",
" # Find the (i, j) with minimum D(i, j) - ui - uj\n",
" for (i, j) in itertools.combinations(dm.columns, 2):\n",
" distance = min(dm.loc[i, j], dm.loc[j, i]) - ui[i] - ui[j]\n",
" if distance < min_distance:\n",
" min_distance = distance\n",
" min_ij = (i, j)\n",
"\n",
" # Drop the two closest species from the datafrane\n",
" new_dm = dm.drop([min_ij[0], min_ij[1]], axis=1).drop([min_ij[1], min_ij[0]], axis=0)\n",
" new_dm.insert(len(dm.columns) - 2, ''.join(list(min_ij)), float(\"+inf\"))\n",
" \n",
" # Compute new distances from all species to the newly created cluster\n",
" new_record = [compute_distance_nj(dm, min_ij, column, ''.join(list(min_ij))) for column in new_dm.columns]\n",
"\n",
" added = pd.DataFrame([new_record], columns=new_dm.columns)\n",
" new_dm = new_dm.append(added)\n",
" new_dm.index = new_dm.columns\n",
" \n",
" # Compute the iij and jij height for the childrens nodes\n",
" height_iij = 0.5 * (min(dm.loc[min_ij[0], min_ij[1]], dm.loc[min_ij[1], min_ij[0]]) + ui[min_ij[0]] - ui[min_ij[1]])\n",
" height_jij = 0.5 * (min(dm.loc[min_ij[0], min_ij[1]], dm.loc[min_ij[1], min_ij[0]]) + ui[min_ij[1]] - ui[min_ij[0]]) \n",
" \n",
" # Create new node in the tree joining the closest species\n",
" nodes[''.join(list(min_ij))] = Tree(name=''.join(list(min_ij)))\n",
" root = nodes[''.join(list(min_ij))]\n",
" \n",
" # Set the parent / children relationships\n",
" for node in min_ij:\n",
" nodes[node].set_parent(nodes[''.join(list(min_ij))])\n",
" \n",
" # Set the heigh of both childrens accordingly\n",
" root.childrens[0].height = height_iij\n",
" root.childrens[1].height = height_jij\n",
" \n",
" for col in new_dm.columns:\n",
" new_dm.loc[new_dm[col] == 0, col] = float(\"+inf\")\n",
" \n",
" # Update dataframe\n",
" dm = new_dm\n",
"\n",
" # Final case\n",
" \n",
" ui = dict()\n",
" \n",
" # Compute uis\n",
" for column_a in dm.columns:\n",
" ui[column_a] = sum([min(dm.loc[column_a, column_b], dm.loc[column_b, column_a]) for column_b in dm.columns if column_a != column_b]) / (len(dm.columns))\n",
" \n",
" # Save dataframe\n",
" matrices.append(dm)\n",
"\n",
" # Get minimum distance : Trivial case since we only have 2 clusters\n",
" min_ij = tuple(list(dm))\n",
" \n",
" # Compute the height of the childrens to the newly created cluster\n",
" height_iij = 0.5 * (min(dm.loc[min_ij[0], min_ij[1]], dm.loc[min_ij[1], min_ij[0]]) + ui[min_ij[0]] - ui[min_ij[1]])\n",
" height_jij = 0.5 * (min(dm.loc[min_ij[0], min_ij[1]], dm.loc[min_ij[1], min_ij[0]]) + ui[min_ij[1]] - ui[min_ij[0]]) \n",
" \n",
" # Build the root node of the tree\n",
" root = Tree(name=\"\".join(list(dm.columns)))\n",
" \n",
" # Set the parent / children relationships\n",
" for node in dm.columns:\n",
" nodes[node].set_parent(root)\n",
" \n",
" # Set the height of both childrens\n",
" root.childrens[0].height = height_iij\n",
" root.childrens[1].height = height_jij\n",
" \n",
" return matrices, uis, root"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Display Neighbour joining results"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" <th>D</th>\n",
" <th>E</th>\n",
" <th>F</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>A</th>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>B</th>\n",
" <td>5.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>C</th>\n",
" <td>4.000000</td>\n",
" <td>7.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>D</th>\n",
" <td>7.000000</td>\n",
" <td>10.000000</td>\n",
" <td>7.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>E</th>\n",
" <td>6.000000</td>\n",
" <td>9.000000</td>\n",
" <td>6.000000</td>\n",
" <td>5.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>F</th>\n",
" <td>8.000000</td>\n",
" <td>11.000000</td>\n",
" <td>8.000000</td>\n",
" <td>9.000000</td>\n",
" <td>8.000000</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C D E F\n",
"A inf inf inf inf inf inf\n",
"B 5.000000 inf inf inf inf inf\n",
"C 4.000000 7.000000 inf inf inf inf\n",
"D 7.000000 10.000000 7.000000 inf inf inf\n",
"E 6.000000 9.000000 6.000000 5.000000 inf inf\n",
"F 8.000000 11.000000 8.000000 9.000000 8.000000 inf"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>C</th>\n",
" <th>D</th>\n",
" <th>E</th>\n",
" <th>F</th>\n",
" <th>AB</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>C</th>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>D</th>\n",
" <td>7.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>E</th>\n",
" <td>6.000000</td>\n",
" <td>5.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>F</th>\n",
" <td>8.000000</td>\n",
" <td>9.000000</td>\n",
" <td>8.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>AB</th>\n",
" <td>3.000000</td>\n",
" <td>6.000000</td>\n",
" <td>5.000000</td>\n",
" <td>7.000000</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" C D E F AB\n",
"C inf inf inf inf inf\n",
"D 7.000000 inf inf inf inf\n",
"E 6.000000 5.000000 inf inf inf\n",
"F 8.000000 9.000000 8.000000 inf inf\n",
"AB 3.000000 6.000000 5.000000 7.000000 inf"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>D</th>\n",
" <th>E</th>\n",
" <th>F</th>\n",
" <th>CAB</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>D</th>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>E</th>\n",
" <td>5.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>F</th>\n",
" <td>9.000000</td>\n",
" <td>8.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>CAB</th>\n",
" <td>5.000000</td>\n",
" <td>4.000000</td>\n",
" <td>6.000000</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" D E F CAB\n",
"D inf inf inf inf\n",
"E 5.000000 inf inf inf\n",
"F 9.000000 8.000000 inf inf\n",
"CAB 5.000000 4.000000 6.000000 inf"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>F</th>\n",
" <th>CAB</th>\n",
" <th>DE</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>F</th>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>CAB</th>\n",
" <td>6.000000</td>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>DE</th>\n",
" <td>6.000000</td>\n",
" <td>2.000000</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" F CAB DE\n",
"F inf inf inf\n",
"CAB 6.000000 inf inf\n",
"DE 6.000000 2.000000 inf"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>DE</th>\n",
" <th>FCAB</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>DE</th>\n",
" <td>inf</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>FCAB</th>\n",
" <td>1.000000</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" DE FCAB\n",
"DE inf inf\n",
"FCAB 1.000000 inf"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"matrices, uis, root = nj(dm)\n",
"\n",
"for matrice in matrices:\n",
" display(matrice)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Tree(rooted=False, weight=1.0)\n",
" Clade()\n",
" Clade(branch_length=0.5)\n",
" Clade(branch_length=3.0, name='D')\n",
" Clade(branch_length=2.0, name='E')\n",
" Clade(branch_length=0.5)\n",
" Clade(branch_length=5.0, name='F')\n",
" Clade(branch_length=1.0)\n",
" Clade(branch_length=2.0, name='C')\n",
" Clade(branch_length=1.0)\n",
" Clade(branch_length=1.0, name='A')\n",
" Clade(branch_length=4.0, name='B')\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAEKCAYAAAARnO4WAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAFJRJREFUeJzt3X+QXeV93/H3VyvMTykmFVH54bKS\nHLsTLHPxrEkKWLMbXMZOiMSPZCLGcZvERjTUjO224zqZoUzdzoRpOyGmE8eRhQ2MhVwq2zjxEDCe\naGNDGsEKLggkYwWxGBmI1mZsUGLHIH37xx7J1+sVurvas+dePe/XjEZ37x6d5wOj/ei5zz33OZGZ\nSJKOfQuaDiBJmh8WviQVwsKXpEJY+JJUCAtfkgph4UtSISx8SSqEhS9JhbDwJakQC5sO0GnJkiU5\nODjYdAxJ6hvbtm37Tmae1s2xPVX4g4ODjI2NNR1DkvpGRDzT7bEu6UhSISx8SSqEhS9JhbDwJakQ\nFr4kFcLCl6RCWPiSVAgLX5IKYeFLUiEsfEkqhIUvSYWw8CWpEBa+JBXCwpekQlj4klQIC1+SCmHh\nS1IhLHxJKoSFL0mFsPAlqRA9dRNzab4MDg6yaNEiAPbv388VV1zB9ddfz/HHH99wMqk+tc3wI+LT\nEbE3Ih6vawzpaGzZsoXt27fz4IMPsnv3btatW9d0JKlWdS7p3Aq8q8bzS3PilFNO4ZOf/CR33XUX\nL774YtNxpNrUVviZ+TXAnx71hcWLF7Ns2TJ27drVdBSpNq7hdxgeHm46gmoyOjp6xGMys/4gUoMa\nv0onItZFxFhEjE1MTDSWY3h4mHa73dj4atbLL7/M+Pg4b3rTm5qOItWm8Rl+Zq4H1gMMDQ01OsVq\ntVpdzQR1bNm3bx/XXnstl112GaeeemrTcaTaNF74UlNGRkbITA4cOMDll1/O9ddf33QkqVa1FX5E\nbAKGgSURsQe4ITNvqWs8aSbGx8ebjiDNu9oKPzOvquvckqSZa/xNW0nS/LDwJakQFr4kFcLCl6RC\nWPiSVAgLX5IKYeFLUiEsfEkqhIUvSYWw8CWpEH29edpc7l/fbrdptVpzdj5J6jV9O8N3/3pJmpm+\nnuHP5f713u1K0rGub2f4kqSZsfAlqRAWviQVwsKXpEJY+JJUCAtfkgph4UtSISx8SSqEhS9JhbDw\nJakQfb21gnrb4OAgixYtYmBgAIBVq1Zx8803N5xKKldthR8RbwBuB/45cABYn5kfr2s89aYtW7aw\nZMmSpmNIot4Z/qvAf8zMhyNiEbAtIu7LzB01jilJOoza1vAz8/nMfLh6/DKwEzizrvHUm0ZGRmi1\nWrRaLW666aam40hFm5c1/IgYBM4Dts7HeLPVbrfdJvkoTLdVtUs6Uu+o/SqdiDgF+Dzwocx8aZrv\nr4uIsYgYm5iYqDvOYY2OjnrHK0nHtFpn+BFxHJNlvzEzvzDdMZm5HlgPMDQ0lHXmOZK5upmKJPWi\nOq/SCeAWYGdm/lFd46i3jYyMHLos861vfSu33357w4mkctU5w78QeC+wPSIO3nz2DzLz7hrHVA8Z\nHx9vOoKkDrUVfmbeD0Rd55ckzYxbK0hSISx8SSqEhS9JhbDwJakQFr4kFcLCl6RCWPiSVAgLX5IK\nYeFLUiEsfEkqhIUvSYWw8CWpEBa+JBXCwpekQlj4klQIC1+SCmHhS1IhLHxJKoSFL0mFsPAlqRAW\nviQVwsKXpEJY+JJUiIVNB5DUXwYHB1m0aBEDAwMAfOITn+CCCy5oOJW6UVvhR8QJwNeA46txNmfm\nDXWNJ2n+bNmyhSVLljQdQzNU5wz/n4Bfzsx9EXEccH9E/GVm/m2NY0qSDqO2ws/MBPZVXx5X/cq6\nxpM0f0ZGRhgYGOD4449n69atTcdRl2pdw4+IAWAb8EbgTzKzp/9mDA8PNx1B6jmjo6M/9ZxLOv2p\n1qt0MnN/ZraAs4DzI+ItU4+JiHURMRYRYxMTE3XGeU3Dw8O02+3Gxpekus3LVTqZ+b2IGAXeBTw+\n5XvrgfUAQ0NDjS75tFqtaWczknQsqG2GHxGnRcTrq8cnAu8EvlHXeJKk11bnDP904LZqHX8BcGdm\nfrnG8STNg/Hx8aYjaJbqvErnMeC8us4vSZoZt1aQpEJY+JJUiK6XdCLi54ATDn6dmd+qJZEkqRZH\nnOFHxOqI2AU8Dfw1MA78Zc25JElzrJslnf8G/BLwzcxcBlwMPFBrKknSnOum8F/JzO8CCyJiQWZu\nAVo155IkzbFu1vC/FxGnMLnV8caI2Au8Wm8sSdJc62aGvwb4AfBh4B7gKeDX6gwlSZp73RT+2dUm\naK9m5m2ZeTOwsu5gkqS51U3h3xkR/zkmnRgR/xv4w7qDSZLmVjeF/4vAG4C/AR4CngMurDOUJGnu\ndXWVDpNr+Ccy+cGrpzPzQK2pJElzrpvCf4jJwn87cBFwVURsrjWVJGnOdXNZ5vsyc6x6/AKwJiLe\nW2MmSVINjlj4B8t+yl46f11nKEnS3OtmL51fcy8dSep/3azh/3fcS0eS+p576UhSIWa7l84r9caS\nJM21bgr/UeAfmdxL5z3AzwCn1BlKkjT3uin8keqDVgeA2wAi4rFaU0mS5txhCz8ifg+4FlgxpeAX\n4Zu2ktR3XmuGfweTl1/+IfDRjudfzswXa03VpXa7zfDw8Jydq9XyvWhJx67DXqWTmd/PzPHMvCoz\nn+n41RNlPzo6akFr1vbt28c111zDihUrOOecc1i1ahVbt25tOpZUq27W8I9KRAwAY8C3M/PSuTz3\n6OjonJ1rrl4pqD+8//3vZ9myZezatYsFCxawe/dudu7c2XQsqVa1Fz7wQWAnsHgexpKO6KmnnmLr\n1q1s3LiRBQsmX+QuX76c5cuXN5xMqlc3H7yatYg4C/hVYEOd40gz8cQTT9BqtRgYGGg6ijSv6p7h\n/zHwESav7NFRctlp9uZy+U/qV7XN8CPiUmBvZm47wnHrImIsIsYmJibqitP3hoeHabfbTcc4Jpxz\nzjk8+uijHDjgfXxUljpn+BcCqyPiV5jcVnlxRHw2M3+r86DMXA+sBxgaGsoa8/S9VqvlTHUOrFix\ngqGhIW644QY+9rGPERHs2rWLHTt2sGbNmqbjSbWpbYafmb+fmWdl5iCwFvirqWUvNWXDhg288MIL\nvPGNb2TlypVcffXVnHHGGU3Hkmo1H1fpSD1n8eLFfOpTn2o6hjSv5qXwM3MUGJ2PsSRJ06v1skxJ\nUu+w8CWpEBa+JBXCwpekQlj4klQIC1+SCmHhS1IhLHxJKoSFL0mFsPAlqRAWviQVwsKXpEJY+JJU\nCAtfkgph4UtSISx8SSqEhS9JhbDwJakQ3tO2Q7vdZnh4uOkY02q327RaraZjSOpjzvAro6OjFqqk\nY5oz/A6jo6NNRzisXn3lIal/OMOXpEJY+JJUCAtfxXrkkUeICO69996mo0jzotbCj4jxiNgeEe2I\nGKtzLGmmNm3axEUXXcSmTZuajiLNi/l403YkM78zD+NIXctMNm/ezH333cc73vEOfvjDH3LCCSc0\nHUuqlVfpaM708pVEU6/AeuCBB1i2bBkrVqxgeHiYu+++myuuuKKZcNI8qXsNP4GvRMS2iFg33QER\nsS4ixiJibGJiouY4qsvw8DDtdrvpGF3btGkTa9euBWDt2rUu66gIkZn1nTzijMx8LiJ+DrgPuC4z\nv3a444eGhnJszKX+6RycPffqZwV6PV+n/fv3c+aZZ3LccccxMDBAZvLd736X559/nkWLFjUdT5qR\niNiWmUPdHFvrDD8zn6t+3wt8ETi/zvGkbnz1q1/l3HPP5dlnn2V8fJxnnnmGK6+8krvuuqvpaFKt\naiv8iDg5IhYdfAxcAjxe13hStzZt2sTll1/+E89deeWV3HHHHQ0lkuZHnW/aLgW+GBEHx7kjM++p\ncTypK7feeutPPbd69WpWr149/2GkeVRb4WfmbuDcus4vSZoZP2krSYWw8CWpEBa+JBXCwpekQlj4\nklQIC1+SCmHhS1IhLHxJKoSFL0mFsPAlqRDeAKWPtNvtnr3JSLvdptVqNR1D0mtwht8nRkdHLVRJ\nR8UZfh/p5ZuL9OorD0k/5gxfkgph4UtSISx8SSqEhS9JhbDwJakQFr4kFcLCl6RCWPiSVAgLX5IK\nYeFLUiEsfEnHlMHBQVauXEmr1WLlypV86UtfajpSz6h1L52IeD2wAXgLkMDvZub/q3NMSdqyZQtL\nlizhySef5JJLLmHNmjVNR+oJdW+e9nHgnsz89Yh4HXBSzeNJ0iEvvfQSp556atMxekZthR8Ri4FV\nwG8DZOaPgB/VNZ6a18v79evYNd0usiMjI2Qmu3fv5s4775z/UD2qzjX85cAE8JmIeCQiNkTEyVMP\nioh1ETEWEWMTExM1xlGd3K9fvWTLli08/vjjbN++nQ984APs27ev6Ug9oc4lnYXA24DrMnNrRHwc\n+ChwfedBmbkeWA8wNDSUNeZRzXp5v36VacWKFSxdupQdO3Zw/vnnNx2ncXXO8PcAezJza/X1Zib/\nAZCkebF3716efvppzj777Kaj9ITaZviZ+UJEPBsRb87MJ4GLgR11jSdJB42MjDAwMMArr7zCjTfe\nyNKlS5uO1BPqvkrnOmBjdYXObuB3ah5PUuHGx8ebjtCzai38zGwDQ3WOIUnqjp+0laRCWPiSVAgL\nX5IKYeFLUiEsfEkqhIUvSYWw8CWpEBa+JBXCwpekQlj4klQIC1+SCmHhS1IhLHxJKoSFL0mFsPAl\nqRAWviQVwsKXpEJY+JJUCAtfkgph4UtSISx8SSpEZGbTGQ6JiAngmRn8kSXAd2qKMxfMN3u9nA3M\ndzR6ORv0dr7psp2dmad184d7qvBnKiLGMnOo6RyHY77Z6+VsYL6j0cvZoLfzHW02l3QkqRAWviQV\not8Lf33TAY7AfLPXy9nAfEejl7NBb+c7qmx9vYYvSepev8/wJUld6tvCj4h3RcSTEfF3EfHRpvN0\niohPR8TeiHi86SxTRcQbImJLROyMiCci4oNNZ+oUESdExIMR8WiV7782nWmqiBiIiEci4stNZ5kq\nIsYjYntEtCNirOk8U0XE6yNic0R8o/o7+K+azgQQEW+u/p8d/PVSRHyo6VydIuLD1c/E4xGxKSJO\nmPE5+nFJJyIGgG8C/xrYAzwEXJWZOxoNVomIVcA+4PbMfEvTeTpFxOnA6Zn5cEQsArYBl/XQ/7sA\nTs7MfRFxHHA/8MHM/NuGox0SEf8BGAIWZ+alTefpFBHjwFBm9uR15BFxG/D1zNwQEa8DTsrM7zWd\nq1PVL98GfjEzZ/K5oNpExJlM/iz8Qmb+ICLuBO7OzFtncp5+neGfD/xdZu7OzB8BnwPWNJzpkMz8\nGvBi0zmmk5nPZ+bD1eOXgZ3Amc2m+rGctK/68rjqV8/MSiLiLOBXgQ1NZ+k3EbEYWAXcApCZP+q1\nsq9cDDzVK2XfYSFwYkQsBE4CnpvpCfq18M8Enu34eg89VFr9IiIGgfOArc0m+UnVkkkb2Avcl5m9\nlO+PgY8AB5oOchgJfCUitkXEuqbDTLEcmAA+Uy2JbYiIk5sONY21wKamQ3TKzG8D/wv4FvA88P3M\n/MpMz9OvhR/TPNczs8B+EBGnAJ8HPpSZLzWdp1Nm7s/MFnAWcH5E9MSyWERcCuzNzG1NZ3kNF2bm\n24B3A/++Wl7sFQuBtwF/mpnnAf8A9Nr7b68DVgP/t+ksnSLiVCZXMZYBZwAnR8RvzfQ8/Vr4e4A3\ndHx9FrN4eVOqam3888DGzPxC03kOp3q5Pwq8q+EoB10IrK7WyT8H/HJEfLbZSD8pM5+rft8LfJHJ\n5c9esQfY0/GKbTOT/wD0kncDD2fm3zcdZIp3Ak9n5kRmvgJ8Abhgpifp18J/CPj5iFhW/Yu8Fvjz\nhjP1hepN0VuAnZn5R03nmSoiTouI11ePT2TyL/o3mk01KTN/PzPPysxBJv/O/VVmzniWVZeIOLl6\nI55qqeQSoGeuFMvMF4BnI+LN1VMXAz1xsUCHq+ix5ZzKt4BfioiTqp/hi5l8/21GFs55rHmQma9G\nxAeAe4EB4NOZ+UTDsQ6JiE3AMLAkIvYAN2TmLc2mOuRC4L3A9mqdHOAPMvPuBjN1Oh24rbpSYgFw\nZ2b23OWPPWop8MXJPmAhcEdm3tNspJ9yHbCxmqjtBn6n4TyHRMRJTF75d03TWabKzK0RsRl4GHgV\neIRZfOq2Ly/LlCTNXL8u6UiSZsjCl6RCWPiSVAgLX5IKYeFLUiEsfPWsiBic7x1HI2I0Il7znqF1\n5YqI4Yi4oOPrWyPi1+d6HJXLwlffq67ZPxYMM4tPT0rdsvDV6xZGxG0R8Vi1j/pJcGjf9/8SEfcD\nvxERV0fEQ9U++p/vOO7WiLg5Iv4mInZ3zpgj4iPV3vGPRsSNHWP+RrUn/zcj4h2vFa7a6O1/VmM/\nFhHXVM8PV68WDu79vrH6hCQR8SvVc/dX2b5cbWT374APV/uxHxx31XTZpdmw8NXr3gysz8y3Ai8B\n13Z874eZeVFmfg74Qma+PTPPZfIj5+/rOO504CLgUuBGgIh4N3AZk3uenwv8j47jF2bm+cCHgBuO\nkO99TO5c+Hbg7cDVEbGs+t551Tl+gcmdIi+sblrxZ8C7M/Mi4DSAzBwHPgnclJmtzPz64bJLs2Xh\nq9c9m5kPVI8/y2T5HfR/Oh6/JSK+HhHbgfcA53R8767MPFDd5GVp9dw7gc9k5j8CZGbn/QsObii3\nDRg8Qr5LgH9TbVOxFfhnwM9X33swM/dk5gGgXZ3rXwK7M/Pp6pgj7dsyXXZpVvpyLx0VZereH51f\n/0PH41uZvHPXoxHx20yuhx/0Tx2Po+P3w+0rcvD4/Rz5ZySA6zLz3p94MmJ4yrgHzzXd1t6vZbrs\n0qw4w1ev+xfx4/ueXsXkbd6mswh4vtr6+T1dnPcrwO92rPX/7Czz3Qv8XjUuEfGmI9zU4xvA8mrN\nHuA3O773MpP/HVItLHz1up3Av42Ix4CfBf70MMddz+SSyn10sZ1ytYvknwNj1XLMf5plvg1MbvH7\ncHWp5p/xGq8KMvMHTL4PcU/1hvPfA9+vvv0XwOVT3rSV5oy7ZUrzLCJOqW7SHsCfALsy86amc+nY\n5wxfmn9XV68qngB+hslXBVLtnOFLUiGc4UtSISx8SSqEhS9JhbDwJakQFr4kFcLCl6RC/H9s0aLy\nx6pPFwAAAABJRU5ErkJggg==\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x7f528e1484e0>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Convert the tree to newick string format\n",
"tree = tree_to_newick(root)\n",
"\n",
"# Convert the newick string format to Phylo Python Tree\n",
"tree = Phylo.read(StringIO(tree), \"newick\")\n",
"\n",
"print(tree)\n",
"\n",
"# Display the tree\n",
"Phylo.draw(tree)\n",
"\n",
"# Save the tree to newick format nj.nw\n",
"Phylo.write(tree, 'nj.nw', 'newick')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
{
"cells": [
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import itertools as it\n",
"import pandas as pd \n",
"#from ete3 import Tree"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" A B C D\n",
"A 0 8 7 12\n",
"B 8 0 9 14\n",
"C 7 9 0 11\n",
"D 12 14 11 0\n",
"A\n",
"27\n",
"B\n",
"31\n",
"C\n",
"27\n",
"D\n",
"37\n"
]
}
],
"source": [
"#Build matrix\n",
"\n",
"M1 = [[0,8,7,12], [8,0,9,14], [7,9,0,11], [12,14,11,0]]\n",
"M2 = [[0,2,3,8,14,18],[2,0,3,8,14,18],\n",
" [3,3,0,8,14,18],[8,8,8,0,14,18],\n",
" [14,14,14,14,0,18],[18,18,18,18,18,0]]\n",
"#UPGMA\n",
"M3 = [[0,19,27,8,33,18,13],[19,0,31,18,36,1,13],\n",
" [27,31,0,26,41,32,29],[8,18,26,0,31,17,14],\n",
" [33,36,41,31,0,35,28],[18,1,32,17,35,0,12],\n",
" [13,13,29,14,28,12,0]]\n",
"#Neighbor Joining\n",
"M4 = [[0,2,4,6,6,8],[2,0,4,6,6,8],\n",
" [4,4,0,6,6,8],[6,6,6,0,4,8],\n",
" [6,6,6,4,0,8],[8,8,8,8,8,0]]\n",
"\n",
"#creation de DataFrame \n",
"df1 = pd.DataFrame(M1, index = ['A', 'B', 'C', 'D'], columns = ['A', 'B', 'C', 'D'])\n",
"df2 = pd.DataFrame(M2, index = ['A', 'B', 'C', 'D','E', 'F'], columns = ['A', 'B', 'C', 'D','E', 'F'])\n",
"df3 = pd.DataFrame(M3, index = ['A', 'B', 'C', 'D','E', 'F', 'G'], columns = ['A', 'B', 'C', 'D','E', 'F', 'G'])\n",
"df4 = pd.DataFrame(M4, index = ['A', 'B', 'C', 'D','E', 'F'], columns = ['A', 'B', 'C', 'D','E', 'F'])\n",
"\n",
"print(df1)\n",
"for cluster in df1:\n",
" print(cluster)\n",
" print(sum(df1[cluster]))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'dm' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-11-9b689930fd24>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 53\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 54\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 55\u001b[0;31m \u001b[0mis_additive\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m<ipython-input-11-9b689930fd24>\u001b[0m in \u001b[0;36mis_additive\u001b[0;34m(matrix)\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmatrix\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 32\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mi\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mj\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 33\u001b[0;31m \u001b[0mdistance_ij\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mj\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 34\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmatrix\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 35\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mk\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mk\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mj\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'dm' is not defined"
]
}
],
"source": [
"#1. Introduction \n",
"\n",
"#Question1\n",
"#Version with loops\n",
"def is_ultrametric(matrix):\n",
" for i in matrix:\n",
" for j in matrix:\n",
" if i != j:\n",
" distance_ij = min(dm.loc[i, j], dm.loc[j, i])\n",
" for k in matrix:\n",
" if k != i and k != j:\n",
" distance_ik = min(dm.loc[i, k], dm.loc[k, i])\n",
" distance_kj = min(dm.loc[k, j], dm.loc[j, k])\n",
" if distance_ij > max(distance_ik, distance_kj):\n",
" return False\n",
" return True\n",
"\n",
"#Version with itertools\n",
"def is_ultrametric2(matrix):\n",
" # get column names\n",
" columns = list(matrix.columns)\n",
" combos = itertools.combinations(columns, 3)\n",
" for triplet in combos:\n",
" if matrix.loc[triplet[0]][triplet[2]] > max(matrix.loc[triplet[0]][triplet[1]], matrix.loc[triplet[1]][triplet[2]]):\n",
" return False\n",
" return True \n",
"\n",
"\n",
"def is_additive(matrix):\n",
" for i in matrix:\n",
" for j in matrix:\n",
" if i != j:\n",
" distance_ij = min(dm.loc[i, j], dm.loc[j, i])\n",
" for k in matrix:\n",
" if k != i and k != j:\n",
" for l in matrix:\n",
" if l != k and l != i and l != j:\n",
" distance_ik = min(dm.loc[i, k], dm.loc[k, i])\n",
" distance_kl = min(dm.loc[k, l], dm.loc[l, k])\n",
" distance_lj = min(dm.loc[l, j], dm.loc[j, l])\n",
" if distance_ij > max(distance_ik, distance_kl, distance_lj):\n",
" return False\n",
" return True\n",
"\n",
"def is_additive2(matrix):\n",
" # Better version with itertools\n",
" # get column names\n",
" columns = list(matrix.columns)\n",
" combos = itertools.combinations(columns, 4)\n",
" for triplet in combos:\n",
" if matrix.loc[triplet[0]][triplet[3]] > max(matrix.loc[triplet[0]][triplet[1]], matrix.loc[triplet[1]][triplet[2]],matrix.loc[triplet[2]][triplet[3]]):\n",
" return False\n",
" return True \n",
"\n",
"is_additive(df1)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"#Question 2\n",
"\n",
"def sum_distance(cluster, mat):\n",
" return(sum(mat[cluster]))\n",
"\n",
"def sum_distance_all(mat):\n",
" sum_allElements=[]\n",
" for cluster in mat:\n",
" sum_allElements.append(sum(mat[cluster]))\n",
" return(sum_allElements)\n",
"\n",
"#sum_distance_all(df1)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"la\n",
"(2, 0)\n",
"3.5\n",
"(2, 0)\n",
"3.5\n",
"(2, 0)\n",
"3.5\n",
"(2, 0)\n",
"3.5\n"
]
},
{
"data": {
"text/plain": [
"3.5"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#2. UPGMA\n",
"\n",
"\"\"\"\n",
"Le format Newick permet de decrire les relations phylogeniques entre les especes en partant d'un arbre.\n",
"Ce format met des groupes freres ensemble entre parantheses, en les séparant par une virgule.\n",
"ex : (B,(A,C,E),D);\n",
"Si on a les données sur les longueurs des branches, on met cette information apres l'espece\n",
"ou le groupe frere avec ':'\n",
"ex :(A:0.1,B:0.2,(C:0.3,D:0.4):0.5);\n",
"\"\"\"\n",
"\n",
"def findMin(matDistance):\n",
" mini_value=matDistance.iat[1,0]\n",
" x=1\n",
" y=0\n",
" #Parcours sur les colonnes\n",
" for j in range(len(matDistance)):\n",
" #Parcours sur les lignes\n",
" for i in range(j+1,len(matDistance)):\n",
" if matDistance.iat[i,j]<mini_value:\n",
" mini_value=matDistance.iat[i,j]\n",
" x, y=i, j\n",
" return x,y\n",
"\n",
"def UPGMA(matDistance):\n",
" dico_especes={}\n",
" branch_length = np.zeros(len(matD))#sert à savoir la longueur des branches des sous-arbres\n",
" for cluster in matDistance:\n",
" dico_especes[cluster]=\"\"\n",
" for i in range(len(matDistance)):\n",
" #while (len(matDistance)>1):\n",
" #Trouver la valeur minimale dans la matrice de distance\n",
" ind_min=findMin(matDistance)\n",
" #Calcul de la longueur des branches\n",
" branch_len=matDistance.iat[ind_min[0],ind_min[1]]/2\n",
" d_c = round(branch_len - branch_length[ind_min[0]], 2)\n",
" d_l = round(branch_len - branch_length[ind_min[1]], 2)\n",
" branch_length[ind_min[0]]+=d_c\n",
" return branch_len\n",
"\n",
"UPGMA(df1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
M1 = [[0,8,7,12], [8,0,9,14], [7,9,0,11], [12,14,11,0]]
M2 = [[0,2,3,8,14,18],[2,0,3,8,14,18],
[3,3,0,8,14,18],[8,8,8,0,14,18],
[14,14,14,14,0,18],[18,18,18,18,18,0]]
#UPGMA
M3 = [[0,19,27,8,33,18,13],[19,0,31,18,36,1,13],
[27,31,0,26,41,32,29],[8,18,26,0,31,17,14],
[33,36,41,31,0,35,28],[18,1,32,17,35,0,12],
[13,13,29,14,28,12,0]]
#Neighbor Joining
M4 = [[0,2,4,6,6,8],[2,0,4,6,6,8],
[4,4,0,6,6,8],[6,6,6,0,4,8],
[6,6,6,4,0,8],[8,8,8,8,8,0]]
StepMatrix = [[0, 3, 4, 9],
[3, 0, 2, 4],
[4, 2, 0, 4],
[9, 4, 4, 0]]
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment