Commit ac780633 by righetti2

Upload New File

parent 40cffd97
# ---------------------------------------------------------
# HUMAN PROTEIN ATLAS INFO EXTRACTER
# This R-executable script extracts information from the two downloadable Human Protein Atlas expression datasets at https://www.proteinatlas.org/about/download,
# for rna (https://www.proteinatlas.org/download/rna_tissue_consensus.tsv.zip) and protein (https://www.proteinatlas.org/download/normal_tissue.tsv.zip) respectively.
# For each gene, the 3 organs (or tissue groups) with the highest expression in the rna dataset and the protein dataset are identified. In the case of ties,
# a frequent occurrence in the discrete categories of the protein dataset, more than 3 organs are taken.
# These data can then be cross-referenced with a list of genes of interest, in our case with the list of genes with CNVs at population level.
# Before running the script, change to your own reference paths.
# ---------------------------------------------------------
# Load the required library for data manipulation
library(dplyr)
# ---------------------------------------------------------
# SECTION 1: GROUP DEFINITION
# ---------------------------------------------------------
# Define tissue groups for biological categorization
brain <- c("hippocampal formation", "amygdala", "basal ganglia", "midbrain", "spinal cord", "cerebral cortex", "cerebellum", "hypothalamus", "choroid plexus")
eye <- c("retina")
endocrine_tissues <- c("thyroid gland", "parathyroid gland", "adrenal gland", "pituitary gland")
respiratory_system <- c("nasopharynx", "bronchus", "lung")
proximal_digestive_tract <- c("oral mucosa", "salivary gland", "esophagus", "tongue")
liver_and_gallbladder <- c("liver", "gallbladder")
gastrointestinal_tract <- c("stomach", "small intestine", "colon", "rectum", "duodenum")
pancreas <- c("pancreas")
kidney_and_urinary_bladder <- c("kidney", "urinary bladder")
male_tissues <- c("testis", "epididymis", "prostate", "seminal vesicle")
female_tissues <- c("endometrium", "vagina", "breast", "cervix", "fallopian tube", "ovary", "placenta")
muscle_tissues <- c("heart muscle", "skeletal muscle", "smooth muscle")
connective_and_soft_tissues <- c("soft tissues", "adipose tissue")
skin <- c("skin")
bone_marrow_and_lymphoid_tissues <- c("bone marrow", "appendix", "thymus", "spleen", "lymph node", "tonsil")
# Function to map tissues to the defined groups
map_tissue_to_group <- function(tissue) {
if (tissue %in% brain) {
return("Brain")
} else if (tissue %in% eye) {
return("Eye")
} else if (tissue %in% endocrine_tissues) {
return("Endocrine_tissues")
} else if (tissue %in% respiratory_system) {
return("Respiratory_system")
} else if (tissue %in% proximal_digestive_tract) {
return("Proximal_digestive_tract")
} else if (tissue %in% liver_and_gallbladder) {
return("Liver_and_gallbladder")
} else if (tissue %in% gastrointestinal_tract) {
return("Gastrointestinal_tract")
} else if (tissue %in% pancreas) {
return("Pancreas")
} else if (tissue %in% kidney_and_urinary_bladder) {
return("Kidney_and_urinary_bladder")
} else if (tissue %in% male_tissues) {
return("Male_tissues")
} else if (tissue %in% female_tissues) {
return("Female_tissues")
} else if (tissue %in% muscle_tissues) {
return("Muscle_tissues")
} else if (tissue %in% connective_and_soft_tissues) {
return("Connective_and_soft_tissues")
} else if (tissue %in% skin) {
return("Skin")
} else if (tissue %in% bone_marrow_and_lymphoid_tissues) {
return("Bone_marrow_and_lymphoid_tissues")
} else {
return("Other")
}
}
# ---------------------------------------------------------
# SECTION 2: RNA DATA PROCESSING
# ---------------------------------------------------------
# RNA dataset input: Replace the path with your own RNA dataset path
rna_expression_data <- read.csv(file = "path/to/rna_tissue_consensus.tsv", sep = "\t")
# Apply tissue group mapping to RNA data
rna_expression_data$Group <- sapply(rna_expression_data$Tissue, map_tissue_to_group)
# Function to extract the tissue with the highest nTPM (normalized transcripts per million) for each gene
extract_highest_level_tissue <- function(data) {
highest_level_index <- which.max(data$nTPM)
highest_level_tissue <- data[highest_level_index, ]
return(highest_level_tissue)
}
# Group RNA data by gene and tissue group, selecting the tissue with the highest nTPM
all_genes_organs_rna <- rna_expression_data %>%
group_by(Gene, Gene.name, Group) %>%
filter(nTPM == max(nTPM)) %>%
ungroup() %>%
as.data.frame()
# Select the top 3 tissues with the highest nTPM for each gene
all_genes_3organs_rna <- all_genes_organs_rna %>%
group_by(Gene) %>%
slice_max(order_by = nTPM, n = 3) %>%
ungroup() %>%
as.data.frame()
# Remove rows where nTPM is equal to 0
all_genes_3organs_rna <- all_genes_3organs_rna[all_genes_3organs_rna$nTPM != 0, ]
# Export processed RNA data: Replace with your desired output path
write.table(all_genes_3organs_rna, file = "path/to/output/all_genes_3organs_rna.tsv", sep = "\t", row.names = FALSE, quote = FALSE)
# ---------------------------------------------------------
# SECTION 3: PROTEIN DATA PROCESSING
# ---------------------------------------------------------
# Protein dataset input: Replace the path with your own protein dataset path
protein_expression_data <- read.csv(file = "path/to/normal_tissue.tsv", sep = "\t")
# Apply tissue group mapping to protein data
protein_expression_data$Group <- sapply(protein_expression_data$Tissue, map_tissue_to_group)
# Remove rows where the reliability of protein expression is uncertain
protein_expression_data <- protein_expression_data[protein_expression_data$Reliability != "Uncertain", ]
# Function to extract the tissue with the highest protein expression level for each gene
extract_highest_level_tissue <- function(data) {
level_order <- c("N/A", "Ascending", "Descending", "Not representative", "Not detected", "Low", "Medium", "High")
highest_level_index <- which.max(match(data$Level, level_order))
highest_level_tissue <- data[highest_level_index, ]
return(highest_level_tissue)
}
# Group protein data by gene and tissue group, selecting the tissue with the highest expression level
all_genes_organs_protein <- protein_expression_data %>%
group_by(Gene, Gene.name, Group) %>%
do(extract_highest_level_tissue(.)) %>%
ungroup()
# Function to extract tissues with the highest levels of protein expression
extract_highest_levels <- function(data) {
level_order <- c("N/A", "Ascending", "Descending", "Not representative", "Low", "Medium", "High")
data <- data[data$Level != "Not detected", ]
if (nrow(data) == 0) return(data.frame()) # If no data remains, return an empty dataframe
highest_level <- max(match(data$Level, level_order))
highest_level_tissues <- data[match(data$Level, level_order) == highest_level, ]
return(highest_level_tissues)
}
# Group protein data by gene and extract the top 3 organs with the highest expression levels
all_genes_3organs_protein <- protein_expression_data %>%
group_by(Gene) %>%
do(extract_highest_levels(.)) %>%
ungroup() %>%
as.data.frame()
# Export processed protein data: Replace with your desired output path
write.table(all_genes_3organs_protein, file = "path/to/output/all_genes_3organs_protein.tsv", sep = "\t", row.names = FALSE, quote = FALSE)
# ---------------------------------------------------------
# END OF SCRIPT
# ---------------------------------------------------------
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment