Commit d324c775 by righetti2

Upload New File

parent ac780633
#!/bin/bash
#SBATCH --job-name=Mo12_2014
#SBATCH --output=/shared/home/righettin/Analyses/slurm_outputs/Mo12_2014/Mo12_2014.out
#SBATCH --error=/shared/home/righettin/Analyses/slurm_outputs/Mo12_2014/Mo12_2014.error
#SBATCH --cpus-per-task=48
#SBATCH --nodes=1
# Variables to be set manually, same for the slurms job and outputs
genome="Mo12_2014" # Declaring the genome ID variable
fr_ln="150" # Declaring the fragment length used for sequencing
autosome_sum_length="2937639396" # Declaring the sum of the length of all autosomes in the reference, third column of the Data/References/hg38/hg38.ungapped.lengths file
hg38_genome_txt="/shared/home/righettin/Data/References/hg38/hg38.genome.txt"
hg38_genes_protein_coding_bed="/shared/home/righettin/Data/References/hg38/hg38_genes_protein_coding_tabdelimited.bed"
echo "Performing the pipeline for $genome sequenced with fragment lenght equals to $fr_ln bp and mapped to the reference hg38.
All necessary packages and programs are installed in the conda environmente named "CNVconda", all scripts are in /shared/home/righettin/Scripts directory."
# SECTION A: READS MAPPING
echo "
-. .-. .-. .-. .-. .-. .
||\|||\ /|||\|||\ /|||\|||\ /|
|/ \|||\|||/ \|||\|||/ \|||\||
~ \`-~ \`-\` \`-~ \`-\` \`-~ \`-
SECTION A: READS MAPPING
In this first section the reads of $genome are mapped to the human reference hg38 through bwa aln.
The output is then processed through some filters."
# 1.a) Alignment to the Reference Genome
echo "1.a) Mapping of $genome on hg38:"
bwa aln -t 48 -l 16500 -n 0.01 -o 2 /shared/home/righettin/Data/References/hg38/bwa_2021/hg38 /shared/home/righettin/Data/FASTQ_Files/$genome/${genome}.fastq > /shared/home/righettin/Analyses/mapping/$genome/${genome}_mapped_hg38_multi.sai
echo "Mapping completed."
# 1.b) Format Conversion
echo "1.b) Format convertion:"/shared/home/righettin/Data/References/hg38/hg38.genome.txt
bwa samse /shared/home/righettin/Data/References/hg38/bwa_2021/hg38 /shared/home/righettin/Analyses/mapping/$genome/${genome}_mapped_hg38_multi.sai /shared/home/righettin/Data/FASTQ_Files/$genome/${genome}.fastq \
| samtools view -F 4 -h -Su \
| samtools sort -o /shared/home/righettin/Analyses/mapping/$genome/${genome}.bam
echo "Format convertion completed."
# 2) Filter Out PCR Duplicates
echo "2) Filtering out PCR duplicates:"
java -jar /shared/home/righettin/Programs/picard.jar MarkDuplicates --INPUT /shared/home/righettin/Analyses/mapping/$genome/${genome}.bam \
--OUTPUT /shared/home/righettin/Analyses/mapping/$genome/${genome}_nopcr.bam \
--METRICS_FILE /shared/home/righettin/Analyses/mapping/$genome/${genome}_pcr_duplicates.metrics \
--REMOVE_DUPLICATES true \
--REMOVE_SEQUENCING_DUPLICATES true
echo "PCR duplicates removed."
# 3) Filter Reads with Percentage of Identity < than 90% and Shorter than 30bp
echo "3.a) Bed File Generation and Mismatch Calculation to remove reads with percentage of identity lower than 90% and shorter than 30bp:"
bedtools bamtobed -ed -i /shared/home/righettin/Analyses/mapping/$genome/${genome}_nopcr.bam > /shared/home/righettin/Analyses/mapping/$genome/${genome}_ed.bed
echo "Bed File created."
echo "3.b) Calculating Read Lenght and Percentage of Mismatches:"
awk -F"\t" '{print $1"\t"$3-$2"\t"$5"\t"($5/($3-$2)*100)"\t"$4}' /shared/home/righettin/Analyses/mapping/$genome/${genome}_ed.bed > /shared/home/righettin/Analyses/mapping/$genome/${genome}_read_length_mismatches.txt
echo "Read Lenght and Percentage of Mismatches calculated."
echo "3.c) Filtering Out Reads with Percentage of Identity Lower than 90% and shorter than 30bp:"
awk '($4 <= 10) && ($2 >=30) {print $5}' /shared/home/righettin/Analyses/mapping/$genome/${genome}_read_length_mismatches.txt > /shared/home/righettin/Analyses/mapping/$genome/${genome}_filtered_read_names_30bp.txt
java -jar /shared/home/righettin/Programs/picard.jar FilterSamReads \
--INPUT /shared/home/righettin/Analyses/mapping/$genome/${genome}_nopcr.bam \
--OUTPUT /shared/home/righettin/Analyses/mapping/$genome/${genome}_nopcr_90perc_30bp.bam \
--READ_LIST_FILE /shared/home/righettin/Analyses/mapping/$genome/${genome}_filtered_read_names_30bp.txt \
--FILTER includeReadList
echo "Reads with Percentage of Identity Lower than 90% and shorter than 30bp removed."
# SECTION B: DATA PREPARATION
echo "
-. .-. .-. .-. .-. .-. .
||\|||\ /|||\|||\ /|||\|||\ /|
|/ \|||\|||/ \|||\|||/ \|||\||
~ \`-~ \`-\` \`-~ \`-\` \`-~ \`-
SECTION B: DATA PREPARATION
In this second section the de-novo mapping is used to perform the following steps:
0) Create all necessary directories and subdirectories
1) The BAM file is separated in 24 separate files, one for each chromosome, with numbers from 1 to 22 or letter X/Y. From now on, all the analyses of this section will be performed on each chromosome.
2) The GC-bias for each chromosome is computed and each alingment is corrected based on the results.
3) The depth of coverage for each chromosome is computed.
4) The average depth of coverage for each chromosome is computed and the results are concatenated in a single file.
5) The average depth is calculated excluding the blacklisted regions.
6) A .txt file containing the average depth of coverage of the genome, taking into account all autosomes, is generated.
7) A gzipped file containing the depth of coverage of each base of protein coding genes defined in the .bed reference is obtained. This will be the main input for the CNV estimation"
# 0) Create all necessary directories and subdirectories
echo "0) Creating all necessary directories and subdirectories:"
mkdir -p /shared/home/righettin/Analyses/Data_preparation/$genome/{avgdepth,bam,corrected,coverage}
echo "Directories and subdirectories created."
# 1) BAM handling
echo "1) Separating the BAM obtained with Section A in single chromosome files:"
input_dir="/shared/home/righettin/Analyses/mapping/$genome"
output_dir="/shared/home/righettin/Analyses/Data_preparation/$genome/bam"
INPUT_BAM="${input_dir}/${genome}_nopcr_90perc_30bp.bam"
# Index the input BAM file if not already indexed
if [ ! -f "${INPUT_BAM}.bai" ]; then
echo "Indexing input BAM file..."
samtools index "${INPUT_BAM}"
fi
# Loop over each chromosome
for chr in {1..22} X Y; do
OUTPUT_BAM="${output_dir}/${genome}.hg38.chr${chr}.bam"
# Extract reads for the current chromosome
samtools view -b -o "${OUTPUT_BAM}" "${INPUT_BAM}" "chr${chr}"
done
echo "BAM separated in single chromosome files. Results are store in $output_dir."
# 2) Correct GC bias
echo "2) Correcting the BAM files based on GC-bias:"
data_dir="/shared/home/righettin/Analyses/Data_preparation/$genome"
bam_dir="$data_dir/bam" # Path to the single chromosomes BAM files created in step 1
corrected_dir="$data_dir/corrected" # Where the corrected BAMs will be stored
chrom_size_file="/shared/home/righettin/Data/References/hg38/hg38.genome.txt" # Chromosome size file
chrom_2bit_dir="/shared/home/righettin/Data/References/hg38/dna_sm" # 2bit reference genome directory
blacklist_file="/shared/home/righettin/Data/References/hg38/intervals/blacklist.bed" # Path to blacklist.bed file
for bam_file in "$bam_dir"/*.bam; do
filename=$(basename "$bam_file") # Extract BAM filename
chr=$(echo "$filename" | cut -d '.' -f 3) # Extract chromosome identifier from BAM filename
chr_number=$(echo "$chr" | sed 's/^chr//') # Strip 'chr' to match the format in the chromosome size file
effective_genome_size=$(awk -v chr="$chr_number" '$1 == chr {print $2}' "$chrom_size_file") # Retrieve effective genome size for the chromosome
# Index the BAM file
echo "Indexing $bam_file...:"
samtools index "$bam_file"
# Compute GC bias for the chromosome
echo "Computing GC bias for $chr...:"
computeGCBias -b "$bam_file" --effectiveGenomeSize "$effective_genome_size" -g "$chrom_2bit_dir/Homo_sapiens.hg38.dna_sm.${chr}.2bit" \
-bl "$blacklist_file" -o "$corrected_dir/${genome}.hg38.${chr}_GCfreq.txt" -l "$fr_ln" -p 48
# Correct GC bias for the chromosome
echo "Correcting GC bias for $chr...:"
correctGCBias -b "$bam_file" --effectiveGenomeSize "$effective_genome_size" -g "$chrom_2bit_dir/Homo_sapiens.hg38.dna_sm.${chr}.2bit" \
-freq "$corrected_dir/${genome}.hg38.${chr}_GCfreq.txt" -o "$corrected_dir/${genome}.hg38.${chr}.corrected.bam" -p 48
done
echo "BAM files corrected from GC-bias. Results are stored in $corrected_dir."
mkdir -p "$corrected_dir/freq"
mv "$corrected_dir"/*_GCfreq.txt "$corrected_dir/freq/"
echo "The output txt files of the correctGCBias are store in $corrected_dir/freq."
# 3) Compute Depth of Coverage
echo "3) Computing Depth of Coverage for each Chromosome:"
data_dir="/shared/home/righettin/Analyses/Data_preparation/$genome"
corrected_dir="$data_dir/corrected"
coverage_dir="$data_dir/coverage"
for corrected_bam_file in "$corrected_dir"/*.corrected.bam; do
filename=$(basename "$corrected_bam_file") # Extract BAM filename
chr=$(echo "$filename" | cut -d '.' -f 3) # Extract chromosome identifier from BAM filename
output_prefix="$genome.hg38.$chr" # Define output file prefix for mosdepth
# Index the BAM file
echo "Indexing $corrected_bam_file...:"
samtools index "$corrected_bam_file"
# Run mosdepth for the chromosome
mosdepth -x --chrom "$chr" "$coverage_dir/$output_prefix" "$corrected_bam_file"
done
echo "Depth of coverage for each chromosome computed. Results are stored in $coverage_dir."
# 4) Compute the Average Depth of Coverage for each Chromosome
echo "4a) Computing Depth of Coverage for each Chromosome:"
coverage_dir="/shared/home/righettin/Analyses/Data_preparation/$genome/coverage"
avgdepth_dir="/shared/home/righettin/Analyses/Data_preparation/$genome/avgdepth"
chrom_size_file="/shared/home/righettin/Data/References/hg38/hg38.genome.txt"
for coverage_file in "$coverage_dir"/*.per-base.bed.gz; do
filename=$(basename "$coverage_file") # Extract coverage filename
chr=$(echo "$filename" | cut -d '.' -f 3) # Extract chromosome identifier from coverage filename
avgdepth_output="$avgdepth_dir/$genome.hg38.$chr.avgdepth.txt" # Define output file path for average depth
chr_number=$(echo "$chr" | sed 's/^chr//') # Strip 'chr' to match the format in the chromosome size file
chr_size=$(awk -v chr="$chr_number" '$1 == chr {print $2}' "$chrom_size_file") # Retrieve length for the chromosome
# Calculate average depth and write to output file
echo "calculating average depth of coverage for "$chr":"
zcat $coverage_file | awk -v chrsize=$chr_size '{ sum += (($3-$2)*$4) } END{printf("'$chr_number'\t%.4f\n",sum/chrsize)}' > "$avgdepth_output"
done
# Concatenate all chromosome average depth files into a single genome-wide average depth file
echo "4b) ...and concatenate the results in one single file per-genome:"
cat "$avgdepth_dir/$genome.hg38."*.avgdepth.txt > "$avgdepth_dir/$genome.hg38.avgdepth.txt"
echo "Average Depth of Coverage estimated. Results are stored in $avgdepth_dir."
# 5) Compute the Average Depth of Coverage for each Chromosome excluding blacklisted regions (change the chrom_size_file)
echo "5) Computing Depth of Coverage for each Chromosome excluding blacklisted regions:"
coverage_dir="/shared/home/righettin/Analyses/Data_preparation/$genome/coverage"
avgdepth_nobl_dir="/shared/home/righettin/Analyses/Data_preparation/$genome/avgdepth/nobl"
mkdir -p "$avgdepth_nobl_dir" # Create avgdepth/nobl directory if it doesn't exist
blacklist_file="/path/to/blacklist.bed" # Define blacklist file path
chrom_size_file="/shared/home/righettin/Data/References/hg38/hg38.ungapped.nobl.lengths"
for coverage_file in "$coverage_dir"/*.per-base.bed.gz; do
filename=$(basename "$coverage_file") # Extract coverage filename
chr=$(echo "$filename" | cut -d '.' -f 3) # Extract chromosome identifier from coverage filename
avgdepth_output="$avgdepth_nobl_dir/$genome.hg38.$chr.nobl.avgdepth.txt" # Define output file path for average depth
chr_number=$(echo "$chr" | sed 's/^chr//') # Strip 'chr' to match the format in the chromosome size file
chr_size=$(awk -v chr="$chr_number" '$1 == chr {print $2}' "$chrom_size_file") # Retrieve length for the chromosome
# Calculate average depth and write to output file
echo "calculating average depth of coverage for "$chr":"
zcat $coverage_file | awk -v chrsize=$chr_size '{ sum += (($3-$2)*$4) } END{printf("'$chr_number'\t%.4f\n",sum/chrsize)}' > "$avgdepth_output"
done
# Concatenate all chromosome average depth files into a single genome-wide average depth file
echo "5b) ...and concatenate the results in one single file per-genome:"
cat "$avgdepth_nobl_dir/$genome.hg38."*.nobl.avgdepth.txt > "$avgdepth_nobl_dir/$genome.hg38.nobl.avgdepth.txt"
echo "Average Depth of Coverage estimated. Results are stored in $avgdepth_nobl_dir."
# 6) Computing Average Depth of Coverage for Autosomes
echo "6) Computing Average Depth of Coverage for Autosomes:"
coverage_dir="/shared/home/righettin/Analyses/Data_preparation/$genome/coverage"
genome_doc_file="/shared/home/righettin/Analyses/Data_preparation/$genome/$genome.doc.txt"
# Loop through autosomes (chromosomes 1 to 22)
for i in {1..22}; do
# Compute sum of coverage per base for each autosomal chromosome
zcat "$coverage_dir/$genome.hg38.chr${i}.per-base.bed.gz";
done |\
awk -v name="$genome" -v sum_length="$autosome_sum_length" '{sum+=($3-$2)*$4} END{printf("%s\t%.2f\n",name,1.0*sum/sum_length)}' > $genome_doc_file
echo "Average Depth of Coverage for Autosomes computed and stored in $genome_doc_file."
# 7) Computing Coverage of Protein-Coding Genes
echo "7) Computing Coverage of Protein-Coding Genes:"
coverage_dir="/shared/home/righettin/Analyses/Data_preparation/$genome/coverage"
genome_dir="/shared/home/righettin/Analyses/Data_preparation/$genome"
beds=""
for i in {1..22}; do beds="${beds} $coverage_dir/$genome.hg38.chr${i}.per-base.bed.gz"; done
bedtools intersect -g /shared/home/righettin/Data/References/hg38/hg38.genome.wout.txt \
-a /shared/home/righettin/Data/References/hg38/hg38_genes_protein_coding_chr_2_nomt.bed -b ${beds} \
-wa -wb | gzip -c > $genome_dir/$genome.hg38.genes.protein_coding.coverage.gz
starting_gz=/shared/home/righettin/Analyses/Data_preparation/$genome/$genome.hg38.genes.protein_coding.coverage.gz
ending_gz=/shared/home/righettin/Analyses/Data_preparation/$genome/$genome.fixed.hg38.genes.protein_coding.coverage.gz
zcat $starting_gz | sed 's/chr//g' | gzip -c > $ending_gz # take out "chr" from the file
echo "Coverage of Protein-Coding Genes computed and stored in /shared/home/righetti/Analyses/Data_preparation/$genome/$genome.hg38.genes.protein_coding.coverage.gz."
# SECTION C: CNVs ESTIMATE
CNV_estimates_dir="/shared/home/righettin/Analyses/CNV_estimates"
echo "
-. .-. .-. .-. .-. .-. .
||\|||\ /|||\|||\ /|||\|||\ /|
|/ \|||\|||/ \|||\|||/ \|||\||
~ \`-~ \`-\` \`-~ \`-\` \`-~ \`-
SECTION C: CNVs ESTIMATE
In this section CNVs are estimated for each genome. Results are stored in $CNV_estimates_dir.
"
# 1) Gene copy number estimates
echo "Estimating gene CNV for $genome:"
chrom_doc="/shared/home/righettin/Analyses/Data_preparation/$genome/avgdepth/nobl/$genome.hg38.nobl.avgdepth.txt"
gene_doc="/shared/home/righettin/Analyses/Data_preparation/$genome/$genome.fixed.hg38.genes.protein_coding.coverage.gz"
python3 /shared/home/righettin/Scripts/CNV_scripts/computeGeneCopy.py --chrom-doc $chrom_doc --gene-doc $gene_doc > $CNV_estimates_dir/$genome.nobl.cnv.tsv
echo "CNV estimated for $genome."
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment