Commit dd722622 by righetti2

Update read2CNV.slurm

parent c0aa06da
#!/bin/bash
#SBATCH --job-name=Mo12_2014
#SBATCH --output=/shared/home/righettin/Analyses/slurm_outputs/Mo12_2014/Mo12_2014.out
#SBATCH --error=/shared/home/righettin/Analyses/slurm_outputs/Mo12_2014/Mo12_2014.error
#SBATCH --output=/Analyses/slurm_outputs/Mo12_2014/Mo12_2014.out
#SBATCH --error=/Analyses/slurm_outputs/Mo12_2014/Mo12_2014.error
#SBATCH --cpus-per-task=48
#SBATCH --nodes=1
......@@ -9,11 +9,11 @@
genome="Mo12_2014" # Declaring the genome ID variable
fr_ln="150" # Declaring the fragment length used for sequencing
autosome_sum_length="2937639396" # Declaring the sum of the length of all autosomes in the reference, third column of the Data/References/hg38/hg38.ungapped.lengths file
hg38_genome_txt="/shared/home/righettin/Data/References/hg38/hg38.genome.txt"
hg38_genes_protein_coding_bed="/shared/home/righettin/Data/References/hg38/hg38_genes_protein_coding_tabdelimited.bed"
hg38_genome_txt="/Data/References/hg38/hg38.genome.txt"
hg38_genes_protein_coding_bed="/Data/References/hg38/hg38_genes_protein_coding_tabdelimited.bed"
echo "Performing the pipeline for $genome sequenced with fragment lenght equals to $fr_ln bp and mapped to the reference hg38.
All necessary packages and programs are installed in the conda environmente named "CNVconda", all scripts are in /shared/home/righettin/Scripts directory."
All necessary packages and programs are installed in the conda environmente named "CNVconda", all scripts are in /Scripts directory."
# SECTION A: READS MAPPING
......@@ -30,24 +30,24 @@ The output is then processed through some filters."
# 1.a) Alignment to the Reference Genome
echo "1.a) Mapping of $genome on hg38:"
bwa aln -t 48 -l 16500 -n 0.01 -o 2 /shared/home/righettin/Data/References/hg38/bwa_2021/hg38 /shared/home/righettin/Data/FASTQ_Files/$genome/${genome}.fastq > /shared/home/righettin/Analyses/mapping/$genome/${genome}_mapped_hg38_multi.sai
bwa aln -t 48 -l 16500 -n 0.01 -o 2 /Data/References/hg38/bwa_2021/hg38 /Data/FASTQ_Files/$genome/${genome}.fastq > /Analyses/mapping/$genome/${genome}_mapped_hg38_multi.sai
echo "Mapping completed."
# 1.b) Format Conversion
echo "1.b) Format convertion:"/shared/home/righettin/Data/References/hg38/hg38.genome.txt
bwa samse /shared/home/righettin/Data/References/hg38/bwa_2021/hg38 /shared/home/righettin/Analyses/mapping/$genome/${genome}_mapped_hg38_multi.sai /shared/home/righettin/Data/FASTQ_Files/$genome/${genome}.fastq \
echo "1.b) Format convertion:"/Data/References/hg38/hg38.genome.txt
bwa samse /Data/References/hg38/bwa_2021/hg38 /Analyses/mapping/$genome/${genome}_mapped_hg38_multi.sai /Data/FASTQ_Files/$genome/${genome}.fastq \
| samtools view -F 4 -h -Su \
| samtools sort -o /shared/home/righettin/Analyses/mapping/$genome/${genome}.bam
| samtools sort -o /Analyses/mapping/$genome/${genome}.bam
echo "Format convertion completed."
# 2) Filter Out PCR Duplicates
echo "2) Filtering out PCR duplicates:"
java -jar /shared/home/righettin/Programs/picard.jar MarkDuplicates --INPUT /shared/home/righettin/Analyses/mapping/$genome/${genome}.bam \
--OUTPUT /shared/home/righettin/Analyses/mapping/$genome/${genome}_nopcr.bam \
--METRICS_FILE /shared/home/righettin/Analyses/mapping/$genome/${genome}_pcr_duplicates.metrics \
java -jar /Programs/picard.jar MarkDuplicates --INPUT /Analyses/mapping/$genome/${genome}.bam \
--OUTPUT /Analyses/mapping/$genome/${genome}_nopcr.bam \
--METRICS_FILE /Analyses/mapping/$genome/${genome}_pcr_duplicates.metrics \
--REMOVE_DUPLICATES true \
--REMOVE_SEQUENCING_DUPLICATES true
......@@ -57,23 +57,23 @@ echo "PCR duplicates removed."
# 3) Filter Reads with Percentage of Identity < than 90% and Shorter than 30bp
echo "3.a) Bed File Generation and Mismatch Calculation to remove reads with percentage of identity lower than 90% and shorter than 30bp:"
bedtools bamtobed -ed -i /shared/home/righettin/Analyses/mapping/$genome/${genome}_nopcr.bam > /shared/home/righettin/Analyses/mapping/$genome/${genome}_ed.bed
bedtools bamtobed -ed -i /Analyses/mapping/$genome/${genome}_nopcr.bam > /Analyses/mapping/$genome/${genome}_ed.bed
echo "Bed File created."
echo "3.b) Calculating Read Lenght and Percentage of Mismatches:"
awk -F"\t" '{print $1"\t"$3-$2"\t"$5"\t"($5/($3-$2)*100)"\t"$4}' /shared/home/righettin/Analyses/mapping/$genome/${genome}_ed.bed > /shared/home/righettin/Analyses/mapping/$genome/${genome}_read_length_mismatches.txt
awk -F"\t" '{print $1"\t"$3-$2"\t"$5"\t"($5/($3-$2)*100)"\t"$4}' /Analyses/mapping/$genome/${genome}_ed.bed > /Analyses/mapping/$genome/${genome}_read_length_mismatches.txt
echo "Read Lenght and Percentage of Mismatches calculated."
echo "3.c) Filtering Out Reads with Percentage of Identity Lower than 90% and shorter than 30bp:"
awk '($4 <= 10) && ($2 >=30) {print $5}' /shared/home/righettin/Analyses/mapping/$genome/${genome}_read_length_mismatches.txt > /shared/home/righettin/Analyses/mapping/$genome/${genome}_filtered_read_names_30bp.txt
java -jar /shared/home/righettin/Programs/picard.jar FilterSamReads \
--INPUT /shared/home/righettin/Analyses/mapping/$genome/${genome}_nopcr.bam \
--OUTPUT /shared/home/righettin/Analyses/mapping/$genome/${genome}_nopcr_90perc_30bp.bam \
--READ_LIST_FILE /shared/home/righettin/Analyses/mapping/$genome/${genome}_filtered_read_names_30bp.txt \
awk '($4 <= 10) && ($2 >=30) {print $5}' /Analyses/mapping/$genome/${genome}_read_length_mismatches.txt > /Analyses/mapping/$genome/${genome}_filtered_read_names_30bp.txt
java -jar /Programs/picard.jar FilterSamReads \
--INPUT /Analyses/mapping/$genome/${genome}_nopcr.bam \
--OUTPUT /Analyses/mapping/$genome/${genome}_nopcr_90perc_30bp.bam \
--READ_LIST_FILE /Analyses/mapping/$genome/${genome}_filtered_read_names_30bp.txt \
--FILTER includeReadList
echo "Reads with Percentage of Identity Lower than 90% and shorter than 30bp removed."
......@@ -102,15 +102,15 @@ In this second section the de-novo mapping is used to perform the following step
# 0) Create all necessary directories and subdirectories
echo "0) Creating all necessary directories and subdirectories:"
mkdir -p /shared/home/righettin/Analyses/Data_preparation/$genome/{avgdepth,bam,corrected,coverage}
mkdir -p /Analyses/Data_preparation/$genome/{avgdepth,bam,corrected,coverage}
echo "Directories and subdirectories created."
# 1) BAM handling
echo "1) Separating the BAM obtained with Section A in single chromosome files:"
input_dir="/shared/home/righettin/Analyses/mapping/$genome"
output_dir="/shared/home/righettin/Analyses/Data_preparation/$genome/bam"
input_dir="/Analyses/mapping/$genome"
output_dir="/Analyses/Data_preparation/$genome/bam"
INPUT_BAM="${input_dir}/${genome}_nopcr_90perc_30bp.bam"
# Index the input BAM file if not already indexed
......@@ -133,12 +133,12 @@ echo "BAM separated in single chromosome files. Results are store in $output_dir
# 2) Correct GC bias
echo "2) Correcting the BAM files based on GC-bias:"
data_dir="/shared/home/righettin/Analyses/Data_preparation/$genome"
data_dir="/Analyses/Data_preparation/$genome"
bam_dir="$data_dir/bam" # Path to the single chromosomes BAM files created in step 1
corrected_dir="$data_dir/corrected" # Where the corrected BAMs will be stored
chrom_size_file="/shared/home/righettin/Data/References/hg38/hg38.genome.txt" # Chromosome size file
chrom_2bit_dir="/shared/home/righettin/Data/References/hg38/dna_sm" # 2bit reference genome directory
blacklist_file="/shared/home/righettin/Data/References/hg38/intervals/blacklist.bed" # Path to blacklist.bed file
chrom_size_file="/Data/References/hg38/hg38.genome.txt" # Chromosome size file
chrom_2bit_dir="/Data/References/hg38/dna_sm" # 2bit reference genome directory
blacklist_file="/Data/References/hg38/intervals/blacklist.bed" # Path to blacklist.bed file
for bam_file in "$bam_dir"/*.bam; do
filename=$(basename "$bam_file") # Extract BAM filename
......@@ -171,7 +171,7 @@ echo "The output txt files of the correctGCBias are store in $corrected_dir/freq
# 3) Compute Depth of Coverage
echo "3) Computing Depth of Coverage for each Chromosome:"
data_dir="/shared/home/righettin/Analyses/Data_preparation/$genome"
data_dir="/Analyses/Data_preparation/$genome"
corrected_dir="$data_dir/corrected"
coverage_dir="$data_dir/coverage"
......@@ -193,9 +193,9 @@ echo "Depth of coverage for each chromosome computed. Results are stored in $cov
# 4) Compute the Average Depth of Coverage for each Chromosome
echo "4a) Computing Depth of Coverage for each Chromosome:"
coverage_dir="/shared/home/righettin/Analyses/Data_preparation/$genome/coverage"
avgdepth_dir="/shared/home/righettin/Analyses/Data_preparation/$genome/avgdepth"
chrom_size_file="/shared/home/righettin/Data/References/hg38/hg38.genome.txt"
coverage_dir="/Analyses/Data_preparation/$genome/coverage"
avgdepth_dir="/Analyses/Data_preparation/$genome/avgdepth"
chrom_size_file="/Data/References/hg38/hg38.genome.txt"
for coverage_file in "$coverage_dir"/*.per-base.bed.gz; do
filename=$(basename "$coverage_file") # Extract coverage filename
......@@ -217,11 +217,11 @@ echo "Average Depth of Coverage estimated. Results are stored in $avgdepth_dir."
# 5) Compute the Average Depth of Coverage for each Chromosome excluding blacklisted regions (change the chrom_size_file)
echo "5) Computing Depth of Coverage for each Chromosome excluding blacklisted regions:"
coverage_dir="/shared/home/righettin/Analyses/Data_preparation/$genome/coverage"
avgdepth_nobl_dir="/shared/home/righettin/Analyses/Data_preparation/$genome/avgdepth/nobl"
coverage_dir="/Analyses/Data_preparation/$genome/coverage"
avgdepth_nobl_dir="/Analyses/Data_preparation/$genome/avgdepth/nobl"
mkdir -p "$avgdepth_nobl_dir" # Create avgdepth/nobl directory if it doesn't exist
blacklist_file="/path/to/blacklist.bed" # Define blacklist file path
chrom_size_file="/shared/home/righettin/Data/References/hg38/hg38.ungapped.nobl.lengths"
chrom_size_file="/Data/References/hg38/hg38.ungapped.nobl.lengths"
for coverage_file in "$coverage_dir"/*.per-base.bed.gz; do
filename=$(basename "$coverage_file") # Extract coverage filename
......@@ -243,8 +243,8 @@ echo "Average Depth of Coverage estimated. Results are stored in $avgdepth_nobl_
# 6) Computing Average Depth of Coverage for Autosomes
echo "6) Computing Average Depth of Coverage for Autosomes:"
coverage_dir="/shared/home/righettin/Analyses/Data_preparation/$genome/coverage"
genome_doc_file="/shared/home/righettin/Analyses/Data_preparation/$genome/$genome.doc.txt"
coverage_dir="/Analyses/Data_preparation/$genome/coverage"
genome_doc_file="/Analyses/Data_preparation/$genome/$genome.doc.txt"
# Loop through autosomes (chromosomes 1 to 22)
for i in {1..22}; do
......@@ -258,17 +258,17 @@ echo "Average Depth of Coverage for Autosomes computed and stored in $genome_doc
# 7) Computing Coverage of Protein-Coding Genes
echo "7) Computing Coverage of Protein-Coding Genes:"
coverage_dir="/shared/home/righettin/Analyses/Data_preparation/$genome/coverage"
genome_dir="/shared/home/righettin/Analyses/Data_preparation/$genome"
coverage_dir="/Analyses/Data_preparation/$genome/coverage"
genome_dir="/Analyses/Data_preparation/$genome"
beds=""
for i in {1..22}; do beds="${beds} $coverage_dir/$genome.hg38.chr${i}.per-base.bed.gz"; done
bedtools intersect -g /shared/home/righettin/Data/References/hg38/hg38.genome.wout.txt \
-a /shared/home/righettin/Data/References/hg38/hg38_genes_protein_coding_chr_2_nomt.bed -b ${beds} \
bedtools intersect -g /Data/References/hg38/hg38.genome.wout.txt \
-a /Data/References/hg38/hg38_genes_protein_coding_chr_2_nomt.bed -b ${beds} \
-wa -wb | gzip -c > $genome_dir/$genome.hg38.genes.protein_coding.coverage.gz
starting_gz=/shared/home/righettin/Analyses/Data_preparation/$genome/$genome.hg38.genes.protein_coding.coverage.gz
ending_gz=/shared/home/righettin/Analyses/Data_preparation/$genome/$genome.fixed.hg38.genes.protein_coding.coverage.gz
starting_gz=/Analyses/Data_preparation/$genome/$genome.hg38.genes.protein_coding.coverage.gz
ending_gz=/Analyses/Data_preparation/$genome/$genome.fixed.hg38.genes.protein_coding.coverage.gz
zcat $starting_gz | sed 's/chr//g' | gzip -c > $ending_gz # take out "chr" from the file
......@@ -276,7 +276,7 @@ zcat $starting_gz | sed 's/chr//g' | gzip -c > $ending_gz # take out "chr" fro
echo "Coverage of Protein-Coding Genes computed and stored in /shared/home/righetti/Analyses/Data_preparation/$genome/$genome.hg38.genes.protein_coding.coverage.gz."
# SECTION C: CNVs ESTIMATE
CNV_estimates_dir="/shared/home/righettin/Analyses/CNV_estimates"
CNV_estimates_dir="/Analyses/CNV_estimates"
echo "
-. .-. .-. .-. .-. .-. .
||\|||\ /|||\|||\ /|||\|||\ /|
......@@ -290,9 +290,9 @@ In this section CNVs are estimated for each genome. Results are stored in $CNV_e
# 1) Gene copy number estimates
echo "Estimating gene CNV for $genome:"
chrom_doc="/shared/home/righettin/Analyses/Data_preparation/$genome/avgdepth/nobl/$genome.hg38.nobl.avgdepth.txt"
gene_doc="/shared/home/righettin/Analyses/Data_preparation/$genome/$genome.fixed.hg38.genes.protein_coding.coverage.gz"
chrom_doc="/Analyses/Data_preparation/$genome/avgdepth/nobl/$genome.hg38.nobl.avgdepth.txt"
gene_doc="/Analyses/Data_preparation/$genome/$genome.fixed.hg38.genes.protein_coding.coverage.gz"
python3 /shared/home/righettin/Scripts/CNV_scripts/computeGeneCopy.py --chrom-doc $chrom_doc --gene-doc $gene_doc > $CNV_estimates_dir/$genome.nobl.cnv.tsv
python3 /Scripts/CNV_scripts/computeGeneCopy.py --chrom-doc $chrom_doc --gene-doc $gene_doc > $CNV_estimates_dir/$genome.nobl.cnv.tsv
echo "CNV estimated for $genome."
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment