Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
P
PRESCOTT
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Mustafa Tekpinar
PRESCOTT
Commits
15b17ad6
Commit
15b17ad6
authored
Jan 02, 2025
by
Mustafa Tekpinar
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Updated prescott.py
parent
67c81d47
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
180 additions
and
23 deletions
+180
-23
prescott.py
prescott/prescott.py
+180
-23
No files found.
prescott/prescott.py
View file @
15b17ad6
...
...
@@ -141,7 +141,7 @@ def getGnomADOverallFrequency(infile, usePopMax="true"):
mutantList
.
append
(
mutant
)
# print(mutant, row['Protein Consequence'])
# print(infile.split('_')[2])
proteinNameList
.
append
(
os
.
path
.
basename
(
infile
)
.
split
(
'_'
)[
2
]
)
proteinNameList
.
append
(
os
.
path
.
basename
(
infile
))
dfMissense
[
'mutant'
]
=
mutantList
dfMissense
[
'protein'
]
=
proteinNameList
...
...
@@ -281,15 +281,15 @@ def getGnomADOverallFrequencyV2(infile, usePopMax="true"):
mutantList
.
append
(
mutant
)
# print(mutant, row['Protein Consequence'])
# print(infile.split('_')[2])
proteinNameList
.
append
(
os
.
path
.
basename
(
infile
)
.
split
(
'_'
)[
2
]
)
proteinNameList
.
append
(
os
.
path
.
basename
(
infile
))
dfMissense
[
'mutant'
]
=
mutantList
dfMissense
[
'protein'
]
=
proteinNameList
return
(
dfMissense
)
def
getGnomADV4OverallFrequency
(
infile
,
usePopMax
=
"true"
):
def
getGnomADV4
p0_
OverallFrequency
(
infile
,
usePopMax
=
"true"
):
"""
This version is for gnomAD v4.
This version is for gnomAD v4.
0
-Latino/Admixed is renamed as Admixed
-I used Middle Eastern instead of Other field.
"""
...
...
@@ -407,13 +407,140 @@ def getGnomADV4OverallFrequency(infile, usePopMax="true"):
mutantList
.
append
(
mutant
)
# print(mutant, row['Protein Consequence'])
# print(infile.split('_')[2])
proteinNameList
.
append
(
os
.
path
.
basename
(
infile
)
.
split
(
'_'
)[
2
]
)
proteinNameList
.
append
(
os
.
path
.
basename
(
infile
))
dfMissense
[
'mutant'
]
=
mutantList
dfMissense
[
'protein'
]
=
proteinNameList
return
(
dfMissense
)
###############################################################################
def
getGnomADV4p1_OverallFrequency
(
infile
,
usePopMax
=
"true"
):
"""
This version is for gnomAD v4.1
-Latino/Admixed is renamed as Admixed
-I used Middle Eastern instead of Other field.
"""
df
=
pd
.
read_csv
(
infile
)
# print(df.columns)
#print(df1.columns)
dfMissense
=
df
.
loc
[
df
[
'VEP Annotation'
]
==
'missense_variant'
,
[
'HGVS Consequence'
,
'Protein Consequence'
,
'Transcript Consequence'
,
'VEP Annotation'
,
'ClinVar Germline Classification'
,
'ClinVar Variation ID'
,
'Flags'
,
'Allele Count'
,
'Allele Number'
,
'Allele Frequency'
,
'Homozygote Count'
,
'Hemizygote Count'
,
'Allele Count African/African American'
,
'Allele Number African/African American'
,
'Homozygote Count African/African American'
,
'Hemizygote Count African/African American'
,
'Allele Count Admixed American'
,
'Allele Number Admixed American'
,
'Homozygote Count Admixed American'
,
'Hemizygote Count Admixed American'
,
'Allele Count Ashkenazi Jewish'
,
'Allele Number Ashkenazi Jewish'
,
'Homozygote Count Ashkenazi Jewish'
,
'Hemizygote Count Ashkenazi Jewish'
,
'Allele Count East Asian'
,
'Allele Number East Asian'
,
'Homozygote Count East Asian'
,
'Hemizygote Count East Asian'
,
'Allele Count European (Finnish)'
,
'Allele Number European (Finnish)'
,
'Homozygote Count European (Finnish)'
,
'Hemizygote Count European (Finnish)'
,
'Allele Count European (non-Finnish)'
,
'Allele Number European (non-Finnish)'
,
'Homozygote Count European (non-Finnish)'
,
'Hemizygote Count European (non-Finnish)'
,
'Allele Count Middle Eastern'
,
'Allele Number Middle Eastern'
,
'Homozygote Count Middle Eastern'
,
'Hemizygote Count Middle Eastern'
,
'Allele Count South Asian'
,
'Allele Number South Asian'
,
'Homozygote Count South Asian'
,
'Hemizygote Count South Asian'
]]
dfMissense
=
dfMissense
.
reset_index
()
dfMissense
[
'Allele Frequency'
]
dfMissense
[
'Allele Frequency Log'
]
=
""
if
(
usePopMax
.
lower
()
==
"true"
):
alleleCountList
=
[
'Allele Count African/African American'
,
'Allele Count Admixed American'
,
'Allele Count Ashkenazi Jewish'
,
'Allele Count East Asian'
,
'Allele Count European (Finnish)'
,
'Allele Count European (non-Finnish)'
,
'Allele Count Middle Eastern'
,
'Allele Count South Asian'
]
alleleNumberList
=
[
'Allele Number African/African American'
,
'Allele Number Admixed American'
,
'Allele Number Ashkenazi Jewish'
,
'Allele Number East Asian'
,
'Allele Number European (Finnish)'
,
'Allele Number European (non-Finnish)'
,
'Allele Number Middle Eastern'
,
'Allele Number South Asian'
]
for
index
,
row
in
dfMissense
.
iterrows
():
maxFreq
=
0.0
tempIndex
=
0
for
i
in
range
(
len
(
alleleCountList
)):
if
(
row
[
alleleNumberList
[
i
]]
!=
0
):
tempValue
=
(
row
[
alleleCountList
[
i
]]
/
row
[
alleleNumberList
[
i
]])
if
(
tempValue
>
maxFreq
):
maxFreq
=
tempValue
tempIndex
=
i
# Avoid zero frequency error by setting it to a very low number such as 10**-10
if
(
maxFreq
==
0.0
):
maxFreq
=
10
**-
10
# Which means 1 in 10 billion, which is the estimated population in 2050.
# print(maxFreq)
dfMissense
.
at
[
index
,
'Selected Population'
]
=
alleleCountList
[
tempIndex
]
dfMissense
.
at
[
index
,
'Allele Frequency Log'
]
=
np
.
log10
(
maxFreq
)
#print(np.log10(maxFreq))
else
:
# Avoid zero frequency error by setting it to a very low number such as 10**-10
for
index
,
row
in
dfMissense
.
iterrows
():
if
(
row
[
'Allele Frequency'
]
==
0.0
):
dfMissense
.
at
[
index
,
'Allele Frequency Log'
]
=
np
.
log10
(
10
**-
10
)
else
:
dfMissense
.
at
[
index
,
'Allele Frequency Log'
]
=
np
.
log10
(
row
[
'Allele Frequency'
])
# sys.exit(-1)
#print(dfMissense['Allele Frequency Log'])
#print(dfMissense[['Allele Frequency', 'ClinVar Clinical Significance']])
# dfMissense.dropna(subset = ['ClinVar Clinical Significance'], inplace=True)
dfMissense
=
dfMissense
.
reset_index
()
#print(dfMissense[['Allele Frequency', 'ClinVar Clinical Significance']])
# plt.figure()
# plt.hist(dfMissense['Allele Frequency Log'], density=False, color='red', label='pathogenic')
# plt.show()
mutantList
=
[]
proteinNameList
=
[]
for
index
,
row
in
dfMissense
.
iterrows
():
source
=
one_letter
[
row
[
'Protein Consequence'
][
2
:
5
]
.
upper
()]
position
=
(
row
[
'Protein Consequence'
][
5
:
-
3
])
target
=
one_letter
[(
row
[
'Protein Consequence'
][
-
3
:])
.
upper
()]
mutant
=
source
+
position
+
target
mutantList
.
append
(
mutant
)
# print(mutant, row['Protein Consequence'])
# print(infile.split('_')[2])
proteinNameList
.
append
(
os
.
path
.
basename
(
infile
))
dfMissense
[
'mutant'
]
=
mutantList
dfMissense
[
'protein'
]
=
proteinNameList
return
(
dfMissense
)
###############################################################################
...
...
@@ -739,8 +866,8 @@ def main():
'M1A 0.378
\n
'
,
required
=
False
,
default
=
'gemme'
)
main_parser
.
add_argument
(
'--gnomadversion'
,
dest
=
'gnomadversion'
,
type
=
int
,
\
help
=
'An integer value. Default is version 4
(4.0.0) of GnomAD!
\n
Other possible versions are 2 and 3
.'
,
required
=
False
,
default
=
4
)
help
=
'An integer value. Default is version 4
1 (4.1) of GnomAD!
\n
Other possible versions are 2, 3 or 40 (for 4.0)
.'
,
required
=
False
,
default
=
4
1
)
# main_parser.add_argument('--colormap', dest='colormap', type=str, \
# help='A colormap as defined in matplotlib',
# required=False, default='coolwarm_r')
...
...
@@ -762,7 +889,7 @@ def main():
args
=
main_parser
.
parse_args
()
print
(
"
\n\n
@> Running PRESCOTT with the following parameters:
\n\n
"
)
print
(
"@> ESCOTT file : {}"
.
format
(
args
.
escottfile
))
print
(
"@> Frequency file : {}"
.
format
(
args
.
gnomadfile
))
print
(
"@> Frequency file
: {}"
.
format
(
args
.
gnomadfile
))
print
(
"@> Use population max. freq : {}"
.
format
(
str
(
args
.
usepopmax
)
.
lower
()))
print
(
"@> Which equation to use (Default=2): {}"
.
format
(
str
(
args
.
equation
)))
print
(
"@> Scaling coefficient (Default=1.0): {}"
.
format
(
args
.
coefficient
))
...
...
@@ -841,6 +968,7 @@ def main():
myBigMergedDF
[
'log10frequency'
]
=
999.0
myBigMergedDF
[
'labels'
]
=
np
.
nan
myBigMergedDF
[
'position'
]
=
""
myBigMergedDF
[
'Selected Population'
]
=
""
# Assign ESCOTT scores to PRESCOTT scores.
# Then, we will modify them according to different conditions.
...
...
@@ -850,27 +978,54 @@ def main():
if
(
file_extension
==
".csv"
):
print
(
"@> You frequency data is in gnomAD format!"
)
print
(
"@> GnomAD data version (Default=4) : {}"
.
format
(
str
(
args
.
gnomadversion
)))
print
(
"@> GnomAD data version (Default=4
1 for 4.1
) : {}"
.
format
(
str
(
args
.
gnomadversion
)))
if
(
args
.
gnomadversion
==
2
or
args
.
gnomadversion
==
3
):
gnomadDF
=
getGnomADOverallFrequency
(
args
.
gnomadfile
,
usePopMax
=
usePopMaxOrNot
)
elif
(
args
.
gnomadversion
==
4
):
gnomadDF
=
getGnomADV4OverallFrequency
(
args
.
gnomadfile
,
usePopMax
=
usePopMaxOrNot
)
# Assign labels to pathogenic/benign mutations for performance evaluation
gnomadDF
[
'labels'
]
=
""
for
index
,
row
in
gnomadDF
.
iterrows
():
if
((
row
[
'ClinVar Clinical Significance'
]
==
'Benign/Likely benign'
)
or
\
(
row
[
'ClinVar Clinical Significance'
]
==
'Benign'
)
or
\
(
row
[
'ClinVar Clinical Significance'
]
==
'Likely benign'
)):
gnomadDF
.
at
[
index
,
'labels'
]
=
0
if
((
row
[
'ClinVar Clinical Significance'
]
==
'Pathogenic/Likely pathogenic'
)
or
\
(
row
[
'ClinVar Clinical Significance'
]
==
'Pathogenic'
)
or
\
(
row
[
'ClinVar Clinical Significance'
]
==
'Likely pathogenic'
)):
gnomadDF
.
at
[
index
,
'labels'
]
=
1
elif
(
args
.
gnomadversion
==
40
):
gnomadDF
=
getGnomADV4p0_OverallFrequency
(
args
.
gnomadfile
,
usePopMax
=
usePopMaxOrNot
)
# Assign labels to pathogenic/benign mutations for performance evaluation
gnomadDF
[
'labels'
]
=
""
for
index
,
row
in
gnomadDF
.
iterrows
():
if
((
row
[
'ClinVar Clinical Significance'
]
==
'Benign/Likely benign'
)
or
\
(
row
[
'ClinVar Clinical Significance'
]
==
'Benign'
)
or
\
(
row
[
'ClinVar Clinical Significance'
]
==
'Likely benign'
)):
gnomadDF
.
at
[
index
,
'labels'
]
=
0
if
((
row
[
'ClinVar Clinical Significance'
]
==
'Pathogenic/Likely pathogenic'
)
or
\
(
row
[
'ClinVar Clinical Significance'
]
==
'Pathogenic'
)
or
\
(
row
[
'ClinVar Clinical Significance'
]
==
'Likely pathogenic'
)):
gnomadDF
.
at
[
index
,
'labels'
]
=
1
elif
(
args
.
gnomadversion
==
41
):
gnomadDF
=
getGnomADV4p1_OverallFrequency
(
args
.
gnomadfile
,
usePopMax
=
usePopMaxOrNot
)
# Assign labels to pathogenic/benign mutations for performance evaluation
gnomadDF
[
'labels'
]
=
""
for
index
,
row
in
gnomadDF
.
iterrows
():
if
((
row
[
'ClinVar Germline Classification'
]
==
'Benign/Likely benign'
)
or
\
(
row
[
'ClinVar Germline Classification'
]
==
'Benign'
)
or
\
(
row
[
'ClinVar Germline Classification'
]
==
'Likely benign'
)):
gnomadDF
.
at
[
index
,
'labels'
]
=
0
if
((
row
[
'ClinVar Germline Classification'
]
==
'Pathogenic/Likely pathogenic'
)
or
\
(
row
[
'ClinVar Germline Classification'
]
==
'Pathogenic'
)
or
\
(
row
[
'ClinVar Germline Classification'
]
==
'Likely pathogenic'
)):
gnomadDF
.
at
[
index
,
'labels'
]
=
1
else
:
print
(
"ERROR: Unknown GnomAD version!"
)
sys
.
exit
(
-
1
)
# Assign labels to pathogenic/benign mutations for performance evaluation
gnomadDF
[
'labels'
]
=
""
for
index
,
row
in
gnomadDF
.
iterrows
():
if
((
row
[
'ClinVar Clinical Significance'
]
==
'Benign/Likely benign'
)
or
\
(
row
[
'ClinVar Clinical Significance'
]
==
'Benign'
)
or
\
(
row
[
'ClinVar Clinical Significance'
]
==
'Likely benign'
)):
gnomadDF
.
at
[
index
,
'labels'
]
=
0
if
((
row
[
'ClinVar Clinical Significance'
]
==
'Pathogenic/Likely pathogenic'
)
or
\
(
row
[
'ClinVar Clinical Significance'
]
==
'Pathogenic'
)
or
\
(
row
[
'ClinVar Clinical Significance'
]
==
'Likely pathogenic'
)):
gnomadDF
.
at
[
index
,
'labels'
]
=
1
if
(
len
(
gnomadDF
.
loc
[(
gnomadDF
[
'labels'
]
==
0
)
|
(
gnomadDF
[
'labels'
]
==
1
)])
>
0
):
print
(
gnomadDF
.
loc
[(
gnomadDF
[
'labels'
]
==
0
)
|
(
gnomadDF
[
'labels'
]
==
1
)])
# print(gnomadDF['ClinVar Clinical Significance'])
...
...
@@ -896,6 +1051,8 @@ def main():
if
(
len
(
temp
)
>
0
):
myBigMergedDF
.
at
[
index
,
'log10frequency'
]
=
temp
[
0
]
myBigMergedDF
.
at
[
index
,
'labels'
]
=
gnomadDF
.
loc
[
gnomadDF
[
'mutant'
]
==
row
[
'mutant'
],
'labels'
]
.
values
[
0
]
myBigMergedDF
.
at
[
index
,
'Selected Population'
]
=
\
gnomadDF
.
loc
[
gnomadDF
[
'mutant'
]
==
row
[
'mutant'
],
'Selected Population'
]
.
values
[
0
]
.
replace
(
'Allele Count '
,
''
)
# print(myBigMergedDF)
# scalingCoeff = args.coefficient
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment