Commit 17622dc1 by Riccardo Vicedomini

fixed a bug with DAMA, domain hits needed to be sorted by sequence-id

parent a1263033
...@@ -3,9 +3,7 @@ Author: Riccardo Vicedomini ...@@ -3,9 +3,7 @@ Author: Riccardo Vicedomini
""" """
import sys, os, argparse, gzip import sys, os, argparse, gzip
import subprocess import subprocess, glob, time, operator
import glob
import time
from mclade_utils import * from mclade_utils import *
...@@ -35,19 +33,24 @@ def main(): ...@@ -35,19 +33,24 @@ def main():
start_time = time.time() start_time = time.time()
print(f'[{script_name}] formatting DAMA input file') print(f'[{script_name}] loading best hits')
domainsHitDict = {} domainsHitDict = {}
for bdfile in glob.glob(f'{args.filterResDir}/*.best.res'):
domid = os.path.basename(bdfile).split('.')[0]
with open(bdfile,'r') as bdf:
for line in bdf:
cols = line.rstrip().split('\t')
mid,mbeg,mend,mlen,sid,sbeg,send,slen,evalue,score,acc,mtaxid = cols
hit_key = (sid,sbeg,send,domid)
domainsHitDict[hit_key]=cols
print(f'[{script_name}] loading best hits')
sortedDomHitKeys = sorted(domainsHitDict.keys(),key=operator.itemgetter(0,1))
domains_hit_file = f'{temp_dir}/domainsHitFile.best.res' domains_hit_file = f'{temp_dir}/domainsHitFile.best.res'
with open(domains_hit_file,'w') as dhf: with open(domains_hit_file,'w') as dhf:
for bdfile in glob.glob(f'{args.filterResDir}/*.best.res'): for hit_key in sortedDomHitKeys:
domid = os.path.basename(bdfile).split('.')[0] mid,mbeg,mend,mlen,sid,sbeg,send,slen,evalue,score,acc,mtaxid = domainsHitDict[hit_key]
with open(bdfile,'r') as bdf: dhf.write(f'{evalue}\t{sbeg}\t{send}\t{sid}\t{hit_key[3]}\t{mbeg}\t{mend}\t{score}\t{mid}\n')
for line in bdf:
cols = line.rstrip().split('\t')
mid,mbeg,mend,mlen,sid,sbeg,send,slen,evalue,score,acc,mtaxid = cols
hit_key = (sid,sbeg,send,domid)
domainsHitDict[hit_key]=cols
dhf.write(f'{evalue}\t{sbeg}\t{send}\t{sid}\t{domid}\t{acc}\t{mid}\t{mbeg}\t{mend}\t{domid}\n')
print(f'[{script_name}] running DAMA') print(f'[{script_name}] running DAMA')
dama_temp_out = f'{temp_dir}/dama.arch.txt' dama_temp_out = f'{temp_dir}/dama.arch.txt'
...@@ -57,16 +60,18 @@ def main(): ...@@ -57,16 +60,18 @@ def main():
'-overlappingDomainFile', args.overlappingDomainFile, '-overlappingDomainFile', args.overlappingDomainFile,
'-evalueCutOff', args.evalueCutOff, '-evalueCutOff', args.evalueCutOff,
'-evalueCutOffConf', args.evalueCutOffConf, '-evalueCutOffConf', args.evalueCutOffConf,
'-overlappingAA', args.overlappingAA, '-overlappingAA', str(args.overlappingAA),
'-overlappingMaxDomain', args.overlappingMaxDomain, '-overlappingMaxDomain', str(args.overlappingMaxDomain),
'-domainsHitFile', domains_hit_file, '-domainsHitFile', domains_hit_file,
'-outputFile', dama_temp_out ] '-outputFile', dama_temp_out ]
dama_cmd = ' '.join(dama_args) dama_cmd = ' '.join(dama_args)
eprint(f'running DAMA: {dama_cmd}')
dama_process = subprocess.call(dama_args) dama_process = subprocess.call(dama_args)
if dama_process != 0: # if dama failed if dama_process != 0: # if dama failed
eprint(f'[{script_name}] error running: {hmmsearch_cmd}') eprint(f'[{script_name}] error running: {hmmsearch_cmd}')
return 1 return 1
print(f'[{script_name}] loading taxonomy information')
taxid2name = {} taxid2name = {}
with gzip.open(args.taxid2nameFile,'rt') as taxFile: with gzip.open(args.taxid2nameFile,'rt') as taxFile:
for line in taxFile: for line in taxFile:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment