Commit 17622dc1 by Riccardo Vicedomini

fixed a bug with DAMA, domain hits needed to be sorted by sequence-id

parent a1263033
......@@ -3,9 +3,7 @@ Author: Riccardo Vicedomini
"""
import sys, os, argparse, gzip
import subprocess
import glob
import time
import subprocess, glob, time, operator
from mclade_utils import *
......@@ -35,19 +33,24 @@ def main():
start_time = time.time()
print(f'[{script_name}] formatting DAMA input file')
print(f'[{script_name}] loading best hits')
domainsHitDict = {}
for bdfile in glob.glob(f'{args.filterResDir}/*.best.res'):
domid = os.path.basename(bdfile).split('.')[0]
with open(bdfile,'r') as bdf:
for line in bdf:
cols = line.rstrip().split('\t')
mid,mbeg,mend,mlen,sid,sbeg,send,slen,evalue,score,acc,mtaxid = cols
hit_key = (sid,sbeg,send,domid)
domainsHitDict[hit_key]=cols
print(f'[{script_name}] loading best hits')
sortedDomHitKeys = sorted(domainsHitDict.keys(),key=operator.itemgetter(0,1))
domains_hit_file = f'{temp_dir}/domainsHitFile.best.res'
with open(domains_hit_file,'w') as dhf:
for bdfile in glob.glob(f'{args.filterResDir}/*.best.res'):
domid = os.path.basename(bdfile).split('.')[0]
with open(bdfile,'r') as bdf:
for line in bdf:
cols = line.rstrip().split('\t')
mid,mbeg,mend,mlen,sid,sbeg,send,slen,evalue,score,acc,mtaxid = cols
hit_key = (sid,sbeg,send,domid)
domainsHitDict[hit_key]=cols
dhf.write(f'{evalue}\t{sbeg}\t{send}\t{sid}\t{domid}\t{acc}\t{mid}\t{mbeg}\t{mend}\t{domid}\n')
for hit_key in sortedDomHitKeys:
mid,mbeg,mend,mlen,sid,sbeg,send,slen,evalue,score,acc,mtaxid = domainsHitDict[hit_key]
dhf.write(f'{evalue}\t{sbeg}\t{send}\t{sid}\t{hit_key[3]}\t{mbeg}\t{mend}\t{score}\t{mid}\n')
print(f'[{script_name}] running DAMA')
dama_temp_out = f'{temp_dir}/dama.arch.txt'
......@@ -57,16 +60,18 @@ def main():
'-overlappingDomainFile', args.overlappingDomainFile,
'-evalueCutOff', args.evalueCutOff,
'-evalueCutOffConf', args.evalueCutOffConf,
'-overlappingAA', args.overlappingAA,
'-overlappingMaxDomain', args.overlappingMaxDomain,
'-overlappingAA', str(args.overlappingAA),
'-overlappingMaxDomain', str(args.overlappingMaxDomain),
'-domainsHitFile', domains_hit_file,
'-outputFile', dama_temp_out ]
dama_cmd = ' '.join(dama_args)
eprint(f'running DAMA: {dama_cmd}')
dama_process = subprocess.call(dama_args)
if dama_process != 0: # if dama failed
eprint(f'[{script_name}] error running: {hmmsearch_cmd}')
return 1
print(f'[{script_name}] loading taxonomy information')
taxid2name = {}
with gzip.open(args.taxid2nameFile,'rt') as taxFile:
for line in taxFile:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment