0.3.2 bug_fixes for automatic download in create_dataset

parent d51c12e8
...@@ -230,8 +230,6 @@ class STRINGDatasetCreation: ...@@ -230,8 +230,6 @@ class STRINGDatasetCreation:
os.remove(self.intermediate_file) os.remove(self.intermediate_file)
os.remove("clusters_preprocessed.tsv") os.remove("clusters_preprocessed.tsv")
os.remove("clusters.tsv") os.remove("clusters.tsv")
os.remove(self.interactions_file)
os.remove(self.sequences_file)
# A method to remove sequences of inappropriate length from a fasta file # A method to remove sequences of inappropriate length from a fasta file
def process_fasta_file(self): def process_fasta_file(self):
...@@ -282,44 +280,50 @@ def add_args(parser): ...@@ -282,44 +280,50 @@ def add_args(parser):
def main(params): def main(params):
downloaded_flag = False
if params.interactions is None or params.sequences is None: if params.interactions is None or params.sequences is None:
downloaded_flag = True
logging.info('One or both of the files are not specified (interactions or sequences). ' logging.info('One or both of the files are not specified (interactions or sequences). '
'Downloading from STRING...') 'Downloading from STRING...')
_, version = get_string_url() _, version = get_string_url()
logging.info('STRING version: {}'.format(version)) logging.info('STRING version: {}'.format(version))
try: try:
url = "{0}protein.physical.links.full.v{1}/{2}.protein.physical.links.full.v{1}.txt.gz".format(DOWNLOAD_LINK_STRING, version, params.species) url = "{0}protein.physical.links.full.v{1}/{2}.protein.physical.links.full.v{1}.txt.gz".format(DOWNLOAD_LINK_STRING, version, params.species)
string_file_name_links = "{1}.protein.physical.links.full.v{0}.txt".format(version, params.species) string_file_name_links = "{1}.protein.physical.links.full.v{0}.txt".format(version, params.species)
wget.download(url, out=string_file_name_links+'.gz') wget.download(url, out=string_file_name_links+'.gz')
with gzip.open(string_file_name_links+'.gz', 'rb') as f_in: with gzip.open(string_file_name_links+'.gz', 'rb') as f_in:
with open(string_file_name_links, 'wb') as f_out: with open(string_file_name_links, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out) shutil.copyfileobj(f_in, f_out)
url = "{0}protein.sequences.v{1}/{2}.protein.sequences.v{1}.fa.gz".format(DOWNLOAD_LINK_STRING, version, params.species) url = "{0}protein.sequences.v{1}/{2}.protein.sequences.v{1}.fa.gz".format(DOWNLOAD_LINK_STRING, version, params.species)
string_file_name_seqs = "{1}.protein.sequences.v{0}.fa".format(version, params.species) string_file_name_seqs = "{1}.protein.sequences.v{0}.fa".format(version, params.species)
wget.download(url, out=string_file_name_seqs+'.gz') wget.download(url, out=string_file_name_seqs+'.gz')
with gzip.open(string_file_name_seqs+'.gz', 'rb') as f_in: with gzip.open(string_file_name_seqs+'.gz', 'rb') as f_in:
with open(string_file_name_seqs, 'wb') as f_out: with open(string_file_name_seqs, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out) shutil.copyfileobj(f_in, f_out)
except HTTPError: except HTTPError:
raise Exception('The files are not available for the specified species. ' raise Exception('The files are not available for the specified species. '
'There might be two reasons for that: \n ' 'There might be two reasons for that: \n '
'1) the species is not available in STRING. Please check the STRING species list to verify. \n' '1) the species is not available in STRING. Please check the STRING species list to verify. \n'
'2) the download link has changed. Please raise an issue in the repository. ') '2) the download link has changed. Please raise an issue in the repository. ')
os.remove(string_file_name_seqs+'.gz') os.remove(string_file_name_seqs+'.gz')
os.remove(string_file_name_links+'.gz') os.remove(string_file_name_links+'.gz')
params.interactions = string_file_name_links params.interactions = string_file_name_links
params.sequences = string_file_name_seqs params.sequences = string_file_name_seqs
data = STRINGDatasetCreation(params) data = STRINGDatasetCreation(params)
data.final_preprocessing_positives() data.final_preprocessing_positives()
data.create_negatives() data.create_negatives()
if downloaded_flag:
os.remove(params.interactions)
os.remove(params.sequences)
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment