0.3.2 bug_fixes for automatic download in create_dataset

parent d51c12e8
......@@ -230,8 +230,6 @@ class STRINGDatasetCreation:
os.remove(self.intermediate_file)
os.remove("clusters_preprocessed.tsv")
os.remove("clusters.tsv")
os.remove(self.interactions_file)
os.remove(self.sequences_file)
# A method to remove sequences of inappropriate length from a fasta file
def process_fasta_file(self):
......@@ -282,44 +280,50 @@ def add_args(parser):
def main(params):
downloaded_flag = False
if params.interactions is None or params.sequences is None:
downloaded_flag = True
logging.info('One or both of the files are not specified (interactions or sequences). '
'Downloading from STRING...')
_, version = get_string_url()
logging.info('STRING version: {}'.format(version))
try:
url = "{0}protein.physical.links.full.v{1}/{2}.protein.physical.links.full.v{1}.txt.gz".format(DOWNLOAD_LINK_STRING, version, params.species)
string_file_name_links = "{1}.protein.physical.links.full.v{0}.txt".format(version, params.species)
wget.download(url, out=string_file_name_links+'.gz')
with gzip.open(string_file_name_links+'.gz', 'rb') as f_in:
with open(string_file_name_links, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
url = "{0}protein.sequences.v{1}/{2}.protein.sequences.v{1}.fa.gz".format(DOWNLOAD_LINK_STRING, version, params.species)
string_file_name_seqs = "{1}.protein.sequences.v{0}.fa".format(version, params.species)
wget.download(url, out=string_file_name_seqs+'.gz')
with gzip.open(string_file_name_seqs+'.gz', 'rb') as f_in:
with open(string_file_name_seqs, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
except HTTPError:
raise Exception('The files are not available for the specified species. '
'There might be two reasons for that: \n '
'1) the species is not available in STRING. Please check the STRING species list to verify. \n'
'2) the download link has changed. Please raise an issue in the repository. ')
os.remove(string_file_name_seqs+'.gz')
os.remove(string_file_name_links+'.gz')
params.interactions = string_file_name_links
params.sequences = string_file_name_seqs
_, version = get_string_url()
logging.info('STRING version: {}'.format(version))
try:
url = "{0}protein.physical.links.full.v{1}/{2}.protein.physical.links.full.v{1}.txt.gz".format(DOWNLOAD_LINK_STRING, version, params.species)
string_file_name_links = "{1}.protein.physical.links.full.v{0}.txt".format(version, params.species)
wget.download(url, out=string_file_name_links+'.gz')
with gzip.open(string_file_name_links+'.gz', 'rb') as f_in:
with open(string_file_name_links, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
url = "{0}protein.sequences.v{1}/{2}.protein.sequences.v{1}.fa.gz".format(DOWNLOAD_LINK_STRING, version, params.species)
string_file_name_seqs = "{1}.protein.sequences.v{0}.fa".format(version, params.species)
wget.download(url, out=string_file_name_seqs+'.gz')
with gzip.open(string_file_name_seqs+'.gz', 'rb') as f_in:
with open(string_file_name_seqs, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
except HTTPError:
raise Exception('The files are not available for the specified species. '
'There might be two reasons for that: \n '
'1) the species is not available in STRING. Please check the STRING species list to verify. \n'
'2) the download link has changed. Please raise an issue in the repository. ')
os.remove(string_file_name_seqs+'.gz')
os.remove(string_file_name_links+'.gz')
params.interactions = string_file_name_links
params.sequences = string_file_name_seqs
data = STRINGDatasetCreation(params)
data.final_preprocessing_positives()
data.create_negatives()
if downloaded_flag:
os.remove(params.interactions)
os.remove(params.sequences)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment