0.3.2 bug_fixes for automatic download in create_dataset

5eab0f2b · Konstantin Volzhenin · d51c12e8 · 5eab0f2b
Commit 5eab0f2b authored Sep 01, 2023 by Konstantin Volzhenin
Hide whitespace changes
Inline Side-by-side

Showing with 34 additions and 30 deletions

create_dataset.py senseppi/commands/create_dataset.py +34 -30

No files found.
--- a/senseppi/commands/create_dataset.py
+++ b/senseppi/commands/create_dataset.py
@@ -230,8 +230,6 @@ class STRINGDatasetCreation:
        os.remove(self.intermediate_file)
        os.remove("clusters_preprocessed.tsv")
        os.remove("clusters.tsv")
-        os.remove(self.interactions_file)
-        os.remove(self.sequences_file)

    # A method to remove sequences of inappropriate length from a fasta file
    def process_fasta_file(self):
@@ -282,44 +280,50 @@ def add_args(parser):


 def main(params):
+    downloaded_flag = False
    if params.interactions is None or params.sequences is None:
+        downloaded_flag = True
        logging.info('One or both of the files are not specified (interactions or sequences). '
                     'Downloading from STRING...')

-    _, version = get_string_url()
-    logging.info('STRING version: {}'.format(version))
-
-    try:
-        url = "{0}protein.physical.links.full.v{1}/{2}.protein.physical.links.full.v{1}.txt.gz".format(DOWNLOAD_LINK_STRING, version, params.species)
-        string_file_name_links = "{1}.protein.physical.links.full.v{0}.txt".format(version, params.species)
-        wget.download(url, out=string_file_name_links+'.gz')
-        with gzip.open(string_file_name_links+'.gz', 'rb') as f_in:
-            with open(string_file_name_links, 'wb') as f_out:
-                shutil.copyfileobj(f_in, f_out)
-
-        url = "{0}protein.sequences.v{1}/{2}.protein.sequences.v{1}.fa.gz".format(DOWNLOAD_LINK_STRING, version, params.species)
-        string_file_name_seqs = "{1}.protein.sequences.v{0}.fa".format(version, params.species)
-        wget.download(url, out=string_file_name_seqs+'.gz')
-        with gzip.open(string_file_name_seqs+'.gz', 'rb') as f_in:
-            with open(string_file_name_seqs, 'wb') as f_out:
-                shutil.copyfileobj(f_in, f_out)
-    except HTTPError:
-        raise Exception('The files are not available for the specified species. '
-                     'There might be two reasons for that: \n '
-                     '1) the species is not available in STRING. Please check the STRING species list to verify. \n'
-                     '2) the download link has changed. Please raise an issue in the repository. ')
-
-    os.remove(string_file_name_seqs+'.gz')
-    os.remove(string_file_name_links+'.gz')
-
-    params.interactions = string_file_name_links
-    params.sequences = string_file_name_seqs
+        _, version = get_string_url()
+        logging.info('STRING version: {}'.format(version))
+
+        try:
+            url = "{0}protein.physical.links.full.v{1}/{2}.protein.physical.links.full.v{1}.txt.gz".format(DOWNLOAD_LINK_STRING, version, params.species)
+            string_file_name_links = "{1}.protein.physical.links.full.v{0}.txt".format(version, params.species)
+            wget.download(url, out=string_file_name_links+'.gz')
+            with gzip.open(string_file_name_links+'.gz', 'rb') as f_in:
+                with open(string_file_name_links, 'wb') as f_out:
+                    shutil.copyfileobj(f_in, f_out)
+
+            url = "{0}protein.sequences.v{1}/{2}.protein.sequences.v{1}.fa.gz".format(DOWNLOAD_LINK_STRING, version, params.species)
+            string_file_name_seqs = "{1}.protein.sequences.v{0}.fa".format(version, params.species)
+            wget.download(url, out=string_file_name_seqs+'.gz')
+            with gzip.open(string_file_name_seqs+'.gz', 'rb') as f_in:
+                with open(string_file_name_seqs, 'wb') as f_out:
+                    shutil.copyfileobj(f_in, f_out)
+        except HTTPError:
+            raise Exception('The files are not available for the specified species. '
+                         'There might be two reasons for that: \n '
+                         '1) the species is not available in STRING. Please check the STRING species list to verify. \n'
+                         '2) the download link has changed. Please raise an issue in the repository. ')
+
+        os.remove(string_file_name_seqs+'.gz')
+        os.remove(string_file_name_links+'.gz')
+
+        params.interactions = string_file_name_links
+        params.sequences = string_file_name_seqs

    data = STRINGDatasetCreation(params)

    data.final_preprocessing_positives()
    data.create_negatives()

+    if downloaded_flag:
+        os.remove(params.interactions)
+        os.remove(params.sequences)
+

 if __name__ == '__main__':
    parser = argparse.ArgumentParser()