Commit 7727426d by Edoardo Sarti

query ncbi

parent c3e61ba4
import sys
import requests
db = 'protein'
# ICI faire lire une liste en format csv (comma-separated values) avec les accession numbers des PROTEINES (pas des gènes)
acc_list = 'NP_060022,NP_001349278,XP_025004865'
acc_array = acc_list.split(',')
#append [accn] field to each accession
new_acc_array = []
for acc in acc_array:
new_acc_array.append(acc+"[accn]")
acc_array = new_acc_array
#join the accessions with OR
query = "+OR+".join(acc_array)
#assemble the esearch URL
base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
url = base + f'esearch.fcgi?db={db}&term={query}&usehistory=y'
#post the esearch URL
print("URL", url)
output = requests.get(url)
#parse WebEnv and QueryKey
web, key = "", ""
for line in output.text.split("\n"):
fields = line.split(">")
we = False
qk = False
for f in fields:
print(f)
if "<WebEnv" in f:
we = True
continue
if we:
web = f.split("<")[0]
we = False
continue
if "<QueryKey" in f:
qk = True
continue
if qk:
key = f.split("<")[0]
qk = False
continue
if not (web and key):
exit(1)
#assemble the efetch URL
url = base + f'efetch.fcgi?db={db}&query_key={key}&WebEnv={web}'
url += '&rettype=fasta&retmode=text'
fasta = requests.get(url)
#print(fasta.text)
# ICI mettre le bon format
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment