10
0
mirror of https://github.com/LCPQ/EMSL_Basis_Set_Exchange_Local synced 2025-01-08 20:33:13 +01:00

Gestion of duplicate Name and Error when dowload

This commit is contained in:
Thomas Applencourt 2014-11-19 11:47:55 +01:00
parent 86b8faabe7
commit 17f3b0b500
2 changed files with 48 additions and 29 deletions

View File

@ -20,7 +20,7 @@ Options:
<db_path> is the path to the SQLite3 file containing the Basis sets. <db_path> is the path to the SQLite3 file containing the Basis sets.
""" """
version="0.1.1" version = "0.1.1"
import sys import sys
@ -33,8 +33,8 @@ from EMSL_utility import EMSL_local
if __name__ == '__main__': if __name__ == '__main__':
arguments = docopt(__doc__, version='EMSL Api '+version) arguments = docopt(__doc__, version='EMSL Api ' + version)
print arguments # print arguments
if arguments["get_list_basis"]: if arguments["get_list_basis"]:
db_path = arguments["<db_path>"] db_path = arguments["<db_path>"]
@ -64,7 +64,7 @@ if __name__ == '__main__':
l = e.get_basis(basis_name, elts) l = e.get_basis(basis_name, elts)
for i in l: for i in l:
print i,'\n' print i, '\n'
elif arguments["get_list_formats"]: elif arguments["get_list_formats"]:
for i in format_dict: for i in format_dict:
@ -74,11 +74,10 @@ if __name__ == '__main__':
db_path = arguments["<db_path>"] db_path = arguments["<db_path>"]
format = arguments["<format>"] format = arguments["<format>"]
if format not in format_dict: if format not in format_dict:
print "Format %s doesn't exist. Run get_list_formats to get the list of formats."%(format) print "Format %s doesn't exist. Run get_list_formats to get the list of formats." % (format)
sys.exit(1) sys.exit(1)
contraction = not arguments["--no-contraction"] contraction = not arguments["--no-contraction"]
print "go" e = EMSL_dump(
e = EMSL_dump(db_path=db_path, format=format_dict[format], contraction=contraction) db_path=db_path, format=format_dict[format], contraction=contraction)
e.new_db() e.new_db()

View File

@ -4,8 +4,9 @@ import sqlite3
import re import re
import sys import sys
import os import os
import time
debug = True debug = False
class EMSL_dump: class EMSL_dump:
@ -23,7 +24,7 @@ class EMSL_dump:
self.db_path = path self.db_path = path
def dwl_basis_list_raw(self): def dwl_basis_list_raw(self):
print "Dwl the basis list info", print "Download all the name available in EMSL. It can take some time.",
sys.stdout.flush() sys.stdout.flush()
"""Download the source code of the iframe who contains the list of the basis set available""" """Download the source code of the iframe who contains the list of the basis set available"""
@ -45,14 +46,13 @@ class EMSL_dump:
page = self.requests.get(url).text page = self.requests.get(url).text
print "Done" print "Done"
return page return page
def bl_raw_to_array(self, data_raw): def bl_raw_to_array(self, data_raw):
"""Parse the raw html to create a basis set array whith all the info: """Parse the raw html to create a basis set array whith all the info:
url, name,description""" url, name,description"""
d = [] d = {}
for line in data_raw.split('\n'): for line in data_raw.split('\n'):
if "new basisSet(" in line: if "new basisSet(" in line:
@ -72,11 +72,15 @@ class EMSL_dump:
if "-ecp" in url.lower(): if "-ecp" in url.lower():
continue continue
d[name] = [name, url, des, elts]
d.append([name, url, des, elts]) """Tric for the unicity of the name"""
array = [d[key] for key in d]
d_sort = sorted(d, key=lambda x: x[0]) array_sort = sorted(array, key=lambda x: x[0])
return d_sort print len(array_sort), "basisset will be download"
return array_sort
def create_url(self, url, name, elts): def create_url(self, url, name, elts):
"""Create the adequate url to get the basis data""" """Create the adequate url to get the basis data"""
@ -100,7 +104,8 @@ class EMSL_dump:
b = data.find("$DATA") b = data.find("$DATA")
e = data.find("$END") e = data.find("$END")
if (b == -1 or data.find("$DATA$END") != -1): if (b == -1 or data.find("$DATA$END") != -1):
print data if debug:
print data
raise StandardError("WARNING not DATA") raise StandardError("WARNING not DATA")
else: else:
data = data[b + 5:e].split('\n\n') data = data[b + 5:e].split('\n\n')
@ -124,22 +129,34 @@ class EMSL_dump:
import threading import threading
num_worker_threads = 7 num_worker_threads = 7
num_try_of_dwl = 2
q_in = Queue.Queue(num_worker_threads) q_in = Queue.Queue(num_worker_threads)
q_out = Queue.Queue(num_worker_threads) q_out = Queue.Queue(num_worker_threads)
basis_raw = {}
def worker(): def worker():
"""get a Job from the q_in, do stuff, when finish put it in the q_out"""
while True: while True:
[name, url, des, elts] = q_in.get() [name, url, des, elts] = q_in.get()
url = self.create_url(url, name, elts) url = self.create_url(url, name, elts)
q_out.put(
([name, url, des, elts], self.requests.get(url).text)) for i in range(num_try_of_dwl):
text = self.requests.get(url).text
try:
basis_data = self.basis_data_row_to_array(
text, name, des, elts)
break
except:
time.sleep(0.1)
pass
q_out.put(([name, url, des, elts], basis_data))
q_in.task_done() q_in.task_done()
def enqueue(): def enqueue():
for [name, url, des, elts] in list_basis_array: for [name, url, des, elts] in list_basis_array:
q_in.put(([name, url, des, elts])) q_in.put(([name, url, des, elts]))
return 0 return 0
t = threading.Thread(target=enqueue) t = threading.Thread(target=enqueue)
@ -151,21 +168,24 @@ class EMSL_dump:
t.daemon = True t.daemon = True
t.start() t.start()
for i in range(len(list_basis_array)): nb_basis = len(list_basis_array)
[name, url, des, elts], basis_raw = q_out.get()
for i in range(nb_basis):
[name, url, des, elts], basis_data = q_out.get()
try: try:
basis_data = self.basis_data_row_to_array(
basis_raw, name, des, elts)
c.executemany( c.executemany(
"INSERT INTO all_value VALUES (?,?,?,?)", basis_data) "INSERT INTO all_value VALUES (?,?,?,?)", basis_data)
conn.commit() conn.commit()
print i, name
print '{:>3}'.format(i + 1), "/", nb_basis, name
except: except:
print name, url, des, elts print '{:>3}'.format(i + 1), "/", nb_basis, name, "fail",
pass print ' ', [url, des, elts]
raise
conn.close() conn.close()
q_in.join() q_in.join()
q_out.join()
def new_db(self): def new_db(self):
"""Create new_db from scratch""" """Create new_db from scratch"""