10
0
mirror of https://github.com/LCPQ/EMSL_Basis_Set_Exchange_Local synced 2024-11-01 03:33:46 +01:00
EMSL_Basis_Set_Exchange_Local/src/EMSL_dump.py

342 lines
10 KiB
Python
Raw Normal View History

2014-11-12 18:12:07 +01:00
import os
2015-03-11 10:32:44 +01:00
import sys
import re
import time
2015-03-11 10:32:44 +01:00
import sqlite3
2014-11-12 14:30:26 +01:00
2015-01-12 14:22:13 +01:00
def install_with_pip(name):
ins = False
d = {'y': True,
'n': False}
while True:
choice = raw_input('Do you want to install it ? [Y/N]')
try:
ins = d[choice.lower()]
break
except:
print "not a valid choice"
if ins:
try:
import pip
pip.main(['install', name])
except:
2015-03-16 09:22:15 +01:00
print "You need pip"
print "(http://pip.readthedocs.org/en/latest/installing.html)"
2015-01-12 14:22:13 +01:00
sys.exit(1)
2014-11-12 14:30:26 +01:00
class EMSL_dump:
2015-03-11 10:37:15 +01:00
2015-03-11 10:32:44 +01:00
"""
This call implement all you need for download the EMSL and save it localy
"""
2014-11-12 14:30:26 +01:00
2015-01-16 09:38:12 +01:00
format_dict = {"g94": "Gaussian94",
"gamess-us": "GAMESS-US",
"gamess-uk": "GAMESS-UK",
"turbomole": "Turbomole",
"tx93": "TX93",
"molpro": "Molpro",
"molproint": "MolproInt",
"hondo": "Hondo",
"supermolecule": "SuperMolecule",
"molcas": "Molcas",
"hyperchem": "HyperChem",
"dalton": "Dalton",
"demon-ks": "deMon-KS",
"demon2k": "deMon2k",
"aces2": "AcesII"
}
2014-11-12 14:30:26 +01:00
def __init__(self, db_path=None, format="GAMESS-US", contraction="True"):
self.db_path = db_path
self.format = format
2014-11-12 15:35:31 +01:00
self.contraction = str(contraction)
2015-03-11 10:32:44 +01:00
self.debug = True
2015-01-12 14:22:13 +01:00
try:
import requests
except:
print "You need the requests package"
install_with_pip("requests")
finally:
self.requests = requests
2014-11-12 14:30:26 +01:00
2015-01-16 09:38:12 +01:00
def get_list_format(self):
"""List all the format available in EMSL"""
return self.format_dict
2014-11-12 14:30:26 +01:00
def set_db_path(self, path):
"""Define the database path"""
self.db_path = path
2015-01-16 09:38:12 +01:00
def get_dict_ele(self):
2015-03-14 23:05:00 +01:00
"""Return dict[atom]=[abreviation]"""
2015-01-16 09:38:12 +01:00
elt_path = os.path.dirname(sys.argv[0]) + "/src/elts_abrev.dat"
with open(elt_path, "r") as f:
data = f.readlines()
dict_ele = dict()
for i in data:
l = i.split("-")
dict_ele[l[1].strip().lower()] = l[2].strip().lower()
return dict_ele
2014-11-12 14:30:26 +01:00
def dwl_basis_list_raw(self):
2015-03-16 09:22:15 +01:00
"""Return the source code of the iframe
who contains the list of the basis set available"""
2015-03-14 23:05:00 +01:00
2015-03-16 09:22:15 +01:00
print "Download all the name available in EMSL."
print "It can take some time.",
2014-11-12 14:30:26 +01:00
sys.stdout.flush()
url = "https://bse.pnl.gov/bse/portal/user/anon/js_peid/11535052407933/panel/Main/template/content"
2015-03-11 10:32:44 +01:00
if self.debug:
2014-11-12 18:12:07 +01:00
import cPickle as pickle
dbcache = 'db/cache'
if not os.path.isfile(dbcache):
page = self.requests.get(url).text
file = open(dbcache, 'w')
pickle.dump(page, file)
else:
file = open(dbcache, 'r')
page = pickle.load(file)
file.close()
2014-11-12 16:29:11 +01:00
else:
2014-11-12 18:12:07 +01:00
page = self.requests.get(url).text
2014-11-12 14:30:26 +01:00
print "Done"
return page
2015-03-14 23:05:00 +01:00
def basis_list_raw_to_array(self, data_raw):
2015-03-16 09:22:15 +01:00
"""Parse the raw html basis set to create a dict
will all the information for dowloanding the database :
Return d[name] = [name, xml_path, description,
lits of the elements available]
2015-03-14 23:05:00 +01:00
Explanation of tuple data from 'tup' by index:
0 - path to xml file
1 - basis set name
2 - categorization: "dftcfit", "dftorb", "dftxfit", "diffuse",
"ecporb","effective core potential", "orbital", "polarization",
"rydberg", or "tight"
3 - parameterized elements by symbol e.g. '[H, He, B, C, N, O, F, Ne]'
4 - curation status; only 'published' is trustworthy
5 - boolean: has ECP
6 - boolean: has spin
7 - last modified date
8 - name of primary developer
9 - name of contributor
10 - human-readable summary/description of basis set
"""
d = {}
2014-11-12 14:30:26 +01:00
for line in data_raw.split('\n'):
2015-03-14 23:05:00 +01:00
2014-11-12 14:30:26 +01:00
if "new basisSet(" in line:
b = line.find("(")
e = line.find(");")
s = line[b + 1:e]
tup = eval(s)
2015-03-14 23:05:00 +01:00
# non-published (e.g. rejected) basis sets should be ignored
2015-03-16 09:22:15 +01:00
if tup[4] != "published":
continue
2015-03-14 23:05:00 +01:00
2015-01-16 16:51:56 +01:00
xml_path = tup[0]
2014-11-12 14:30:26 +01:00
name = tup[1]
2015-01-16 16:51:56 +01:00
elts = re.sub('[["\ \]]', '', tup[3]).split(',')
2014-11-12 14:30:26 +01:00
2015-01-20 14:20:06 +01:00
des = re.sub('\s+', ' ', tup[-1])
2014-11-12 14:30:26 +01:00
2015-01-16 16:51:56 +01:00
if "-ecp" in xml_path.lower():
2014-11-12 14:30:26 +01:00
continue
2015-01-16 16:51:56 +01:00
d[name] = [name, xml_path, des, elts]
2015-03-16 09:22:15 +01:00
2015-03-14 23:05:00 +01:00
return d
2015-03-14 23:05:00 +01:00
def parse_basis_data_gamess_us(self, data, name, des, elts):
"""Parse the basis data raw html of gamess-us to get a nice tuple
Return [name, description, [[ele, data_ele],...]]"""
basis_data = []
2014-11-12 14:30:26 +01:00
b = data.find("$DATA")
e = data.find("$END")
if (b == -1 or data.find("$DATA$END") != -1):
2015-03-11 10:32:44 +01:00
if self.debug:
print data
2015-01-08 16:12:15 +01:00
raise Exception("WARNING not DATA")
2014-11-12 14:30:26 +01:00
else:
2015-03-14 23:05:00 +01:00
dict_replace = {"PHOSPHOROUS": "PHOSPHORUS",
2015-03-16 09:22:15 +01:00
"D+": "E+",
"D-": "E-"}
2015-03-14 23:05:00 +01:00
2015-03-16 09:22:15 +01:00
for k, v in dict_replace.iteritems():
data = data.replace(k, v)
2015-01-09 16:32:40 +01:00
2015-01-12 14:22:13 +01:00
data = data[b + 5:e - 1].split('\n\n')
2015-01-09 15:44:03 +01:00
2015-01-16 09:38:12 +01:00
dict_ele = self.get_dict_ele()
2014-11-12 14:30:26 +01:00
for (elt, data_elt) in zip(elts, data):
2015-01-09 15:44:03 +01:00
elt_long_th = dict_ele[elt.lower()]
elt_long_exp = data_elt.split()[0].lower()
2015-01-09 16:32:40 +01:00
if "$" in data_elt:
2015-03-11 10:32:44 +01:00
if self.debug:
2015-01-20 14:20:06 +01:00
print "Eror",
2015-01-26 09:00:53 +01:00
raise Exception("WARNING bad split")
2015-01-09 16:32:40 +01:00
2015-01-09 15:44:03 +01:00
if elt_long_th == elt_long_exp:
2015-03-14 23:05:00 +01:00
basis_data.append([elt, data_elt.strip()])
2015-01-09 15:44:03 +01:00
else:
2015-03-11 10:32:44 +01:00
if self.debug:
2015-01-20 14:20:06 +01:00
print "th", elt_long_th
print "exp", elt_long_exp
print "abv", elt
2015-03-14 23:05:00 +01:00
raise Exception("WARNING not a good ELEMENT")
2014-11-12 14:30:26 +01:00
2015-03-14 23:05:00 +01:00
return [name, des, basis_data]
2014-11-12 14:30:26 +01:00
2015-03-14 23:05:00 +01:00
def create_sql(self, dict_basis_list):
2014-11-12 14:30:26 +01:00
"""Create the sql from the list of basis available data"""
conn = sqlite3.connect(self.db_path)
c = conn.cursor()
2015-01-20 14:20:06 +01:00
c.execute('''CREATE TABLE basis_tab(
basis_id INTEGER PRIMARY KEY AUTOINCREMENT,
name text,
description text,
UNIQUE(name)
);''')
c.execute('''CREATE TABLE data_tab(
basis_id INTEGER,
elt TEXT,
data TEXT,
FOREIGN KEY(basis_id)
REFERENCES basis_tab(basis_id)
);''')
c.execute(''' CREATE VIEW output_tab AS
SELECT basis_id,
name,
description,
elt,
data
FROM basis_tab
NATURAL JOIN data_tab
''')
conn.commit()
2014-11-12 14:30:26 +01:00
2014-11-12 16:29:11 +01:00
import Queue
import threading
2014-11-12 14:30:26 +01:00
2014-11-12 18:12:07 +01:00
num_worker_threads = 7
2015-01-09 15:44:03 +01:00
attemps_max = 20
2014-11-12 18:12:07 +01:00
q_in = Queue.Queue(num_worker_threads)
2014-11-12 16:29:11 +01:00
q_out = Queue.Queue(num_worker_threads)
2014-11-12 14:30:26 +01:00
2014-11-12 16:29:11 +01:00
def worker():
2015-03-16 09:22:15 +01:00
"""get a Job from the q_in, do stuff,
when finish put it in the q_out"""
2014-11-12 18:12:07 +01:00
while True:
2015-01-20 14:20:06 +01:00
name, path_xml, des, elts = q_in.get()
2015-01-16 16:51:56 +01:00
url = "https://bse.pnl.gov:443/bse/portal/user/anon/js_peid/11535052407933/action/portlets.BasisSetAction/template/courier_content/panel/Main/"
url += "/eventSubmit_doDownload/true"
params = {'bsurl': path_xml, 'bsname': name,
'elts': " ".join(elts),
'format': self.format,
'minimize': self.contraction}
2015-01-09 15:44:03 +01:00
attemps = 0
while attemps < attemps_max:
2015-01-16 16:51:56 +01:00
text = self.requests.get(url, params=params).text
try:
2015-03-16 09:22:15 +01:00
basis_data = self.parse_basis_data_gamess_us(
text,
name,
des,
elts)
except:
time.sleep(0.1)
2015-01-09 15:44:03 +01:00
attemps += 1
2015-01-26 09:00:53 +01:00
else:
break
2015-01-09 15:44:03 +01:00
try:
2015-01-20 14:20:06 +01:00
q_out.put(basis_data)
2015-01-09 15:44:03 +01:00
except:
2015-03-11 10:32:44 +01:00
if self.debug:
2015-01-20 14:20:06 +01:00
print "Fail on q_out.put", basis_data
2015-01-09 15:44:03 +01:00
raise
2015-01-26 09:00:53 +01:00
else:
q_in.task_done()
2014-11-12 16:29:11 +01:00
def enqueue():
2015-03-14 23:05:00 +01:00
for [name, path_xml, des, elts] in dict_basis_list.itervalues():
2015-01-20 14:20:06 +01:00
q_in.put([name, path_xml, des, elts])
2014-11-12 18:12:07 +01:00
return 0
2014-11-12 14:30:26 +01:00
2014-11-12 16:29:11 +01:00
t = threading.Thread(target=enqueue)
t.daemon = True
t.start()
for i in range(num_worker_threads):
t = threading.Thread(target=worker)
t.daemon = True
t.start()
2015-03-16 09:22:15 +01:00
nb_basis = len(dict_basis_list)
for i in range(nb_basis):
2015-03-14 23:05:00 +01:00
name, des, basis_data = q_out.get()
2015-01-20 14:20:06 +01:00
q_out.task_done()
try:
2015-03-16 09:22:15 +01:00
cmd = "INSERT INTO basis_tab(name,description) VALUES (?,?)"
2015-03-14 23:05:00 +01:00
c.execute(cmd, [name, des])
2015-01-20 14:20:06 +01:00
conn.commit()
except sqlite3.IntegrityError:
print '{:>3}'.format(i + 1), "/", nb_basis, name, "fail"
2015-03-14 23:05:00 +01:00
id_ = [c.lastrowid]
2014-11-12 14:30:26 +01:00
try:
2015-03-16 09:22:15 +01:00
cmd = "INSERT INTO data_tab VALUES (?,?,?)"
2015-03-14 23:05:00 +01:00
c.executemany(cmd, [id_ + k for k in basis_data])
2014-11-12 14:30:26 +01:00
conn.commit()
print '{:>3}'.format(i + 1), "/", nb_basis, name
2015-01-20 14:20:06 +01:00
2014-11-12 14:30:26 +01:00
except:
2015-01-16 16:51:56 +01:00
print '{:>3}'.format(i + 1), "/", nb_basis, name, "fail"
raise
2015-01-20 14:20:06 +01:00
2014-11-12 14:30:26 +01:00
conn.close()
2014-11-12 16:29:11 +01:00
q_in.join()
2014-11-12 14:30:26 +01:00
def new_db(self):
"""Create new_db from scratch"""
_data = self.dwl_basis_list_raw()
2015-03-14 23:05:00 +01:00
array_basis = self.basis_list_raw_to_array(_data)
2014-11-12 14:30:26 +01:00
self.create_sql(array_basis)