2014-11-12 18:12:07 +01:00
|
|
|
import os
|
2015-03-11 10:32:44 +01:00
|
|
|
import sys
|
|
|
|
import re
|
2014-11-19 11:47:55 +01:00
|
|
|
import time
|
2015-03-11 10:32:44 +01:00
|
|
|
import sqlite3
|
2014-11-12 14:30:26 +01:00
|
|
|
|
2015-03-16 14:35:25 +01:00
|
|
|
from collections import OrderedDict
|
|
|
|
|
2014-11-12 14:30:26 +01:00
|
|
|
|
2015-01-12 14:22:13 +01:00
|
|
|
def install_with_pip(name):
|
|
|
|
|
|
|
|
ins = False
|
|
|
|
d = {'y': True,
|
|
|
|
'n': False}
|
|
|
|
|
|
|
|
while True:
|
|
|
|
choice = raw_input('Do you want to install it ? [Y/N]')
|
|
|
|
try:
|
|
|
|
ins = d[choice.lower()]
|
|
|
|
break
|
|
|
|
except:
|
|
|
|
print "not a valid choice"
|
|
|
|
|
|
|
|
if ins:
|
|
|
|
try:
|
|
|
|
import pip
|
|
|
|
pip.main(['install', name])
|
|
|
|
except:
|
2015-03-16 09:22:15 +01:00
|
|
|
print "You need pip"
|
|
|
|
print "(http://pip.readthedocs.org/en/latest/installing.html)"
|
2015-01-12 14:22:13 +01:00
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
2014-11-12 14:30:26 +01:00
|
|
|
class EMSL_dump:
|
2015-03-11 10:37:15 +01:00
|
|
|
|
2015-03-11 10:32:44 +01:00
|
|
|
"""
|
|
|
|
This call implement all you need for download the EMSL and save it localy
|
|
|
|
"""
|
2014-11-12 14:30:26 +01:00
|
|
|
|
|
|
|
def __init__(self, db_path=None, format="GAMESS-US", contraction="True"):
|
2015-03-16 14:35:25 +01:00
|
|
|
|
2015-03-17 18:03:03 +01:00
|
|
|
from src.parser_handler import get_parser_function
|
|
|
|
from src.parser_handler import check_format
|
2015-03-16 14:35:25 +01:00
|
|
|
|
2015-03-17 18:03:03 +01:00
|
|
|
self.format = check_format(format)
|
|
|
|
self.parser = get_parser_function(self.format)
|
2015-03-16 14:35:25 +01:00
|
|
|
|
2015-03-17 16:11:01 +01:00
|
|
|
if db_path:
|
|
|
|
self.db_path = db_path
|
|
|
|
else:
|
|
|
|
head_path = os.path.dirname(__file__)
|
|
|
|
self.db_path = "{0}/../db/{1}.db".format(head_path, self.format)
|
|
|
|
|
2014-11-12 15:35:31 +01:00
|
|
|
self.contraction = str(contraction)
|
2015-03-11 10:32:44 +01:00
|
|
|
self.debug = True
|
|
|
|
|
2015-01-12 14:22:13 +01:00
|
|
|
try:
|
|
|
|
import requests
|
|
|
|
except:
|
|
|
|
print "You need the requests package"
|
|
|
|
install_with_pip("requests")
|
|
|
|
finally:
|
|
|
|
self.requests = requests
|
2014-11-12 14:30:26 +01:00
|
|
|
|
2015-01-16 09:38:12 +01:00
|
|
|
def get_list_format(self):
|
|
|
|
"""List all the format available in EMSL"""
|
2015-03-17 18:03:03 +01:00
|
|
|
from src.parser_handler import parser_dict
|
|
|
|
return parser_dict.keys()
|
2015-01-16 09:38:12 +01:00
|
|
|
|
2014-11-12 14:30:26 +01:00
|
|
|
def set_db_path(self, path):
|
|
|
|
"""Define the database path"""
|
|
|
|
self.db_path = path
|
|
|
|
|
|
|
|
def dwl_basis_list_raw(self):
|
2015-03-16 09:22:15 +01:00
|
|
|
"""Return the source code of the iframe
|
|
|
|
who contains the list of the basis set available"""
|
2015-03-14 23:05:00 +01:00
|
|
|
|
2015-03-16 09:22:15 +01:00
|
|
|
print "Download all the name available in EMSL."
|
|
|
|
print "It can take some time.",
|
2014-11-12 14:30:26 +01:00
|
|
|
sys.stdout.flush()
|
|
|
|
|
|
|
|
url = "https://bse.pnl.gov/bse/portal/user/anon/js_peid/11535052407933/panel/Main/template/content"
|
2015-03-11 10:32:44 +01:00
|
|
|
if self.debug:
|
2014-11-12 18:12:07 +01:00
|
|
|
import cPickle as pickle
|
|
|
|
dbcache = 'db/cache'
|
|
|
|
if not os.path.isfile(dbcache):
|
|
|
|
page = self.requests.get(url).text
|
|
|
|
file = open(dbcache, 'w')
|
|
|
|
pickle.dump(page, file)
|
|
|
|
else:
|
|
|
|
file = open(dbcache, 'r')
|
|
|
|
page = pickle.load(file)
|
|
|
|
file.close()
|
2014-11-12 16:29:11 +01:00
|
|
|
|
|
|
|
else:
|
2014-11-12 18:12:07 +01:00
|
|
|
page = self.requests.get(url).text
|
2014-11-12 14:30:26 +01:00
|
|
|
|
|
|
|
print "Done"
|
|
|
|
return page
|
|
|
|
|
2015-03-14 23:05:00 +01:00
|
|
|
def basis_list_raw_to_array(self, data_raw):
|
2015-03-16 09:22:15 +01:00
|
|
|
"""Parse the raw html basis set to create a dict
|
|
|
|
will all the information for dowloanding the database :
|
|
|
|
Return d[name] = [name, xml_path, description,
|
|
|
|
lits of the elements available]
|
|
|
|
|
2015-03-14 23:05:00 +01:00
|
|
|
Explanation of tuple data from 'tup' by index:
|
|
|
|
|
|
|
|
0 - path to xml file
|
|
|
|
1 - basis set name
|
|
|
|
2 - categorization: "dftcfit", "dftorb", "dftxfit", "diffuse",
|
|
|
|
"ecporb","effective core potential", "orbital", "polarization",
|
|
|
|
"rydberg", or "tight"
|
|
|
|
3 - parameterized elements by symbol e.g. '[H, He, B, C, N, O, F, Ne]'
|
|
|
|
4 - curation status; only 'published' is trustworthy
|
|
|
|
5 - boolean: has ECP
|
|
|
|
6 - boolean: has spin
|
|
|
|
7 - last modified date
|
|
|
|
8 - name of primary developer
|
|
|
|
9 - name of contributor
|
|
|
|
10 - human-readable summary/description of basis set
|
|
|
|
"""
|
2015-03-16 14:35:25 +01:00
|
|
|
|
|
|
|
d = OrderedDict()
|
2014-11-12 14:30:26 +01:00
|
|
|
|
|
|
|
for line in data_raw.split('\n'):
|
2015-03-14 23:05:00 +01:00
|
|
|
|
2014-11-12 14:30:26 +01:00
|
|
|
if "new basisSet(" in line:
|
|
|
|
b = line.find("(")
|
|
|
|
e = line.find(");")
|
|
|
|
|
|
|
|
s = line[b + 1:e]
|
|
|
|
|
|
|
|
tup = eval(s)
|
2015-03-14 23:05:00 +01:00
|
|
|
|
2015-03-16 14:35:25 +01:00
|
|
|
xml_path = tup[0]
|
|
|
|
|
|
|
|
# non-published (e.g. rejected) basis sets and ecp should be
|
|
|
|
# ignored
|
|
|
|
if tup[4] != "published" or "-ecp" in xml_path.lower():
|
2015-03-16 09:22:15 +01:00
|
|
|
continue
|
2015-03-14 23:05:00 +01:00
|
|
|
|
2014-11-12 14:30:26 +01:00
|
|
|
name = tup[1]
|
2015-01-16 16:51:56 +01:00
|
|
|
elts = re.sub('[["\ \]]', '', tup[3]).split(',')
|
2015-01-20 14:20:06 +01:00
|
|
|
des = re.sub('\s+', ' ', tup[-1])
|
2014-11-12 14:30:26 +01:00
|
|
|
|
2015-01-16 16:51:56 +01:00
|
|
|
d[name] = [name, xml_path, des, elts]
|
2015-03-16 09:22:15 +01:00
|
|
|
|
2015-03-14 23:05:00 +01:00
|
|
|
return d
|
2014-11-19 11:47:55 +01:00
|
|
|
|
2015-03-16 14:35:25 +01:00
|
|
|
# _____ _
|
|
|
|
# / __ \ | |
|
|
|
|
# | / \/_ __ ___ __ _| |_ ___
|
|
|
|
# | | | '__/ _ \/ _` | __/ _ \
|
|
|
|
# | \__/\ | | __/ (_| | || __/
|
|
|
|
# \____/_| \___|\__,_|\__\___|
|
|
|
|
#
|
2015-03-14 23:05:00 +01:00
|
|
|
def create_sql(self, dict_basis_list):
|
2015-03-16 14:35:25 +01:00
|
|
|
"""Create the sql from strach.
|
|
|
|
Take the list of basis available data,
|
|
|
|
download her, put her in sql"""
|
|
|
|
|
|
|
|
if os.path.isfile(self.db_path):
|
|
|
|
print >> sys.stderr, "FAILLURE:"
|
|
|
|
print >> sys.stderr, "{0} file alredy exist.".format(self.db_path),
|
|
|
|
print >> sys.stderr, "Delete or remove it"
|
|
|
|
sys.exit(1)
|
2014-11-12 14:30:26 +01:00
|
|
|
|
|
|
|
conn = sqlite3.connect(self.db_path)
|
|
|
|
c = conn.cursor()
|
|
|
|
|
2015-01-20 14:20:06 +01:00
|
|
|
c.execute('''CREATE TABLE basis_tab(
|
|
|
|
basis_id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
|
|
name text,
|
|
|
|
description text,
|
|
|
|
UNIQUE(name)
|
|
|
|
);''')
|
|
|
|
|
|
|
|
c.execute('''CREATE TABLE data_tab(
|
|
|
|
basis_id INTEGER,
|
|
|
|
elt TEXT,
|
|
|
|
data TEXT,
|
|
|
|
FOREIGN KEY(basis_id)
|
|
|
|
REFERENCES basis_tab(basis_id)
|
|
|
|
);''')
|
|
|
|
|
2015-03-16 19:10:55 +01:00
|
|
|
c.execute('''CREATE TABLE format_tab(format TEXT)''')
|
|
|
|
c.execute('''INSERT INTO format_tab VALUES (?)''', [self.format])
|
|
|
|
conn.commit()
|
|
|
|
|
2015-01-20 14:20:06 +01:00
|
|
|
c.execute(''' CREATE VIEW output_tab AS
|
|
|
|
SELECT basis_id,
|
|
|
|
name,
|
|
|
|
description,
|
|
|
|
elt,
|
|
|
|
data
|
|
|
|
FROM basis_tab
|
|
|
|
NATURAL JOIN data_tab
|
|
|
|
''')
|
|
|
|
|
2014-11-12 16:29:11 +01:00
|
|
|
import Queue
|
|
|
|
import threading
|
2014-11-12 14:30:26 +01:00
|
|
|
|
2014-11-12 18:12:07 +01:00
|
|
|
num_worker_threads = 7
|
2015-01-09 15:44:03 +01:00
|
|
|
attemps_max = 20
|
2014-11-19 11:47:55 +01:00
|
|
|
|
2014-11-12 18:12:07 +01:00
|
|
|
q_in = Queue.Queue(num_worker_threads)
|
2014-11-12 16:29:11 +01:00
|
|
|
q_out = Queue.Queue(num_worker_threads)
|
2014-11-12 14:30:26 +01:00
|
|
|
|
2014-11-12 16:29:11 +01:00
|
|
|
def worker():
|
2015-03-16 09:22:15 +01:00
|
|
|
"""get a Job from the q_in, do stuff,
|
|
|
|
when finish put it in the q_out"""
|
2014-11-12 18:12:07 +01:00
|
|
|
while True:
|
2015-01-20 14:20:06 +01:00
|
|
|
name, path_xml, des, elts = q_in.get()
|
2015-01-16 16:51:56 +01:00
|
|
|
|
|
|
|
url = "https://bse.pnl.gov:443/bse/portal/user/anon/js_peid/11535052407933/action/portlets.BasisSetAction/template/courier_content/panel/Main/"
|
|
|
|
url += "/eventSubmit_doDownload/true"
|
|
|
|
|
|
|
|
params = {'bsurl': path_xml, 'bsname': name,
|
|
|
|
'elts': " ".join(elts),
|
|
|
|
'format': self.format,
|
|
|
|
'minimize': self.contraction}
|
2014-11-19 11:47:55 +01:00
|
|
|
|
2015-01-09 15:44:03 +01:00
|
|
|
attemps = 0
|
|
|
|
while attemps < attemps_max:
|
2015-01-16 16:51:56 +01:00
|
|
|
text = self.requests.get(url, params=params).text
|
2014-11-19 11:47:55 +01:00
|
|
|
try:
|
2015-03-16 14:35:25 +01:00
|
|
|
basis_data = self.parser(text, name, des, elts,
|
|
|
|
self.debug)
|
2014-11-19 11:47:55 +01:00
|
|
|
except:
|
|
|
|
time.sleep(0.1)
|
2015-01-09 15:44:03 +01:00
|
|
|
attemps += 1
|
2015-01-26 09:00:53 +01:00
|
|
|
else:
|
|
|
|
break
|
2015-01-09 15:44:03 +01:00
|
|
|
|
|
|
|
try:
|
2015-01-20 14:20:06 +01:00
|
|
|
q_out.put(basis_data)
|
2015-01-09 15:44:03 +01:00
|
|
|
except:
|
2015-03-11 10:32:44 +01:00
|
|
|
if self.debug:
|
2015-01-20 14:20:06 +01:00
|
|
|
print "Fail on q_out.put", basis_data
|
2015-01-09 15:44:03 +01:00
|
|
|
raise
|
2015-01-26 09:00:53 +01:00
|
|
|
else:
|
|
|
|
q_in.task_done()
|
|
|
|
|
2014-11-12 16:29:11 +01:00
|
|
|
def enqueue():
|
2015-03-14 23:05:00 +01:00
|
|
|
for [name, path_xml, des, elts] in dict_basis_list.itervalues():
|
2015-01-20 14:20:06 +01:00
|
|
|
q_in.put([name, path_xml, des, elts])
|
2014-11-19 11:47:55 +01:00
|
|
|
|
2014-11-12 18:12:07 +01:00
|
|
|
return 0
|
2014-11-12 14:30:26 +01:00
|
|
|
|
2014-11-12 16:29:11 +01:00
|
|
|
t = threading.Thread(target=enqueue)
|
|
|
|
t.daemon = True
|
|
|
|
t.start()
|
|
|
|
|
|
|
|
for i in range(num_worker_threads):
|
|
|
|
t = threading.Thread(target=worker)
|
|
|
|
t.daemon = True
|
|
|
|
t.start()
|
|
|
|
|
2015-03-16 09:22:15 +01:00
|
|
|
nb_basis = len(dict_basis_list)
|
2014-11-19 11:47:55 +01:00
|
|
|
|
|
|
|
for i in range(nb_basis):
|
2015-03-14 23:05:00 +01:00
|
|
|
name, des, basis_data = q_out.get()
|
2015-01-20 14:20:06 +01:00
|
|
|
q_out.task_done()
|
|
|
|
|
2015-03-16 14:35:25 +01:00
|
|
|
str_indice = '{:>3}'.format(i + 1)
|
|
|
|
str_ = '{0} / {1} | {2}'.format(str_indice, nb_basis, name)
|
|
|
|
|
|
|
|
# ~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~ #
|
|
|
|
# A d d _ t h e _ b a s i s _ n a m e #
|
|
|
|
# ~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~ #
|
2015-01-20 14:20:06 +01:00
|
|
|
try:
|
2015-03-16 09:22:15 +01:00
|
|
|
cmd = "INSERT INTO basis_tab(name,description) VALUES (?,?)"
|
2015-03-14 23:05:00 +01:00
|
|
|
c.execute(cmd, [name, des])
|
2015-01-20 14:20:06 +01:00
|
|
|
conn.commit()
|
|
|
|
except sqlite3.IntegrityError:
|
2015-03-16 14:35:25 +01:00
|
|
|
print str_, "Fail"
|
|
|
|
|
|
|
|
# ~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~ #
|
|
|
|
# A d d _ t h e _ b a s i s _ d a t a #
|
|
|
|
# ~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~ #
|
2014-11-19 11:47:55 +01:00
|
|
|
|
2015-03-14 23:05:00 +01:00
|
|
|
id_ = [c.lastrowid]
|
2015-03-16 14:35:25 +01:00
|
|
|
|
2014-11-12 14:30:26 +01:00
|
|
|
try:
|
2015-03-16 14:35:25 +01:00
|
|
|
cmd = "INSERT INTO data_tab(basis_id,elt,data) VALUES (?,?,?)"
|
2015-03-14 23:05:00 +01:00
|
|
|
c.executemany(cmd, [id_ + k for k in basis_data])
|
2014-11-12 14:30:26 +01:00
|
|
|
conn.commit()
|
2015-03-16 14:35:25 +01:00
|
|
|
except sqlite3.IntegrityError:
|
|
|
|
print str_, "Fail"
|
|
|
|
else:
|
|
|
|
print str_
|
2014-11-12 14:30:26 +01:00
|
|
|
conn.close()
|
2014-11-19 11:47:55 +01:00
|
|
|
|
2014-11-12 16:29:11 +01:00
|
|
|
q_in.join()
|
2014-11-12 14:30:26 +01:00
|
|
|
|
|
|
|
def new_db(self):
|
|
|
|
"""Create new_db from scratch"""
|
|
|
|
|
|
|
|
_data = self.dwl_basis_list_raw()
|
2015-03-14 23:05:00 +01:00
|
|
|
array_basis = self.basis_list_raw_to_array(_data)
|
2014-11-12 14:30:26 +01:00
|
|
|
|
|
|
|
self.create_sql(array_basis)
|