10
0
mirror of https://github.com/LCPQ/EMSL_Basis_Set_Exchange_Local synced 2024-11-01 03:33:46 +01:00
EMSL_Basis_Set_Exchange_Local/src/EMSL_dump.py

293 lines
9.3 KiB
Python
Raw Normal View History

2014-11-12 18:12:07 +01:00
import os
2015-03-11 10:32:44 +01:00
import sys
import re
import time
2015-03-11 10:32:44 +01:00
import sqlite3
2014-11-12 14:30:26 +01:00
from collections import OrderedDict
2014-11-12 14:30:26 +01:00
2015-01-12 14:22:13 +01:00
def install_with_pip(name):
ins = False
d = {'y': True,
'n': False}
while True:
choice = raw_input('Do you want to install it ? [Y/N]')
try:
ins = d[choice.lower()]
break
except:
print "not a valid choice"
if ins:
try:
import pip
pip.main(['install', name])
except:
2015-03-16 09:22:15 +01:00
print "You need pip"
print "(http://pip.readthedocs.org/en/latest/installing.html)"
2015-01-12 14:22:13 +01:00
sys.exit(1)
2014-11-12 14:30:26 +01:00
class EMSL_dump:
2015-03-11 10:32:44 +01:00
"""
This call implement all you need for download the EMSL and save it localy
"""
2014-11-12 14:30:26 +01:00
def __init__(self, db_path=None, format="GAMESS-US", contraction="True"):
2015-03-17 18:03:03 +01:00
from src.parser_handler import get_parser_function
from src.parser_handler import check_format
2015-03-17 18:03:03 +01:00
self.format = check_format(format)
self.parser = get_parser_function(self.format)
2015-03-23 14:56:12 +01:00
"""Define the database path"""
2015-03-17 16:11:01 +01:00
if db_path:
self.db_path = db_path
else:
head_path = os.path.dirname(__file__)
2015-03-23 15:05:24 +01:00
db_path = "{0}/../db/{1}.db".format(head_path, self.format)
self.db_path = os.path.abspath(db_path)
2015-03-17 16:11:01 +01:00
2014-11-12 15:35:31 +01:00
self.contraction = str(contraction)
2015-03-23 15:05:24 +01:00
self.debug = True
2015-03-11 10:32:44 +01:00
2015-01-12 14:22:13 +01:00
try:
import requests
except:
print "You need the requests package"
install_with_pip("requests")
finally:
self.requests = requests
2014-11-12 14:30:26 +01:00
2015-01-16 09:38:12 +01:00
def get_list_format(self):
"""List all the format available in EMSL"""
2015-03-17 18:03:03 +01:00
from src.parser_handler import parser_dict
return parser_dict.keys()
2015-01-16 09:38:12 +01:00
2014-11-12 14:30:26 +01:00
def dwl_basis_list_raw(self):
2015-03-16 09:22:15 +01:00
"""Return the source code of the iframe
who contains the list of the basis set available"""
2015-03-14 23:05:00 +01:00
2015-03-16 09:22:15 +01:00
print "Download all the name available in EMSL."
print "It can take some time.",
2014-11-12 14:30:26 +01:00
sys.stdout.flush()
url = "https://bse.pnl.gov/bse/portal/user/anon/js_peid/11535052407933/panel/Main/template/content"
2015-03-11 10:32:44 +01:00
if self.debug:
2014-11-12 18:12:07 +01:00
import cPickle as pickle
dbcache = 'db/cache'
if not os.path.isfile(dbcache):
page = self.requests.get(url).text
2015-03-23 15:05:24 +01:00
pickle.dump(page, open(dbcache, 'wb'))
2014-11-12 18:12:07 +01:00
else:
2015-03-23 15:05:24 +01:00
page = pickle.load(open(dbcache, 'rb'))
2014-11-12 16:29:11 +01:00
else:
2014-11-12 18:12:07 +01:00
page = self.requests.get(url).text
2014-11-12 14:30:26 +01:00
print "Done"
return page
2015-03-14 23:05:00 +01:00
def basis_list_raw_to_array(self, data_raw):
2015-03-16 09:22:15 +01:00
"""Parse the raw html basis set to create a dict
will all the information for dowloanding the database :
Return d[name] = [name, xml_path, description,
lits of the elements available]
2015-03-14 23:05:00 +01:00
Explanation of tuple data from 'tup' by index:
0 - path to xml file
1 - basis set name
2 - categorization: "dftcfit", "dftorb", "dftxfit", "diffuse",
"ecporb","effective core potential", "orbital", "polarization",
"rydberg", or "tight"
3 - parameterized elements by symbol e.g. '[H, He, B, C, N, O, F, Ne]'
4 - curation status; only 'published' is trustworthy
5 - boolean: has ECP
6 - boolean: has spin
7 - last modified date
8 - name of primary developer
9 - name of contributor
10 - human-readable summary/description of basis set
"""
d = OrderedDict()
2014-11-12 14:30:26 +01:00
for line in data_raw.split('\n'):
2015-03-14 23:05:00 +01:00
2014-11-12 14:30:26 +01:00
if "new basisSet(" in line:
b = line.find("(")
e = line.find(");")
s = line[b + 1:e]
tup = eval(s)
2015-03-14 23:05:00 +01:00
xml_path = tup[0]
# non-published (e.g. rejected) basis sets and ecp should be
# ignored
if tup[4] != "published" or "-ecp" in xml_path.lower():
2015-03-16 09:22:15 +01:00
continue
2015-03-14 23:05:00 +01:00
2014-11-12 14:30:26 +01:00
name = tup[1]
2015-01-16 16:51:56 +01:00
elts = re.sub('[["\ \]]', '', tup[3]).split(',')
2015-01-20 14:20:06 +01:00
des = re.sub('\s+', ' ', tup[-1])
2014-11-12 14:30:26 +01:00
2015-01-16 16:51:56 +01:00
d[name] = [name, xml_path, des, elts]
2015-03-16 09:22:15 +01:00
2015-03-14 23:05:00 +01:00
return d
# _____ _
# / __ \ | |
# | / \/_ __ ___ __ _| |_ ___
# | | | '__/ _ \/ _` | __/ _ \
# | \__/\ | | __/ (_| | || __/
# \____/_| \___|\__,_|\__\___|
#
2015-03-14 23:05:00 +01:00
def create_sql(self, dict_basis_list):
"""Create the sql from strach.
Take the list of basis available data,
download her, put her in sql"""
if os.path.isfile(self.db_path):
print >> sys.stderr, "FAILLURE:"
print >> sys.stderr, "{0} file alredy exist.".format(self.db_path),
print >> sys.stderr, "Delete or remove it"
sys.exit(1)
2014-11-12 14:30:26 +01:00
conn = sqlite3.connect(self.db_path)
c = conn.cursor()
2015-01-20 14:20:06 +01:00
c.execute('''CREATE TABLE basis_tab(
basis_id INTEGER PRIMARY KEY AUTOINCREMENT,
name text,
description text,
UNIQUE(name)
);''')
c.execute('''CREATE TABLE data_tab(
basis_id INTEGER,
elt TEXT,
data TEXT,
FOREIGN KEY(basis_id)
REFERENCES basis_tab(basis_id)
);''')
2015-03-16 19:10:55 +01:00
c.execute('''CREATE TABLE format_tab(format TEXT)''')
c.execute('''INSERT INTO format_tab VALUES (?)''', [self.format])
conn.commit()
2015-01-20 14:20:06 +01:00
c.execute(''' CREATE VIEW output_tab AS
SELECT basis_id,
name,
description,
elt,
data
FROM basis_tab
NATURAL JOIN data_tab
''')
2014-11-12 16:29:11 +01:00
import Queue
import threading
2014-11-12 14:30:26 +01:00
2014-11-12 18:12:07 +01:00
num_worker_threads = 7
2015-01-09 15:44:03 +01:00
attemps_max = 20
2015-03-23 14:56:12 +01:00
# All the task need to be executed
nb_basis = len(dict_basis_list)
q_in = Queue.Queue(nb_basis)
# Populate the q_in list
for [name, path_xml, des, elts] in dict_basis_list.itervalues():
q_in.put([name, path_xml, des, elts])
# All the queue who have been executed
2014-11-12 16:29:11 +01:00
q_out = Queue.Queue(num_worker_threads)
2014-11-12 14:30:26 +01:00
2014-11-12 16:29:11 +01:00
def worker():
2015-03-16 09:22:15 +01:00
"""get a Job from the q_in, do stuff,
when finish put it in the q_out"""
2014-11-12 18:12:07 +01:00
while True:
2015-01-20 14:20:06 +01:00
name, path_xml, des, elts = q_in.get()
2015-01-16 16:51:56 +01:00
url = "https://bse.pnl.gov:443/bse/portal/user/anon/js_peid/11535052407933/action/portlets.BasisSetAction/template/courier_content/panel/Main/"
url += "/eventSubmit_doDownload/true"
params = {'bsurl': path_xml, 'bsname': name,
'elts': " ".join(elts),
'format': self.format,
'minimize': self.contraction}
2015-01-09 15:44:03 +01:00
attemps = 0
while attemps < attemps_max:
2015-01-16 16:51:56 +01:00
text = self.requests.get(url, params=params).text
try:
basis_data = self.parser(text, name, des, elts,
self.debug)
except:
time.sleep(0.1)
2015-01-09 15:44:03 +01:00
attemps += 1
2015-01-26 09:00:53 +01:00
else:
break
2015-01-09 15:44:03 +01:00
try:
2015-01-20 14:20:06 +01:00
q_out.put(basis_data)
2015-01-09 15:44:03 +01:00
except:
2015-03-11 10:32:44 +01:00
if self.debug:
2015-01-20 14:20:06 +01:00
print "Fail on q_out.put", basis_data
2015-01-09 15:44:03 +01:00
raise
2014-11-12 16:29:11 +01:00
2015-03-23 14:56:12 +01:00
# Create all the worker (q_in |> worker |> q_out)
2014-11-12 16:29:11 +01:00
for i in range(num_worker_threads):
t = threading.Thread(target=worker)
t.daemon = True
t.start()
2015-03-23 14:56:12 +01:00
# Take the result from the out queue (populate by the worker)
# and put in in the SQL database
for i in range(nb_basis):
2015-03-14 23:05:00 +01:00
name, des, basis_data = q_out.get()
2015-01-20 14:20:06 +01:00
q_out.task_done()
str_indice = '{:>3}'.format(i + 1)
str_ = '{0} / {1} | {2}'.format(str_indice, nb_basis, name)
# ~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~ #
# A d d _ t h e _ b a s i s _ n a m e #
# ~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~ #
2015-01-20 14:20:06 +01:00
try:
2015-03-16 09:22:15 +01:00
cmd = "INSERT INTO basis_tab(name,description) VALUES (?,?)"
2015-03-14 23:05:00 +01:00
c.execute(cmd, [name, des])
2015-01-20 14:20:06 +01:00
conn.commit()
except sqlite3.IntegrityError:
print str_, "Fail"
# ~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~ #
# A d d _ t h e _ b a s i s _ d a t a #
# ~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~ #
2015-03-14 23:05:00 +01:00
id_ = [c.lastrowid]
2014-11-12 14:30:26 +01:00
try:
cmd = "INSERT INTO data_tab(basis_id,elt,data) VALUES (?,?,?)"
2015-03-14 23:05:00 +01:00
c.executemany(cmd, [id_ + k for k in basis_data])
2014-11-12 14:30:26 +01:00
conn.commit()
except sqlite3.IntegrityError:
print str_, "Fail"
else:
print str_
2014-11-12 14:30:26 +01:00
conn.close()
2014-11-12 16:29:11 +01:00
q_in.join()
2014-11-12 14:30:26 +01:00
def new_db(self):
"""Create new_db from scratch"""
_data = self.dwl_basis_list_raw()
2015-03-14 23:05:00 +01:00
array_basis = self.basis_list_raw_to_array(_data)
2014-11-12 14:30:26 +01:00
self.create_sql(array_basis)