mirror of
https://github.com/LCPQ/EMSL_Basis_Set_Exchange_Local
synced 2024-11-03 20:53:50 +01:00
Debug / Typo / Split parser in new parser.py
This commit is contained in:
parent
4f8c21368e
commit
b6503406f8
@ -155,14 +155,10 @@ if __name__ == '__main__':
|
|||||||
db_path = arguments["--db_path"]
|
db_path = arguments["--db_path"]
|
||||||
format = arguments["--format"]
|
format = arguments["--format"]
|
||||||
|
|
||||||
format_dict = EMSL_dump().get_list_format()
|
|
||||||
if format not in format_dict:
|
|
||||||
print "Format %s doesn't exist. Run list_formats to get the list of formats." % (format)
|
|
||||||
sys.exit(1)
|
|
||||||
contraction = not arguments["--no-contraction"]
|
contraction = not arguments["--no-contraction"]
|
||||||
|
|
||||||
e = EMSL_dump(db_path=db_path,
|
e = EMSL_dump(db_path=db_path,
|
||||||
format=format_dict[format],
|
format=format,
|
||||||
contraction=contraction)
|
contraction=contraction)
|
||||||
e.new_db()
|
e.new_db()
|
||||||
|
|
||||||
|
162
src/EMSL_dump.py
162
src/EMSL_dump.py
@ -4,6 +4,9 @@ import re
|
|||||||
import time
|
import time
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
|
||||||
|
from collections import OrderedDict
|
||||||
|
from src.parser import format_dict
|
||||||
|
|
||||||
|
|
||||||
def install_with_pip(name):
|
def install_with_pip(name):
|
||||||
|
|
||||||
@ -35,26 +38,24 @@ class EMSL_dump:
|
|||||||
This call implement all you need for download the EMSL and save it localy
|
This call implement all you need for download the EMSL and save it localy
|
||||||
"""
|
"""
|
||||||
|
|
||||||
format_dict = {"g94": "Gaussian94",
|
|
||||||
"gamess-us": "GAMESS-US",
|
|
||||||
"gamess-uk": "GAMESS-UK",
|
|
||||||
"turbomole": "Turbomole",
|
|
||||||
"tx93": "TX93",
|
|
||||||
"molpro": "Molpro",
|
|
||||||
"molproint": "MolproInt",
|
|
||||||
"hondo": "Hondo",
|
|
||||||
"supermolecule": "SuperMolecule",
|
|
||||||
"molcas": "Molcas",
|
|
||||||
"hyperchem": "HyperChem",
|
|
||||||
"dalton": "Dalton",
|
|
||||||
"demon-ks": "deMon-KS",
|
|
||||||
"demon2k": "deMon2k",
|
|
||||||
"aces2": "AcesII"
|
|
||||||
}
|
|
||||||
|
|
||||||
def __init__(self, db_path=None, format="GAMESS-US", contraction="True"):
|
def __init__(self, db_path=None, format="GAMESS-US", contraction="True"):
|
||||||
self.db_path = db_path
|
self.db_path = db_path
|
||||||
self.format = format
|
|
||||||
|
if format not in format_dict:
|
||||||
|
print >> sys.stderr, "Format {0} doesn't exist. Choose in:".format(format)
|
||||||
|
print >> sys.stderr, format_dict.keys()
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
self.format = format
|
||||||
|
|
||||||
|
if format_dict[self.format]:
|
||||||
|
self.parser = format_dict[self.format]
|
||||||
|
else:
|
||||||
|
print >> sys.stderr, "We have no parser for this format"
|
||||||
|
print >> sys.stderr, "Fill free to Fock /pull request"
|
||||||
|
print >> sys.stderr, "You just need to add a function like"
|
||||||
|
print >> sys.stderr, "'parse_basis_data_gamess_us' to parse you'r format"
|
||||||
|
|
||||||
self.contraction = str(contraction)
|
self.contraction = str(contraction)
|
||||||
self.debug = True
|
self.debug = True
|
||||||
|
|
||||||
@ -68,25 +69,12 @@ class EMSL_dump:
|
|||||||
|
|
||||||
def get_list_format(self):
|
def get_list_format(self):
|
||||||
"""List all the format available in EMSL"""
|
"""List all the format available in EMSL"""
|
||||||
return self.format_dict
|
return format_dict
|
||||||
|
|
||||||
def set_db_path(self, path):
|
def set_db_path(self, path):
|
||||||
"""Define the database path"""
|
"""Define the database path"""
|
||||||
self.db_path = path
|
self.db_path = path
|
||||||
|
|
||||||
def get_dict_ele(self):
|
|
||||||
"""Return dict[atom]=[abreviation]"""
|
|
||||||
elt_path = os.path.dirname(sys.argv[0]) + "/src/elts_abrev.dat"
|
|
||||||
|
|
||||||
with open(elt_path, "r") as f:
|
|
||||||
data = f.readlines()
|
|
||||||
|
|
||||||
dict_ele = dict()
|
|
||||||
for i in data:
|
|
||||||
l = i.split("-")
|
|
||||||
dict_ele[l[1].strip().lower()] = l[2].strip().lower()
|
|
||||||
return dict_ele
|
|
||||||
|
|
||||||
def dwl_basis_list_raw(self):
|
def dwl_basis_list_raw(self):
|
||||||
"""Return the source code of the iframe
|
"""Return the source code of the iframe
|
||||||
who contains the list of the basis set available"""
|
who contains the list of the basis set available"""
|
||||||
@ -136,7 +124,8 @@ class EMSL_dump:
|
|||||||
9 - name of contributor
|
9 - name of contributor
|
||||||
10 - human-readable summary/description of basis set
|
10 - human-readable summary/description of basis set
|
||||||
"""
|
"""
|
||||||
d = {}
|
|
||||||
|
d = OrderedDict()
|
||||||
|
|
||||||
for line in data_raw.split('\n'):
|
for line in data_raw.split('\n'):
|
||||||
|
|
||||||
@ -148,69 +137,40 @@ class EMSL_dump:
|
|||||||
|
|
||||||
tup = eval(s)
|
tup = eval(s)
|
||||||
|
|
||||||
# non-published (e.g. rejected) basis sets should be ignored
|
xml_path = tup[0]
|
||||||
if tup[4] != "published":
|
|
||||||
|
# non-published (e.g. rejected) basis sets and ecp should be
|
||||||
|
# ignored
|
||||||
|
if tup[4] != "published" or "-ecp" in xml_path.lower():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
xml_path = tup[0]
|
|
||||||
name = tup[1]
|
name = tup[1]
|
||||||
|
|
||||||
elts = re.sub('[["\ \]]', '', tup[3]).split(',')
|
elts = re.sub('[["\ \]]', '', tup[3]).split(',')
|
||||||
|
|
||||||
des = re.sub('\s+', ' ', tup[-1])
|
des = re.sub('\s+', ' ', tup[-1])
|
||||||
|
|
||||||
if "-ecp" in xml_path.lower():
|
|
||||||
continue
|
|
||||||
d[name] = [name, xml_path, des, elts]
|
d[name] = [name, xml_path, des, elts]
|
||||||
|
|
||||||
return d
|
return d
|
||||||
|
|
||||||
def parse_basis_data_gamess_us(self, data, name, des, elts):
|
|
||||||
"""Parse the basis data raw html of gamess-us to get a nice tuple
|
|
||||||
Return [name, description, [[ele, data_ele],...]]"""
|
|
||||||
basis_data = []
|
|
||||||
|
|
||||||
b = data.find("$DATA")
|
|
||||||
e = data.find("$END")
|
|
||||||
if (b == -1 or data.find("$DATA$END") != -1):
|
|
||||||
if self.debug:
|
|
||||||
print data
|
|
||||||
raise Exception("WARNING not DATA")
|
|
||||||
else:
|
|
||||||
dict_replace = {"PHOSPHOROUS": "PHOSPHORUS",
|
|
||||||
"D+": "E+",
|
|
||||||
"D-": "E-"}
|
|
||||||
|
|
||||||
for k, v in dict_replace.iteritems():
|
|
||||||
data = data.replace(k, v)
|
|
||||||
|
|
||||||
data = data[b + 5:e - 1].split('\n\n')
|
|
||||||
|
|
||||||
dict_ele = self.get_dict_ele()
|
|
||||||
|
|
||||||
for (elt, data_elt) in zip(elts, data):
|
|
||||||
|
|
||||||
elt_long_th = dict_ele[elt.lower()]
|
|
||||||
elt_long_exp = data_elt.split()[0].lower()
|
|
||||||
|
|
||||||
if "$" in data_elt:
|
|
||||||
if self.debug:
|
|
||||||
print "Eror",
|
|
||||||
raise Exception("WARNING bad split")
|
|
||||||
|
|
||||||
if elt_long_th == elt_long_exp:
|
|
||||||
basis_data.append([elt, data_elt.strip()])
|
|
||||||
else:
|
|
||||||
if self.debug:
|
|
||||||
print "th", elt_long_th
|
|
||||||
print "exp", elt_long_exp
|
|
||||||
print "abv", elt
|
|
||||||
raise Exception("WARNING not a good ELEMENT")
|
|
||||||
|
|
||||||
return [name, des, basis_data]
|
|
||||||
|
|
||||||
|
# _____ _
|
||||||
|
# / __ \ | |
|
||||||
|
# | / \/_ __ ___ __ _| |_ ___
|
||||||
|
# | | | '__/ _ \/ _` | __/ _ \
|
||||||
|
# | \__/\ | | __/ (_| | || __/
|
||||||
|
# \____/_| \___|\__,_|\__\___|
|
||||||
|
#
|
||||||
def create_sql(self, dict_basis_list):
|
def create_sql(self, dict_basis_list):
|
||||||
"""Create the sql from the list of basis available data"""
|
"""Create the sql from strach.
|
||||||
|
Take the list of basis available data,
|
||||||
|
download her, put her in sql"""
|
||||||
|
|
||||||
|
if os.path.isfile(self.db_path):
|
||||||
|
print >> sys.stderr, "FAILLURE:"
|
||||||
|
print >> sys.stderr, "{0} file alredy exist.".format(self.db_path),
|
||||||
|
print >> sys.stderr, "Delete or remove it"
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
conn = sqlite3.connect(self.db_path)
|
conn = sqlite3.connect(self.db_path)
|
||||||
c = conn.cursor()
|
c = conn.cursor()
|
||||||
@ -240,8 +200,6 @@ class EMSL_dump:
|
|||||||
NATURAL JOIN data_tab
|
NATURAL JOIN data_tab
|
||||||
''')
|
''')
|
||||||
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
import Queue
|
import Queue
|
||||||
import threading
|
import threading
|
||||||
|
|
||||||
@ -269,12 +227,11 @@ class EMSL_dump:
|
|||||||
while attemps < attemps_max:
|
while attemps < attemps_max:
|
||||||
text = self.requests.get(url, params=params).text
|
text = self.requests.get(url, params=params).text
|
||||||
try:
|
try:
|
||||||
basis_data = self.parse_basis_data_gamess_us(
|
basis_data = self.parser(text, name, des, elts,
|
||||||
text,
|
self.debug)
|
||||||
name,
|
|
||||||
des,
|
|
||||||
elts)
|
|
||||||
except:
|
except:
|
||||||
|
if self.debug:
|
||||||
|
raise
|
||||||
time.sleep(0.1)
|
time.sleep(0.1)
|
||||||
attemps += 1
|
attemps += 1
|
||||||
else:
|
else:
|
||||||
@ -310,24 +267,33 @@ class EMSL_dump:
|
|||||||
name, des, basis_data = q_out.get()
|
name, des, basis_data = q_out.get()
|
||||||
q_out.task_done()
|
q_out.task_done()
|
||||||
|
|
||||||
|
str_indice = '{:>3}'.format(i + 1)
|
||||||
|
str_ = '{0} / {1} | {2}'.format(str_indice, nb_basis, name)
|
||||||
|
|
||||||
|
# ~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~ #
|
||||||
|
# A d d _ t h e _ b a s i s _ n a m e #
|
||||||
|
# ~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~ #
|
||||||
try:
|
try:
|
||||||
cmd = "INSERT INTO basis_tab(name,description) VALUES (?,?)"
|
cmd = "INSERT INTO basis_tab(name,description) VALUES (?,?)"
|
||||||
c.execute(cmd, [name, des])
|
c.execute(cmd, [name, des])
|
||||||
conn.commit()
|
conn.commit()
|
||||||
except sqlite3.IntegrityError:
|
except sqlite3.IntegrityError:
|
||||||
print '{:>3}'.format(i + 1), "/", nb_basis, name, "fail"
|
print str_, "Fail"
|
||||||
|
|
||||||
|
# ~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~ #
|
||||||
|
# A d d _ t h e _ b a s i s _ d a t a #
|
||||||
|
# ~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~ #
|
||||||
|
|
||||||
id_ = [c.lastrowid]
|
id_ = [c.lastrowid]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
cmd = "INSERT INTO data_tab VALUES (?,?,?)"
|
cmd = "INSERT INTO data_tab(basis_id,elt,data) VALUES (?,?,?)"
|
||||||
c.executemany(cmd, [id_ + k for k in basis_data])
|
c.executemany(cmd, [id_ + k for k in basis_data])
|
||||||
conn.commit()
|
conn.commit()
|
||||||
print '{:>3}'.format(i + 1), "/", nb_basis, name
|
except sqlite3.IntegrityError:
|
||||||
|
print str_, "Fail"
|
||||||
except:
|
else:
|
||||||
print '{:>3}'.format(i + 1), "/", nb_basis, name, "fail"
|
print str_
|
||||||
raise
|
|
||||||
|
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
q_in.join()
|
q_in.join()
|
||||||
|
92
src/parser.py
Normal file
92
src/parser.py
Normal file
@ -0,0 +1,92 @@
|
|||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def get_dict_ele():
|
||||||
|
"""Return dict[atom]=[abreviation]"""
|
||||||
|
elt_path = os.path.dirname(sys.argv[0]) + "/src/elts_abrev.dat"
|
||||||
|
|
||||||
|
with open(elt_path, "r") as f:
|
||||||
|
data = f.readlines()
|
||||||
|
|
||||||
|
dict_ele = dict()
|
||||||
|
for i in data:
|
||||||
|
l = i.split("-")
|
||||||
|
dict_ele[l[1].strip().lower()] = l[2].strip().lower()
|
||||||
|
|
||||||
|
return dict_ele
|
||||||
|
|
||||||
|
|
||||||
|
# ______
|
||||||
|
# | ___ \
|
||||||
|
# | |_/ /_ _ _ __ ___ ___ _ __
|
||||||
|
# | __/ _` | '__/ __|/ _ \ '__|
|
||||||
|
# | | | (_| | | \__ \ __/ |
|
||||||
|
# \_| \__,_|_| |___/\___|_|
|
||||||
|
#
|
||||||
|
|
||||||
|
# __
|
||||||
|
# /__ _. ._ _ _ _ _ _
|
||||||
|
# \_| (_| | | | (/_ _> _> |_| _>
|
||||||
|
#
|
||||||
|
def parse_basis_data_gamess_us(data, name, des, elts, debug=False):
|
||||||
|
"""Parse the basis data raw html of gamess-us to get a nice tuple
|
||||||
|
Return [name, description, [[ele, data_ele],...]]"""
|
||||||
|
basis_data = []
|
||||||
|
|
||||||
|
b = data.find("$DATA")
|
||||||
|
e = data.find("$END")
|
||||||
|
if (b == -1 or data.find("$DATA$END") != -1):
|
||||||
|
if debug:
|
||||||
|
print data
|
||||||
|
raise Exception("WARNING not DATA")
|
||||||
|
else:
|
||||||
|
dict_replace = {"PHOSPHOROUS": "PHOSPHORUS",
|
||||||
|
"D+": "E+",
|
||||||
|
"D-": "E-"}
|
||||||
|
|
||||||
|
for k, v in dict_replace.iteritems():
|
||||||
|
data = data.replace(k, v)
|
||||||
|
|
||||||
|
data = data[b + 5:e - 1].split('\n\n')
|
||||||
|
|
||||||
|
dict_ele = get_dict_ele()
|
||||||
|
|
||||||
|
for (elt, data_elt) in zip(elts, data):
|
||||||
|
|
||||||
|
elt_long_th = dict_ele[elt.lower()]
|
||||||
|
elt_long_exp = data_elt.split()[0].lower()
|
||||||
|
|
||||||
|
if "$" in data_elt:
|
||||||
|
if debug:
|
||||||
|
print "Eror",
|
||||||
|
raise Exception("WARNING bad split")
|
||||||
|
|
||||||
|
if elt_long_th == elt_long_exp:
|
||||||
|
basis_data.append([elt, data_elt.strip()])
|
||||||
|
else:
|
||||||
|
if debug:
|
||||||
|
print "th", elt_long_th
|
||||||
|
print "exp", elt_long_exp
|
||||||
|
print "abv", elt
|
||||||
|
raise Exception("WARNING not a good ELEMENT")
|
||||||
|
|
||||||
|
return [name, des, basis_data]
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
format_dict = {"Gaussian94": None,
|
||||||
|
"GAMESS-US": parse_basis_data_gamess_us,
|
||||||
|
"GAMESS-UK": None,
|
||||||
|
"Turbomole": None,
|
||||||
|
"TX93": None,
|
||||||
|
"Molpro": None,
|
||||||
|
"MolproInt": None,
|
||||||
|
"Hondo": None,
|
||||||
|
"SuperMolecule": None,
|
||||||
|
"Molcas": None,
|
||||||
|
"HyperChem": None,
|
||||||
|
"Dalton": None,
|
||||||
|
"deMon-KS": None,
|
||||||
|
"deMon2k": None,
|
||||||
|
"AcesII": None,
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user