mirror of
https://github.com/LCPQ/EMSL_Basis_Set_Exchange_Local
synced 2024-12-22 04:13:43 +01:00
Debug / Typo / Split parser in new parser.py
This commit is contained in:
parent
4f8c21368e
commit
b6503406f8
@ -155,14 +155,10 @@ if __name__ == '__main__':
|
||||
db_path = arguments["--db_path"]
|
||||
format = arguments["--format"]
|
||||
|
||||
format_dict = EMSL_dump().get_list_format()
|
||||
if format not in format_dict:
|
||||
print "Format %s doesn't exist. Run list_formats to get the list of formats." % (format)
|
||||
sys.exit(1)
|
||||
contraction = not arguments["--no-contraction"]
|
||||
|
||||
e = EMSL_dump(db_path=db_path,
|
||||
format=format_dict[format],
|
||||
format=format,
|
||||
contraction=contraction)
|
||||
e.new_db()
|
||||
|
||||
|
162
src/EMSL_dump.py
162
src/EMSL_dump.py
@ -4,6 +4,9 @@ import re
|
||||
import time
|
||||
import sqlite3
|
||||
|
||||
from collections import OrderedDict
|
||||
from src.parser import format_dict
|
||||
|
||||
|
||||
def install_with_pip(name):
|
||||
|
||||
@ -35,26 +38,24 @@ class EMSL_dump:
|
||||
This call implement all you need for download the EMSL and save it localy
|
||||
"""
|
||||
|
||||
format_dict = {"g94": "Gaussian94",
|
||||
"gamess-us": "GAMESS-US",
|
||||
"gamess-uk": "GAMESS-UK",
|
||||
"turbomole": "Turbomole",
|
||||
"tx93": "TX93",
|
||||
"molpro": "Molpro",
|
||||
"molproint": "MolproInt",
|
||||
"hondo": "Hondo",
|
||||
"supermolecule": "SuperMolecule",
|
||||
"molcas": "Molcas",
|
||||
"hyperchem": "HyperChem",
|
||||
"dalton": "Dalton",
|
||||
"demon-ks": "deMon-KS",
|
||||
"demon2k": "deMon2k",
|
||||
"aces2": "AcesII"
|
||||
}
|
||||
|
||||
def __init__(self, db_path=None, format="GAMESS-US", contraction="True"):
|
||||
self.db_path = db_path
|
||||
self.format = format
|
||||
|
||||
if format not in format_dict:
|
||||
print >> sys.stderr, "Format {0} doesn't exist. Choose in:".format(format)
|
||||
print >> sys.stderr, format_dict.keys()
|
||||
sys.exit(1)
|
||||
else:
|
||||
self.format = format
|
||||
|
||||
if format_dict[self.format]:
|
||||
self.parser = format_dict[self.format]
|
||||
else:
|
||||
print >> sys.stderr, "We have no parser for this format"
|
||||
print >> sys.stderr, "Fill free to Fock /pull request"
|
||||
print >> sys.stderr, "You just need to add a function like"
|
||||
print >> sys.stderr, "'parse_basis_data_gamess_us' to parse you'r format"
|
||||
|
||||
self.contraction = str(contraction)
|
||||
self.debug = True
|
||||
|
||||
@ -68,25 +69,12 @@ class EMSL_dump:
|
||||
|
||||
def get_list_format(self):
|
||||
"""List all the format available in EMSL"""
|
||||
return self.format_dict
|
||||
return format_dict
|
||||
|
||||
def set_db_path(self, path):
|
||||
"""Define the database path"""
|
||||
self.db_path = path
|
||||
|
||||
def get_dict_ele(self):
|
||||
"""Return dict[atom]=[abreviation]"""
|
||||
elt_path = os.path.dirname(sys.argv[0]) + "/src/elts_abrev.dat"
|
||||
|
||||
with open(elt_path, "r") as f:
|
||||
data = f.readlines()
|
||||
|
||||
dict_ele = dict()
|
||||
for i in data:
|
||||
l = i.split("-")
|
||||
dict_ele[l[1].strip().lower()] = l[2].strip().lower()
|
||||
return dict_ele
|
||||
|
||||
def dwl_basis_list_raw(self):
|
||||
"""Return the source code of the iframe
|
||||
who contains the list of the basis set available"""
|
||||
@ -136,7 +124,8 @@ class EMSL_dump:
|
||||
9 - name of contributor
|
||||
10 - human-readable summary/description of basis set
|
||||
"""
|
||||
d = {}
|
||||
|
||||
d = OrderedDict()
|
||||
|
||||
for line in data_raw.split('\n'):
|
||||
|
||||
@ -148,69 +137,40 @@ class EMSL_dump:
|
||||
|
||||
tup = eval(s)
|
||||
|
||||
# non-published (e.g. rejected) basis sets should be ignored
|
||||
if tup[4] != "published":
|
||||
xml_path = tup[0]
|
||||
|
||||
# non-published (e.g. rejected) basis sets and ecp should be
|
||||
# ignored
|
||||
if tup[4] != "published" or "-ecp" in xml_path.lower():
|
||||
continue
|
||||
|
||||
xml_path = tup[0]
|
||||
name = tup[1]
|
||||
|
||||
elts = re.sub('[["\ \]]', '', tup[3]).split(',')
|
||||
|
||||
des = re.sub('\s+', ' ', tup[-1])
|
||||
|
||||
if "-ecp" in xml_path.lower():
|
||||
continue
|
||||
d[name] = [name, xml_path, des, elts]
|
||||
|
||||
return d
|
||||
|
||||
def parse_basis_data_gamess_us(self, data, name, des, elts):
|
||||
"""Parse the basis data raw html of gamess-us to get a nice tuple
|
||||
Return [name, description, [[ele, data_ele],...]]"""
|
||||
basis_data = []
|
||||
|
||||
b = data.find("$DATA")
|
||||
e = data.find("$END")
|
||||
if (b == -1 or data.find("$DATA$END") != -1):
|
||||
if self.debug:
|
||||
print data
|
||||
raise Exception("WARNING not DATA")
|
||||
else:
|
||||
dict_replace = {"PHOSPHOROUS": "PHOSPHORUS",
|
||||
"D+": "E+",
|
||||
"D-": "E-"}
|
||||
|
||||
for k, v in dict_replace.iteritems():
|
||||
data = data.replace(k, v)
|
||||
|
||||
data = data[b + 5:e - 1].split('\n\n')
|
||||
|
||||
dict_ele = self.get_dict_ele()
|
||||
|
||||
for (elt, data_elt) in zip(elts, data):
|
||||
|
||||
elt_long_th = dict_ele[elt.lower()]
|
||||
elt_long_exp = data_elt.split()[0].lower()
|
||||
|
||||
if "$" in data_elt:
|
||||
if self.debug:
|
||||
print "Eror",
|
||||
raise Exception("WARNING bad split")
|
||||
|
||||
if elt_long_th == elt_long_exp:
|
||||
basis_data.append([elt, data_elt.strip()])
|
||||
else:
|
||||
if self.debug:
|
||||
print "th", elt_long_th
|
||||
print "exp", elt_long_exp
|
||||
print "abv", elt
|
||||
raise Exception("WARNING not a good ELEMENT")
|
||||
|
||||
return [name, des, basis_data]
|
||||
|
||||
# _____ _
|
||||
# / __ \ | |
|
||||
# | / \/_ __ ___ __ _| |_ ___
|
||||
# | | | '__/ _ \/ _` | __/ _ \
|
||||
# | \__/\ | | __/ (_| | || __/
|
||||
# \____/_| \___|\__,_|\__\___|
|
||||
#
|
||||
def create_sql(self, dict_basis_list):
|
||||
"""Create the sql from the list of basis available data"""
|
||||
"""Create the sql from strach.
|
||||
Take the list of basis available data,
|
||||
download her, put her in sql"""
|
||||
|
||||
if os.path.isfile(self.db_path):
|
||||
print >> sys.stderr, "FAILLURE:"
|
||||
print >> sys.stderr, "{0} file alredy exist.".format(self.db_path),
|
||||
print >> sys.stderr, "Delete or remove it"
|
||||
sys.exit(1)
|
||||
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
c = conn.cursor()
|
||||
@ -240,8 +200,6 @@ class EMSL_dump:
|
||||
NATURAL JOIN data_tab
|
||||
''')
|
||||
|
||||
conn.commit()
|
||||
|
||||
import Queue
|
||||
import threading
|
||||
|
||||
@ -269,12 +227,11 @@ class EMSL_dump:
|
||||
while attemps < attemps_max:
|
||||
text = self.requests.get(url, params=params).text
|
||||
try:
|
||||
basis_data = self.parse_basis_data_gamess_us(
|
||||
text,
|
||||
name,
|
||||
des,
|
||||
elts)
|
||||
basis_data = self.parser(text, name, des, elts,
|
||||
self.debug)
|
||||
except:
|
||||
if self.debug:
|
||||
raise
|
||||
time.sleep(0.1)
|
||||
attemps += 1
|
||||
else:
|
||||
@ -310,24 +267,33 @@ class EMSL_dump:
|
||||
name, des, basis_data = q_out.get()
|
||||
q_out.task_done()
|
||||
|
||||
str_indice = '{:>3}'.format(i + 1)
|
||||
str_ = '{0} / {1} | {2}'.format(str_indice, nb_basis, name)
|
||||
|
||||
# ~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~ #
|
||||
# A d d _ t h e _ b a s i s _ n a m e #
|
||||
# ~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~ #
|
||||
try:
|
||||
cmd = "INSERT INTO basis_tab(name,description) VALUES (?,?)"
|
||||
c.execute(cmd, [name, des])
|
||||
conn.commit()
|
||||
except sqlite3.IntegrityError:
|
||||
print '{:>3}'.format(i + 1), "/", nb_basis, name, "fail"
|
||||
print str_, "Fail"
|
||||
|
||||
# ~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~ #
|
||||
# A d d _ t h e _ b a s i s _ d a t a #
|
||||
# ~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~ #
|
||||
|
||||
id_ = [c.lastrowid]
|
||||
|
||||
try:
|
||||
cmd = "INSERT INTO data_tab VALUES (?,?,?)"
|
||||
cmd = "INSERT INTO data_tab(basis_id,elt,data) VALUES (?,?,?)"
|
||||
c.executemany(cmd, [id_ + k for k in basis_data])
|
||||
conn.commit()
|
||||
print '{:>3}'.format(i + 1), "/", nb_basis, name
|
||||
|
||||
except:
|
||||
print '{:>3}'.format(i + 1), "/", nb_basis, name, "fail"
|
||||
raise
|
||||
|
||||
except sqlite3.IntegrityError:
|
||||
print str_, "Fail"
|
||||
else:
|
||||
print str_
|
||||
conn.close()
|
||||
|
||||
q_in.join()
|
||||
|
92
src/parser.py
Normal file
92
src/parser.py
Normal file
@ -0,0 +1,92 @@
|
||||
import sys
|
||||
|
||||
|
||||
def get_dict_ele():
|
||||
"""Return dict[atom]=[abreviation]"""
|
||||
elt_path = os.path.dirname(sys.argv[0]) + "/src/elts_abrev.dat"
|
||||
|
||||
with open(elt_path, "r") as f:
|
||||
data = f.readlines()
|
||||
|
||||
dict_ele = dict()
|
||||
for i in data:
|
||||
l = i.split("-")
|
||||
dict_ele[l[1].strip().lower()] = l[2].strip().lower()
|
||||
|
||||
return dict_ele
|
||||
|
||||
|
||||
# ______
|
||||
# | ___ \
|
||||
# | |_/ /_ _ _ __ ___ ___ _ __
|
||||
# | __/ _` | '__/ __|/ _ \ '__|
|
||||
# | | | (_| | | \__ \ __/ |
|
||||
# \_| \__,_|_| |___/\___|_|
|
||||
#
|
||||
|
||||
# __
|
||||
# /__ _. ._ _ _ _ _ _
|
||||
# \_| (_| | | | (/_ _> _> |_| _>
|
||||
#
|
||||
def parse_basis_data_gamess_us(data, name, des, elts, debug=False):
|
||||
"""Parse the basis data raw html of gamess-us to get a nice tuple
|
||||
Return [name, description, [[ele, data_ele],...]]"""
|
||||
basis_data = []
|
||||
|
||||
b = data.find("$DATA")
|
||||
e = data.find("$END")
|
||||
if (b == -1 or data.find("$DATA$END") != -1):
|
||||
if debug:
|
||||
print data
|
||||
raise Exception("WARNING not DATA")
|
||||
else:
|
||||
dict_replace = {"PHOSPHOROUS": "PHOSPHORUS",
|
||||
"D+": "E+",
|
||||
"D-": "E-"}
|
||||
|
||||
for k, v in dict_replace.iteritems():
|
||||
data = data.replace(k, v)
|
||||
|
||||
data = data[b + 5:e - 1].split('\n\n')
|
||||
|
||||
dict_ele = get_dict_ele()
|
||||
|
||||
for (elt, data_elt) in zip(elts, data):
|
||||
|
||||
elt_long_th = dict_ele[elt.lower()]
|
||||
elt_long_exp = data_elt.split()[0].lower()
|
||||
|
||||
if "$" in data_elt:
|
||||
if debug:
|
||||
print "Eror",
|
||||
raise Exception("WARNING bad split")
|
||||
|
||||
if elt_long_th == elt_long_exp:
|
||||
basis_data.append([elt, data_elt.strip()])
|
||||
else:
|
||||
if debug:
|
||||
print "th", elt_long_th
|
||||
print "exp", elt_long_exp
|
||||
print "abv", elt
|
||||
raise Exception("WARNING not a good ELEMENT")
|
||||
|
||||
return [name, des, basis_data]
|
||||
|
||||
import os
|
||||
|
||||
format_dict = {"Gaussian94": None,
|
||||
"GAMESS-US": parse_basis_data_gamess_us,
|
||||
"GAMESS-UK": None,
|
||||
"Turbomole": None,
|
||||
"TX93": None,
|
||||
"Molpro": None,
|
||||
"MolproInt": None,
|
||||
"Hondo": None,
|
||||
"SuperMolecule": None,
|
||||
"Molcas": None,
|
||||
"HyperChem": None,
|
||||
"Dalton": None,
|
||||
"deMon-KS": None,
|
||||
"deMon2k": None,
|
||||
"AcesII": None,
|
||||
}
|
Loading…
Reference in New Issue
Block a user