10
0
mirror of https://github.com/LCPQ/EMSL_Basis_Set_Exchange_Local synced 2024-11-03 20:53:50 +01:00

Debug / Typo / Split parser in new parser.py

This commit is contained in:
Thomas Applencourt 2015-03-16 14:35:25 +01:00
parent 4f8c21368e
commit b6503406f8
3 changed files with 157 additions and 103 deletions

View File

@ -155,14 +155,10 @@ if __name__ == '__main__':
db_path = arguments["--db_path"] db_path = arguments["--db_path"]
format = arguments["--format"] format = arguments["--format"]
format_dict = EMSL_dump().get_list_format()
if format not in format_dict:
print "Format %s doesn't exist. Run list_formats to get the list of formats." % (format)
sys.exit(1)
contraction = not arguments["--no-contraction"] contraction = not arguments["--no-contraction"]
e = EMSL_dump(db_path=db_path, e = EMSL_dump(db_path=db_path,
format=format_dict[format], format=format,
contraction=contraction) contraction=contraction)
e.new_db() e.new_db()

View File

@ -4,6 +4,9 @@ import re
import time import time
import sqlite3 import sqlite3
from collections import OrderedDict
from src.parser import format_dict
def install_with_pip(name): def install_with_pip(name):
@ -35,26 +38,24 @@ class EMSL_dump:
This call implement all you need for download the EMSL and save it localy This call implement all you need for download the EMSL and save it localy
""" """
format_dict = {"g94": "Gaussian94",
"gamess-us": "GAMESS-US",
"gamess-uk": "GAMESS-UK",
"turbomole": "Turbomole",
"tx93": "TX93",
"molpro": "Molpro",
"molproint": "MolproInt",
"hondo": "Hondo",
"supermolecule": "SuperMolecule",
"molcas": "Molcas",
"hyperchem": "HyperChem",
"dalton": "Dalton",
"demon-ks": "deMon-KS",
"demon2k": "deMon2k",
"aces2": "AcesII"
}
def __init__(self, db_path=None, format="GAMESS-US", contraction="True"): def __init__(self, db_path=None, format="GAMESS-US", contraction="True"):
self.db_path = db_path self.db_path = db_path
self.format = format
if format not in format_dict:
print >> sys.stderr, "Format {0} doesn't exist. Choose in:".format(format)
print >> sys.stderr, format_dict.keys()
sys.exit(1)
else:
self.format = format
if format_dict[self.format]:
self.parser = format_dict[self.format]
else:
print >> sys.stderr, "We have no parser for this format"
print >> sys.stderr, "Fill free to Fock /pull request"
print >> sys.stderr, "You just need to add a function like"
print >> sys.stderr, "'parse_basis_data_gamess_us' to parse you'r format"
self.contraction = str(contraction) self.contraction = str(contraction)
self.debug = True self.debug = True
@ -68,25 +69,12 @@ class EMSL_dump:
def get_list_format(self): def get_list_format(self):
"""List all the format available in EMSL""" """List all the format available in EMSL"""
return self.format_dict return format_dict
def set_db_path(self, path): def set_db_path(self, path):
"""Define the database path""" """Define the database path"""
self.db_path = path self.db_path = path
def get_dict_ele(self):
"""Return dict[atom]=[abreviation]"""
elt_path = os.path.dirname(sys.argv[0]) + "/src/elts_abrev.dat"
with open(elt_path, "r") as f:
data = f.readlines()
dict_ele = dict()
for i in data:
l = i.split("-")
dict_ele[l[1].strip().lower()] = l[2].strip().lower()
return dict_ele
def dwl_basis_list_raw(self): def dwl_basis_list_raw(self):
"""Return the source code of the iframe """Return the source code of the iframe
who contains the list of the basis set available""" who contains the list of the basis set available"""
@ -136,7 +124,8 @@ class EMSL_dump:
9 - name of contributor 9 - name of contributor
10 - human-readable summary/description of basis set 10 - human-readable summary/description of basis set
""" """
d = {}
d = OrderedDict()
for line in data_raw.split('\n'): for line in data_raw.split('\n'):
@ -148,69 +137,40 @@ class EMSL_dump:
tup = eval(s) tup = eval(s)
# non-published (e.g. rejected) basis sets should be ignored xml_path = tup[0]
if tup[4] != "published":
# non-published (e.g. rejected) basis sets and ecp should be
# ignored
if tup[4] != "published" or "-ecp" in xml_path.lower():
continue continue
xml_path = tup[0]
name = tup[1] name = tup[1]
elts = re.sub('[["\ \]]', '', tup[3]).split(',') elts = re.sub('[["\ \]]', '', tup[3]).split(',')
des = re.sub('\s+', ' ', tup[-1]) des = re.sub('\s+', ' ', tup[-1])
if "-ecp" in xml_path.lower():
continue
d[name] = [name, xml_path, des, elts] d[name] = [name, xml_path, des, elts]
return d return d
def parse_basis_data_gamess_us(self, data, name, des, elts):
"""Parse the basis data raw html of gamess-us to get a nice tuple
Return [name, description, [[ele, data_ele],...]]"""
basis_data = []
b = data.find("$DATA")
e = data.find("$END")
if (b == -1 or data.find("$DATA$END") != -1):
if self.debug:
print data
raise Exception("WARNING not DATA")
else:
dict_replace = {"PHOSPHOROUS": "PHOSPHORUS",
"D+": "E+",
"D-": "E-"}
for k, v in dict_replace.iteritems():
data = data.replace(k, v)
data = data[b + 5:e - 1].split('\n\n')
dict_ele = self.get_dict_ele()
for (elt, data_elt) in zip(elts, data):
elt_long_th = dict_ele[elt.lower()]
elt_long_exp = data_elt.split()[0].lower()
if "$" in data_elt:
if self.debug:
print "Eror",
raise Exception("WARNING bad split")
if elt_long_th == elt_long_exp:
basis_data.append([elt, data_elt.strip()])
else:
if self.debug:
print "th", elt_long_th
print "exp", elt_long_exp
print "abv", elt
raise Exception("WARNING not a good ELEMENT")
return [name, des, basis_data]
# _____ _
# / __ \ | |
# | / \/_ __ ___ __ _| |_ ___
# | | | '__/ _ \/ _` | __/ _ \
# | \__/\ | | __/ (_| | || __/
# \____/_| \___|\__,_|\__\___|
#
def create_sql(self, dict_basis_list): def create_sql(self, dict_basis_list):
"""Create the sql from the list of basis available data""" """Create the sql from strach.
Take the list of basis available data,
download her, put her in sql"""
if os.path.isfile(self.db_path):
print >> sys.stderr, "FAILLURE:"
print >> sys.stderr, "{0} file alredy exist.".format(self.db_path),
print >> sys.stderr, "Delete or remove it"
sys.exit(1)
conn = sqlite3.connect(self.db_path) conn = sqlite3.connect(self.db_path)
c = conn.cursor() c = conn.cursor()
@ -240,8 +200,6 @@ class EMSL_dump:
NATURAL JOIN data_tab NATURAL JOIN data_tab
''') ''')
conn.commit()
import Queue import Queue
import threading import threading
@ -269,12 +227,11 @@ class EMSL_dump:
while attemps < attemps_max: while attemps < attemps_max:
text = self.requests.get(url, params=params).text text = self.requests.get(url, params=params).text
try: try:
basis_data = self.parse_basis_data_gamess_us( basis_data = self.parser(text, name, des, elts,
text, self.debug)
name,
des,
elts)
except: except:
if self.debug:
raise
time.sleep(0.1) time.sleep(0.1)
attemps += 1 attemps += 1
else: else:
@ -310,24 +267,33 @@ class EMSL_dump:
name, des, basis_data = q_out.get() name, des, basis_data = q_out.get()
q_out.task_done() q_out.task_done()
str_indice = '{:>3}'.format(i + 1)
str_ = '{0} / {1} | {2}'.format(str_indice, nb_basis, name)
# ~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~ #
# A d d _ t h e _ b a s i s _ n a m e #
# ~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~ #
try: try:
cmd = "INSERT INTO basis_tab(name,description) VALUES (?,?)" cmd = "INSERT INTO basis_tab(name,description) VALUES (?,?)"
c.execute(cmd, [name, des]) c.execute(cmd, [name, des])
conn.commit() conn.commit()
except sqlite3.IntegrityError: except sqlite3.IntegrityError:
print '{:>3}'.format(i + 1), "/", nb_basis, name, "fail" print str_, "Fail"
# ~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~ #
# A d d _ t h e _ b a s i s _ d a t a #
# ~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~ #
id_ = [c.lastrowid] id_ = [c.lastrowid]
try: try:
cmd = "INSERT INTO data_tab VALUES (?,?,?)" cmd = "INSERT INTO data_tab(basis_id,elt,data) VALUES (?,?,?)"
c.executemany(cmd, [id_ + k for k in basis_data]) c.executemany(cmd, [id_ + k for k in basis_data])
conn.commit() conn.commit()
print '{:>3}'.format(i + 1), "/", nb_basis, name except sqlite3.IntegrityError:
print str_, "Fail"
except: else:
print '{:>3}'.format(i + 1), "/", nb_basis, name, "fail" print str_
raise
conn.close() conn.close()
q_in.join() q_in.join()

92
src/parser.py Normal file
View File

@ -0,0 +1,92 @@
import sys
def get_dict_ele():
"""Return dict[atom]=[abreviation]"""
elt_path = os.path.dirname(sys.argv[0]) + "/src/elts_abrev.dat"
with open(elt_path, "r") as f:
data = f.readlines()
dict_ele = dict()
for i in data:
l = i.split("-")
dict_ele[l[1].strip().lower()] = l[2].strip().lower()
return dict_ele
# ______
# | ___ \
# | |_/ /_ _ _ __ ___ ___ _ __
# | __/ _` | '__/ __|/ _ \ '__|
# | | | (_| | | \__ \ __/ |
# \_| \__,_|_| |___/\___|_|
#
# __
# /__ _. ._ _ _ _ _ _
# \_| (_| | | | (/_ _> _> |_| _>
#
def parse_basis_data_gamess_us(data, name, des, elts, debug=False):
"""Parse the basis data raw html of gamess-us to get a nice tuple
Return [name, description, [[ele, data_ele],...]]"""
basis_data = []
b = data.find("$DATA")
e = data.find("$END")
if (b == -1 or data.find("$DATA$END") != -1):
if debug:
print data
raise Exception("WARNING not DATA")
else:
dict_replace = {"PHOSPHOROUS": "PHOSPHORUS",
"D+": "E+",
"D-": "E-"}
for k, v in dict_replace.iteritems():
data = data.replace(k, v)
data = data[b + 5:e - 1].split('\n\n')
dict_ele = get_dict_ele()
for (elt, data_elt) in zip(elts, data):
elt_long_th = dict_ele[elt.lower()]
elt_long_exp = data_elt.split()[0].lower()
if "$" in data_elt:
if debug:
print "Eror",
raise Exception("WARNING bad split")
if elt_long_th == elt_long_exp:
basis_data.append([elt, data_elt.strip()])
else:
if debug:
print "th", elt_long_th
print "exp", elt_long_exp
print "abv", elt
raise Exception("WARNING not a good ELEMENT")
return [name, des, basis_data]
import os
format_dict = {"Gaussian94": None,
"GAMESS-US": parse_basis_data_gamess_us,
"GAMESS-UK": None,
"Turbomole": None,
"TX93": None,
"Molpro": None,
"MolproInt": None,
"Hondo": None,
"SuperMolecule": None,
"Molcas": None,
"HyperChem": None,
"Dalton": None,
"deMon-KS": None,
"deMon2k": None,
"AcesII": None,
}