From 0af5cb6fa1767a03512c70e0fa7aa65b064075a7 Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Tue, 17 Mar 2015 15:03:38 +0100 Subject: [PATCH] Add gaussian 94 support --- src/EMSL_local.py | 4 +-- src/parser.py | 90 +++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 89 insertions(+), 5 deletions(-) diff --git a/src/EMSL_local.py b/src/EMSL_local.py index 64ea8e2..1afeb09 100755 --- a/src/EMSL_local.py +++ b/src/EMSL_local.py @@ -229,13 +229,13 @@ class EMSL_local: str_ = """SELECT DISTINCT elt FROM output_tab - WHERE name=:name_us COLLATE NOCASE""" + WHERE name=(?) COLLATE NOCASE""" # ~#~#~#~#~ # # F e t c h # # ~#~#~#~#~ # - self.c.execute(str_, {"name_us": basis_name}) + self.c.execute(str_, basis_name) # ~#~#~#~#~#~ # # R e t u r n # diff --git a/src/parser.py b/src/parser.py index 486e948..ec9d878 100644 --- a/src/parser.py +++ b/src/parser.py @@ -1,5 +1,6 @@ import sys import os +import re def get_dict_ele(): @@ -74,8 +75,6 @@ def parse_basis_data_gamess_us(data, name, des, elts, debug=False): return [name, des, basis_data] -import re - symmetry_regex = re.compile(ur'^(\w)\s+\d+\b') @@ -158,6 +157,91 @@ def handle_l_gamess_us(l_atom_basis): return l_data + +# __ _ +# /__ _. _ _ o _. ._ (_| |_|_ +# \_| (_| |_| _> _> | (_| | | | | +# +def parse_basis_data_gaussian94(data, name, description, elements, debug=True): + """Parse the Gaussian94 basis data raw html to get a nice tuple. + + The data-pairs item is actually expected to be a 2 item list: + [symbol, data] + + e.g. ["Ca", "#BASIS SET..."] + + N.B.: Currently ignores ECP data! + + @param data: raw HTML from BSE + @type data : unicode + @param name: basis set name + @type name : str + @param des: basis set description + @type des : str + @param elements: element symbols e.g. ['H', 'C', 'N', 'O', 'Cl'] + @type elements : list + @return: (name, description, data-pairs) + @rtype : tuple + """ + + d = [] + + # Each basis set block starts and ends with ****. Find the region + # containing all the basis blocks using the first and last ****. + mark = "****" + begin = data.find(mark) + end = data.rfind(mark) + + if begin == -1 or end == -1: + if debug: + print(data) + str_ = " No basis set data found while attempting to process {0} ({1})" + raise ValueError(str_.format(name, description)) + + trimmed = data[begin + len(mark): end - len(mark)].strip() + chunks = [] + lines = [] + + # group lines of data delimited by mark into per-element chunks + for line in trimmed.split("\n"): + if line.startswith(mark): + if lines: + chunks.append(lines) + lines = [line] + else: + lines.append(line) + + # handle trailing chunk that is not followed by another basis set block + # also remove the marker lines from the chunk itself + if lines and (not chunks or lines != chunks[-1]): + chunks.append(lines) + + # join lines back into solid text blocks + chunks = ["\n".join([L for L in c if mark not in L]) for c in chunks] + + # check each block for element and assign symbols to final pairs + pairs = [] + unused_elements = set([e.upper() for e in elements]) + for chunk in chunks: + # get first 3 chars of first line in block + symbol = chunk.split("\n")[0][:3].strip() + try: + unused_elements.remove(symbol.upper()) + except KeyError: + if debug: + msg = "Warning: already processed {0}\n".format(symbol) + sys.stderr.write(msg) + pairs.append([symbol, chunk]) + + if unused_elements: + msg = "Warning: elements {0} left over for {1}".format( + list(unused_elements), + name) + print(msg) + + return [name, description, pairs] + + # ______ _ _ _ _ # | ___| | | | (_) | | # | |_ _ __ ___ _ __ ___ __ _| |_ __| |_ ___| |_ @@ -166,7 +250,7 @@ def handle_l_gamess_us(l_atom_basis): # \_| |_| \___/|_| |_| |_|\__,_|\__| \__,_|_|\___|\__| # -format_dict = {"Gaussian94": None, +format_dict = {"Gaussian94": parse_basis_data_gaussian94, "GAMESS-US": parse_basis_data_gamess_us, "GAMESS-UK": None, "Turbomole": None,