diff --git a/EMSL_api.py b/EMSL_api.py index 74afdb0..243d2e0 100755 --- a/EMSL_api.py +++ b/EMSL_api.py @@ -4,19 +4,22 @@ """EMSL Api. Usage: - EMSL_api.py list_basis [--atom=atom_name...] - [--db_path=db_path] - EMSL_api.py list_atoms --basis=basis_name - [--db_path=db_path] - EMSL_api.py get_basis_data --basis=basis_name - [--atom=atom_name...] - [--db_path=db_path] - [--with_l] - [(--save [--path=path])] + EMSL_api.py list_basis [--basis=...] + [--atom=...] + [--db_path=] + [--average_mo_number] + EMSL_api.py list_atoms --basis= + [--db_path=] + EMSL_api.py get_basis_data --basis= + [--atom=...] + [--db_path=] + [(--save [--path=])] + [--check=] + [--treat_l] EMSL_api.py list_formats - EMSL_api.py create_db --db_path=db_path - --format=format - [--no-contraction] + EMSL_api.py create_db --format= + [--db_path=] + [--no-contraction] EMSL_api.py (-h | --help) EMSL_api.py --version @@ -27,75 +30,100 @@ Options: is the path to the SQLite3 file containing the Basis sets. By default is $EMSL_API_ROOT/db/Gausian_uk.db + +Example of use: + ./EMSL_api.py list_basis --atom Al --atom U + ./EMSL_api.py list_basis --atom S --basis 'cc-pV*' --average_mo_number + ./EMSL_api.py list_atoms --basis ANO-RCC + ./EMSL_api.py get_basis_data --basis 3-21++G* """ -version = "0.2.0" +version = "0.8.1" -import sys +import os -from src.docopt import docopt -from src.EMSL_utility import EMSL_dump -from src.EMSL_utility import format_dict -from src.EMSL_utility import EMSL_local +from src.misc.docopt import docopt +from src.EMSL_dump import EMSL_dump +from src.EMSL_local import EMSL_local, checkSQLite3 if __name__ == '__main__': arguments = docopt(__doc__, version='EMSL Api ' + version) + # ___ + # | ._ o _|_ + # _|_ | | | |_ + # + if arguments["--db_path"]: db_path = arguments["--db_path"] else: - import os - db_path = os.path.dirname(__file__) + "/db/Gamess-us.db" + db_path = os.path.dirname(__file__) + "/db/GAMESS-US.db" + + # Check the db + try: + if not(arguments['create_db']): + db_path, db_path_changed = checkSQLite3(db_path) + except: + raise + + # _ _ _ ______ _ + # | | (_) | | | ___ \ (_) + # | | _ ___| |_ | |_/ / __ _ ___ _ ___ + # | | | / __| __| | ___ \/ _` / __| / __| + # | |___| \__ \ |_ | |_/ / (_| \__ \ \__ \ + # \_____/_|___/\__| \____/ \__,_|___/_|___/ - # _ _ _ ______ _ - #| | (_) | | | ___ \ (_) - #| | _ ___| |_ | |_/ / __ _ ___ _ ___ - #| | | / __| __| | ___ \/ _` / __| / __| - #| |___| \__ \ |_ | |_/ / (_| \__ \ \__ \ - #\_____/_|___/\__| \____/ \__,_|___/_|___/ - # if arguments["list_basis"]: e = EMSL_local(db_path=db_path) - elts = arguments["--atom"] - l = e.get_list_basis_available(elts) + l = e.list_basis_available(arguments["--atom"], + arguments["--basis"], + arguments["--average_mo_number"]) - for name, des in l: - print name, "|", des + if arguments["--average_mo_number"]: + for name, des, avg in l: + print "- '{}' ({}) || {:<50}".format(name, avg, des) + else: + for name, des in l: + print "- '{}' || {:<50}".format(name, des) - # _ _ _ _____ _ _ - #| | (_) | | | ___| | | | - #| | _ ___| |_ | |__ | | ___ _ __ ___ ___ _ __ | |_ ___ - #| | | / __| __| | __|| |/ _ \ '_ ` _ \ / _ \ '_ \| __/ __| - #| |___| \__ \ |_ | |___| | __/ | | | | | __/ | | | |_\__ \ - #\_____/_|___/\__| \____/|_|\___|_| |_| |_|\___|_| |_|\__|___/ - if arguments["list_atoms"]: + # _ _ _ _____ _ _ + # | | (_) | | | ___| | | | + # | | _ ___| |_ | |__ | | ___ _ __ ___ ___ _ __ | |_ ___ + # | | | / __| __| | __|| |/ _ \ '_ ` _ \ / _ \ '_ \| __/ __| + # | |___| \__ \ |_ | |___| | __/ | | | | | __/ | | | |_\__ \ + # \_____/_|___/\__| \____/|_|\___|_| |_| |_|\___|_| |_|\__|___/ + elif arguments["list_atoms"]: e = EMSL_local(db_path=db_path) basis_name = arguments["--basis"] l = e.get_list_element_available(basis_name) print ", ".join(l) - #______ _ _ _ - #| ___ \ (_) | | | | - #| |_/ / __ _ ___ _ ___ __| | __ _| |_ __ _ - #| ___ \/ _` / __| / __| / _` |/ _` | __/ _` | - #| |_/ / (_| \__ \ \__ \ | (_| | (_| | || (_| | - #\____/ \__,_|___/_|___/ \__,_|\__,_|\__\__,_| - if arguments["get_basis_data"]: + # ______ _ _ _ + # | ___ \ (_) | | | | + # | |_/ / __ _ ___ _ ___ __| | __ _| |_ __ _ + # | ___ \/ _` / __| / __| / _` |/ _` | __/ _` | + # | |_/ / (_| \__ \ \__ \ | (_| | (_| | || (_| | + # \____/ \__,_|___/_|___/ \__,_|\__,_|\__\__,_| + elif arguments["get_basis_data"]: e = EMSL_local(db_path=db_path) - basis_name = arguments["--basis"] + basis_name = arguments["--basis"][0] elts = arguments["--atom"] - l = e.get_basis(basis_name, elts,arguments["--with_l"]) - str_ = "\n\n".join(l) + "\n" + l_atom_basis = e.get_basis(basis_name, elts, + arguments["--treat_l"], + arguments["--check"]) + # Add separation between atoms, and a empty last line + str_ = "\n\n".join(l_atom_basis) + "\n" if arguments["--save"]: if arguments["--path"]: path = arguments["--path"] else: + # The defaut path is bais path = "_".join([basis_name, ".".join(elts)]) path = "/tmp/" + path + ".bs" @@ -105,32 +133,39 @@ if __name__ == '__main__': else: print str_ - # _ _ _ __ _ - #| | (_) | | / _| | | - #| | _ ___| |_ | |_ ___ _ __ _ __ ___ __ _| |_ ___ - #| | | / __| __| | _/ _ \| '__| '_ ` _ \ / _` | __/ __| - #| |___| \__ \ |_ | || (_) | | | | | | | | (_| | |_\__ \ - #\_____/_|___/\__| |_| \___/|_| |_| |_| |_|\__,_|\__|___/ - if arguments["list_formats"]: - for i in format_dict: + # _ _ _ __ _ + # | | (_) | | / _| | | + # | | _ ___| |_ | |_ ___ _ __ _ __ ___ __ _| |_ ___ + # | | | / __| __| | _/ _ \| '__| '_ ` _ \ / _` | __/ __| + # | |___| \__ \ |_ | || (_) | | | | | | | | (_| | |_\__ \ + # \_____/_|___/\__| |_| \___/|_| |_| |_| |_|\__,_|\__|___/ + elif arguments["list_formats"]: + e = EMSL_dump() + for i in e.get_list_format(): print i - # _____ _ _ _ - #/ __ \ | | | | | - #| / \/_ __ ___ __ _| |_ ___ __| | |__ - #| | | '__/ _ \/ _` | __/ _ \ / _` | '_ \ - #| \__/\ | | __/ (_| | || __/ | (_| | |_) | - # \____/_| \___|\__,_|\__\___| \__,_|_.__/ - if arguments["create_db"]: + # _____ _ _ _ + # / __ \ | | | | | + # | / \/_ __ ___ __ _| |_ ___ __| | |__ + # | | | '__/ _ \/ _` | __/ _ \ / _` | '_ \ + # | \__/\ | | __/ (_| | || __/ | (_| | |_) | + # \____/_| \___|\__,_|\__\___| \__,_|_.__/ + elif arguments["create_db"]: db_path = arguments["--db_path"] format = arguments["--format"] - if format not in format_dict: - print "Format %s doesn't exist. Run list_formats to get the list of formats." % (format) - sys.exit(1) + contraction = not arguments["--no-contraction"] - e = EMSL_dump( - db_path=db_path, - format=format_dict[format], - contraction=contraction) + e = EMSL_dump(db_path=db_path, + format=format, + contraction=contraction) e.new_db() + + # _ + # / | _ _. ._ o ._ _ + # \_ | (/_ (_| | | | | | (_| + # _| + + # Clean up on exit + if not(arguments['create_db']) and db_path_changed: + os.system("rm -f /dev/shm/%d.db" % (os.getpid())) diff --git a/README.md b/README.md index 8dec301..f273905 100644 --- a/README.md +++ b/README.md @@ -4,28 +4,34 @@ EMSL_Basis_Set_Exchange_Local Create of Local Copy of the famous [EMSL Basis Set Exchange](https://bse.pnl.gov/bse/portal) and use it easily with the API. -* Make a slight copy (40Mo Sqlite3 database) of the EMSL Basis Set Exchange website (One database for all the basis set of one format); +* Make a slight copy (40Mo Sqlite3 database) of the EMSL Basis Set Exchange website. Currently avalaible format are : + * Gamess-us, Gaussian94 and NEWCHEM; * API for scripting; * Quick local access without delay; -* Only need [Python](https://www.python.org/) and [Request](http://docs.python-requests.org/en/latest/) module. +* Only need [Python](https://www.python.org/) ##Dependencies * Python >2.6 -* Request ```pip install requests``` (in a virtual env or with sudo) + +###### Optional +If you plan to download manually some database -not using the pre existing one- you need : +* [Request](http://docs.python-requests.org/en/latest/) python module. ```$pip install requests``` (do it in a virtual env or with sudo) ##Installation -* Download the git (```$ git clone https://github.com/TApplencourt/EMSL_Basis_Set_Exchange_Local.git``` for example) -* Done ! You can now, use ```EMSL_api.py``` +* Download the git repertory (```$git clone https://github.com/TApplencourt/EMSL_Basis_Set_Exchange_Local.git``` for example) +* That all! You can now, use ```EMSL_api.py``` ##Usage ``` EMSL Api. Usage: - EMSL_api.py list_basis [--atom=...] - [--db_path=] + EMSL_api.py list_basis [--basis=...] + [--atom=...] + [--db_path=] + [--average_mo_number] EMSL_api.py list_atoms --basis= - [--db_path=] + [--db_path=] EMSL_api.py get_basis_data --basis= [--atom=...] [--db_path=] @@ -45,6 +51,12 @@ Options: is the path to the SQLite3 file containing the Basis sets. By default is $EMSL_API_ROOT/db/Gausian_uk.db + +Example of use: + ./EMSL_api.py list_basis --atom Al --atom U + ./EMSL_api.py list_basis --atom S --basis 'cc-pV*' --average_mo_number + ./EMSL_api.py list_atoms --basis ANO-RCC + ./EMSL_api.py get_basis_data --basis 3-21++G* ``` ##Demonstration @@ -53,14 +65,47 @@ By default is $EMSL_API_ROOT/db/Gausian_uk.db (For a beter quality see the [Source](https://asciinema.org/api/asciicasts/15380)) ##To do -For now we can only parse Gaussian-US basis set type file. (Look at ```./src/EMSL_utility.py#EMSL_dump.basis_data_row_to_array```) +For now we can only parse `Gamess-us, Gaussian94 and NEWCHEM` (Thanks to @mattbernst for Gaussian94 and NEWCHEM) basis set type file. + +###I need more format! + +I realy simple. Just read the few explanation bellow. + +You just need to provide a function who will split the basis data who containt all the atoms in atom only tuple. + +Sommething like this: +```python +def parse_basis_data_gaussian94(data, name, description, elements, debug=True): + """Parse the Gaussian94 basis data raw html to get a nice tuple. + + The data-pairs item is actually expected to be a 2 item list: + [symbol, data] + + e.g. ["Ca", "#BASIS SET..."] + + N.B.: Currently ignores ECP data! + + @param data: raw HTML from BSE + @type data : unicode + @param name: basis set name + @type name : str + @param des: basis set description + @type des : str + @param elements: element symbols e.g. ['H', 'C', 'N', 'O', 'Cl'] + @type elements : list + @return: (name, description, data-pairs) + @rtype : tuple + """ +``` + +Then just add the function in `src.parser_handler.format_dict`. You are ready to go! Feel free to fork/pull request. ##Disclaimer It'is not a official API. Use it with moderation. -In papers where you use the basis sets obtained from the Basis Set Exchange please site this : +In papers where you use the basis sets obtained from the Basis Set Exchange please site this: >The Role of Databases in Support of Computational Chemistry Calculations > >>--Feller, D.; J. Comp. Chem., 17(13), 1571-1586, 1996. diff --git a/db/Gamess-us.db b/db/GAMESS-US.db similarity index 92% rename from db/Gamess-us.db rename to db/GAMESS-US.db index fe60097..50aa1e5 100644 Binary files a/db/Gamess-us.db and b/db/GAMESS-US.db differ diff --git a/src/EMSL_dump.py b/src/EMSL_dump.py new file mode 100644 index 0000000..ff69a68 --- /dev/null +++ b/src/EMSL_dump.py @@ -0,0 +1,303 @@ +import os +import sys +import re +import time +import sqlite3 + +from collections import OrderedDict + + +def install_with_pip(name): + + ins = False + d = {'y': True, + 'n': False} + + while True: + choice = raw_input('Do you want to install it ? [Y/N]') + try: + ins = d[choice.lower()] + break + except: + print "not a valid choice" + + if ins: + try: + import pip + pip.main(['install', name]) + except: + print "You need pip" + print "(http://pip.readthedocs.org/en/latest/installing.html)" + sys.exit(1) + + +class EMSL_dump: + + """ + This call implement all you need for download the EMSL and save it localy + """ + + def __init__(self, db_path=None, format="GAMESS-US", contraction="True"): + + from src.parser_handler import get_parser_function + from src.parser_handler import check_format + + self.format = check_format(format) + self.parser = get_parser_function(self.format) + + if db_path: + self.db_path = db_path + else: + head_path = os.path.dirname(__file__) + self.db_path = "{0}/../db/{1}.db".format(head_path, self.format) + + self.contraction = str(contraction) + self.debug = False + + try: + import requests + except: + print "You need the requests package" + install_with_pip("requests") + finally: + self.requests = requests + + def get_list_format(self): + """List all the format available in EMSL""" + from src.parser_handler import parser_dict + return parser_dict.keys() + + def set_db_path(self, path): + """Define the database path""" + self.db_path = path + + def dwl_basis_list_raw(self): + """Return the source code of the iframe + who contains the list of the basis set available""" + + print "Download all the name available in EMSL." + print "It can take some time.", + sys.stdout.flush() + + url = "https://bse.pnl.gov/bse/portal/user/anon/js_peid/11535052407933/panel/Main/template/content" + if self.debug: + import cPickle as pickle + dbcache = 'db/cache' + if not os.path.isfile(dbcache): + page = self.requests.get(url).text + file = open(dbcache, 'w') + pickle.dump(page, file) + else: + file = open(dbcache, 'r') + page = pickle.load(file) + file.close() + + else: + page = self.requests.get(url).text + + print "Done" + return page + + def basis_list_raw_to_array(self, data_raw): + """Parse the raw html basis set to create a dict + will all the information for dowloanding the database : + Return d[name] = [name, xml_path, description, + lits of the elements available] + + Explanation of tuple data from 'tup' by index: + + 0 - path to xml file + 1 - basis set name + 2 - categorization: "dftcfit", "dftorb", "dftxfit", "diffuse", + "ecporb","effective core potential", "orbital", "polarization", + "rydberg", or "tight" + 3 - parameterized elements by symbol e.g. '[H, He, B, C, N, O, F, Ne]' + 4 - curation status; only 'published' is trustworthy + 5 - boolean: has ECP + 6 - boolean: has spin + 7 - last modified date + 8 - name of primary developer + 9 - name of contributor + 10 - human-readable summary/description of basis set + """ + + d = OrderedDict() + + for line in data_raw.split('\n'): + + if "new basisSet(" in line: + b = line.find("(") + e = line.find(");") + + s = line[b + 1:e] + + tup = eval(s) + + xml_path = tup[0] + + # non-published (e.g. rejected) basis sets and ecp should be + # ignored + if tup[4] != "published" or "-ecp" in xml_path.lower(): + continue + + name = tup[1] + elts = re.sub('[["\ \]]', '', tup[3]).split(',') + des = re.sub('\s+', ' ', tup[-1]) + + d[name] = [name, xml_path, des, elts] + + return d + + # _____ _ + # / __ \ | | + # | / \/_ __ ___ __ _| |_ ___ + # | | | '__/ _ \/ _` | __/ _ \ + # | \__/\ | | __/ (_| | || __/ + # \____/_| \___|\__,_|\__\___| + # + def create_sql(self, dict_basis_list): + """Create the sql from strach. + Take the list of basis available data, + download her, put her in sql""" + + if os.path.isfile(self.db_path): + print >> sys.stderr, "FAILLURE:" + print >> sys.stderr, "{0} file alredy exist.".format(self.db_path), + print >> sys.stderr, "Delete or remove it" + sys.exit(1) + + conn = sqlite3.connect(self.db_path) + c = conn.cursor() + + c.execute('''CREATE TABLE basis_tab( + basis_id INTEGER PRIMARY KEY AUTOINCREMENT, + name text, + description text, + UNIQUE(name) + );''') + + c.execute('''CREATE TABLE data_tab( + basis_id INTEGER, + elt TEXT, + data TEXT, + FOREIGN KEY(basis_id) + REFERENCES basis_tab(basis_id) + );''') + + c.execute('''CREATE TABLE format_tab(format TEXT)''') + c.execute('''INSERT INTO format_tab VALUES (?)''', [self.format]) + conn.commit() + + c.execute(''' CREATE VIEW output_tab AS + SELECT basis_id, + name, + description, + elt, + data + FROM basis_tab + NATURAL JOIN data_tab + ''') + + import Queue + import threading + + num_worker_threads = 7 + attemps_max = 20 + + q_in = Queue.Queue(num_worker_threads) + q_out = Queue.Queue(num_worker_threads) + + def worker(): + """get a Job from the q_in, do stuff, + when finish put it in the q_out""" + while True: + name, path_xml, des, elts = q_in.get() + + url = "https://bse.pnl.gov:443/bse/portal/user/anon/js_peid/11535052407933/action/portlets.BasisSetAction/template/courier_content/panel/Main/" + url += "/eventSubmit_doDownload/true" + + params = {'bsurl': path_xml, 'bsname': name, + 'elts': " ".join(elts), + 'format': self.format, + 'minimize': self.contraction} + + attemps = 0 + while attemps < attemps_max: + text = self.requests.get(url, params=params).text + try: + basis_data = self.parser(text, name, des, elts, + self.debug) + except: + time.sleep(0.1) + attemps += 1 + else: + break + + try: + q_out.put(basis_data) + except: + if self.debug: + print "Fail on q_out.put", basis_data + raise + else: + q_in.task_done() + + def enqueue(): + for [name, path_xml, des, elts] in dict_basis_list.itervalues(): + q_in.put([name, path_xml, des, elts]) + + return 0 + + t = threading.Thread(target=enqueue) + t.daemon = True + t.start() + + for i in range(num_worker_threads): + t = threading.Thread(target=worker) + t.daemon = True + t.start() + + nb_basis = len(dict_basis_list) + + for i in range(nb_basis): + name, des, basis_data = q_out.get() + q_out.task_done() + + str_indice = '{:>3}'.format(i + 1) + str_ = '{0} / {1} | {2}'.format(str_indice, nb_basis, name) + + # ~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~ # + # A d d _ t h e _ b a s i s _ n a m e # + # ~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~ # + try: + cmd = "INSERT INTO basis_tab(name,description) VALUES (?,?)" + c.execute(cmd, [name, des]) + conn.commit() + except sqlite3.IntegrityError: + print str_, "Fail" + + # ~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~ # + # A d d _ t h e _ b a s i s _ d a t a # + # ~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~ # + + id_ = [c.lastrowid] + + try: + cmd = "INSERT INTO data_tab(basis_id,elt,data) VALUES (?,?,?)" + c.executemany(cmd, [id_ + k for k in basis_data]) + conn.commit() + except sqlite3.IntegrityError: + print str_, "Fail" + else: + print str_ + conn.close() + + q_in.join() + + def new_db(self): + """Create new_db from scratch""" + + _data = self.dwl_basis_list_raw() + array_basis = self.basis_list_raw_to_array(_data) + + self.create_sql(array_basis) diff --git a/src/EMSL_local.py b/src/EMSL_local.py new file mode 100755 index 0000000..d2577bf --- /dev/null +++ b/src/EMSL_local.py @@ -0,0 +1,317 @@ +# -*- coding: utf-8 -*- + +import sqlite3 +import re +import sys +import os + + +def checkSQLite3(db_path): + """Check if the db_path is a good one""" + + from os.path import isfile, getsize + + db_path = os.path.expanduser(db_path) + db_path = os.path.expandvars(db_path) + db_path = os.path.abspath(db_path) + + # Check if db file is readable + if not os.access(db_path, os.R_OK): + print >>sys.stderr, "Db file %s is not readable" % (db_path) + raise IOError + + if not isfile(db_path): + print >>sys.stderr, "Db file %s is not... a file!" % (db_path) + raise IOError + + if getsize(db_path) < 100: # SQLite database file header is 100 bytes + print >>sys.stderr, "Db file %s is not a SQLite file!" % (db_path) + raise IOError + + with open(db_path, 'rb') as fd: + header = fd.read(100) + + if header[:16] != 'SQLite format 3\x00': + print >>sys.stderr, "Db file %s is not in SQLiteFormat3!" % (db_path) + raise IOError + + # Check if the file system allows I/O on sqlite3 (lustre) + # If not, copy on /dev/shm and remove after opening + try: + EMSL_local(db_path=db_path).list_basis_available() + except sqlite3.OperationalError: + print >>sys.stderr, "I/O Error for you file system" + print >>sys.stderr, "Try some fixe" + new_db_path = "/dev/shm/%d.db" % (os.getpid()) + os.system("cp %s %s" % (db_path, new_db_path)) + db_path = new_db_path + else: + changed = False + return db_path, changed + + # Try again to check + try: + EMSL_local(db_path=db_path).list_basis_available() + except: + print >>sys.stderr, "Sorry..." + os.system("rm -f /dev/shm/%d.db" % (os.getpid())) + raise + else: + print >>sys.stderr, "Working !" + changed = True + return db_path, changed + + +def cond_sql_or(table_name, l_value, glob=False): + """Take a table_name, a list of value and create the sql or combande""" + + opr = "GLOB" if glob else "=" + + return [" OR ".join(['{} {} "{}"'.format(table_name, + opr, + val) for val in l_value])] + + +def string_to_nb_mo(str_type): + """Take a string and return the nb of orbital""" + + d = {"S": 3, + "P": 5, + "D": 7, + "SP": 8} + + if str_type in d: + return d[str_type] + # ord("F") = 70 and ord("Z") = 87 + elif 70 <= ord(str_type) <= 87: + # ord("F") = 70 and l = 4 so ofset if 66 + return 2 * (ord(str_type) - 66) + 1 + else: + raise BaseException + +# _ __ +# |_ |\/| (_ | | _ _ _. | +# |_ | | __) |_ |_ (_) (_ (_| | +# +class EMSL_local: + + """ + All the method for using the EMSL db localy + """ + + def __init__(self, db_path=None): + self.db_path = db_path + + self.conn = sqlite3.connect(self.db_path) + self.c = self.conn.cursor() + + self.c.execute("SELECT * from format_tab") + self.format = self.c.fetchone()[0] + + def list_basis_available(self, + elts=[], + basis=[], + average_mo_number=False): + """ + return all the basis name who contant all the elts + """ + # If not elts just get the distinct name + # Else: 1) fetch for geting all the run_id whos satisfy the condition + # 2) If average_mo_number: + # * Get name,descripption,data + # * Then parse it + # Else Get name,description + # 3) Parse it + + # ~#~#~#~#~#~ # + # F i l t e r # + # ~#~#~#~#~#~ # + + if basis: + cmd_filter_basis = " ".join(cond_sql_or("name", basis, glob=True)) + else: + cmd_filter_basis = "(1)" + + # Not Ets + if not elts: + if not average_mo_number: + cmd = """SELECT DISTINCT name, description + FROM basis_tab + WHERE {0}""" + else: + cmd = """SELECT DISTINCT name, description, data + FROM output_tab + WHERE {0}""" + + cmd = cmd.format(cmd_filter_basis) + + else: + + # ~#~#~#~#~#~#~#~#~#~#~#~#~#~#~ # + # G e t t i n g _ B a s i s I d # + # ~#~#~#~#~#~#~#~#~#~#~#~#~#~#~ # + + str_ = """SELECT DISTINCT basis_id + FROM output_tab + WHERE elt=? AND {0}""".format(cmd_filter_basis) + + cmd = " INTERSECT ".join([str_] * len(elts)) + ";" + self.c.execute(cmd, elts) + + l_basis_id = [i[0] for i in self.c.fetchall()] + + # ~#~#~#~#~#~#~#~#~#~#~#~#~#~ # + # C r e a t e _ t h e _ c m d # + # ~#~#~#~#~#~#~#~#~#~#~#~#~#~ # + + cmd_filter_basis = " ".join(cond_sql_or("basis_id", l_basis_id)) + cmd_filter_ele = " ".join(cond_sql_or("elt", elts)) + + column_to_fech = "name, description" + if average_mo_number: + column_to_fech += ", data" + + filter_where = " ({}) AND ({})".format( + cmd_filter_ele, + cmd_filter_basis) + + cmd = """SELECT DISTINCT {0} + FROM output_tab + WHERE {1} + ORDER BY name""".format(column_to_fech, filter_where) + # ~#~#~#~#~ # + # F e t c h # + # ~#~#~#~#~ # + + self.c.execute(cmd) + info = self.c.fetchall() + + # ~#~#~#~#~#~#~ # + # P a r s i n g # + # ~#~#~#~#~#~#~ # + # If average_mo_number is asking + + from collections import OrderedDict + dict_info = OrderedDict() + # Description : dict_info[name] = [description, nb_mo, nb_ele] + + from src.parser_handler import get_symmetry_function + if average_mo_number: + + f_symmetry = get_symmetry_function(self.format) + + for name, description, atom_basis in info: + + nb_mo = 0 + + line = atom_basis.split("\n") + + for type_, _, _ in f_symmetry(line): + + nb_mo += string_to_nb_mo(type_) + try: + dict_info[name][1] += nb_mo + dict_info[name][2] += 1. + except KeyError: + dict_info[name] = [description, nb_mo, 1.] + + # ~#~#~#~#~#~ # + # R e t u r n # + # ~#~#~#~#~#~ # + + if average_mo_number: + return[[k, v[0], str(v[1] / v[2])] for k, v in dict_info.iteritems()] + else: + return [i[:] for i in info] + + def get_list_element_available(self, basis_name): + + # ~#~#~#~#~#~ # + # F i l t e r # + # ~#~#~#~#~#~ # + + str_ = """SELECT DISTINCT elt + FROM output_tab + WHERE name=(?) COLLATE NOCASE""" + + # ~#~#~#~#~ # + # F e t c h # + # ~#~#~#~#~ # + + self.c.execute(str_, basis_name) + + # ~#~#~#~#~#~ # + # R e t u r n # + # ~#~#~#~#~#~ # + + return [str(i[0]) for i in self.c.fetchall()] + + def get_basis(self, + basis_name, elts=None, + handle_l_format=False, check_format=None): + """ + Return the data from the basis set + """ + + # ~#~#~#~#~#~ # + # F i l t e r # + # ~#~#~#~#~#~ # + + cmd_filter_ele = " ".join(cond_sql_or("elt", elts)) if elts else "(1)" + + self.c.execute('''SELECT DISTINCT data from output_tab + WHERE name="{0}" + AND {1}'''.format(basis_name, cmd_filter_ele)) + + # We need to take i[0] because fetchall return a tuple [(value),...] + l_atom_basis = [i[0].strip() for i in self.c.fetchall()] + + # ~#~#~#~#~#~#~#~ # + # h a n d l e _ f # + # ~#~#~#~#~#~#~#~ # + if handle_l_format: + from src.parser_handler import get_handle_l_function + f = get_handle_l_function(self.format) + l_atom_basis = f(l_atom_basis) + + # ~#~#~#~#~ # + # C h e c k # + # ~#~#~#~#~ # + + if check_format: + + from src.parser_handler import get_symmetry_function + from src.parser.check_validity import get_check_function + + f = get_check_function(check_format) + f_symmetry = get_symmetry_function(self.format) + + for atom_basis in l_atom_basis: + lines = atom_basis.split("\n") + for type_, _, _ in f_symmetry(lines): + try: + f(type_) + except AssertionError: + print "False. You have somme special function like SP" + sys.exit(1) + except BaseException: + print "Fail !" + sys.exit(1) + + # ~#~#~#~#~#~ # + # R e t u r n # + # ~#~#~#~#~#~ # + return l_atom_basis +if __name__ == "__main__": + + e = EMSL_local(db_path="EMSL.db") + l = e.get_list_basis_available() + for i in l: + print i + + l = e.get_list_element_available("pc-0") + print l + + l = e.get_basis("cc-pVTZ", ["H", "He"]) + for i in l: + print i diff --git a/src/EMSL_utility.py b/src/EMSL_utility.py deleted file mode 100755 index 192fa9c..0000000 --- a/src/EMSL_utility.py +++ /dev/null @@ -1,435 +0,0 @@ -# -*- coding: utf-8 -*- - -import sqlite3 -import re -import sys -import os -import time - -debug = True - -elt_path = os.path.dirname(sys.argv[0]) + "/src/elts_abrev.dat" - -with open(elt_path, "r") as f: - data = f.readlines() - -dict_ele = dict() -for i in data: - l = i.split("-") - dict_ele[l[1].strip().lower()] = l[2].strip().lower() - - -def install_with_pip(name): - - ins = False - d = {'y': True, - 'n': False} - - while True: - choice = raw_input('Do you want to install it ? [Y/N]') - try: - ins = d[choice.lower()] - break - except: - print "not a valid choice" - - if ins: - try: - import pip - pip.main(['install', name]) - except: - print "You need pip, (http://pip.readthedocs.org/en/latest/installing.html)" - sys.exit(1) - - -def cond_sql_or(table_name, l_value): - - l = [] - dmy = " OR ".join(['%s = "%s"' % (table_name, i) for i in l_value]) - if dmy: - l.append("(%s)" % dmy) - - return l - - -class EMSL_dump: - - def __init__(self, db_path=None, format="GAMESS-US", contraction="True"): - self.db_path = db_path - self.format = format - self.contraction = str(contraction) - - try: - import requests - except: - print "You need the requests package" - install_with_pip("requests") - finally: - self.requests = requests - - def set_db_path(self, path): - """Define the database path""" - self.db_path = path - - def dwl_basis_list_raw(self): - print "Download all the name available in EMSL. It can take some time.", - sys.stdout.flush() - - """Download the source code of the iframe who contains the list of the basis set available""" - - url = "https://bse.pnl.gov/bse/portal/user/anon/js_peid/11535052407933/panel/Main/template/content" - if debug: - import cPickle as pickle - dbcache = 'db/cache' - if not os.path.isfile(dbcache): - page = self.requests.get(url).text - file = open(dbcache, 'w') - pickle.dump(page, file) - else: - file = open(dbcache, 'r') - page = pickle.load(file) - file.close() - - else: - page = self.requests.get(url).text - - print "Done" - return page - - def bl_raw_to_array(self, data_raw): - """Parse the raw html to create a basis set array whith all the info: - url, name,description""" - - d = {} - - for line in data_raw.split('\n'): - if "new basisSet(" in line: - b = line.find("(") - e = line.find(");") - - s = line[b + 1:e] - - tup = eval(s) - url = tup[0] - name = tup[1] - - junkers = re.compile('[[" \]]') - elts = junkers.sub('', tup[3]).split(',') - - des = tup[-1] - - if "-ecp" in url.lower(): - continue - d[name] = [name, url, des, elts] - - """Tric for the unicity of the name""" - array = [d[key] for key in d] - - array_sort = sorted(array, key=lambda x: x[0]) - print len(array_sort), "basisset will be download" - - return array_sort - - def create_url(self, url, name, elts): - """Create the adequate url to get the basis data""" - - elts_string = " ".join(elts) - - path = "https://bse.pnl.gov:443/bse/portal/user/anon/js_peid/11535052407933/action/portlets.BasisSetAction/template/courier_content/panel/Main/" - path += "/eventSubmit_doDownload/true" - path += "?bsurl=" + url - path += "&bsname=" + name - path += "&elts=" + elts_string - path += "&format=" + self.format - path += "&minimize=" + self.contraction - return path - - def basis_data_row_to_array(self, data, name, des, elts): - """Parse the basis data raw html to get a nice tuple""" - - d = [] - - b = data.find("$DATA") - e = data.find("$END") - if (b == -1 or data.find("$DATA$END") != -1): - if debug: - print data - raise Exception("WARNING not DATA") - else: - data = data.replace("PHOSPHOROUS", "PHOSPHORUS") - data = data.replace("D+", "E+") - data = data.replace("D-", "E-") - - data = data[b + 5:e - 1].split('\n\n') - - for (elt, data_elt) in zip(elts, data): - - elt_long_th = dict_ele[elt.lower()] - elt_long_exp = data_elt.split()[0].lower() - - if "$" in data_elt: - print "Eror", - raise Exception("WARNING not bad split") - - if elt_long_th == elt_long_exp: - d.append((name, des, elt, data_elt.strip())) - else: - print "th", elt_long_th - print "exp", elt_long_exp - print "abv", elt - raise Exception("WARNING not good ELEMENT") - - return d - - def create_sql(self, list_basis_array): - """Create the sql from the list of basis available data""" - - conn = sqlite3.connect(self.db_path) - c = conn.cursor() - - # Create table - c.execute('''CREATE TABLE all_value - (name text, description text, elt text, data text)''') - - import Queue - import threading - - num_worker_threads = 7 - attemps_max = 20 - - q_in = Queue.Queue(num_worker_threads) - q_out = Queue.Queue(num_worker_threads) - - def worker(): - """get a Job from the q_in, do stuff, when finish put it in the q_out""" - while True: - [name, url, des, elts] = q_in.get() - url = self.create_url(url, name, elts) - - attemps = 0 - while attemps < attemps_max: - text = self.requests.get(url).text - try: - basis_data = self.basis_data_row_to_array( - text, name, des, elts) - break - except: - time.sleep(0.1) - attemps += 1 - - try: - q_out.put(([name, url, des, elts], basis_data)) - q_in.task_done() - except: - print name, url, des - raise - - def enqueue(): - for [name, url, des, elts] in list_basis_array: - q_in.put(([name, url, des, elts])) - - return 0 - - t = threading.Thread(target=enqueue) - t.daemon = True - t.start() - - for i in range(num_worker_threads): - t = threading.Thread(target=worker) - t.daemon = True - t.start() - - nb_basis = len(list_basis_array) - - for i in range(nb_basis): - [name, url, des, elts], basis_data = q_out.get() - - try: - c.executemany( - "INSERT INTO all_value VALUES (?,?,?,?)", basis_data) - conn.commit() - - print '{:>3}'.format(i + 1), "/", nb_basis, name - except: - print '{:>3}'.format(i + 1), "/", nb_basis, name, "fail", - print ' ', [url, des, elts] - raise - conn.close() - - q_in.join() - - def new_db(self): - """Create new_db from scratch""" - - _data = self.dwl_basis_list_raw() - array_basis = self.bl_raw_to_array(_data) - del _data - - self.create_sql(array_basis) - - -class EMSL_local: - - def __init__(self, db_path=None): - self.db_path = db_path - - def get_list_basis_available(self, elts=[]): - - conn = sqlite3.connect(self.db_path) - c = conn.cursor() - - if not elts: - - c.execute("SELECT DISTINCT name,description from all_value") - data = c.fetchall() - - else: - cmd = [ - "SELECT name,description FROM all_value WHERE elt=?"] * len(elts) - cmd = " INTERSECT ".join(cmd) + ";" - - c.execute(cmd, elts) - data = c.fetchall() - - data = [i[:] for i in data] - - conn.close() - - return data - - def get_list_element_available(self, basis_name): - - conn = sqlite3.connect(self.db_path) - c = conn.cursor() - - c.execute( - "SELECT DISTINCT elt from all_value WHERE name=:name_us COLLATE NOCASE", { - "name_us": basis_name}) - - data = c.fetchall() - - data = [str(i[0]) for i in data] - - conn.close() - return data - - def get_basis(self, basis_name, elts=None, with_l=False): - - def get_list_type(l_line): - l = [] - for i, line in enumerate(l_line): - - m = re.search(p, line) - if m: - l.append([m.group(1), i]) - try: - l[-2].append(i) - except IndexError: - pass - - l[-1].append(i + 1) - return l - - import re - - # __ _ - # /__ _ _|_ _|_ ._ _ ._ _ _ _. | - # \_| (/_ |_ | | (_) | | | _> (_| | - # | - conn = sqlite3.connect(self.db_path) - c = conn.cursor() - - if elts: - cmd_ele = "AND " + " ".join(cond_sql_or("elt", elts)) - else: - cmd_ele = "" - - c.execute('''SELECT DISTINCT data from all_value - WHERE name="{basis_name}" COLLATE NOCASE - {cmd_ele}'''.format(basis_name=basis_name, - cmd_ele=cmd_ele)) - - l_data_raw = c.fetchall() - conn.close() - - # |_| _. ._ _| | _ || | || - # | | (_| | | (_| | (/_ |_ - # - - p = re.compile(ur'^(\w)\s+\d+\b') - - l_data = [] - - for data_raw in l_data_raw: - - basis = data_raw[0].strip() - - l_line_raw = basis.split("\n") - - l_line = [l_line_raw[0]] - - for symmetry, begin, end in get_list_type(l_line_raw): - - if not(with_l) and symmetry in "L": - - body_s = [] - body_p = [] - - for i_l in l_line_raw[begin + 1:end]: - - a = i_l.split() - - common = "{:>3}".format(a[0]) - common += "{:>15.7f}".format(float(a[1])) - - tail_s = common + "{:>23.7f}".format(float(a[2])) - body_s.append(tail_s) - - tail_p = common + "{:>23.7f}".format(float(a[3])) - body_p.append(tail_p) - - l_line += [l_line_raw[begin].replace("L", "S")] - l_line += body_s - - l_line += [l_line_raw[begin].replace("L", "P")] - l_line += body_p - else: - l_line += l_line_raw[begin:end] - - l_data.append("\n".join(l_line)) - - return l_data - - -format_dict = \ - { - "g94": "Gaussian94", - "gamess-us": "GAMESS-US", - "gamess-uk": "GAMESS-UK", - "turbomole": "Turbomole", - "tx93": "TX93", - "molpro": "Molpro", - "molproint": "MolproInt", - "hondo": "Hondo", - "supermolecule": "SuperMolecule", - "molcas": "Molcas", - "hyperchem": "HyperChem", - "dalton": "Dalton", - "demon-ks": "deMon-KS", - "demon2k": "deMon2k", - "aces2": "AcesII", - } - -if __name__ == "__main__": - - e = EMSL_local(db_path="EMSL.db") - l = e.get_list_basis_available() - for i in l: - print i - - l = e.get_list_element_available("pc-0") - print l - - l = e.get_basis("cc-pVTZ", ["H", "He"]) - for i in l: - print i diff --git a/src/misc/__init__.py b/src/misc/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/docopt.py b/src/misc/docopt.py similarity index 96% rename from src/docopt.py rename to src/misc/docopt.py index 78f7f77..59830d5 100644 --- a/src/docopt.py +++ b/src/misc/docopt.py @@ -59,12 +59,20 @@ class Pattern(object): either = [list(child.children) for child in transform(self).children] for case in either: for e in [child for child in case if case.count(child) > 1]: - if type(e) is Argument or type(e) is Option and e.argcount: + if isinstance( + e, + Argument) or isinstance( + e, + Option) and e.argcount: if e.value is None: e.value = [] - elif type(e.value) is not list: + elif not isinstance(e.value, list): e.value = e.value.split() - if type(e) is Command or type(e) is Option and e.argcount == 0: + if isinstance( + e, + Command) or isinstance( + e, + Option) and e.argcount == 0: e.value = 0 return self @@ -84,10 +92,10 @@ def transform(pattern): if any(t in map(type, children) for t in parents): child = [c for c in children if type(c) in parents][0] children.remove(child) - if type(child) is Either: + if isinstance(child, Either): for c in child.children: groups.append([c] + children) - elif type(child) is OneOrMore: + elif isinstance(child, OneOrMore): groups.append(child.children * 2 + children) else: groups.append(child.children + children) @@ -117,10 +125,10 @@ class LeafPattern(Pattern): left_ = left[:pos] + left[pos + 1:] same_name = [a for a in collected if a.name == self.name] if type(self.value) in (int, list): - if type(self.value) is int: + if isinstance(self.value, int): increment = 1 else: - increment = ([match.value] if type(match.value) is str + increment = ([match.value] if isinstance(match.value, str) else match.value) if not same_name: match.value = increment @@ -151,7 +159,7 @@ class Argument(LeafPattern): def single_match(self, left): for n, pattern in enumerate(left): - if type(pattern) is Argument: + if isinstance(pattern, Argument): return n, Argument(self.name, pattern.value) return None, None @@ -169,7 +177,7 @@ class Command(Argument): def single_match(self, left): for n, pattern in enumerate(left): - if type(pattern) is Argument: + if isinstance(pattern, Argument): if pattern.value == self.name: return n, Command(self.name, True) else: diff --git a/src/elts_abrev.dat b/src/misc/elts_abrev.dat similarity index 100% rename from src/elts_abrev.dat rename to src/misc/elts_abrev.dat diff --git a/src/parser/__init__.py b/src/parser/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/parser/check_validity.py b/src/parser/check_validity.py new file mode 100644 index 0000000..708cf09 --- /dev/null +++ b/src/parser/check_validity.py @@ -0,0 +1,52 @@ +# _ +# / |_ _ _ | _. | o _| o _|_ +# \_ | | (/_ (_ |< \/ (_| | | (_| | |_ \/ +# / +# Do this After the L special case traitement. + +import sys + + +def check_gamess(str_type): + """Check is the orbital type is handle by gamess""" + + assert len(str_type) == 1 + + if str_type in "S P D".split(): + return True + elif str_type == "SP": + raise BaseException + else: + return True + + +def check_NWChem(str_type): + """Check is the orbital type is handle by gamess""" + + assert len(str_type) == 1 + + if str_type in "S P D".split(): + return True + elif str_type > "I" or str_type in "K L M".split(): + raise BaseException + else: + return True + + +d_check = {"GAMESS-US": check_gamess, + "NWChem": check_NWChem} + + +def get_check_function(name_program): + """ + Tranforme SP special function (create using get_symmetry_function) + into S and P + """ + try: + f = d_check[name_program] + except KeyError: + str_ = "You need to add a check funtion for your program {0}" + print >> sys.stderr, str_.format(name_program) + print >> sys.stderr, "This one are avalaible {0}".format(d_check.keys()) + sys.exit(1) + return f diff --git a/src/parser/gamess_us.py b/src/parser/gamess_us.py new file mode 100644 index 0000000..d06cd22 --- /dev/null +++ b/src/parser/gamess_us.py @@ -0,0 +1,138 @@ +# __ +# /__ _. ._ _ _ _ _ _ +# \_| (_| | | | (/_ _> _> |_| _> +# + +from src.parser_handler import get_dict_ele +import re + + +def parse_basis_data_gamess_us(data, name, des, elts, debug=False): + """Parse the basis data raw html of gamess-us to get a nice tuple + Return (name, description, [[ele, data_ele],...])""" + basis_data = [] + + b = data.find("$DATA") + e = data.find("$END") + if (b == -1 or data.find("$DATA$END") != -1): + if debug: + print data + raise Exception("WARNING not DATA") + else: + dict_replace = {"PHOSPHOROUS": "PHOSPHORUS", + "D+": "E+", + "D-": "E-"} + + for k, v in dict_replace.iteritems(): + data = data.replace(k, v) + + data = data[b + 5:e - 1].split('\n\n') + + dict_ele = get_dict_ele() + + for (elt, data_elt) in zip(elts, data): + + elt_long_th = dict_ele[elt.lower()] + elt_long_exp = data_elt.split()[0].lower() + + if "$" in data_elt: + if debug: + print "Eror", + raise Exception("WARNING bad split") + + if elt_long_th == elt_long_exp: + basis_data.append([elt, data_elt.strip()]) + else: + if debug: + print "th", elt_long_th + print "exp", elt_long_exp + print "abv", elt + raise Exception("WARNING not a good ELEMENT") + + return (name, des, basis_data) + + +symmetry_regex = re.compile(ur'^(\w)\s+\d+\b') + + +def l_symmetry_gamess_us(atom_basis): + """ + Return the begin and the end of all the type of orbital + input: atom_basis = [name, S 1, 12 0.12 12212, ...] + output: [ [type, begin, end], ...] + """ + # Example + # [[u'S', 1, 5], [u'L', 5, 9], [u'L', 9, 12], [u'D', 16, 18]]" + + l = [] + for i, line in enumerate(atom_basis): + # Optimisation for not seaching all the time + if len(line) < 10: + m = re.search(symmetry_regex, line) + if m: + # Cause of L ! + read_symmetry = m.group(1) + + # L is real L or special SP + # Just check the number of exponant + if all([read_symmetry == "L", + len(atom_basis[i + 1].split()) == 4]): + real_symmetry = "SP" + else: + real_symmetry = read_symmetry + + l.append([real_symmetry, i]) + try: + l[-2].append(i) + except IndexError: + pass + + l[-1].append(i + 1) + return l + + +def handle_l_gamess_us(l_atom_basis): + """ + Read l_atom_basis and change the SP in L and P + """ + + l_data = [] + for atom_basis in l_atom_basis: + + # Split the data in line + l_line_raw = atom_basis.split("\n") + l_line = [l_line_raw[0]] + # l_line_raw[0] containt the name of the Atom + + for symmetry, begin, end in l_symmetry_gamess_us(l_line_raw): + + if symmetry == "SP": + + body_s = [] + body_p = [] + + for i_l in l_line_raw[begin + 1:end]: + + # one L => S & P + a = i_l.split() + + common = "{:>3}".format(a[0]) + common += "{:>15.7f}".format(float(a[1])) + + tail_s = common + "{:>23.7f}".format(float(a[2])) + body_s.append(tail_s) + + tail_p = common + "{:>23.7f}".format(float(a[3])) + body_p.append(tail_p) + + l_line += [l_line_raw[begin].replace("L", "S")] + l_line += body_s + + l_line += [l_line_raw[begin].replace("L", "P")] + l_line += body_p + else: + l_line += l_line_raw[begin:end] + + l_data.append("\n".join(l_line)) + + return l_data diff --git a/src/parser/gaussian94.py b/src/parser/gaussian94.py new file mode 100644 index 0000000..b3a59ae --- /dev/null +++ b/src/parser/gaussian94.py @@ -0,0 +1,83 @@ +# __ _ +# /__ _. _ _ o _. ._ (_| |_|_ +# \_| (_| |_| _> _> | (_| | | | | +# +import sys + + +def parse_basis_data_gaussian94(data, name, description, elements, debug=True): + """Parse the Gaussian94 basis data raw html to get a nice tuple. + + The data-pairs item is actually expected to be a 2 item list: + [symbol, data] + + e.g. ["Ca", "#BASIS SET..."] + + N.B.: Currently ignores ECP data! + + @param data: raw HTML from BSE + @type data : unicode + @param name: basis set name + @type name : str + @param des: basis set description + @type des : str + @param elements: element symbols e.g. ['H', 'C', 'N', 'O', 'Cl'] + @type elements : list + @return: (name, description, data-pairs) + @rtype : tuple + """ + + # Each basis set block starts and ends with ****. Find the region + # containing all the basis blocks using the first and last ****. + mark = "****" + begin = data.find(mark) + end = data.rfind(mark) + + if begin == -1 or end == -1: + if debug: + print(data) + str_ = " No basis set data found while attempting to process {0} ({1})" + raise ValueError(str_.format(name, description)) + + trimmed = data[begin + len(mark): end - len(mark)].strip() + chunks = [] + lines = [] + + # group lines of data delimited by mark into per-element chunks + for line in trimmed.split("\n"): + if line.startswith(mark): + if lines: + chunks.append(lines) + lines = [line] + else: + lines.append(line) + + # handle trailing chunk that is not followed by another basis set block + # also remove the marker lines from the chunk itself + if lines and (not chunks or lines != chunks[-1]): + chunks.append(lines) + + # join lines back into solid text blocks + chunks = ["\n".join([L for L in c if mark not in L]) for c in chunks] + + # check each block for element and assign symbols to final pairs + pairs = [] + unused_elements = set([e.upper() for e in elements]) + for chunk in chunks: + # get first 3 chars of first line in block + symbol = chunk.split("\n")[0][:3].strip() + try: + unused_elements.remove(symbol.upper()) + except KeyError: + if debug: + msg = "Warning: already processed {0}\n".format(symbol) + sys.stderr.write(msg) + pairs.append([symbol, chunk]) + + if unused_elements: + msg = "Warning: elements {0} left over for {1}".format( + list(unused_elements), + name) + print(msg) + + return (name, description, pairs) diff --git a/src/parser/nwchem.py b/src/parser/nwchem.py new file mode 100644 index 0000000..ddb2378 --- /dev/null +++ b/src/parser/nwchem.py @@ -0,0 +1,228 @@ +# _ +# |\ | / |_ _ ._ _ +# | \| \/\/ \_ | | (/_ | | | +# +import json + + +def extract_basis_nwchem(data, name): + """Extract atomic orbital, charge density fitting, or exchange + correlation functional basis data from a text region passed in as + data. The charge density fitting and exchange correlation functional + basis set data are employed for density functional calculations. + + @param data: text region containing basis set data + @type data : str + @param name: name of basis type: "ao basis", "cd basis", or "xc basis" + @type name : str + @return: per-element basis set chunks + @rtype : list + """ + + begin_marker = """BASIS "{0}" PRINT""".format(name) + end_marker = "END" + + # search for the basis set data begin marker + # calling "upper" on data because original data has inconsistent + # capitalization + begin = data.upper().find(begin_marker.upper()) + end = data.upper().find(end_marker, begin) + + # No basis data found + if begin == -1: + return [] + + trimmed = data[begin + len(begin_marker): end - len(end_marker)].strip() + + chunks = [] + lines = [] + + # group lines of data delimited by #BASIS SET... into per-element chunks + for line in trimmed.split("\n"): + if line.upper().startswith("#BASIS SET"): + if lines: + chunks.append(lines) + lines = [line] + else: + lines.append(line) + + # handle trailing chunk that is not followed by another #BASIS SET... + if lines and (not chunks or lines != chunks[-1]): + chunks.append(lines) + + # join lines back into solid text blocks + chunks = ["\n".join(c) for c in chunks] + return chunks + + +def extract_ecp_nwchem(data): + """Extract the effective core potential basis data from a text region + passed in as data. + + @param data: text region containing ECP data + @type data : str + @return: per-element effective core potential chunks + @rtype : list + """ + + ecp_begin_mark = "ECP\n" + ecp_end_mark = "END" + ecp_begin = data.upper().find(ecp_begin_mark) + ecp_end = data.upper().find(ecp_end_mark, ecp_begin) + ecp_region = "" + + if ecp_begin > -1 and ecp_end > -1: + ecp_region = data[ + ecp_begin + + len(ecp_begin_mark): ecp_end - + len(ecp_end_mark)].strip() + + # No ECP data, so return empty list + else: + return [] + + chunks = [] + lines = [] + + # group lines of data delimited by XX nelec YY into chunks, e.g. + # "Zn nelec 18" begins a zinc ECP + for line in ecp_region.split("\n"): + if line.lower().find(" nelec ") > -1: + if lines: + chunks.append(lines) + lines = [line] + else: + lines.append(line) + + # handle trailing chunk that is not followed by another XX nelec YY.. + if lines and (not chunks or lines != chunks[-1]): + chunks.append(lines) + + # join lines back into solid text blocks + chunks = ["\n".join(c) for c in chunks] + return chunks + + +def unpack_nwchem_basis_block(data): + """Unserialize a NWChem basis data block and extract components + + @param data: a JSON of basis set data, perhaps containing many types + @type data : str + @return: unpacked data + @rtype : dict + """ + + unpacked = json.loads(data) + return unpacked + + +def parse_basis_data_nwchem(data, name, description, elements, debug=True): + """Parse the NWChem basis data raw html to get a nice tuple. + + The data-pairs item is actually expected to be a 2 item list: + [symbol, data] + + e.g. ["Ca", "#BASIS SET..."] + + @param data: raw HTML from BSE + @type data : unicode + @param name: basis set name + @type name : str + @param des: basis set description + @type des : str + @param elements: element symbols e.g. ['H', 'C', 'N', 'O', 'Cl'] + @type elements : list + @return: (name, description, data-pairs) + @rtype : tuple + """ + + unused_elements = set([e.upper() for e in elements]) + + def extract_symbol(txt): + for sline in txt.split("\n"): + if not sline.startswith("#"): + try: + symbol = sline[:3].strip().split()[0] + return symbol + except IndexError: + continue + + raise ValueError("Can't find element symbol in {0}".format(txt)) + + ao_chunks = extract_basis_nwchem(data, "ao basis") + cd_chunks = extract_basis_nwchem(data, "cd basis") + xc_chunks = extract_basis_nwchem(data, "xc basis") + ecp_chunks = extract_ecp_nwchem(data) + + if not any([ao_chunks, cd_chunks, xc_chunks, ecp_chunks]): + str_ = "No basis set data found while attempting to process {0} ({1})" + raise ValueError(str_.format(name, description)) + + # Tag all used elements, whether from ordinary AO basis or ECP section + for chunk in ao_chunks + cd_chunks + xc_chunks + ecp_chunks: + try: + symbol = extract_symbol(chunk) + unused_elements.remove(symbol.upper()) + except KeyError: + pass + + if unused_elements: + msg = "Warning: elements {0} left over for {1}" + print msg.format(list(unused_elements), name) + + # Form packed chunks, turn packed chunks into pairs + used_elements = set() + packed = {} + + for cgroup, gname in [(ao_chunks, "ao basis"), (cd_chunks, "cd basis"), + (xc_chunks, "xc basis"), (ecp_chunks, "ecp")]: + for chunk in cgroup: + symbol = extract_symbol(chunk) + + # Expand entry, e.g. add ecp data for Na after it has ao basis + try: + idx, ch = packed[symbol] + ch[gname] = chunk + chunk_dict = ch.copy() + # Create fresh entry, e.g. add Na with initial ao basis + except KeyError: + chunk_dict = {gname: chunk} + idx = len(used_elements) + used_elements.add(symbol) + + packed[symbol] = (idx, chunk_dict) + + """ + for chunk in ao_chunks: + symbol = extract_symbol(chunk) + chunk_dict = {"ao basis" : chunk} + idx = len(used_elements) + used_elements.add(symbol) + packed[symbol] = (idx, chunk_dict) + + for chunk in ecp_chunks: + symbol = extract_symbol(chunk) + #add ECP data if existing chunk, else create fresh chunk + try: + idx, ch = packed[symbol] + ch["ecp"] = chunk + chunk_dict = ch.copy() + except KeyError: + chunk_dict = {"ecp" : chunk} + idx = len(used_elements) + used_elements.add(symbol) + packed[symbol] = (idx, chunk_dict) + """ + + values = sorted(packed.values()) + + # Assign (Symbol, Serialized) to final pairs + pairs = [] + for idx, chunk in values: + symbol = extract_symbol(chunk.get("ao basis") + or chunk.get("cd basis") + or chunk.get("xc basis") + or chunk.get("ecp")) + serialized = json.dumps(chunk) + pairs.append([symbol, serialized]) + return [name, description, pairs] diff --git a/src/parser_handler.py b/src/parser_handler.py new file mode 100644 index 0000000..fa9d634 --- /dev/null +++ b/src/parser_handler.py @@ -0,0 +1,138 @@ +import sys +import os +import re + + +def get_dict_ele(): + """Return dict[atom]=[abreviation]""" + elt_path = os.path.dirname(sys.argv[0]) + "/src/misc/elts_abrev.dat" + + with open(elt_path, "r") as f: + data = f.readlines() + + dict_ele = dict() + for i in data: + l = i.split("-") + dict_ele[l[1].strip().lower()] = l[2].strip().lower() + + return dict_ele + +# ______ _ _ _ _ +# | ___| | | | (_) | | +# | |_ _ __ ___ _ __ ___ __ _| |_ __| |_ ___| |_ +# | _| '__/ _ \| '_ ` _ \ / _` | __| / _` | |/ __| __| +# | | | | | (_) | | | | | | (_| | |_ | (_| | | (__| |_ +# \_| |_| \___/|_| |_| |_|\__,_|\__| \__,_|_|\___|\__| +# +from src.parser.gamess_us import parse_basis_data_gamess_us +from src.parser.gaussian94 import parse_basis_data_gaussian94 +from src.parser.nwchem import parse_basis_data_nwchem + + +parser_dict = {"Gaussian94": parse_basis_data_gaussian94, + "GAMESS-US": parse_basis_data_gamess_us, + "NWChem": parse_basis_data_nwchem, + "GAMESS-UK": None, + "Turbomole": None, + "TX93": None, + "Molpro": None, + "MolproInt": None, + "Hondo": None, + "SuperMolecule": None, + "Molcas": None, + "HyperChem": None, + "Dalton": None, + "deMon-KS": None, + "deMon2k": None, + "AcesII": None} + + +def check_format(format): + try: + parser_dict[format] + except KeyError: + str_ = ["This format ({0}) is not available in EMSL".format(format), + "EMSL provide this list : {0}".format(parser_dict.keys())] + print >> sys.stderr, "\n".join(str_) + sys.exit(1) + else: + return format + + +def get_parser_function(format): + if not parser_dict[format]: + list_parser = [k for k, v in parser_dict.iteritems() if v] + + str_ = ["We have no parser for this format {0}".format(format), + "We only support {0}".format(list_parser), + "Fill free to Fock /pull request", + "You just need to add a function like this one:", + "'src.pars.gamess_us.parse_basis_data_gamess_us'"] + print >> sys.stderr, "\n".join(str_) + sys.exit(1) + else: + return parser_dict[format] + +# _____ _ _ _ _ +# / ___| | | | (_) | | +# \ `--. _ _ _ __ ___ _ __ ___ ___| |_ _ __ _ _ __| |_ ___| |_ +# `--. \ | | | '_ ` _ \| '_ ` _ \ / _ \ __| '__| | | | / _` | |/ __| __| +# /\__/ / |_| | | | | | | | | | | | __/ |_| | | |_| | | (_| | | (__| |_ +# \____/ \__, |_| |_| |_|_| |_| |_|\___|\__|_| \__, | \__,_|_|\___|\__| +# __/ | __/ | +# |___/ |___/ + +""" +Return the begin and the end of all the type of orbital +input: atom_basis = [name, S 1, 12 0.12 12212, ...] +output: [ [type, begin, end], ...] +""" + +from src.parser.gamess_us import l_symmetry_gamess_us + +symmetry_dict = {"GAMESS-US": l_symmetry_gamess_us} + + +def get_symmetry_function(format): + """ + Return the begin and the end of all the type of orbital + input: atom_basis = [name, S 1, 12 0.12 12212, ...] + output: [ [type, begin, end], ...] + """ + try: + f = symmetry_dict[format] + except KeyError: + print >> sys.stderr, "You need to add a function in symmetry_dict" + print >> sys.stderr, "for your format ({0})".format(format) + sys.exit(1) + else: + return f + +# _ _ _ _ _ _ _ _ _ ______ _ _ +# | | | | | | | ( | ) | ( | ) | _ (_) | | +# | |_| | __ _ _ __ __| | | ___ V V| | V V | | | |_ ___| |_ +# | _ |/ _` | '_ \ / _` | |/ _ \ | | | | | | |/ __| __| +# | | | | (_| | | | | (_| | | __/ | |____ | |/ /| | (__| |_ +# \_| |_/\__,_|_| |_|\__,_|_|\___| \_____/ |___/ |_|\___|\__| + +""" +Tranforme SP special function (create using get_symmetry_function) into S and P +""" +from src.parser.gamess_us import handle_l_gamess_us + +handle_l_dict = {"GAMESS-US": handle_l_gamess_us} + + +def get_handle_l_function(format): + """ + Tranforme SP special function (create using get_symmetry_function) + into S and P + """ + try: + f = handle_l_dict[format] + except KeyError: + print >> sys.stderr, "You need to add a function in handle_l_dict" + print >> sys.stderr, "for your format ({0})".format(format) + sys.exit(1) + else: + return f