Adding parser...

2025-01-03 01:55:54 +01:00 · 2015-03-17 17:32:37 +01:00 · 2015-03-17 17:32:37 +01:00 · 7d146f0e8a
commit 7d146f0e8a
parent d1f0515ebb
8 changed files with 1209 additions and 0 deletions
--- a/src/misc/init.py
+++ b/src/misc/init.py
--- a/src/misc/docopt.py
+++ b/src/misc/docopt.py
@ -0,0 +1,590 @@
+"""Pythonic command-line interface parser that will make you smile.
+
+ * http://docopt.org
+ * Repository and issue-tracker: https://github.com/docopt/docopt
+ * Licensed under terms of MIT license (see LICENSE-MIT)
+ * Copyright (c) 2013 Vladimir Keleshev, vladimir@keleshev.com
+
+"""
+import sys
+import re
+
+
+__all__ = ['docopt']
+__version__ = '0.6.1'
+
+
+class DocoptLanguageError(Exception):
+
+    """Error in construction of usage-message by developer."""
+
+
+class DocoptExit(SystemExit):
+
+    """Exit in case user invoked program with incorrect arguments."""
+
+    usage = ''
+
+    def __init__(self, message=''):
+        SystemExit.__init__(self, (message + '\n' + self.usage).strip())
+
+
+class Pattern(object):
+
+    def __eq__(self, other):
+        return repr(self) == repr(other)
+
+    def __hash__(self):
+        return hash(repr(self))
+
+    def fix(self):
+        self.fix_identities()
+        self.fix_repeating_arguments()
+        return self
+
+    def fix_identities(self, uniq=None):
+        """Make pattern-tree tips point to same object if they are equal."""
+        if not hasattr(self, 'children'):
+            return self
+        uniq = list(set(self.flat())) if uniq is None else uniq
+        for i, child in enumerate(self.children):
+            if not hasattr(child, 'children'):
+                assert child in uniq
+                self.children[i] = uniq[uniq.index(child)]
+            else:
+                child.fix_identities(uniq)
+
+    def fix_repeating_arguments(self):
+        """Fix elements that should accumulate/increment values."""
+        either = [list(child.children) for child in transform(self).children]
+        for case in either:
+            for e in [child for child in case if case.count(child) > 1]:
+                if isinstance(
+                        e,
+                        Argument) or isinstance(
+                        e,
+                        Option) and e.argcount:
+                    if e.value is None:
+                        e.value = []
+                    elif not isinstance(e.value, list):
+                        e.value = e.value.split()
+                if isinstance(
+                        e,
+                        Command) or isinstance(
+                        e,
+                        Option) and e.argcount == 0:
+                    e.value = 0
+        return self
+
+
+def transform(pattern):
+    """Expand pattern into an (almost) equivalent one, but with single Either.
+
+    Example: ((-a | -b) (-c | -d)) => (-a -c | -a -d | -b -c | -b -d)
+    Quirks: [-a] => (-a), (-a...) => (-a -a)
+
+    """
+    result = []
+    groups = [[pattern]]
+    while groups:
+        children = groups.pop(0)
+        parents = [Required, Optional, OptionsShortcut, Either, OneOrMore]
+        if any(t in map(type, children) for t in parents):
+            child = [c for c in children if type(c) in parents][0]
+            children.remove(child)
+            if isinstance(child, Either):
+                for c in child.children:
+                    groups.append([c] + children)
+            elif isinstance(child, OneOrMore):
+                groups.append(child.children * 2 + children)
+            else:
+                groups.append(child.children + children)
+        else:
+            result.append(children)
+    return Either(*[Required(*e) for e in result])
+
+
+class LeafPattern(Pattern):
+
+    """Leaf/terminal node of a pattern tree."""
+
+    def __init__(self, name, value=None):
+        self.name, self.value = name, value
+
+    def __repr__(self):
+        return '%s(%r, %r)' % (self.__class__.__name__, self.name, self.value)
+
+    def flat(self, *types):
+        return [self] if not types or type(self) in types else []
+
+    def match(self, left, collected=None):
+        collected = [] if collected is None else collected
+        pos, match = self.single_match(left)
+        if match is None:
+            return False, left, collected
+        left_ = left[:pos] + left[pos + 1:]
+        same_name = [a for a in collected if a.name == self.name]
+        if type(self.value) in (int, list):
+            if isinstance(self.value, int):
+                increment = 1
+            else:
+                increment = ([match.value] if isinstance(match.value, str)
+                             else match.value)
+            if not same_name:
+                match.value = increment
+                return True, left_, collected + [match]
+            same_name[0].value += increment
+            return True, left_, collected
+        return True, left_, collected + [match]
+
+
+class BranchPattern(Pattern):
+
+    """Branch/inner node of a pattern tree."""
+
+    def __init__(self, *children):
+        self.children = list(children)
+
+    def __repr__(self):
+        return '%s(%s)' % (self.__class__.__name__,
+                           ', '.join(repr(a) for a in self.children))
+
+    def flat(self, *types):
+        if type(self) in types:
+            return [self]
+        return sum([child.flat(*types) for child in self.children], [])
+
+
+class Argument(LeafPattern):
+
+    def single_match(self, left):
+        for n, pattern in enumerate(left):
+            if isinstance(pattern, Argument):
+                return n, Argument(self.name, pattern.value)
+        return None, None
+
+    @classmethod
+    def parse(class_, source):
+        name = re.findall('(<\S*?>)', source)[0]
+        value = re.findall('\[default: (.*)\]', source, flags=re.I)
+        return class_(name, value[0] if value else None)
+
+
+class Command(Argument):
+
+    def __init__(self, name, value=False):
+        self.name, self.value = name, value
+
+    def single_match(self, left):
+        for n, pattern in enumerate(left):
+            if isinstance(pattern, Argument):
+                if pattern.value == self.name:
+                    return n, Command(self.name, True)
+                else:
+                    break
+        return None, None
+
+
+class Option(LeafPattern):
+
+    def __init__(self, short=None, long=None, argcount=0, value=False):
+        assert argcount in (0, 1)
+        self.short, self.long, self.argcount = short, long, argcount
+        self.value = None if value is False and argcount else value
+
+    @classmethod
+    def parse(class_, option_description):
+        short, long, argcount, value = None, None, 0, False
+        options, _, description = option_description.strip().partition('  ')
+        options = options.replace(',', ' ').replace('=', ' ')
+        for s in options.split():
+            if s.startswith('--'):
+                long = s
+            elif s.startswith('-'):
+                short = s
+            else:
+                argcount = 1
+        if argcount:
+            matched = re.findall('\[default: (.*)\]', description, flags=re.I)
+            value = matched[0] if matched else None
+        return class_(short, long, argcount, value)
+
+    def single_match(self, left):
+        for n, pattern in enumerate(left):
+            if self.name == pattern.name:
+                return n, pattern
+        return None, None
+
+    @property
+    def name(self):
+        return self.long or self.short
+
+    def __repr__(self):
+        return 'Option(%r, %r, %r, %r)' % (self.short, self.long,
+                                           self.argcount, self.value)
+
+
+class Required(BranchPattern):
+
+    def match(self, left, collected=None):
+        collected = [] if collected is None else collected
+        l = left
+        c = collected
+        for pattern in self.children:
+            matched, l, c = pattern.match(l, c)
+            if not matched:
+                return False, left, collected
+        return True, l, c
+
+
+class Optional(BranchPattern):
+
+    def match(self, left, collected=None):
+        collected = [] if collected is None else collected
+        for pattern in self.children:
+            m, left, collected = pattern.match(left, collected)
+        return True, left, collected
+
+
+class OptionsShortcut(Optional):
+
+    """Marker/placeholder for [options] shortcut."""
+
+
+class OneOrMore(BranchPattern):
+
+    def match(self, left, collected=None):
+        assert len(self.children) == 1
+        collected = [] if collected is None else collected
+        l = left
+        c = collected
+        l_ = None
+        matched = True
+        times = 0
+        while matched:
+            # could it be that something didn't match but changed l or c?
+            matched, l, c = self.children[0].match(l, c)
+            times += 1 if matched else 0
+            if l_ == l:
+                break
+            l_ = l
+        if times >= 1:
+            return True, l, c
+        return False, left, collected
+
+
+class Either(BranchPattern):
+
+    def match(self, left, collected=None):
+        collected = [] if collected is None else collected
+        outcomes = []
+        for pattern in self.children:
+            matched, _, _ = outcome = pattern.match(left, collected)
+            if matched:
+                outcomes.append(outcome)
+        if outcomes:
+            return min(outcomes, key=lambda outcome: len(outcome[1]))
+        return False, left, collected
+
+
+class Tokens(list):
+
+    def __init__(self, source, error=DocoptExit):
+        self += source.split() if hasattr(source, 'split') else source
+        self.error = error
+
+    @staticmethod
+    def from_pattern(source):
+        source = re.sub(r'([\[\]\(\)\|]|\.\.\.)', r' \1 ', source)
+        source = [s for s in re.split('\s+|(\S*<.*?>)', source) if s]
+        return Tokens(source, error=DocoptLanguageError)
+
+    def move(self):
+        return self.pop(0) if len(self) else None
+
+    def current(self):
+        return self[0] if len(self) else None
+
+
+def parse_long(tokens, options):
+    """long ::= '--' chars [ ( ' ' | '=' ) chars ] ;"""
+    long, eq, value = tokens.move().partition('=')
+    assert long.startswith('--')
+    value = None if eq == value == '' else value
+    similar = [o for o in options if o.long == long]
+    if tokens.error is DocoptExit and similar == []:  # if no exact match
+        similar = [o for o in options if o.long and o.long.startswith(long)]
+    if len(similar) > 1:  # might be simply specified ambiguously 2+ times?
+        raise tokens.error('%s is not a unique prefix: %s?' %
+                           (long, ', '.join(o.long for o in similar)))
+    elif len(similar) < 1:
+        argcount = 1 if eq == '=' else 0
+        o = Option(None, long, argcount)
+        options.append(o)
+        if tokens.error is DocoptExit:
+            o = Option(None, long, argcount, value if argcount else True)
+    else:
+        o = Option(similar[0].short, similar[0].long,
+                   similar[0].argcount, similar[0].value)
+        if o.argcount == 0:
+            if value is not None:
+                raise tokens.error('%s must not have an argument' % o.long)
+        else:
+            if value is None:
+                if tokens.current() in [None, '--']:
+                    raise tokens.error('%s requires argument' % o.long)
+                value = tokens.move()
+        if tokens.error is DocoptExit:
+            o.value = value if value is not None else True
+    return [o]
+
+
+def parse_shorts(tokens, options):
+    """shorts ::= '-' ( chars )* [ [ ' ' ] chars ] ;"""
+    token = tokens.move()
+    assert token.startswith('-') and not token.startswith('--')
+    left = token.lstrip('-')
+    parsed = []
+    while left != '':
+        short, left = '-' + left[0], left[1:]
+        similar = [o for o in options if o.short == short]
+        if len(similar) > 1:
+            raise tokens.error('%s is specified ambiguously %d times' %
+                               (short, len(similar)))
+        elif len(similar) < 1:
+            o = Option(short, None, 0)
+            options.append(o)
+            if tokens.error is DocoptExit:
+                o = Option(short, None, 0, True)
+        else:  # why copying is necessary here?
+            o = Option(short, similar[0].long,
+                       similar[0].argcount, similar[0].value)
+            value = None
+            if o.argcount != 0:
+                if left == '':
+                    if tokens.current() in [None, '--']:
+                        raise tokens.error('%s requires argument' % short)
+                    value = tokens.move()
+                else:
+                    value = left
+                    left = ''
+            if tokens.error is DocoptExit:
+                o.value = value if value is not None else True
+        parsed.append(o)
+    return parsed
+
+
+def parse_pattern(source, options):
+    tokens = Tokens.from_pattern(source)
+    result = parse_expr(tokens, options)
+    if tokens.current() is not None:
+        raise tokens.error('unexpected ending: %r' % ' '.join(tokens))
+    return Required(*result)
+
+
+def parse_expr(tokens, options):
+    """expr ::= seq ( '|' seq )* ;"""
+    seq = parse_seq(tokens, options)
+    if tokens.current() != '|':
+        return seq
+    result = [Required(*seq)] if len(seq) > 1 else seq
+    while tokens.current() == '|':
+        tokens.move()
+        seq = parse_seq(tokens, options)
+        result += [Required(*seq)] if len(seq) > 1 else seq
+    return [Either(*result)] if len(result) > 1 else result
+
+
+def parse_seq(tokens, options):
+    """seq ::= ( atom [ '...' ] )* ;"""
+    result = []
+    while tokens.current() not in [None, ']', ')', '|']:
+        atom = parse_atom(tokens, options)
+        if tokens.current() == '...':
+            atom = [OneOrMore(*atom)]
+            tokens.move()
+        result += atom
+    return result
+
+
+def parse_atom(tokens, options):
+    """atom ::= '(' expr ')' | '[' expr ']' | 'options'
+             | long | shorts | argument | command ;
+    """
+    token = tokens.current()
+    result = []
+    if token in '([':
+        tokens.move()
+        matching, pattern = {'(': [')', Required], '[': [']', Optional]}[token]
+        result = pattern(*parse_expr(tokens, options))
+        if tokens.move() != matching:
+            raise tokens.error("unmatched '%s'" % token)
+        return [result]
+    elif token == 'options':
+        tokens.move()
+        return [OptionsShortcut()]
+    elif token.startswith('--') and token != '--':
+        return parse_long(tokens, options)
+    elif token.startswith('-') and token not in ('-', '--'):
+        return parse_shorts(tokens, options)
+    elif token.startswith('<') and token.endswith('>') or token.isupper():
+        return [Argument(tokens.move())]
+    else:
+        return [Command(tokens.move())]
+
+
+def parse_argv(tokens, options, options_first=False):
+    """Parse command-line argument vector.
+
+    If options_first:
+        argv ::= [ long | shorts ]* [ argument ]* [ '--' [ argument ]* ] ;
+    else:
+        argv ::= [ long | shorts | argument ]* [ '--' [ argument ]* ] ;
+
+    """
+    parsed = []
+    while tokens.current() is not None:
+        if tokens.current() == '--':
+            return parsed + [Argument(None, v) for v in tokens]
+        elif tokens.current().startswith('--'):
+            parsed += parse_long(tokens, options)
+        elif tokens.current().startswith('-') and tokens.current() != '-':
+            parsed += parse_shorts(tokens, options)
+        elif options_first:
+            return parsed + [Argument(None, v) for v in tokens]
+        else:
+            parsed.append(Argument(None, tokens.move()))
+    return parsed
+
+
+def parse_defaults(doc):
+    defaults = []
+    for s in parse_section('options:', doc):
+        # FIXME corner case "bla: options: --foo"
+        _, _, s = s.partition(':')  # get rid of "options:"
+        split = re.split('\n[ \t]*(-\S+?)', '\n' + s)[1:]
+        split = [s1 + s2 for s1, s2 in zip(split[::2], split[1::2])]
+        options = [Option.parse(s) for s in split if s.startswith('-')]
+        defaults += options
+    return defaults
+
+
+def parse_section(name, source):
+    pattern = re.compile('^([^\n]*' + name + '[^\n]*\n?(?:[ \t].*?(?:\n|$))*)',
+                         re.IGNORECASE | re.MULTILINE)
+    return [s.strip() for s in pattern.findall(source)]
+
+
+def formal_usage(section):
+    _, _, section = section.partition(':')  # drop "usage:"
+    pu = section.split()
+    return '( ' + ' '.join(') | (' if s == pu[0] else s for s in pu[1:]) + ' )'
+
+
+def extras(help, version, options, doc):
+    if help and any((o.name in ('-h', '--help')) and o.value for o in options):
+        print(doc.strip("\n"))
+        sys.exit()
+    if version and any(o.name == '--version' and o.value for o in options):
+        print(version)
+        sys.exit()
+
+
+class Dict(dict):
+
+    def __repr__(self):
+        return '{%s}' % ',\n '.join('%r: %r' % i for i in sorted(self.items()))
+
+
+def docopt(doc, argv=None, help=True, version=None, options_first=False):
+    """Parse `argv` based on command-line interface described in `doc`.
+
+    `docopt` creates your command-line interface based on its
+    description that you pass as `doc`. Such description can contain
+    --options, <positional-argument>, commands, which could be
+    [optional], (required), (mutually | exclusive) or repeated...
+
+    Parameters
+    ----------
+    doc : str
+        Description of your command-line interface.
+    argv : list of str, optional
+        Argument vector to be parsed. sys.argv[1:] is used if not
+        provided.
+    help : bool (default: True)
+        Set to False to disable automatic help on -h or --help
+        options.
+    version : any object
+        If passed, the object will be printed if --version is in
+        `argv`.
+    options_first : bool (default: False)
+        Set to True to require options precede positional arguments,
+        i.e. to forbid options and positional arguments intermix.
+
+    Returns
+    -------
+    args : dict
+        A dictionary, where keys are names of command-line elements
+        such as e.g. "--verbose" and "<path>", and values are the
+        parsed values of those elements.
+
+    Example
+    -------
+    >>> from docopt import docopt
+    >>> doc = '''
+    ... Usage:
+    ...     my_program tcp <host> <port> [--timeout=<seconds>]
+    ...     my_program serial <port> [--baud=<n>] [--timeout=<seconds>]
+    ...     my_program (-h | --help | --version)
+    ...
+    ... Options:
+    ...     -h, --help  Show this screen and exit.
+    ...     --baud=<n>  Baudrate [default: 9600]
+    ... '''
+    >>> argv = ['tcp', '127.0.0.1', '80', '--timeout', '30']
+    >>> docopt(doc, argv)
+    {'--baud': '9600',
+     '--help': False,
+     '--timeout': '30',
+     '--version': False,
+     '<host>': '127.0.0.1',
+     '<port>': '80',
+     'serial': False,
+     'tcp': True}
+
+    See also
+    --------
+    * For video introduction see http://docopt.org
+    * Full documentation is available in README.rst as well as online
+      at https://github.com/docopt/docopt#readme
+
+    """
+    argv = sys.argv[1:] if argv is None else argv
+
+    usage_sections = parse_section('usage:', doc)
+    if len(usage_sections) == 0:
+        raise DocoptLanguageError('"usage:" (case-insensitive) not found.')
+    if len(usage_sections) > 1:
+        raise DocoptLanguageError('More than one "usage:" (case-insensitive).')
+    DocoptExit.usage = usage_sections[0]
+
+    options = parse_defaults(doc)
+    pattern = parse_pattern(formal_usage(DocoptExit.usage), options)
+    # [default] syntax for argument is disabled
+    # for a in pattern.flat(Argument):
+    #    same_name = [d for d in arguments if d.name == a.name]
+    #    if same_name:
+    #        a.value = same_name[0].value
+    argv = parse_argv(Tokens(argv), list(options), options_first)
+    pattern_options = set(pattern.flat(Option))
+    for options_shortcut in pattern.flat(OptionsShortcut):
+        doc_options = parse_defaults(doc)
+        options_shortcut.children = list(set(doc_options) - pattern_options)
+        # if any_options:
+        #    options_shortcut.children += [Option(o.short, o.long, o.argcount)
+        #                    for o in argv if type(o) is Option]
+    extras(help, version, argv, doc)
+    matched, left, collected = pattern.fix().match(argv)
+    if matched and left == []:  # better error message if left?
+        return Dict((a.name, a.value) for a in (pattern.flat() + collected))
+    raise DocoptExit()
--- a/src/misc/elts_abrev.dat
+++ b/src/misc/elts_abrev.dat
@ -0,0 +1,118 @@
+1 - H - Hydrogen
+2 - He - Helium
+3 - Li - Lithium
+4 - Be - Beryllium
+5 - B - Boron
+6 - C - Carbon
+7 - N - Nitrogen
+8 - O - Oxygen
+9 - F - Fluorine
+10 - Ne - Neon
+11 - Na - Sodium
+12 - Mg - Magnesium
+13 - Al - Aluminum
+14 - Si - Silicon
+15 - P - Phosphorus
+16 - S - Sulfur
+17 - Cl - Chlorine
+18 - Ar - Argon
+19 - K - Potassium
+20 - Ca - Calcium
+21 - Sc - Scandium
+22 - Ti - Titanium
+23 - V - Vanadium
+24 - Cr - Chromium
+25 - Mn - Manganese
+26 - Fe - Iron
+27 - Co - Cobalt
+28 - Ni - Nickel
+29 - Cu - Copper
+30 - Zn - Zinc
+31 - Ga - Gallium
+32 - Ge - Germanium
+33 - As - Arsenic
+34 - Se - Selenium
+35 - Br - Bromine
+36 - Kr - Krypton
+37 - Rb - Rubidium
+38 - Sr - Strontium
+39 - Y - Yttrium
+40 - Zr - Zirconium
+41 - Nb - Niobium
+42 - Mo - Molybdenum
+43 - Tc - Technetium
+44 - Ru - Ruthenium
+45 - Rh - Rhodium
+46 - Pd - Palladium
+47 - Ag - Silver
+48 - Cd - Cadmium
+49 - In - Indium
+50 - Sn - Tin
+51 - Sb - Antimony
+52 - Te - Tellurium
+53 - I - Iodine
+54 - Xe - Xenon
+55 - Cs - Cesium
+56 - Ba - Barium
+57 - La - Lanthanum
+58 - Ce - Cerium
+59 - Pr - Praseodymium
+60 - Nd - Neodymium
+61 - Pm - Promethium
+62 - Sm - Samarium
+63 - Eu - Europium
+64 - Gd - Gadolinium
+65 - Tb - Terbium
+66 - Dy - Dysprosium
+67 - Ho - Holmium
+68 - Er - Erbium
+69 - Tm - Thulium
+70 - Yb - Ytterbium
+71 - Lu - Lutetium
+72 - Hf - Hafnium
+73 - Ta - Tantalum
+74 - W - Tungsten
+75 - Re - Rhenium
+76 - Os - Osmium
+77 - Ir - Iridium
+78 - Pt - Platinum
+79 - Au - Gold
+80 - Hg - Mercury
+81 - Tl - Thallium
+82 - Pb - Lead
+83 - Bi - Bismuth
+84 - Po - Polonium
+85 - At - Astatine
+86 - Rn - Radon
+87 - Fr - Francium
+88 - Ra - Radium
+89 - Ac - Actinium
+90 - Th - Thorium
+91 - Pa - Protactinium
+92 - U - Uranium
+93 - Np - Neptunium
+94 - Pu - Plutonium
+95 - Am - Americium
+96 - Cm - Curium
+97 - Bk - Berkelium
+98 - Cf - Californium
+99 - Es - Einsteinium
+100 - Fm - Fermium
+101 - Md - Mendelevium
+102 - No - Nobelium
+103 - Lr - Lawrencium
+104 - Rf - Rutherfordium
+105 - Db - Dubnium
+106 - Sg - Seaborgium
+107 - Bh - Bohrium
+108 - Hs - Hassium
+109 - Mt - Meitnerium
+110 - Ds - Darmstadtium
+111 - Rg - Roentgenium
+112 - Cn - Copernicium
+113 - Uut - Ununtrium
+114 - Fl - Flerovium
+115 - Uup - Ununpentium
+116 - Lv - Livermorium
+117 - Uus - Ununseptium
+118 - Uuo - Ununoctium
--- a/src/parser/init.py
+++ b/src/parser/init.py
--- a/src/parser/check_validity.py
+++ b/src/parser/check_validity.py
@ -0,0 +1,52 @@
+#  _
+# /  |_   _   _ |        _. | o  _| o _|_
+# \_ | | (/_ (_ |<   \/ (_| | | (_| |  |_ \/
+#                                         /
+# Do this After the L special case traitement.
+
+import sys
+
+
+def check_gamess(str_type):
+    """Check is the orbital type is handle by gamess"""
+
+    assert len(str_type) == 1
+
+    if str_type in "S P D".split():
+        return True
+    elif str_type == "SP":
+        raise BaseException
+    else:
+        return True
+
+
+def check_NWChem(str_type):
+    """Check is the orbital type is handle by gamess"""
+
+    assert len(str_type) == 1
+
+    if str_type in "S P D".split():
+        return True
+    elif str_type > "I" or str_type in "K L M".split():
+        raise BaseException
+    else:
+        return True
+
+
+d_check = {"GAMESS-US": check_gamess,
+           "NWChem": check_NWChem}
+
+
+def get_check_function(name_program):
+    """
+    Tranforme SP special function (create using get_symmetry_function)
+    into S and P
+    """
+    try:
+        f = d_check[name_program]
+    except KeyError:
+        str_ = "You need to add a check funtion for your program {0}"
+        print >> sys.stderr, str_.format(name_program)
+        print >> sys.stderr, "This one are avalaible {0}".format(d_check.keys())
+        sys.exit(1)
+    return f
--- a/src/parser/gamess_us.py
+++ b/src/parser/gamess_us.py
@ -0,0 +1,138 @@
+#  __
+# /__  _. ._ _   _   _  _        _
+# \_| (_| | | | (/_ _> _>   |_| _>
+#
+
+from src.parser_handler import get_dict_ele
+import re
+
+
+def parse_basis_data_gamess_us(data, name, des, elts, debug=False):
+    """Parse the basis data raw html of gamess-us to get a nice tuple
+       Return (name, description, [[ele, data_ele],...])"""
+    basis_data = []
+
+    b = data.find("$DATA")
+    e = data.find("$END")
+    if (b == -1 or data.find("$DATA$END") != -1):
+        if debug:
+            print data
+        raise Exception("WARNING not DATA")
+    else:
+        dict_replace = {"PHOSPHOROUS": "PHOSPHORUS",
+                        "D+": "E+",
+                        "D-": "E-"}
+
+        for k, v in dict_replace.iteritems():
+            data = data.replace(k, v)
+
+        data = data[b + 5:e - 1].split('\n\n')
+
+        dict_ele = get_dict_ele()
+
+        for (elt, data_elt) in zip(elts, data):
+
+            elt_long_th = dict_ele[elt.lower()]
+            elt_long_exp = data_elt.split()[0].lower()
+
+            if "$" in data_elt:
+                if debug:
+                    print "Eror",
+                raise Exception("WARNING bad split")
+
+            if elt_long_th == elt_long_exp:
+                basis_data.append([elt, data_elt.strip()])
+            else:
+                if debug:
+                    print "th", elt_long_th
+                    print "exp", elt_long_exp
+                    print "abv", elt
+                raise Exception("WARNING not a good ELEMENT")
+
+    return (name, des, basis_data)
+
+
+symmetry_regex = re.compile(ur'^(\w)\s+\d+\b')
+
+
+def l_symmetry_gamess_us(atom_basis):
+    """
+    Return the begin and the end of all the type of orbital
+    input: atom_basis = [name, S 1, 12 0.12 12212, ...]
+    output: [ [type, begin, end], ...]
+    """
+    # Example
+    # [[u'S', 1, 5], [u'L', 5, 9], [u'L', 9, 12], [u'D', 16, 18]]"
+
+    l = []
+    for i, line in enumerate(atom_basis):
+        # Optimisation for not seaching all the time
+        if len(line) < 10:
+            m = re.search(symmetry_regex, line)
+            if m:
+                # Cause of L !
+                read_symmetry = m.group(1)
+
+                # L is real L or special SP
+                # Just check the number of exponant
+                if all([read_symmetry == "L",
+                        len(atom_basis[i + 1].split()) == 4]):
+                    real_symmetry = "SP"
+                else:
+                    real_symmetry = read_symmetry
+
+                l.append([real_symmetry, i])
+                try:
+                    l[-2].append(i)
+                except IndexError:
+                    pass
+
+    l[-1].append(i + 1)
+    return l
+
+
+def handle_l_gamess_us(l_atom_basis):
+    """
+    Read l_atom_basis and change the SP in L and P
+    """
+
+    l_data = []
+    for atom_basis in l_atom_basis:
+
+        # Split the data in line
+        l_line_raw = atom_basis.split("\n")
+        l_line = [l_line_raw[0]]
+        # l_line_raw[0] containt the name of the Atom
+
+        for symmetry, begin, end in l_symmetry_gamess_us(l_line_raw):
+
+            if symmetry == "SP":
+
+                body_s = []
+                body_p = []
+
+                for i_l in l_line_raw[begin + 1:end]:
+
+                    # one L =>  S & P
+                    a = i_l.split()
+
+                    common = "{:>3}".format(a[0])
+                    common += "{:>15.7f}".format(float(a[1]))
+
+                    tail_s = common + "{:>23.7f}".format(float(a[2]))
+                    body_s.append(tail_s)
+
+                    tail_p = common + "{:>23.7f}".format(float(a[3]))
+                    body_p.append(tail_p)
+
+                l_line += [l_line_raw[begin].replace("L", "S")]
+                l_line += body_s
+
+                l_line += [l_line_raw[begin].replace("L", "P")]
+                l_line += body_p
+            else:
+                l_line += l_line_raw[begin:end]
+
+        l_data.append("\n".join(l_line))
+
+    return l_data
--- a/src/parser/gaussian94.py
+++ b/src/parser/gaussian94.py
@ -0,0 +1,83 @@
+#  __                            _
+# /__  _.      _  _ o  _. ._    (_| |_|_
+# \_| (_| |_| _> _> | (_| | |     |   |
+#
+import sys
+
+
+def parse_basis_data_gaussian94(data, name, description, elements, debug=True):
+    """Parse the Gaussian94 basis data raw html to get a nice tuple.
+
+    The data-pairs item is actually expected to be a 2 item list:
+    [symbol, data]
+
+    e.g. ["Ca", "#BASIS SET..."]
+
+    N.B.: Currently ignores ECP data!
+
+    @param data: raw HTML from BSE
+    @type data : unicode
+    @param name: basis set name
+    @type name : str
+    @param des: basis set description
+    @type des : str
+    @param elements: element symbols e.g. ['H', 'C', 'N', 'O', 'Cl']
+    @type elements : list
+    @return: (name, description, data-pairs)
+    @rtype : tuple
+    """
+
+    # Each basis set block starts and ends with ****. Find the region
+    # containing all the basis blocks using the first and last ****.
+    mark = "****"
+    begin = data.find(mark)
+    end = data.rfind(mark)
+
+    if begin == -1 or end == -1:
+        if debug:
+            print(data)
+        str_ = " No basis set data found while attempting to process {0} ({1})"
+        raise ValueError(str_.format(name, description))
+
+    trimmed = data[begin + len(mark): end - len(mark)].strip()
+    chunks = []
+    lines = []
+
+    # group lines of data delimited by mark into per-element chunks
+    for line in trimmed.split("\n"):
+        if line.startswith(mark):
+            if lines:
+                chunks.append(lines)
+            lines = [line]
+        else:
+            lines.append(line)
+
+    # handle trailing chunk that is not followed by another basis set block
+    # also remove the marker lines from the chunk itself
+    if lines and (not chunks or lines != chunks[-1]):
+        chunks.append(lines)
+
+    # join lines back into solid text blocks
+    chunks = ["\n".join([L for L in c if mark not in L]) for c in chunks]
+
+    # check each block for element and assign symbols to final pairs
+    pairs = []
+    unused_elements = set([e.upper() for e in elements])
+    for chunk in chunks:
+        # get first 3 chars of first line in block
+        symbol = chunk.split("\n")[0][:3].strip()
+        try:
+            unused_elements.remove(symbol.upper())
+        except KeyError:
+            if debug:
+                msg = "Warning: already processed {0}\n".format(symbol)
+                sys.stderr.write(msg)
+        pairs.append([symbol, chunk])
+
+    if unused_elements:
+        msg = "Warning: elements {0} left over for {1}".format(
+            list(unused_elements),
+            name)
+        print(msg)
+
+    return (name, description, pairs)
--- a/src/parser/nwchem.py
+++ b/src/parser/nwchem.py
@ -0,0 +1,228 @@
+#            _
+# |\ |      /  |_   _  ._ _
+# | \| \/\/ \_ | | (/_ | | |
+#
+import json
+
+
+def extract_basis_nwchem(data, name):
+    """Extract atomic orbital, charge density fitting, or exchange
+    correlation functional basis data from a text region passed in as
+    data. The charge density fitting and exchange correlation functional
+    basis set data are employed for density functional calculations.
+
+    @param data: text region containing basis set data
+    @type data : str
+    @param name: name of basis type: "ao basis", "cd basis", or "xc basis"
+    @type name : str
+    @return: per-element basis set chunks
+    @rtype : list
+    """
+
+    begin_marker = """BASIS "{0}" PRINT""".format(name)
+    end_marker = "END"
+
+    # search for the basis set data begin marker
+    # calling "upper" on data because original data has inconsistent
+    # capitalization
+    begin = data.upper().find(begin_marker.upper())
+    end = data.upper().find(end_marker, begin)
+
+    # No basis data found
+    if begin == -1:
+        return []
+
+    trimmed = data[begin + len(begin_marker): end - len(end_marker)].strip()
+
+    chunks = []
+    lines = []
+
+    # group lines of data delimited by #BASIS SET... into per-element chunks
+    for line in trimmed.split("\n"):
+        if line.upper().startswith("#BASIS SET"):
+            if lines:
+                chunks.append(lines)
+            lines = [line]
+        else:
+            lines.append(line)
+
+    # handle trailing chunk that is not followed by another #BASIS SET...
+    if lines and (not chunks or lines != chunks[-1]):
+        chunks.append(lines)
+
+    # join lines back into solid text blocks
+    chunks = ["\n".join(c) for c in chunks]
+    return chunks
+
+
+def extract_ecp_nwchem(data):
+    """Extract the effective core potential basis data from a text region
+    passed in as data.
+
+    @param data: text region containing ECP data
+    @type data : str
+    @return: per-element effective core potential chunks
+    @rtype : list
+    """
+
+    ecp_begin_mark = "ECP\n"
+    ecp_end_mark = "END"
+    ecp_begin = data.upper().find(ecp_begin_mark)
+    ecp_end = data.upper().find(ecp_end_mark, ecp_begin)
+    ecp_region = ""
+
+    if ecp_begin > -1 and ecp_end > -1:
+        ecp_region = data[
+            ecp_begin +
+            len(ecp_begin_mark): ecp_end -
+            len(ecp_end_mark)].strip()
+
+    # No ECP data, so return empty list
+    else:
+        return []
+
+    chunks = []
+    lines = []
+
+    # group lines of data delimited by XX nelec YY into chunks, e.g.
+    # "Zn nelec 18" begins a zinc ECP
+    for line in ecp_region.split("\n"):
+        if line.lower().find(" nelec ") > -1:
+            if lines:
+                chunks.append(lines)
+            lines = [line]
+        else:
+            lines.append(line)
+
+    # handle trailing chunk that is not followed by another XX nelec YY..
+    if lines and (not chunks or lines != chunks[-1]):
+        chunks.append(lines)
+
+    # join lines back into solid text blocks
+    chunks = ["\n".join(c) for c in chunks]
+    return chunks
+
+
+def unpack_nwchem_basis_block(data):
+    """Unserialize a NWChem basis data block and extract components
+
+    @param data: a JSON of basis set data, perhaps containing many types
+    @type data : str
+    @return: unpacked data
+    @rtype : dict
+    """
+
+    unpacked = json.loads(data)
+    return unpacked
+
+
+def parse_basis_data_nwchem(data, name, description, elements, debug=True):
+    """Parse the NWChem basis data raw html to get a nice tuple.
+
+    The data-pairs item is actually expected to be a 2 item list:
+    [symbol, data]
+
+    e.g. ["Ca", "#BASIS SET..."]
+
+    @param data: raw HTML from BSE
+    @type data : unicode
+    @param name: basis set name
+    @type name : str
+    @param des: basis set description
+    @type des : str
+    @param elements: element symbols e.g. ['H', 'C', 'N', 'O', 'Cl']
+    @type elements : list
+    @return: (name, description, data-pairs)
+    @rtype : tuple
+    """
+
+    unused_elements = set([e.upper() for e in elements])
+
+    def extract_symbol(txt):
+        for sline in txt.split("\n"):
+            if not sline.startswith("#"):
+                try:
+                    symbol = sline[:3].strip().split()[0]
+                    return symbol
+                except IndexError:
+                    continue
+
+        raise ValueError("Can't find element symbol in {0}".format(txt))
+
+    ao_chunks = extract_basis_nwchem(data, "ao basis")
+    cd_chunks = extract_basis_nwchem(data, "cd basis")
+    xc_chunks = extract_basis_nwchem(data, "xc basis")
+    ecp_chunks = extract_ecp_nwchem(data)
+
+    if not any([ao_chunks, cd_chunks, xc_chunks, ecp_chunks]):
+        str_ = "No basis set data found while attempting to process {0} ({1})"
+        raise ValueError(str_.format(name, description))
+
+    # Tag all used elements, whether from ordinary AO basis or ECP section
+    for chunk in ao_chunks + cd_chunks + xc_chunks + ecp_chunks:
+        try:
+            symbol = extract_symbol(chunk)
+            unused_elements.remove(symbol.upper())
+        except KeyError:
+            pass
+
+    if unused_elements:
+        msg = "Warning: elements {0} left over for {1}"
+        print msg.format(list(unused_elements), name)
+
+    # Form packed chunks, turn packed chunks into pairs
+    used_elements = set()
+    packed = {}
+
+    for cgroup, gname in [(ao_chunks, "ao basis"), (cd_chunks, "cd basis"),
+                          (xc_chunks, "xc basis"), (ecp_chunks, "ecp")]:
+        for chunk in cgroup:
+            symbol = extract_symbol(chunk)
+
+            # Expand entry, e.g. add ecp data for Na after it has ao basis
+            try:
+                idx, ch = packed[symbol]
+                ch[gname] = chunk
+                chunk_dict = ch.copy()
+            # Create fresh entry, e.g. add Na with initial ao basis
+            except KeyError:
+                chunk_dict = {gname: chunk}
+                idx = len(used_elements)
+                used_elements.add(symbol)
+
+            packed[symbol] = (idx, chunk_dict)
+
+    """
+        for chunk in ao_chunks:
+            symbol = extract_symbol(chunk)
+            chunk_dict = {"ao basis" : chunk}
+            idx = len(used_elements)
+            used_elements.add(symbol)
+            packed[symbol] = (idx, chunk_dict)
+
+        for chunk in ecp_chunks:
+            symbol = extract_symbol(chunk)
+            #add ECP data if existing chunk, else create fresh chunk
+            try:
+                idx, ch = packed[symbol]
+                ch["ecp"] = chunk
+                chunk_dict = ch.copy()
+            except KeyError:
+                chunk_dict = {"ecp" : chunk}
+                idx = len(used_elements)
+                used_elements.add(symbol)
+            packed[symbol] = (idx, chunk_dict)
+        """
+
+    values = sorted(packed.values())
+
+    # Assign (Symbol, Serialized) to final pairs
+    pairs = []
+    for idx, chunk in values:
+        symbol = extract_symbol(chunk.get("ao basis")
+                                or chunk.get("cd basis")
+                                or chunk.get("xc basis")
+                                or chunk.get("ecp"))
+        serialized = json.dumps(chunk)
+        pairs.append([symbol, serialized])
+    return [name, description, pairs]