Source code for amp

import os
import sys
import shutil
import numpy as np
import tempfile
import platform
from getpass import getuser
from socket import gethostname
import subprocess
import warnings

import ase
from ase.calculators.calculator import Calculator, Parameters
try:
    from ase import __version__ as aseversion
except ImportError:
    # We're on ASE 3.9 or older
    from ase.version import version as aseversion

from .utilities import (make_filename, hash_images, Logger, string2dict,
                        logo, now, assign_cores, TrainingConvergenceError)

try:
    from amp import fmodules
except ImportError:
    warnings.warn('Did not find fortran modules.')
else:
    fmodules_version = 9
    wrong_version = fmodules.check_version(version=fmodules_version)
    if wrong_version:
        raise RuntimeError('fortran modules are not updated. Recompile'
                           'with f2py as described in the README. '
                           'Correct version is %i.' % fmodules_version)

_ampversion = '0.5'


[docs]class Amp(Calculator, object):

    """Atomistic Machine-Learning Potential (Amp) ASE calculator

    Parameters
    ----------
    descriptor : object
        Class representing local atomic environment.
    model : object
        Class representing the regression model. Can be only NeuralNetwork for
        now. Input arguments for NeuralNetwork are hiddenlayers, activation,
        weights, and scalings; for more information see docstring for the class
        NeuralNetwork.
    label : str
        Default prefix/location used for all files.
    dblabel : str
        Optional separate prefix/location for database files, including
        fingerprints, fingerprint derivatives, and neighborlists. This file
        location can be shared between calculator instances to avoid
        re-calculating redundant information. If not supplied, just uses the
        value from label.
    cores : int
        Can specify cores to use for parallel training; if None, will determine
        from environment
    envcommand : string
        For parallel processing across nodes, a command can be supplied
        here to load the appropriate environment before starting workers.
    logging : boolean
        Option to turn off logging; e.g., to speed up force calls.
    atoms : object
        ASE atoms objects with positions, symbols, energy, and forces in ASE
        format.
    """
    implemented_properties = ['energy', 'forces']

    def __init__(self, descriptor, model, label='amp', dblabel=None,
                 cores=None, envcommand=None, logging=True, atoms=None):

        self.logging = logging
        Calculator.__init__(self, label=label, atoms=atoms)
        # Note self._log is set and self._printheader is called by above
        # call when it runs self.set_label.

        self._parallel = {'envcommand': envcommand}

        # Note the following are properties: these are setter functions.
        self.descriptor = descriptor
        self.model = model
        self.cores = cores  # Note this calls 'assign_cores'.

        self.dblabel = label if dblabel is None else dblabel

    @property
    def cores(self):
        """
        Get or set the cores for the parallel environment.

        Parameters
        ----------
        cores : int or dictionary
            Parallel configuration. If cores is an integer, parallelizes over
            this many processes on machine localhost. cores can also be
            a dictionary of the type {'node324': 16, 'node325': 16}. If not
            specified, tries to determine from environment, using
            amp.utilities.assign_cores.
        """
        return self._parallel['cores']

    @cores.setter
    def cores(self, cores):
        self._parallel['cores'] = assign_cores(cores, log=self._log)

    @property
    def descriptor(self):
        """
        Get or set the atomic descriptor.

        Parameters
        ----------
        descriptor : object
            Class instance representing the local atomic environment.
        """
        return self._descriptor

    @descriptor.setter
    def descriptor(self, descriptor):
        descriptor.parent = self  # gives the descriptor object a reference to
        # the main Amp instance. Then descriptor can pull parameters directly
        # from Amp without needing them to be passed in each method call.
        self._descriptor = descriptor
        self.reset()  # Clears any old calculations.

    @property
    def model(self):
        """
        Get or set the machine-learning model.

        Parameters
        ----------
        model : object
            Class instance representing the regression model.
        """
        return self._model

    @model.setter
    def model(self, model):
        model.parent = self  # gives the model object a reference to the main
        # Amp instance. Then model can pull parameters directly from Amp
        # without needing them to be passed in each method call.
        self._model = model
        self.reset()  # Clears any old calculations.

    @classmethod
[docs]    def load(Cls, file, Descriptor=None, Model=None, **kwargs):
        """Attempts to load calculators and return a new instance of Amp.

        Only a filename or file-like object is required, in typical cases.

        If using a home-rolled descriptor or model, also supply uninstantiated
        classes to those models, as in Model=MyModel.  (Not as
        Model=MyModel()!)

        Any additional keyword arguments (such as label or dblabel) can be
        fed through to Amp.

        Parameters
        ----------
        file : str
            Name of the file to load data from.
        Descriptor : object
            Class representing local atomic environment.
        Model : object
            Class representing the regression model.
        """
        if hasattr(file, 'read'):
            text = file.read()
        else:
            with open(file) as f:
                text = f.read()

        # Unpack parameter dictionaries.
        p = string2dict(text)
        for key in ['descriptor', 'model']:
            p[key] = string2dict(p[key])

        # If modules are not specified, find them.
        if Descriptor is None:
            Descriptor = importhelper(p['descriptor'].pop('importname'))
        if Model is None:
            Model = importhelper(p['model'].pop('importname'))
        # Key 'importname' and the value removed so that it is not splatted
        # into the keyword arguments used to instantiate in the next line.

        # Instantiate the descriptor and model.
        descriptor = Descriptor(**p['descriptor'])
        # ** sends all the key-value pairs at once.
        model = Model(**p['model'])

        # Instantiate Amp.
        calc = Cls(descriptor=descriptor, model=model, **kwargs)
        calc._log('Loaded file: %s' % file)
        return calc

[docs]    def set(self, **kwargs):
        """Function to set parameters.

        For now, this doesn't do anything as all parameters are within the
        model and descriptor.
        """
        changed_parameters = Calculator.set(self, **kwargs)
        if len(changed_parameters) > 0:
            self.reset()

[docs]    def set_label(self, label):
        """Sets label, ensuring that any needed directories are made.

        Parameters
        ----------
        label : str
            Default prefix/location used for all files.
        """
        Calculator.set_label(self, label)

        # Create directories for output structure if needed.
        # Note ASE doesn't do this for us.
        if self.label:
            if (self.directory != os.curdir and
                    not os.path.isdir(self.directory)):
                os.makedirs(self.directory)

        if self.logging is True:
            self._log = Logger(make_filename(self.label, '-log.txt'))
        else:
            self._log = Logger(None)

        self._printheader(self._log)

[docs]    def calculate(self, atoms, properties, system_changes):
        """Calculation of the energy of system and forces of all atoms.

        """
        # The inherited method below just sets the atoms object,
        # if specified, to self.atoms.
        Calculator.calculate(self, atoms, properties, system_changes)

        log = self._log
        log('Calculation requested.')

        images = hash_images([self.atoms])
        key = images.keys()[0]

        if properties == ['energy']:
            log('Calculating potential energy...', tic='pot-energy')
            self.descriptor.calculate_fingerprints(images=images,
                                                   log=log,
                                                   calculate_derivatives=False)
            energy = self.model.calculate_energy(
                self.descriptor.fingerprints[key])
            self.results['energy'] = energy
            log('...potential energy calculated.', toc='pot-energy')

        if properties == ['forces']:
            log('Calculating forces...', tic='forces')
            self.descriptor.calculate_fingerprints(images=images,
                                                   log=log,
                                                   calculate_derivatives=True)
            forces = \
                self.model.calculate_forces(
                    self.descriptor.fingerprints[key],
                    self.descriptor.fingerprintprimes[key])
            self.results['forces'] = forces
            log('...forces calculated.', toc='forces')

[docs]    def train(self,
              images,
              overwrite=False,
              ):
        """Fits the model to the training images.

        Parameters
        ----------
        images : list or str
            List of ASE atoms objects with positions, symbols, energies, and
            forces in ASE format. This is the training set of data. This can
            also be the path to an ASE trajectory (.traj) or database (.db)
            file. Energies can be obtained from any reference, e.g. DFT
            calculations.
        overwrite : bool
            If an output file with the same name exists, overwrite it.
        """

        log = self._log
        log('\nAmp training started. ' + now() + '\n')
        log('Descriptor: %s\n  (%s)' % (self.descriptor.__class__.__name__,
                                        self.descriptor))
        log('Model: %s\n  (%s)' % (self.model.__class__.__name__, self.model))

        images = hash_images(images, log=log)

        log('\nDescriptor\n==========')
        train_forces = self.model.forcetraining
        # Derivatives of fingerprints need to be calculated if train_forces is
        # True.
        calculate_derivatives = train_forces
        self.descriptor.calculate_fingerprints(
                images=images,
                parallel=self._parallel,
                log=log,
                calculate_derivatives=calculate_derivatives)

        log('\nModel fitting\n=============')
        result = self.model.fit(trainingimages=images,
                                descriptor=self.descriptor,
                                log=log,
                                parallel=self._parallel)

        if result is True:
            log('Amp successfully trained. Saving current parameters.')
            filename = self.label + '.amp'
        else:
            log('Amp not trained successfully. Saving current parameters.')
            filename = make_filename(self.label, '-untrained-parameters.amp')
        filename = self.save(filename, overwrite)
        log('Parameters saved in file "%s".' % filename)
        log("This file can be opened with `calc = Amp.load('%s')`" %
            filename)
        if result is False:
            raise TrainingConvergenceError('Amp did not converge upon '
                                           'training. See log file for'
                                           ' more information.')

[docs]    def save(self, filename, overwrite=False):
        """Saves the calculator in a way that it can be re-opened with
        load.

        Parameters
        ----------
        filename : str
            File object or path to the file to write to.
        overwrite : bool
            If an output file with the same name exists, overwrite it.
        """
        if os.path.exists(filename):
            if overwrite is False:
                oldfilename = filename
                filename = tempfile.NamedTemporaryFile(mode='w', delete=False,
                                                       suffix='.amp').name
                self._log('File "%s" exists. Instead saving to "%s".' %
                          (oldfilename, filename))
            else:
                oldfilename = tempfile.NamedTemporaryFile(mode='w',
                                                          delete=False,
                                                          suffix='.amp').name

                self._log('Overwriting file: "%s". Moving original to "%s".'
                          % (filename, oldfilename))
                shutil.move(filename, oldfilename)
        descriptor = self.descriptor.tostring()
        model = self.model.tostring()
        p = Parameters({'descriptor': descriptor,
                        'model': model})
        p.write(filename)
        return filename

    def _printheader(self, log):
        """Prints header to log file; inspired by that in GPAW.
        """
        log(logo)
        log('Amp: Atomistic Machine-learning Package')
        log('Developed by Andrew Peterson, Alireza Khorshidi, and others,')
        log('Brown University.')
        log('PI Website: http://brown.edu/go/catalyst')
        log('Official repository: http://bitbucket.org/andrewpeterson/amp')
        log('Official documentation: http://amp.readthedocs.org/')
        log('Citation:')
        log('  Alireza Khorshidi & Andrew A. Peterson,')
        log('  Computer Physics Communications 207: 310-324 (2016).')
        log('  http://doi.org/10.1016/j.cpc.2016.05.010')
        log('=' * 70)
        log('User: %s' % getuser())
        log('Hostname: %s' % gethostname())
        log('Date: %s' % now(with_utc=True))
        uname = platform.uname()
        log('Architecture: %s' % uname[4])
        log('PID: %s' % os.getpid())
        log('Amp version: %s' % _ampversion)
        ampdirectory = os.path.dirname(os.path.abspath(__file__))
        log('Amp directory: %s' % ampdirectory)
        commithash, commitdate = get_git_commit(ampdirectory)
        log(' Last commit: %s' % commithash)
        log(' Last commit date: %s' % commitdate)
        log('Python: v{0}.{1}.{2}: %s'.format(*sys.version_info[:3]) %
            sys.executable)
        log('ASE v%s: %s' % (aseversion, os.path.dirname(ase.__file__)))
        log('NumPy v%s: %s' %
            (np.version.version, os.path.dirname(np.__file__)))
        # SciPy is not a strict dependency.
        try:
            import scipy
            log('SciPy v%s: %s' %
                (scipy.version.version, os.path.dirname(scipy.__file__)))
        except ImportError:
            log('SciPy: not available')
        # ZMQ an pxssh are only necessary for parallel calculations.
        try:
            import zmq
            log('ZMQ/PyZMQ v%s/v%s: %s' %
                (zmq.zmq_version(), zmq.pyzmq_version(),
                 os.path.dirname(zmq.__file__)))
        except ImportError:
            log('ZMQ: not available')
        try:
            import pxssh
            log('pxssh: %s' % os.path.dirname(pxssh.__file__))
        except ImportError:
            log('pxssh: Not available from pxssh.')
            try:
                from pexpect import pxssh
            except ImportError:
                log('pxssh: Not available from pexpect.')
            else:
                import pexpect
                log('pxssh (via pexpect v%s): %s' %
                    (pexpect.__version__, pxssh.__file__))
        log('=' * 70)


[docs]def importhelper(importname):
    """Manually compiled list of available modules.

    This is to prevent the execution of arbitrary (potentially malicious) code.

    However, since there is an `eval` statement in string2dict maybe this
    is silly.
    """
    if importname == '.descriptor.gaussian.Gaussian':
        from .descriptor.gaussian import Gaussian as Module
    elif importname == '.descriptor.zernike.Zernike':
        from .descriptor.zernike import Zernike as Module
    elif importname == '.descriptor.bispectrum.Bispectrum':
        from .descriptor.bispectrum import Bispectrum as Module
    elif importname == '.model.neuralnetwork.NeuralNetwork':
        from .model.neuralnetwork import NeuralNetwork as Module
    elif importname == '.model.neuralnetwork.tflow':
        from .model.tflow import NeuralNetwork as Module
    elif importname == '.model.LossFunction':
        from .model import LossFunction as Module
    else:
        raise NotImplementedError(
            'Attempt to import the module %s. Was this intended? '
            'If so, trying manually importing this module and '
            'feeding it to Amp.load. To avoid this error, this '
            'module can be added to amp.importhelper.' %
            importname)

    return Module


[docs]def get_git_commit(ampdirectory):
    """Attempts to get the last git commit from the amp directory.
    """
    pwd = os.getcwd()
    os.chdir(ampdirectory)
    try:
        with open(os.devnull, 'w') as devnull:
            output = subprocess.check_output(['git', 'log', '-1',
                                              '--pretty=%H\t%ci'],
                                             stderr=devnull)
    except:
        output = 'unknown hash\tunknown date'
    output = output.strip()
    commithash, commitdate = output.split('\t')
    os.chdir(pwd)
    return commithash, commitdate