Source code for amp.analysis

#!/usr/bin/env python

from . import Amp
from .utilities import now, hash_images, make_filename
import os
import numpy as np
from matplotlib import pyplot
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})


[docs]def plot_sensitivity(load,
                     images,
                     d=0.0001,
                     label='sensitivity',
                     dblabel=None,
                     plotfile=None,
                     overwrite=False,
                     energy_coefficient=1.0,
                     force_coefficient=0.04):
    """Returns the plot of loss function in terms of perturbed parameters.

    Takes the load file and images. Any other keyword taken by the Amp
    calculator can be fed to this class also.

    Parameters
    ----------
    load : str
        Path for loading an existing ".amp" file. Should be fed like
        'load="filename.amp"'.
    images : list or str
        List of ASE atoms objects with positions, symbols, energies, and forces
        in ASE format. This can also be the path to an ASE trajectory (.traj)
        or database (.db) file.  Energies can be obtained from any reference,
        e.g. DFT calculations.
    d : float
        The amount of perturbation in each parameter.
    label : str
        Default prefix/location used for all files.
    dblabel : str
        Optional separate prefix/location of database files, including
        fingerprints, fingerprint primes, and neighborlists, to avoid
        calculating them. If not supplied, just uses the value from label.
    plotfile : Object
        File for the plot.
    overwrite : bool
        If a plot or an script containing values found overwrite it.
    energy_coefficient : float
        Coefficient of energy loss in the total loss function.
    force_coefficient : float
        Coefficient of force loss in the total loss function.
    """

    from amp.model import LossFunction

    calc = Amp.load(file=load)

    if plotfile is None:
        plotfile = make_filename(label, '-plot.pdf')

    if (not overwrite) and os.path.exists(plotfile):
        raise IOError('File exists: %s.\nIf you want to overwrite,'
                      ' set overwrite=True or manually delete.' % plotfile)

    calc.dblabel = label if dblabel is None else dblabel

    if force_coefficient == 0.:
        calculate_derivatives = False
    else:
        calculate_derivatives = True

    calc._log('\nAmp sensitivity analysis started. ' + now() + '\n')
    calc._log('Descriptor: %s' % calc.descriptor.__class__.__name__)
    calc._log('Model: %s' % calc.model.__class__.__name__)

    images = hash_images(images)

    calc._log('\nDescriptor\n==========')
    calc.descriptor.calculate_fingerprints(
            images=images,
            parallel=calc._parallel,
            log=calc._log,
            calculate_derivatives=calculate_derivatives)

    vector = calc.model.vector.copy()

    lossfunction = LossFunction(energy_coefficient=energy_coefficient,
                                force_coefficient=force_coefficient,
                                parallel=calc._parallel,
                                )
    calc.model.lossfunction = lossfunction

    # Set up local loss function.
    calc.model.lossfunction.attach_model(
                calc.model,
                fingerprints=calc.descriptor.fingerprints,
                fingerprintprimes=calc.descriptor.fingerprintprimes,
                images=images)

    originalloss = calc.model.lossfunction.get_loss(
        vector, lossprime=False)['loss']

    calc._log('\n Perturbing parameters...', tic='perturb')

    allparameters = []
    alllosses = []
    num_parameters = len(vector)

    for count in range(num_parameters):
        calc._log('parameter %i out of %i' % (count + 1, num_parameters))
        parameters = []
        losses = []
        # parameter is perturbed -d and loss function calculated.
        vector[count] -= d
        parameters.append(vector[count])
        perturbedloss = calc.model.lossfunction.get_loss(
            vector, lossprime=False)['loss']
        losses.append(perturbedloss)

        vector[count] += d
        parameters.append(vector[count])
        losses.append(originalloss)
        # parameter is perturbed +d and loss function calculated.
        vector[count] += d
        parameters.append(vector[count])
        perturbedloss = calc.model.lossfunction.get_loss(
            vector, lossprime=False)['loss']
        losses.append(perturbedloss)

        allparameters.append(parameters)
        alllosses.append(losses)
        # returning back to the original value.
        vector[count] -= d

    calc._log('...parameters perturbed and loss functions calculated',
              toc='perturb')

    calc._log('Plotting loss function vs perturbed parameters...',
              tic='plot')

    with PdfPages(plotfile) as pdf:
        count = 0
        for parameter in vector:
            fig = pyplot.figure()
            ax = fig.add_subplot(111)
            ax.plot(allparameters[count],
                    alllosses[count],
                    marker='o', linestyle='--', color='b',)

            xmin = allparameters[count][0] - \
                0.1 * (allparameters[count][-1] - allparameters[count][0])
            xmax = allparameters[count][-1] + \
                0.1 * (allparameters[count][-1] - allparameters[count][0])
            ymin = min(alllosses[count]) - \
                0.1 * (max(alllosses[count]) - min(alllosses[count]))
            ymax = max(alllosses[count]) + \
                0.1 * (max(alllosses[count]) - min(alllosses[count]))
            ax.set_xlim([xmin, xmax])
            ax.set_ylim([ymin, ymax])

            ax.set_xlabel('parameter no %i' % count)
            ax.set_ylabel('loss function')
            pdf.savefig(fig)
            pyplot.close(fig)
            count += 1

    calc._log(' ...loss functions plotted.', toc='plot')


[docs]def plot_parity(load,
                images,
                label='parity',
                dblabel=None,
                plot_forces=True,
                plotfile=None,
                color='b.',
                overwrite=False,
                returndata=False,
                energy_coefficient=1.0,
                force_coefficient=0.04):
    """Makes a parity plot of Amp energies and forces versus real energies and
    forces.

    Parameters
    ----------
    load : str
        Path for loading an existing ".amp" file. Should be fed like
        'load="filename.amp"'.
    images : list or str
        List of ASE atoms objects with positions, symbols, energies, and forces
        in ASE format. This can also be the path to an ASE trajectory (.traj)
        or database (.db) file.  Energies can be obtained from any reference,
        e.g. DFT calculations.
    label : str
        Default prefix/location used for all files.
    dblabel : str
        Optional separate prefix/location of database files, including
        fingerprints, fingerprint primes, and neighborlists, to avoid
        calculating them. If not supplied, just uses the value from label.
    plot_forces : bool
        Determines whether or not forces should be plotted as well.
    plotfile : Object
        File for the plot.
    color : str
        Plot color.
    overwrite : bool
        If a plot or an script containing values found overwrite it.
    returndata : bool
        Whether to return a reference to the figures and their data or not.
    energy_coefficient : float
        Coefficient of energy loss in the total loss function.
    force_coefficient : float
        Coefficient of force loss in the total loss function.
    """

    calc = Amp.load(file=load)

    if plotfile is None:
        plotfile = make_filename(label, '-plot.pdf')

    if (not overwrite) and os.path.exists(plotfile):
        raise IOError('File exists: %s.\nIf you want to overwrite,'
                      ' set overwrite=True or manually delete.'
                      % plotfile)

    calc.dblabel = label if dblabel is None else dblabel

    if (force_coefficient != 0.) or (plot_forces is True):
        calculate_derivatives = True
    else:
        calculate_derivatives = False

    calc._log('\nAmp parity plot started. ' + now() + '\n')
    calc._log('Descriptor: %s' % calc.descriptor.__class__.__name__)
    calc._log('Model: %s' % calc.model.__class__.__name__)

    images = hash_images(images, log=calc._log)

    calc._log('\nDescriptor\n==========')
    calc.descriptor.calculate_fingerprints(
            images=images,
            parallel=calc._parallel,
            log=calc._log,
            calculate_derivatives=calculate_derivatives)

    calc._log('Calculating potential energies...', tic='pot-energy')
    energy_data = {}
    for hash, image in images.iteritems():
        amp_energy = calc.model.calculate_energy(
            calc.descriptor.fingerprints[hash])
        actual_energy = image.get_potential_energy(apply_constraint=False)
        energy_data[hash] = [actual_energy, amp_energy]
    calc._log('...potential energies calculated.', toc='pot-energy')

    min_act_energy = min([energy_data[hash][0]
                         for hash, image in images.iteritems()])
    max_act_energy = max([energy_data[hash][0]
                         for hash, image in images.iteritems()])

    if plot_forces is False:
        fig = pyplot.figure(figsize=(5., 5.))
        ax = fig.add_subplot(111)
    else:
        fig = pyplot.figure(figsize=(5., 10.))
        ax = fig.add_subplot(211)

    calc._log('Plotting energies...', tic='energy-plot')
    for hash, image in images.iteritems():
        ax.plot(energy_data[hash][0], energy_data[hash][1], color)
    # draw line
    ax.plot([min_act_energy, max_act_energy],
            [min_act_energy, max_act_energy],
            'r-',
            lw=0.3,)
    ax.set_xlabel("ab initio energy, eV")
    ax.set_ylabel("Amp energy, eV")
    ax.set_title("Energies")
    calc._log('...energies plotted.', toc='energy-plot')

    if plot_forces is True:
        ax = fig.add_subplot(212)

        calc._log('Calculating forces...', tic='forces')
        force_data = {}
        for hash, image in images.iteritems():
            amp_forces = \
                calc.model.calculate_forces(
                    calc.descriptor.fingerprints[hash],
                    calc.descriptor.fingerprintprimes[hash])
            actual_forces = image.get_forces(apply_constraint=False)
            force_data[hash] = [actual_forces, amp_forces]
        calc._log('...forces calculated.', toc='forces')

        min_act_force = min([force_data[hash][0][index][k]
                            for hash, image in images.iteritems()
                            for index in range(len(image))
                            for k in range(3)])

        max_act_force = max([force_data[hash][0][index][k]
                            for hash, image in images.iteritems()
                            for index in range(len(image))
                            for k in range(3)])

        calc._log('Plotting forces...', tic='force-plot')
        for hash, image in images.iteritems():
            for index in range(len(image)):
                for k in range(3):
                    ax.plot(force_data[hash][0][index][k],
                            force_data[hash][1][index][k], color)
        # draw line
        ax.plot([min_act_force, max_act_force],
                [min_act_force, max_act_force],
                'r-',
                lw=0.3,)
        ax.set_xlabel("ab initio force, eV/Ang")
        ax.set_ylabel("Amp force, eV/Ang")
        ax.set_title("Forces")
        calc._log('...forces plotted.', toc='force-plot')

    fig.savefig(plotfile)

    if returndata:
        if plot_forces is False:
            return fig, energy_data
        else:
            return fig, energy_data, force_data


[docs]def plot_error(load,
               images,
               label='error',
               dblabel=None,
               plot_forces=True,
               plotfile=None,
               color='b.',
               overwrite=False,
               returndata=False,
               energy_coefficient=1.0,
               force_coefficient=0.04):
    """Makes an error plot of Amp energies and forces versus real energies and
    forces.

    Parameters
    ----------
    load : str
        Path for loading an existing ".amp" file. Should be fed like
        'load="filename.amp"'.
    images : list or str
        List of ASE atoms objects with positions, symbols, energies, and forces
        in ASE format. This can also be the path to an ASE trajectory (.traj)
        or database (.db) file.  Energies can be obtained from any reference,
        e.g. DFT calculations.
    label : str
        Default prefix/location used for all files.
    dblabel : str
        Optional separate prefix/location of database files, including
        fingerprints, fingerprint primes, and neighborlists, to avoid
        calculating them. If not supplied, just uses the value from label.
    plot_forces : bool
        Determines whether or not forces should be plotted as well.
    plotfile : Object
        File for the plot.
    color : str
        Plot color.
    overwrite : bool
        If a plot or an script containing values found overwrite it.
    returndata : bool
        Whether to return a reference to the figures and their data or not.
    energy_coefficient : float
        Coefficient of energy loss in the total loss function.
    force_coefficient : float
        Coefficient of force loss in the total loss function.
    """

    calc = Amp.load(file=load)

    if plotfile is None:
        plotfile = make_filename(label, '-plot.pdf')

    if (not overwrite) and os.path.exists(plotfile):
        raise IOError('File exists: %s.\nIf you want to overwrite,'
                      ' set overwrite=True or manually delete.'
                      % plotfile)

    calc.dblabel = label if dblabel is None else dblabel

    if (force_coefficient != 0.) or (plot_forces is True):
        calculate_derivatives = True
    else:
        calculate_derivatives = False

    calc._log('\nAmp error plot started. ' + now() + '\n')
    calc._log('Descriptor: %s' % calc.descriptor.__class__.__name__)
    calc._log('Model: %s' % calc.model.__class__.__name__)

    images = hash_images(images, log=calc._log)

    calc._log('\nDescriptor\n==========')
    calc.descriptor.calculate_fingerprints(
            images=images,
            parallel=calc._parallel,
            log=calc._log,
            calculate_derivatives=calculate_derivatives)

    calc._log('Calculating potential energy errors...', tic='pot-energy')
    energy_data = {}
    for hash, image in images.iteritems():
        no_of_atoms = len(image)
        amp_energy = calc.model.calculate_energy(
            calc.descriptor.fingerprints[hash])
        actual_energy = image.get_potential_energy(apply_constraint=False)
        act_energy_per_atom = actual_energy / no_of_atoms
        energy_error = abs(amp_energy - actual_energy) / no_of_atoms
        energy_data[hash] = [act_energy_per_atom, energy_error]
    calc._log('...potential energy errors calculated.', toc='pot-energy')

    # calculating energy per atom rmse
    energy_square_error = 0.
    for hash, image in images.iteritems():
        energy_square_error += energy_data[hash][1] ** 2.
    energy_per_atom_rmse = np.sqrt(energy_square_error / len(images))

    min_act_energy_per_atom = min([energy_data[hash][0]
                                   for hash, image in images.iteritems()])
    max_act_energy_per_atom = max([energy_data[hash][0]
                                   for hash, image in images.iteritems()])

    if plot_forces is False:
        fig = pyplot.figure(figsize=(5., 5.))
        ax = fig.add_subplot(111)
    else:
        fig = pyplot.figure(figsize=(5., 10.))
        ax = fig.add_subplot(211)

    calc._log('Plotting energy errors...', tic='energy-plot')
    for hash, image in images.iteritems():
        ax.plot(energy_data[hash][0], energy_data[hash][1], color)
    # draw horizontal line for rmse
    ax.plot([min_act_energy_per_atom, max_act_energy_per_atom],
            [energy_per_atom_rmse, energy_per_atom_rmse],
            color='black', linestyle='dashed', lw=1,)
    ax.text(max_act_energy_per_atom,
            energy_per_atom_rmse,
            'energy rmse = %6.5f' % energy_per_atom_rmse,
            ha='right',
            va='bottom',
            color='black')
    ax.set_xlabel("ab initio energy (eV) per atom")
    ax.set_ylabel("$|$ab initio energy - Amp energy$|$ / number of atoms")
    ax.set_title("Energies")
    calc._log('...energy errors plotted.', toc='energy-plot')

    if plot_forces is True:
        ax = fig.add_subplot(212)

        calc._log('Calculating force errors...', tic='forces')
        force_data = {}
        for hash, image in images.iteritems():
            amp_forces = \
                calc.model.calculate_forces(
                    calc.descriptor.fingerprints[hash],
                    calc.descriptor.fingerprintprimes[hash])
            actual_forces = image.get_forces(apply_constraint=False)
            force_data[hash] = [
                actual_forces,
                abs(np.array(amp_forces) - np.array(actual_forces))]
        calc._log('...force errors calculated.', toc='forces')

        # calculating force rmse
        force_square_error = 0.
        for hash, image in images.iteritems():
            no_of_atoms = len(image)
            for index in range(no_of_atoms):
                for k in range(3):
                    force_square_error += \
                        ((1.0 / 3.0) * force_data[hash][1][index][k] ** 2.) / \
                        no_of_atoms
        force_rmse = np.sqrt(force_square_error / len(images))

        min_act_force = min([force_data[hash][0][index][k]
                            for hash, image in images.iteritems()
                            for index in range(len(image))
                            for k in range(3)])

        max_act_force = max([force_data[hash][0][index][k]
                            for hash, image in images.iteritems()
                            for index in range(len(image))
                            for k in range(3)])

        calc._log('Plotting force errors...', tic='force-plot')
        for hash, image in images.iteritems():
            for index in range(len(image)):
                for k in range(3):
                    ax.plot(force_data[hash][0][index][k],
                            force_data[hash][1][index][k], color)
        # draw horizontal line for rmse
        ax.plot([min_act_force, max_act_force],
                [force_rmse, force_rmse],
                color='black',
                linestyle='dashed',
                lw=1,)
        ax.text(max_act_force,
                force_rmse,
                'force rmse = %5.4f' % force_rmse,
                ha='right',
                va='bottom',
                color='black',)
        ax.set_xlabel("ab initio force, eV/Ang")
        ax.set_ylabel("$|$ab initio force - Amp force$|$")
        ax.set_title("Forces")
        calc._log('...force errors plotted.', toc='force-plot')

    fig.savefig(plotfile)

    if returndata:
        if plot_forces is False:
            return fig, energy_data
        else:
            return fig, energy_data, force_data


[docs]def read_trainlog(logfile):
    """Reads the log file from the training process, returning the relevant
    parameters.

    Parameters
    ----------
    logfile : str
        Name or path to the log file.
    """
    data = {}

    with open(logfile, 'r') as f:
        lines = f.read().splitlines()

    # Get number of images.
    for line in lines:
        if 'unique images after hashing.' in line:
            no_images = int(line.split()[0])
            break
    data['no_images'] = no_images
    print('no_images')
    print(no_images)

    # Find where convergence data starts.
    startline = None
    for index, line in enumerate(lines):
        if 'Loss function convergence criteria:' in line:
            startline = index
            data['convergence'] = {}
            d = data['convergence']
            break

    print('convergence')
    print(startline)

    # Get convergence parameters.
    ready = [False, False, False, False, False]
    for index, line in enumerate(lines[startline:]):
        if 'energy_rmse:' in line:
            ready[0] = True
            d['energy_rmse'] = float(line.split(':')[-1])
        elif 'force_rmse:' in line:
            ready[1] = True
            _ = line.split(':')[-1].strip()
            if _ == 'None':
                d['force_rmse'] = None
                trainforces = False
            else:
                d['force_rmse'] = float(line.split(':')[-1])
                trainforces = True
            print('train forces: %s' % trainforces)
        elif 'force_coefficient:' in line:
            ready[2] = True
            d['force_coefficient'] = float(line.split(':')[-1])
        elif 'energy_coefficient:' in line:
            ready[3] = True
            d['energy_coefficient'] = float(line.split(':')[-1])
        elif 'Step' in line and 'Time' in line:
            ready[4] = True
            startline += index + 2
        if ready == [True, True, True, True]:
            break

    E = d['energy_rmse']**2 * no_images
    if trainforces:
        F = d['force_rmse']**2 * no_images
    else:
        F = 0.
    costfxngoal = d['energy_coefficient'] * E + d['force_coefficient'] * F
    d['costfxngoal'] = costfxngoal

    # Extract data.
    steps, es, fs, costfxns = [], [], [], []
    costfxnEs, costfxnFs = [], []
    index = startline
    while index < len(lines):
        line = lines[index]
        if 'Saving checkpoint data.' in line:
            index += 1
            continue
        elif 'Overwriting file' in line:
            index += 1
            continue
        elif 'optimization completed successfully.' in line:  # old version
            break
        elif '...optimization successful.' in line:
            break
        elif 'could not find parameters for the' in line:
            break
        elif '...optimization unsuccessful.' in line:
            break
        print(line)
        if trainforces:
            step, time, costfxn, e, _, _, _, f, _, _, _ = line.split()
            fs.append(float(f))
            F = float(f)**2 * no_images
            costfxnFs.append(d['force_coefficient'] * F / float(costfxn))
        else:
            step, time, costfxn, e, _, _, _ = line.split()
        steps.append(int(step))
        es.append(float(e))
        costfxns.append(costfxn)
        E = float(e)**2 * no_images
        costfxnEs.append(d['energy_coefficient'] * E / float(costfxn))
        index += 1
    d['steps'] = steps
    d['es'] = es
    d['fs'] = fs
    d['costfxns'] = costfxns
    d['costfxnEs'] = costfxnEs
    d['costfxnFs'] = costfxnFs

    return data


[docs]def plot_convergence(logfile, plotfile='convergence.pdf'):
    """Makes a plot of the convergence of the cost function and its energy
    and force components.

    Parameters
    ----------
    logfile : str
        Name or path to the log file.
    plotfile : str
        Name or path to the plot file.
    """

    data = read_trainlog(logfile)

    # Find if multiple runs contained in data set.
    d = data['convergence']
    steps = range(len(d['steps']))
    breaks = []
    for index, step in enumerate(d['steps'][1:]):
        if step < d['steps'][index]:
            breaks.append(index)

    # Make plots.
    fig = pyplot.figure(figsize=(6., 8.))
    # Margins, vertical gap, and top-to-bottom ratio of figure.
    lm, rm, bm, tm, vg, tb = 0.12, 0.05, 0.08, 0.03, 0.08, 4.
    bottomaxheight = (1. - bm - tm - vg) / (tb + 1.)

    ax = fig.add_axes((lm, bm + bottomaxheight + vg,
                       1. - lm - rm, tb * bottomaxheight))
    ax.semilogy(steps, d['es'], 'b', lw=2, label='energy rmse')
    if d['force_rmse']:
        ax.semilogy(steps, d['fs'], 'g', lw=2, label='force rmse')
    ax.semilogy(steps, d['costfxns'], color='0.5', lw=2,
                label='loss function')
    # Targets.
    ax.semilogy([steps[0], steps[-1]], [d['energy_rmse']] * 2,
                color='b', linestyle=':')
    if d['force_rmse']:
        ax.semilogy([steps[0], steps[-1]], [d['force_rmse']] * 2,
                    color='g', linestyle=':')
    ax.semilogy([steps[0], steps[-1]], [d['costfxngoal']] * 2,
                color='0.5', linestyle=':')
    ax.set_ylabel('error')
    ax.legend(loc='best')
    if len(breaks) > 0:
        ylim = ax.get_ylim()
        for b in breaks:
            ax.plot([b] * 2, ylim, '--k')

    if d['force_rmse']:
        axf = fig.add_axes((lm, bm, 1. - lm - rm, bottomaxheight))
        axf.fill_between(x=np.array(steps), y1=d['costfxnEs'],
                         color='blue')
        axf.fill_between(x=np.array(steps), y1=d['costfxnEs'],
                         y2=np.array(d['costfxnEs']) +
                         np.array(d['costfxnFs']),
                         color='green')
        axf.set_ylabel('loss function component')
        axf.set_xlabel('loss function call')
        axf.set_ylim(0, 1)
    else:
        ax.set_xlabel('loss function call')

    fig.savefig(plotfile)
    pyplot.close(fig)