# coding: utf-8
# Copyright (c) Pymatgen Development Team.
# Distributed under the terms of the MIT License.
"""
This module implements classes and methods for processing LAMMPS output
files (log and dump).
"""
import re
import glob
from io import StringIO
import numpy as np
import pandas as pd
from monty.json import MSONable
from monty.io import zopen
from pymatgen.io.lammps.data import LammpsBox
__author__ = "Kiran Mathew, Zhi Deng"
__copyright__ = "Copyright 2018, The Materials Virtual Lab"
__version__ = "1.0"
__maintainer__ = "Zhi Deng"
__email__ = "z4deng@eng.ucsd.edu"
__date__ = "Aug 1, 2018"
[docs]class LammpsDump(MSONable):
"""
Object for representing dump data for a single snapshot.
"""
def __init__(self, timestep, natoms, box, data):
"""
Base constructor.
Args:
timestep (int): Current timestep.
natoms (int): Total number of atoms in the box.
box (LammpsBox): Simulation box.
data (pd.DataFrame): Dumped atomic data.
"""
self.timestep = timestep
self.natoms = natoms
self.box = box
self.data = data
[docs] @classmethod
def from_string(cls, string):
"""
Constructor from string parsing.
Args:
string (str): Input string.
"""
lines = string.split("\n")
timestep = int(lines[1])
natoms = int(lines[3])
box_arr = np.loadtxt(StringIO("\n".join(lines[5:8])))
bounds = box_arr[:, :2]
tilt = None
if "xy xz yz" in lines[4]:
tilt = box_arr[:, 2]
x = (0, tilt[0], tilt[1], tilt[0] + tilt[1])
y = (0, tilt[2])
bounds -= np.array([[min(x), max(x)], [min(y), max(y)], [0, 0]])
box = LammpsBox(bounds, tilt)
data_head = lines[8].replace("ITEM: ATOMS", "").split()
data = pd.read_csv(StringIO("\n".join(lines[9:])), names=data_head,
delim_whitespace=True)
return cls(timestep, natoms, box, data)
[docs] @classmethod
def from_dict(cls, d):
"""
Args:
d (dict): Dict representation
Returns:
LammpsDump
"""
items = {"timestep": d["timestep"], "natoms": d["natoms"]}
items["box"] = LammpsBox.from_dict(d["box"])
items["data"] = pd.read_json(d["data"], orient="split")
return cls(**items)
[docs] def as_dict(self):
"""
Returns: MSONable dict
"""
d = dict()
d["@module"] = self.__class__.__module__
d["@class"] = self.__class__.__name__
d["timestep"] = self.timestep
d["natoms"] = self.natoms
d["box"] = self.box.as_dict()
d["data"] = self.data.to_json(orient="split")
return d
[docs]def parse_lammps_dumps(file_pattern):
"""
Generator that parses dump file(s).
Args:
file_pattern (str): Filename to parse. The timestep wildcard
(e.g., dump.atom.'*') is supported and the files are parsed
in the sequence of timestep.
Yields:
LammpsDump for each available snapshot.
"""
files = glob.glob(file_pattern)
if len(files) > 1:
pattern = r"%s" % file_pattern.replace("*", "([0-9]+)")
pattern = pattern.replace("\\", "\\\\")
files = sorted(files,
key=lambda f: int(re.match(pattern, f).group(1)))
for fname in files:
with zopen(fname, "rt") as f:
dump_cache = []
for line in f:
if line.startswith("ITEM: TIMESTEP"):
if len(dump_cache) > 0:
yield LammpsDump.from_string("".join(dump_cache))
dump_cache = [line]
else:
dump_cache.append(line)
yield LammpsDump.from_string("".join(dump_cache))
[docs]def parse_lammps_log(filename="log.lammps"):
"""
Parses log file with focus on thermo data. Both one and multi line
formats are supported. Any incomplete runs (no "Loop time" marker)
will not be parsed.
Notes:
SHAKE stats printed with thermo data are not supported yet.
They are ignored in multi line format, while they may cause
issues with dataframe parsing in one line format.
Args:
filename (str): Filename to parse.
Returns:
[pd.DataFrame] containing thermo data for each completed run.
"""
with open(filename) as f:
lines = f.readlines()
begin_flag = ("Memory usage per processor =",
"Per MPI rank memory allocation (min/avg/max) =")
end_flag = "Loop time of"
begins, ends = [], []
for i, l in enumerate(lines):
if l.startswith(begin_flag):
begins.append(i)
elif l.startswith(end_flag):
ends.append(i)
def _parse_thermo(lines):
multi_pattern = r"-+\s+Step\s+([0-9]+)\s+-+"
# multi line thermo data
if re.match(multi_pattern, lines[0]):
timestep_marks = [i for i, l in enumerate(lines)
if re.match(multi_pattern, l)]
timesteps = np.split(lines, timestep_marks)[1:]
dicts = []
kv_pattern = r"([0-9A-Za-z_\[\]]+)\s+=\s+([0-9eE\.+-]+)"
for ts in timesteps:
data = {}
data["Step"] = int(re.match(multi_pattern, ts[0]).group(1))
data.update({k: float(v) for k, v
in re.findall(kv_pattern, "".join(ts[1:]))})
dicts.append(data)
df = pd.DataFrame(dicts)
# rearrange the sequence of columns
columns = ["Step"] + [k for k, v in
re.findall(kv_pattern,
"".join(timesteps[0][1:]))]
df = df[columns]
# one line thermo data
else:
df = pd.read_csv(StringIO("".join(lines)), delim_whitespace=True)
return df
runs = []
for b, e in zip(begins, ends):
runs.append(_parse_thermo(lines[b + 1:e]))
return runs