Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,33 @@ geometry. To override this for all groups:
somd2 perturbable_system.bss --terminal-flip-frequency "1 ps" --terminal-flip-angle "180 degrees"
```

## Debugging with energy components

To help diagnose simulation instabilities, `SOMD2` can record the potential
energy contribution from each OpenMM force group. This is enabled with the
`--save-energy-components` flag:

```
somd2 perturbable_system.bss --save-energy-components
```

One Parquet file per λ window is written to the output directory, named
`energy_components_<lambda>.parquet`. Times are in nanoseconds and energies in
kcal/mol; both are stored as schema metadata in the file.

The recording interval depends on the runner and active samplers:

- **Replica exchange**: always `energy-frequency`
- **Standard runner, no MC**: `energy-frequency`
- **Standard runner, with MC**: the shortest active MC frequency, i.e.
`gcmc-frequency`, `terminal-flip-frequency`, or the smaller of the two
when both are active

> [!NOTE]
> Energy components are written more frequently than checkpoint files and are
> not guarded by the file lock, so they may lead the checkpoint files by up
> to one `checkpoint-frequency` interval when copying output mid-simulation.

## Copying output files during a simulation

When `SOMD2` writes checkpoint files it acquires an exclusive
Expand Down
6 changes: 5 additions & 1 deletion src/somd2/config/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -506,7 +506,11 @@ def __init__(
Whether to save a crash report if the simulation crashes.

save_energy_components: bool
Whether to save the energy contribution for each force when checkpointing.
Whether to save per-force-group energy contributions to a Parquet file
in the output directory. Energies are recorded at every 'energy_frequency'
interval. When not running replica exchange, the interval is instead the
shortest active MC frequency when running with GCMC or terminal flip moves.
Intended for debugging purposes.

save_xml: bool
Whether to write an XML file for the OpenMM system to the output
Expand Down
69 changes: 54 additions & 15 deletions src/somd2/runner/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,10 @@ def __init__(self, system, config):
# Check the output directories and create names of output files.
self._filenames = self._prepare_output()

# Per-window cache of the last saved energy-components time (ns),
# used to skip duplicate rows on restart.
self._last_ec_time = {}

# Store the current system as a reference.
self._reference_system = self._system.clone()

Expand Down Expand Up @@ -1194,7 +1198,7 @@ def increment_filename(base_filename, suffix):
filenames["trajectory"] = str(output_directory / f"traj_{lam}.dcd")
filenames["trajectory_chunk"] = str(output_directory / f"traj_{lam}_")
filenames["energy_components"] = str(
output_directory / f"energy_components_{lam}.csv"
output_directory / f"energy_components_{lam}.parquet"
)
filenames["gcmc_ghosts"] = str(output_directory / f"gcmc_ghosts_{lam}.txt")
filenames["sampler_stats"] = str(output_directory / f"sampler_stats_{lam}.pkl")
Expand Down Expand Up @@ -2020,12 +2024,23 @@ def _backup_checkpoint(self, index):
except Exception as e:
return index, e

try:
# Backup the existing energy components file, if it exists.
path = _Path(self._filenames[index]["energy_components"])
if path.exists() and path.stat().st_size > 0:
_copyfile(
self._filenames[index]["energy_components"],
str(self._filenames[index]["energy_components"]) + ".bak",
)
except Exception as e:
return index, e

return index, None

def _save_energy_components(self, index, context, time_ns):
"""
Internal function to save the energy components for each force group to a
CSV file.
Parquet file.

Parameters
----------
Expand All @@ -2040,11 +2055,28 @@ def _save_energy_components(self, index, context, time_ns):
The current simulation time in nanoseconds.
"""

import csv as _csv
import json as _json
import openmm
import pandas as _pd
import pyarrow as _pa
import pyarrow.parquet as _pq_local

filepath = self._filenames[index]["energy_components"]
file_exists = _Path(filepath).exists()

# Lazy-initialise the last saved time for restart deduplication.
# On the first call for this window, read the existing file (if any)
# to find the maximum time already written.
if index not in self._last_ec_time:
path = _Path(filepath)
if path.exists() and path.stat().st_size > 0:
existing = _pq_local.read_table(filepath).to_pandas()
self._last_ec_time[index] = float(existing["time"].max())
else:
self._last_ec_time[index] = -1.0

# Skip rows that have already been written (restart deduplication).
if time_ns <= self._last_ec_time[index]:
return

# Use the named force groups already assigned by sire_to_openmm_system,
# sorted alphabetically for a consistent column order across runs.
Expand All @@ -2055,18 +2087,25 @@ def _save_energy_components(self, index, context, time_ns):
openmm.unit.kilocalories_per_mole
)

columns = ["time"] + list(energies.keys())
row = {"time": round(time_ns, 6)} | {
name: round(nrg, 4) for name, nrg in energies.items()
}
row = {"time": round(time_ns, 6)} | energies
df = _pd.DataFrame([row])

path = _Path(filepath)
if path.exists() and path.stat().st_size > 0:
_parquet_append(filepath, df)
else:
# First write: embed units as schema metadata under the "somd2" key,
# consistent with how the energy trajectory parquet files are written.
table = _pa.Table.from_pandas(df)
meta = _json.dumps(
{"time_units": "ns", "energy_units": "kcal/mol"}
).encode()
table = table.replace_schema_metadata(
{b"somd2": meta, **table.schema.metadata}
)
_pq_local.write_table(table, filepath)

with open(filepath, "a", newline="") as f:
writer = _csv.DictWriter(f, fieldnames=columns)
if not file_exists:
# Write a comment line with units before the header.
f.write("# time: ns, energy: kcal/mol\n")
writer.writeheader()
writer.writerow(row)
self._last_ec_time[index] = time_ns

def _restore_backup_files(self):
"""
Expand Down
26 changes: 20 additions & 6 deletions src/somd2/runner/_repex.py
Original file line number Diff line number Diff line change
Expand Up @@ -1108,6 +1108,16 @@ def run(self):
# Whether a frame is saved at the end of the cycle.
write_gcmc_ghosts = (i + 1) % cycles_per_frame == 0

# Current simulation time in ns for energy components saving.
time_ns = (
(
self._start_block * checkpoint_frequency
+ (i + 1) * self._config.energy_frequency
).to("ns")
if self._config.save_energy_components
else None
)

# Run a dynamics block for each replica, making sure only each GPU is only
# oversubscribed by a factor of self._config.oversubscription_factor.
for j in range(num_batches):
Expand All @@ -1121,6 +1131,7 @@ def run(self):
repeat(is_gcmc),
repeat(write_gcmc_ghosts),
repeat(is_terminal_flip),
repeat(time_ns),
):
if not result:
_logger.error(
Expand Down Expand Up @@ -1294,6 +1305,7 @@ def _run_block(
is_gcmc=False,
write_gcmc_ghosts=False,
is_terminal_flip=False,
time_ns=None,
):
"""
Run a dynamics block for a given replica.
Expand Down Expand Up @@ -1321,6 +1333,10 @@ def _run_block(
Whether a terminal flip MC move should be performed before the
dynamics block.

time_ns: float or None
The current simulation time in nanoseconds, used when saving energy
components. If None, energy components are not saved.

Returns
-------

Expand Down Expand Up @@ -1417,6 +1433,10 @@ def _run_block(
# Save the OpenMM state.
self._dynamics_cache.save_openmm_state(index)

# Save the energy contribution for each force.
if self._config.save_energy_components and time_ns is not None:
self._save_energy_components(index, dynamics.context(), time_ns)

# Get the energy at each lambda value.
energies = dynamics._current_energy_array()

Expand Down Expand Up @@ -1781,12 +1801,6 @@ def _checkpoint(self, index, lambdas, block, num_blocks, is_final_block=False):
# Commit the current system.
system = dynamics.commit()

# Save the energy contribution for each force.
if self._config.save_energy_components:
self._save_energy_components(
index, dynamics.context(), system.time().to("ns")
)

# If performing GCMC, then we need to flag the ghost waters.
if gcmc_sampler is not None:
system = gcmc_sampler._flag_ghost_waters(system)
Expand Down
Loading
Loading