diff --git a/dpdata/__init__.py b/dpdata/__init__.py index f2cd233ff..69442385e 100644 --- a/dpdata/__init__.py +++ b/dpdata/__init__.py @@ -1,6 +1,6 @@ from __future__ import annotations -from . import lammps, md, vasp +from . import md from .bond_order_system import BondOrderSystem from .system import LabeledSystem, MultiSystems, System @@ -9,11 +9,10 @@ except ImportError: from .__about__ import __version__ + __all__ = [ "__version__", - "lammps", "md", - "vasp", "System", "LabeledSystem", "MultiSystems", diff --git a/dpdata/abacus/__init__.py b/dpdata/abacus/__init__.py index e69de29bb..c917d8d4c 100644 --- a/dpdata/abacus/__init__.py +++ b/dpdata/abacus/__init__.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from dpdata.formats.abacus import * # noqa: F403 diff --git a/dpdata/abacus/md.py b/dpdata/abacus/md.py index 8df156c94..2474d87db 100644 --- a/dpdata/abacus/md.py +++ b/dpdata/abacus/md.py @@ -1,224 +1,3 @@ from __future__ import annotations -import os -import warnings - -import numpy as np - -from dpdata.utils import open_file - -from .scf import ( - bohr2ang, - get_geometry_in, - get_mag_force, - kbar2evperang3, -) -from .stru import get_frame_from_stru - -# Read in geometries from an ABACUS MD trajectory. -# The atomic coordinates are read in from generated files in OUT.XXXX. -# Energies, forces -# IMPORTANT: the program defaultly takes STRU input file as standard cell information, -# therefore the direct and cartesan coordinates read could be different from the ones in -# the output cif files!!! -# It is highly recommanded to use ORTHOGANAL coordinates in STRU file if you wish to get -# same coordinates in both dpdata and output cif files. - - -def get_path_out(fname, inlines): - # This function is different from the same-name function in scf.py. - # This function returns OUT.XXXX's base directory. - path_out = os.path.join(fname, "OUT.ABACUS/") - for line in inlines: - if len(line) > 0 and "suffix" in line and "suffix" == line.split()[0]: - suffix = line.split()[1] - path_out = os.path.join(fname, f"OUT.{suffix}/") - break - return path_out - - -def get_coord_dump_freq(inlines): - for line in inlines: - if len(line) > 0 and "md_dumpfreq" in line and "md_dumpfreq" == line.split()[0]: - return int(line.split()[1]) - return 1 - - -def get_coords_from_dump(dumplines, natoms): - nlines = len(dumplines) - total_natoms = sum(natoms) - # The output of VIRIAL, FORCE, and VELOCITY are controlled by INPUT parameters dump_virial, dump_force, and dump_vel, respectively. - # So the search of keywords can determine whether these datas are printed into MD_dump. - calc_stress = False - calc_force = False - check_line = 6 - if "VIRIAL" in dumplines[6]: - calc_stress = True - check_line = 10 - assert "POSITION" in dumplines[check_line], ( - "keywords 'POSITION' cannot be found in the 6th line. Please check." - ) - if "FORCE" in dumplines[check_line]: - calc_force = True - - nframes_dump = -1 - if calc_stress: - nframes_dump = int(nlines / (total_natoms + 13)) - else: - nframes_dump = int(nlines / (total_natoms + 9)) - assert nframes_dump > 0, ( - "Number of lines in MD_dump file = %d. Number of atoms = %d. The MD_dump file is incomplete." # noqa: UP031 - % (nlines, total_natoms) - ) - cells = np.zeros([nframes_dump, 3, 3]) - stresses = np.zeros([nframes_dump, 3, 3]) - forces = np.zeros([nframes_dump, total_natoms, 3]) - coords = np.zeros([nframes_dump, total_natoms, 3]) - iframe = 0 - for iline in range(nlines): - if "MDSTEP" in dumplines[iline]: - # read in LATTICE_CONSTANT - # for abacus version >= v3.1.4, the unit is angstrom, and "ANGSTROM" is added at the end - # for abacus version < v3.1.4, the unit is bohr - celldm = float(dumplines[iline + 1].split()[1]) - newversion = True - if "Angstrom" not in dumplines[iline + 1]: - celldm *= bohr2ang # transfer unit to ANGSTROM - newversion = False - - # read in LATTICE_VECTORS - for ix in range(3): - cells[iframe, ix] = ( - np.array([float(i) for i in dumplines[iline + 3 + ix].split()[0:3]]) - * celldm - ) - if calc_stress: - stresses[iframe, ix] = np.array( - [float(i) for i in dumplines[iline + 7 + ix].split()[0:3]] - ) - - if calc_stress: - skipline = 11 - else: - skipline = 7 - - for iat in range(total_natoms): - # INDEX LABEL POSITION (Angstrom) FORCE (eV/Angstrom) VELOCITY (Angstrom/fs) - # 0 Sn 0.000000000000 0.000000000000 0.000000000000 -0.000000000000 -0.000000000001 -0.000000000001 0.001244557166 -0.000346684288 0.000768457739 - # 1 Sn 0.000000000000 3.102800034079 3.102800034079 -0.000186795145 -0.000453823768 -0.000453823768 0.000550996187 -0.000886442775 0.001579501983 - # for abacus version >= v3.1.4, the value of POSITION is the real cartessian position, and unit is angstrom, and if cal_force the VELOCITY is added at the end. - # for abacus version < v3.1.4, the real position = POSITION * celldm - coords[iframe, iat] = np.array( - [float(i) for i in dumplines[iline + skipline + iat].split()[2:5]] - ) - - if not newversion: - coords[iframe, iat] *= celldm - - if calc_force: - forces[iframe, iat] = np.array( - [ - float(i) - for i in dumplines[iline + skipline + iat].split()[5:8] - ] - ) - iframe += 1 - assert iframe == nframes_dump, ( - "iframe=%d, nframe_dump=%d. Number of frames does not match number of lines in MD_dump." # noqa: UP031 - % (iframe, nframes_dump) - ) - stresses *= kbar2evperang3 - return coords, cells, forces, stresses - - -def get_energy(outlines, ndump, dump_freq): - energy = [] - nenergy = 0 - for line_idx, line in enumerate(outlines): - if "final etot is" in line or "#TOTAL ENERGY#" in line: - if nenergy % dump_freq == 0: - energy.append(float(line.split()[-2])) - nenergy += 1 - elif "!! convergence has not been achieved" in line: - if nenergy % dump_freq == 0: - energy.append(np.nan) - nenergy += 1 - assert ndump == len(energy), ( - "Number of total energies in running_md.log = %d. Number of frames in MD_dump = %d. Please check." # noqa: UP031 - % (len(energy), ndump) - ) - energy = np.array(energy) - return energy - - -def get_frame(fname): - if isinstance(fname, str): - # if the input parameter is only one string, it is assumed that it is the - # base directory containing INPUT file; - path_in = os.path.join(fname, "INPUT") - else: - raise RuntimeError("invalid input") - with open_file(path_in) as fp: - inlines = fp.read().split("\n") - geometry_path_in = get_geometry_in(fname, inlines) # base dir of STRU - path_out = get_path_out(fname, inlines) - - data = get_frame_from_stru(geometry_path_in) - natoms = data["atom_numbs"] - # should remove spins from STRU file - if "spins" in data: - data.pop("spins") - - # This coords is not to be used. - dump_freq = get_coord_dump_freq(inlines=inlines) - # ndump = int(os.popen("ls -l %s | grep 'md_pos_' | wc -l" %path_out).readlines()[0]) - # number of dumped geometry files - # coords = get_coords_from_cif(ndump, dump_freq, atom_names, natoms, types, path_out, cell) - with open_file(os.path.join(path_out, "MD_dump")) as fp: - dumplines = fp.read().split("\n") - coords, cells, force, stress = get_coords_from_dump(dumplines, natoms) - ndump = np.shape(coords)[0] - with open_file(os.path.join(path_out, "running_md.log")) as fp: - outlines = fp.read().split("\n") - energy = get_energy(outlines, ndump, dump_freq) - - unconv_stru = "" - for i, iene in enumerate(energy): - if np.isnan(iene): - coords = np.delete(coords, i - ndump, axis=0) - cells = np.delete(cells, i - ndump, axis=0) - force = np.delete(force, i - ndump, axis=0) - stress = np.delete(stress, i - ndump, axis=0) - energy = np.delete(energy, i - ndump, axis=0) - unconv_stru += "%d " % i # noqa: UP031 - ndump = len(energy) - if unconv_stru != "": - warnings.warn(f"Structure {unconv_stru} are unconverged and not collected!") - - for iframe in range(ndump): - stress[iframe] *= np.linalg.det(cells[iframe, :, :].reshape([3, 3])) - if np.sum(np.abs(stress[0])) < 1e-10: - stress = None - - magmom, magforce = get_mag_force(outlines) - - data["cells"] = cells - # for idx in range(ndump): - # data['cells'][:, :, :] = cell - data["coords"] = coords - data["energies"] = energy - data["forces"] = force - data["virials"] = stress - if not isinstance(data["virials"], np.ndarray): - del data["virials"] - data["orig"] = np.zeros(3) - if len(magmom) > 0: - data["spins"] = magmom - if len(magforce) > 0: - data["force_mags"] = magforce - - # need to expand the move. - if "move" in data: - data["move"] = [data["move"][0] for i in range(ndump)] - - return data +from dpdata.formats.abacus.md import * # noqa: F403 diff --git a/dpdata/abacus/relax.py b/dpdata/abacus/relax.py index db60412b8..3d7a40315 100644 --- a/dpdata/abacus/relax.py +++ b/dpdata/abacus/relax.py @@ -1,265 +1,3 @@ from __future__ import annotations -import glob -import os - -import numpy as np - -from dpdata.utils import open_file - -from .scf import ( - bohr2ang, - collect_force, - collect_stress, - get_geometry_in, - get_mag_force, - kbar2evperang3, -) -from .stru import get_frame_from_stru - -# Read in geometries from an ABACUS RELAX(CELL-RELAX) trajectory in OUT.XXXX/runnning_relax/cell-relax.log. - - -def get_log_file(fname, inlines): - suffix = "ABACUS" - calculation = "scf" - for line in inlines: - if "suffix" in line and "suffix" == line.split()[0]: - suffix = line.split()[1] - elif "calculation" in line and "calculation" == line.split()[0]: - calculation = line.split()[1] - logf = os.path.join(fname, f"OUT.{suffix}/running_{calculation}.log") - return logf - - -def get_relax_stru_files(output_dir): - """Find the STRU files in the output directory. - - Args: - output_dir (str): output directory - - Returns - ------- - strus: list of STRU files - example: - ["STRU_ION1_D", "STRU_ION2_D"] - """ - return glob.glob(os.path.join(output_dir, "STRU_ION*_D")) - - -def get_coords_from_log(loglines, natoms, stru_files=None): - """NOTICE: unit of coords and cells is Angstrom - order: - coordinate - cell (no output if cell is not changed) - energy (no output, if SCF is not converged) - force (no output, if cal_force is not setted or abnormal ending) - stress (no output, if set cal_stress is not setted or abnormal ending). - """ - natoms_log = 0 - for line in loglines: - if line[13:41] == "number of atom for this type": - natoms_log += int(line.split()[-1]) - - assert natoms_log > 0 and natoms_log == natoms, ( - f"ERROR: detected atom number in log file is {natoms_log}, while the atom number in STRU file is {natoms}" - ) - - energy = [] - cells = [] - coords = [] - coord_direct = [] # if the coordinate is direct type or not - - for i in range(len(loglines)): - line = loglines[i] - if line[18:41] == "lattice constant (Bohr)": - a0 = float(line.split()[-1]) - elif len(loglines[i].split()) >= 2 and loglines[i].split()[1] == "COORDINATES": - # read coordinate information - coords.append([]) - direct_coord = False - if loglines[i].split()[0] == "DIRECT": - coord_direct.append(True) - for k in range(2, 2 + natoms): - coords[-1].append( - list(map(lambda x: float(x), loglines[i + k].split()[1:4])) - ) - elif loglines[i].split()[0] == "CARTESIAN": - coord_direct.append(False) - for k in range(2, 2 + natoms): - coords[-1].append( - list( - map( - lambda x: float(x) * a0 * bohr2ang, - loglines[i + k].split()[1:4], - ) - ) - ) - else: - assert False, "Unrecongnized coordinate type, %s, line:%d" % ( # noqa: UP031 - loglines[i].split()[0], - i, - ) - - elif ( - loglines[i][1:56] - == "Lattice vectors: (Cartesian coordinate: in unit of a_0)" - ): - # add the cell information for previous structures - while len(cells) < len(coords) - 1: - cells.append(cells[-1]) - # get current cell information - cells.append([]) - for k in range(1, 4): - cells[-1].append( - list( - map( - lambda x: float(x) * a0 * bohr2ang, - loglines[i + k].split()[0:3], - ) - ) - ) - - elif line[1:14] == "final etot is" or "#TOTAL ENERGY#" in line: - # add the energy for previous structures whose SCF is not converged - while len(energy) < len(coords) - 1: - energy.append(np.nan) - # get the energy of current structure - energy.append(float(line.split()[-2])) - - # in some relax method (like: bfgs_trad), the coordinate is not outputed in running_relax.log - # but if out_stru is true, then STRU_ION*_D will be outputed in OUT.ABACUS - # we should read cell and coord from STRU_ION*_D files - if len(energy) > 1 and len(coords) == 1: - # the energies of all structrues are collected, but coords have only the first structure - if ( - stru_files is not None and len(stru_files) > 1 - ): # if stru_files are not only STRU_ION_D - stru_file_name = [os.path.basename(i) for i in stru_files] - coords = coords[:1] + [np.nan for i in range(len(energy) - 1)] - coord_direct = coord_direct[:1] + [False for i in range(len(energy) - 1)] - cells = cells[:1] + [np.nan for i in range(len(energy) - 1)] - for iframe in range(1, len(energy)): - if f"STRU_ION{iframe}_D" in stru_file_name: - # read the structure from STRU_ION*_D - stru_data = get_frame_from_stru( - stru_files[stru_file_name.index(f"STRU_ION{iframe}_D")] - ) - coords[iframe] = stru_data["coords"][0] - cells[iframe] = stru_data["cells"][0] - - force = collect_force(loglines) - stress = collect_stress(loglines) - - # delete last structures which has no energy - while len(energy) < len(coords): - del coords[-1] - del coord_direct[-1] - - # add cells for last structures whose cell is not changed - while len(cells) < len(coords): - cells.append(cells[-1]) - - # only keep structures that have all of coord, force and stress - if len(stress) == 0 and len(force) == 0: - minl = len(coords) - elif len(stress) == 0: - minl = min(len(coords), len(force)) - force = force[:minl] - elif len(force) == 0: - minl = min(len(coords), len(stress)) - stress = stress[:minl] - else: - minl = min(len(coords), len(force), len(stress)) - force = force[:minl] - stress = stress[:minl] - - coords = coords[:minl] - energy = energy[:minl] - cells = cells[:minl] - - # delete structures whose energy is np.nan - for i in range(minl): - if ( - np.isnan(energy[i - minl]) - or np.any(np.isnan(coords[i - minl])) - or np.any(np.isnan(cells[i - minl])) - ): - del energy[i - minl] - del coords[i - minl] - del cells[i - minl] - del coord_direct[i - minl] - if len(force) > 0: - del force[i - minl] - if len(stress) > 0: - del stress[i - minl] - - energy = np.array(energy) - cells = np.array(cells) - coords = np.array(coords) - stress = np.array(stress) - force = np.array(force) - - # transfer direct coordinate to cartessian type - for i in range(len(coords)): - if coord_direct[i]: - coords[i] = coords[i].dot(cells[i]) - - if len(stress) > 0: - virial = np.zeros([len(cells), 3, 3]) - for i in range(len(cells)): - volume = np.linalg.det(cells[i, :, :].reshape([3, 3])) - virial[i] = stress[i] * kbar2evperang3 * volume - else: - virial = None - - return energy, cells, coords, force, stress, virial - - -def get_frame(fname): - if isinstance(fname, str): - # if the input parameter is only one string, it is assumed that it is the - # base directory containing INPUT file; - path_in = os.path.join(fname, "INPUT") - else: - raise RuntimeError("invalid input") - with open_file(path_in) as fp: - inlines = fp.read().split("\n") - geometry_path_in = get_geometry_in(fname, inlines) # base dir of STRU - - data = get_frame_from_stru(geometry_path_in) - natoms = sum(data["atom_numbs"]) - # should remove spins from STRU file - if "spins" in data: - data.pop("spins") - - logf = get_log_file(fname, inlines) - assert os.path.isfile(logf), f"Error: can not find {logf}" - with open_file(logf) as f1: - lines = f1.readlines() - - relax_stru_files = get_relax_stru_files(os.path.dirname(logf)) - - energy, cells, coords, force, stress, virial = get_coords_from_log( - lines, natoms, stru_files=relax_stru_files - ) - - magmom, magforce = get_mag_force(lines) - - data["cells"] = cells - data["coords"] = coords - data["energies"] = energy - data["forces"] = force - if isinstance(virial, np.ndarray): - data["virials"] = virial - data["stress"] = stress - data["orig"] = np.zeros(3) - - if len(magmom) > 0: - data["spins"] = magmom - if len(magforce) > 0: - data["force_mags"] = magforce - if "move" in data: - data["move"] = [data["move"][0] for i in range(len(data["energies"]))] - - return data +from dpdata.formats.abacus.relax import * # noqa: F403 diff --git a/dpdata/abacus/scf.py b/dpdata/abacus/scf.py index 167d3067f..2a2479998 100644 --- a/dpdata/abacus/scf.py +++ b/dpdata/abacus/scf.py @@ -1,255 +1,3 @@ from __future__ import annotations -import os -import re -import warnings - -import numpy as np - -from dpdata.utils import open_file - -from ..unit import LengthConversion, PressureConversion -from .stru import get_frame_from_stru - -bohr2ang = LengthConversion("bohr", "angstrom").value() -kbar2evperang3 = PressureConversion("kbar", "eV/angstrom^3").value() - - -def CheckFile(ifile): - if not os.path.isfile(ifile): - print(f"Can not find file {ifile}") - return False - return True - - -def get_geometry_in(fname, inlines): - geometry_path_in = os.path.join(fname, "STRU") - for line in inlines: - if "stru_file" in line and "stru_file" == line.split()[0]: - atom_file = line.split()[1] - geometry_path_in = os.path.join(fname, atom_file) - break - return geometry_path_in - - -def get_path_out(fname, inlines): - path_out = os.path.join(fname, "OUT.ABACUS/running_scf.log") - for line in inlines: - if "suffix" in line and "suffix" == line.split()[0]: - suffix = line.split()[1] - path_out = os.path.join(fname, f"OUT.{suffix}/running_scf.log") - break - return path_out - - -def get_energy(outlines): - Etot = None - for line in reversed(outlines): - if "final etot is" in line: # for LTS - Etot = float(line.split()[-2]) # in eV - return Etot, True - elif "TOTAL ENERGY" in line: # for develop - Etot = float(line.split()[-2]) # in eV - return Etot, True - elif "convergence has NOT been achieved!" in line: - return Etot, False - elif "convergence has not been achieved" in line: - return Etot, False - - return Etot, False - - -def collect_force(outlines): - force = [] - for i, line in enumerate(outlines): - # if "TOTAL-FORCE (eV/Angstrom)" in line: - if "TOTAL-FORCE" in line: - value_pattern = re.compile( - r"^\s*[A-Z][a-z]?[1-9][0-9]*\s+[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\s+[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\s+[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\s*$" - ) - j = i - # find the first line of force - noforce = False - while not value_pattern.match(outlines[j]): - j += 1 - if ( - j >= i + 10 - ): # if can not find the first line of force in 10 lines, then stop - warnings.warn("Warning: can not find the first line of force") - noforce = True - break - if noforce: - break - - force.append([]) - while value_pattern.match(outlines[j]): - force[-1].append([float(ii) for ii in outlines[j].split()[1:4]]) - j += 1 - return force # only return the last force - - -def get_force(outlines, natoms): - force = collect_force(outlines) - if len(force) == 0: - return None - else: - return np.array(force[-1]) # only return the last force - - -def collect_stress(outlines): - stress = [] - for i, line in enumerate(outlines): - # if "TOTAL-STRESS (KBAR)" in line: - if "TOTAL-STRESS" in line: - value_pattern = re.compile( - r"^\s*[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\s+[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\s+[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\s*$" - ) - j = i - nostress = False - while not value_pattern.match(outlines[j]): - j += 1 - if ( - j >= i + 10 - ): # if can not find the first line of stress in 10 lines, then stop - warnings.warn("Warning: can not find the first line of stress") - nostress = True - break - if nostress: - break - - stress.append([]) - while value_pattern.match(outlines[j]): - stress[-1].append( - list(map(lambda x: float(x), outlines[j].split()[0:3])) - ) - j += 1 - return stress - - -def get_stress(outlines): - stress = collect_stress(outlines) - if len(stress) == 0: - return None - else: - return np.array(stress[-1]) * kbar2evperang3 # only return the last stress - - -def get_mag_force(outlines): - """Read atomic magmom and magnetic force from OUT.ABACUS/running_scf.log. - - Returns - ------- - magmom: list of list of atomic magnetic moments (three dimensions: ION_STEP * NATOMS * 1/3) - magforce: list of list of atomic magnetic forces (three dimensions: ION_STEP * NATOMS * 1/3) - e.g.: - ------------------------------------------------------------------------------------------- - Total Magnetism (uB) - ------------------------------------------------------------------------------------------- - Fe 0.0000000001 0.0000000000 3.0000000307 - Fe -0.0000000000 -0.0000000000 3.0000001151 - ------------------------------------------------------------------------------------------- - ------------------------------------------------------------------------------------------- - Magnetic force (eV/uB) - ------------------------------------------------------------------------------------------- - Fe 0.0000000000 0.0000000000 -1.2117698671 - Fe 0.0000000000 0.0000000000 -1.2117928796 - ------------------------------------------------------------------------------------------- - - """ - mags = [] - magforces = [] - for i, line in enumerate(outlines): - if "Total Magnetism (uB)" in line: - j = i + 2 - mag = [] - while "-------------------------" not in outlines[j]: - imag = [float(ii) for ii in outlines[j].split()[1:]] - if len(imag) == 1: - imag = [0, 0, imag[0]] - mag.append(imag) - j += 1 - mags.append(mag) - if "Magnetic force (eV/uB)" in line: - j = i + 2 - magforce = [] - while "-------------------------" not in outlines[j]: - imagforce = [float(ii) for ii in outlines[j].split()[1:]] - if len(imagforce) == 1: - imagforce = [0, 0, imagforce[0]] - magforce.append(imagforce) - j += 1 - magforces.append(magforce) - return np.array(mags), np.array(magforces) - - -def get_frame(fname): - data = { - "atom_names": [], - "atom_numbs": [], - "atom_types": [], - "cells": np.array([]), - "coords": np.array([]), - "energies": np.array([]), - "forces": np.array([]), - } - - if isinstance(fname, str): - # if the input parameter is only one string, it is assumed that it is the - # base directory containing INPUT file; - path_in = os.path.join(fname, "INPUT") - else: - raise RuntimeError("invalid input") - - if not CheckFile(path_in): - return data - - with open_file(path_in) as fp: - inlines = fp.read().split("\n") - - geometry_path_in = get_geometry_in(fname, inlines) - - # get OUT.ABACUS/running_scf.log - path_out = get_path_out(fname, inlines) - if not (CheckFile(geometry_path_in) and CheckFile(path_out)): - return data - with open_file(path_out) as fp: - outlines = fp.read().split("\n") - - # get energy - energy, converge = get_energy(outlines) - if not converge: - return data - - # read STRU file - data = get_frame_from_stru(geometry_path_in) - natoms = sum(data["atom_numbs"]) - # should remove spins from STRU file - if "spins" in data: - data.pop("spins") - move = data.pop("move", None) - - # get magmom and magforce, force and stress - magmom, magforce = get_mag_force(outlines) - if len(magmom) > 0: - magmom = magmom[-1:] - if len(magforce) > 0: - magforce = magforce[-1:] - - force = get_force(outlines, natoms) - stress = get_stress(outlines) - - data["energies"] = np.array(energy)[np.newaxis] - data["forces"] = np.empty((0,)) if force is None else force[np.newaxis, :, :] - data["orig"] = np.zeros(3) - if stress is not None: - cell = data["cells"][0] - stress *= np.abs(np.linalg.det(cell)) - data["virials"] = stress[np.newaxis, :, :] - - if len(magmom) > 0: - data["spins"] = magmom - if len(magforce) > 0: - data["force_mags"] = magforce - if move is not None: - data["move"] = move - return data +from dpdata.formats.abacus.scf import * # noqa: F403 diff --git a/dpdata/abacus/stru.py b/dpdata/abacus/stru.py index 50ec2cb72..3ea68177d 100644 --- a/dpdata/abacus/stru.py +++ b/dpdata/abacus/stru.py @@ -1,820 +1,3 @@ from __future__ import annotations -import os -import re -import warnings - -import numpy as np - -from ..unit import LengthConversion - -bohr2ang = LengthConversion("bohr", "angstrom").value() - - -def split_stru_block(lines): - """Split the ABACUS STRU file into blocks by keyword. - - Args: - lines (list): list of lines in the ABACUS STRU file. - - Returns - ------- - dict: dictionary of blocks. - """ - - def clean_comment(line): - return re.split("[#]", line)[0] - - ABACUS_STRU_KEYS = [ - "ATOMIC_SPECIES", - "NUMERICAL_ORBITAL", - "LATTICE_CONSTANT", - "LATTICE_VECTORS", - "ATOMIC_POSITIONS", - "NUMERICAL_DESCRIPTOR", - "PAW_FILES", - ] - blocks = {i: [] for i in ABACUS_STRU_KEYS} - i = 0 - while i < len(lines): - line = clean_comment(lines[i]).strip() - if line in ABACUS_STRU_KEYS: - key = line - for j in range(i + 1, len(lines)): - if clean_comment(lines[j]).strip() == "": - continue - elif clean_comment(lines[j]).strip() in ABACUS_STRU_KEYS: - break - else: - blocks[key].append(clean_comment(lines[j])) - i = j - else: - i += 1 - - return blocks - - -def parse_atomic_species_block(lines): - """Parse the ATOMIC_SPECIES block. - - Args: - lines (list): list of lines in the ATOMIC_SPECIES block. - - Returns - ------- - tuple: tuple of atom_names, masses, and pp_files. - - """ - atom_names, masses, pp_files = [], [], [] - for line in lines: - line = line.split() - atom_names.append(line[0]) - masses.append(float(line[1])) - - # for standard STRU, the pseudo potential file is required, - # but it is not required for dpdata. - if len(line) > 2: - pp_files.append(line[2]) - else: - pp_files.append(None) - - return atom_names, masses, pp_files - - -def parse_numerical_orbital_block(lines): - """Parse the NUMERICAL_ORBITAL block. - - Args: - lines (list): list of lines in the NUMERICAL_ORBITAL block. - - Returns - ------- - list: list of orbital files. - """ - return [line.strip() for line in lines] - - -def parse_lattice_constant_block(lines): - """Parse the LATTICE_CONSTANT block. - - Args: - lines (list): list of lines in the LATTICE_CONSTANT block. - - Returns - ------- - float: the lattice constant. - """ - return float(lines[0]) - - -def parse_lattice_vectors_block(lines): - """Parse the LATTICE_VECTORS block. - - Args: - lines (list): list of lines in the LATTICE_VECTORS block. - - Returns - ------- - np.ndarray: the cell vectors. - """ - cell = np.zeros((3, 3)) - for i, line in enumerate(lines): - cell[i] = [float(x) for x in line.split()] - return cell - - -def parse_pos_oneline(pos_line): - """Parses a line from the atom position block in a structure file. - - The content in atom position block can include: - - `m` or NO key word: Three numbers (0 or 1) controlling atom movement in geometry relaxation calculations. - - `v`, `vel`, or `velocity`: Three components of initial velocity of atoms in geometry relaxation calculations. - - `mag` or `magmom`: Start magnetization for each atom. Can be one number (colinear) or three numbers (non-colinear). - - `angle1`: In non-colinear case, angle between c-axis and real spin (in degrees). - - `angle2`: In non-colinear case, angle between a-axis and real spin projection in ab-plane (in degrees). - - `cs` or `constrain`: Three numbers (0 or 1) controlling the spin constraint of the atom. - - `lambda`: Three numbers controlling the lambda of the atom. - - Parameters - ---------- - pos_line : A line from the atom position block. - - Returns - ------- - tuple: A tuple containing: - - pos (list of float): The position coordinates. - - move (list of int or None): Movement control values. - - velocity (list of float or None): Initial velocity components. - - magmom (float, list of float, or None): Magnetization values. - - angle1 (float or None): Angle1 value. - - angle2 (float or None): Angle2 value. - - constrain (list of bool or None): Spin constraint values. - - lambda1 (float, list of float, or None): Lambda values. - - e.g.: - ``` - Fe - 1.0 - 2 - 0.0 0.0 0.0 m 0 0 0 mag 1.0 angle1 90 angle2 0 cs 0 0 0 - 0.5 0.5 0.5 m 1 1 1 mag 1.0 angle1 90 angle2 180 - ``` - """ - pos_line = pos_line.split("#")[0] # remove comments - sline = pos_line.split() - pos = [float(i) for i in sline[:3]] - move = None - velocity = None - magmom = None - angle1 = None - angle2 = None - constrain = None - lambda1 = None - if len(sline) > 3: - mag_list = None - velocity_list = None - move_list = [] - angle1_list = None - angle2_list = None - constrain_list = None - lambda_list = None - label = "move" - for i in range(3, len(sline)): - # firstly read the label - if sline[i] == "m": - label = "move" - elif sline[i] in ["v", "vel", "velocity"]: - label = "velocity" - velocity_list = [] - elif sline[i] in ["mag", "magmom"]: - label = "magmom" - mag_list = [] - elif sline[i] == "angle1": - label = "angle1" - angle1_list = [] - elif sline[i] == "angle2": - label = "angle2" - angle2_list = [] - elif sline[i] in ["constrain", "sc"]: - label = "constrain" - constrain_list = [] - elif sline[i] in ["lambda"]: - label = "lambda" - lambda_list = [] - - # the read the value to the list - elif label == "move": - move_list.append(int(sline[i])) - elif label == "velocity": - velocity_list.append(float(sline[i])) - elif label == "magmom": - mag_list.append(float(sline[i])) - elif label == "angle1": - angle1_list.append(float(sline[i])) - elif label == "angle2": - angle2_list.append(float(sline[i])) - elif label == "constrain": - constrain_list.append(bool(int(sline[i]))) - elif label == "lambda": - lambda_list.append(float(sline[i])) - - if move_list is not None and len(move_list) > 0: - if len(move_list) == 3: - move = move_list - else: - raise RuntimeError(f"Invalid setting of move: {pos_line}") - - if velocity_list is not None: - if len(velocity_list) == 3: - velocity = velocity_list - else: - raise RuntimeError(f"Invalid setting of velocity: {pos_line}") - - if mag_list is not None: - if len(mag_list) == 3: - magmom = mag_list - elif len(mag_list) == 1: - magmom = mag_list[0] - else: - raise RuntimeError(f"Invalid magnetic moment {pos_line}") - - if angle1_list is not None: - if len(angle1_list) == 1: - angle1 = angle1_list[0] - else: - raise RuntimeError(f"Invalid angle1 {pos_line}") - - if angle2_list is not None: - if len(angle2_list) == 1: - angle2 = angle2_list[0] - else: - raise RuntimeError(f"Invalid angle2 {pos_line}") - - if constrain_list is not None: - if len(constrain_list) == 3: - constrain = constrain_list - elif len(constrain_list) == 1: - constrain = constrain_list[0] - else: - raise RuntimeError(f"Invalid constrain {pos_line}") - - if lambda_list is not None: - if len(lambda_list) == 3: - lambda1 = lambda_list - elif len(lambda_list) == 1: - lambda1 = lambda_list[0] - else: - raise RuntimeError(f"Invalid lambda {pos_line}") - - return pos, move, velocity, magmom, angle1, angle2, constrain, lambda1 - - -def get_atom_mag_cartesian(atommag, angle1, angle2): - """Transform atommag, angle1, angle2 to magmom in cartesian coordinates. - - Parameters - ---------- - atommag : float/list of float/None - Atom magnetic moment. - angle1 : float/None - value of angle1. - angle2 : float/None - value of angle2. - ABACUS support defining mag, angle1, angle2 at the same time. - angle1 is the angle between z-axis and real spin (in degrees). - angle2 is the angle between x-axis and real spin projection in xy-plane (in degrees). - If only mag is defined, then transfer it to magmom directly. - And if mag, angle1, angle2 are defined, then mag is only the norm of magmom, and the direction is defined by angle1 and angle2. - """ - if atommag is None: - return None - if not (isinstance(atommag, list) or isinstance(atommag, float)): - raise RuntimeError(f"Invalid atommag: {atommag}") - - if angle1 is None and angle2 is None: - if isinstance(atommag, list): - return atommag - else: - return [0, 0, atommag] - else: - a1 = 0 - a2 = 0 - if angle1 is not None: - a1 = angle1 - if angle2 is not None: - a2 = angle2 - if isinstance(atommag, list): - mag_norm = np.linalg.norm(atommag) - else: - mag_norm = atommag - return [ - mag_norm * np.sin(np.radians(a1)) * np.cos(np.radians(a2)), - mag_norm * np.sin(np.radians(a1)) * np.sin(np.radians(a2)), - mag_norm * np.cos(np.radians(a1)), - ] - - -def get_cartesian_coords(coords, coord_type, celldm, cell): - """Transform the atomic coordinates to cartesian coordinates. - - Args: - coords (np.ndarray): atomic coordinates read from the STRU file. - coord_type (str): the coordination type, either "cartesian" or "direct". - celldm (float): the lattice constant. - cell (np.ndarray): the cell vectors in angstrom. - - Returns - ------- - np.ndarray: the cartesian coordinates in angstrom. - """ - if coord_type == "cartesian": - return coords * celldm * bohr2ang - elif coord_type == "direct": - return np.matmul(coords, cell) - else: - raise RuntimeError(f"Invalid coordination type: {coord_type}") - - -def parse_pos(coords_lines, atom_names, celldm, cell): - """Read the atomic positions block in the ABACUS STRU file. - - Args: - coords_lines (list): list of lines in the atomic positions block. - atom_names (list): list of atom names. - celldm (float): the lattice constant. - cell (np.ndarray): the cell vectors in angstrom, and has multipy celldm. - - Returns - ------- - tuple: tuple of atom_numbs, coords, move, mags, velocity, sc, lambda_ - Note: for atomic magnetic moment, we finnaly transform it to non-collinear magnetic moment in cartesian coordinates, - and do not return the angle1 and angle2, and the magnetic moment of each atom type. - - """ - coord_type = coords_lines[0].split()[0].lower() # cartisan or direct - atom_numbs = [] # the number of each atom type - coords = [] # coordinations of atoms - move = [] # move flag of each atom - velocity = [] # velocity of each atom - mags = [] # magnetic moment of each atom - sc = [] # spin constraint flag of each atom - lambda_ = [] # lambda of each atom - - ntype = len(atom_names) - line_idx = 1 # starting line of first element - define_atom_mag = False - for it in range(ntype): - atom_name = coords_lines[line_idx].split()[0] - if atom_name != atom_names[it]: - raise RuntimeError( - f"Read atom name '{atom_name}' is not equal to the expected atom name '{atom_names[it]}'" - ) - atom_type_mag = float(coords_lines[line_idx + 1].split()[0]) - line_idx += 2 - atom_numbs.append(int(coords_lines[line_idx].split()[0])) - line_idx += 1 - for iline in range(atom_numbs[it]): - pos, imove, ivelocity, imagmom, iangle1, iangle2, iconstrain, ilambda1 = ( - parse_pos_oneline(coords_lines[line_idx]) - ) - - coords.append(get_cartesian_coords(np.array(pos), coord_type, celldm, cell)) - - move.append(imove) - velocity.append(ivelocity) - sc.append(iconstrain) - lambda_.append(ilambda1) - - # calculate the magnetic moment in cartesian coordinates - mag = get_atom_mag_cartesian(imagmom, iangle1, iangle2) - if mag is None: - mag = [0, 0, atom_type_mag] - mags.append(mag) - - if imagmom is not None: - define_atom_mag = True - - line_idx += 1 - coords = np.array(coords) # need transformation!!! - - if all([i is None for i in move]): - move = [] - else: - move = np.array(move, dtype=bool) - - if all([i is None for i in velocity]): - velocity = [] - else: - velocity = np.array(velocity) - - if all([i is None for i in sc]): - sc = [] - - if all([i is None for i in lambda_]): - lambda_ = [] - - # here return the magnetic moment only when the atom magnetic moment is specified. - if not define_atom_mag: - mags = [] - else: - mags = np.array(mags) - - return atom_numbs, coords, move, mags, velocity, sc, lambda_ - - -def right_hand_rule( - cell: np.ndarray, coord: np.ndarray -) -> tuple[np.ndarray, np.ndarray]: - """Rotate the cell and coord to make the cell fit the right-hand rule. - - Args: - cell (np.ndarray): the cell vectors. - coord (np.ndarray): the atomic coordinates in cartesian. - - Returns - ------- - tuple: the rotated cell and coord. - """ - if np.linalg.det(cell) < 0: - cell = -cell - coord = -coord - return cell, coord - - -def get_frame_from_stru(stru): - """Read the ABACUS STRU file and return the dpdata frame. - - The description of ABACUS STRU can be found in https://abacus.deepmodeling.com/en/latest/advanced/input_files/stru.html - - Args: - stru (str): path to the ABACUS STRU file. - - Returns - ------- - data: the parsed stru information in dictionary. - { - "atom_names": list of atom names, - "atom_numbs": list of atom numbers, - "atom_types": list of atom types, - "masses": list of atomic masses, - "pp_files", list of pseudo potential files, - "orb_files", list of orbital files, - "dpks_descriptor": the deepks descriptor file, - - # below are the information in each frame - - "cells": list of cell vectors, - "coords": list of atomic coordinates, - "spins": list of magnetic moments, # return only when set "mag xxx" for each atom in STRU file - "moves": list of move flags, - } - For some keys, if the information is not provided in the STRU file, then it will not be included in the dictionary. - "spins" is designed for delta spin calculation, and when dpdata.System is write to lmp format, the spin will be written as magmom. - But we should note that this file format is valid only for a spin lammps job, not for a normal job. - If you want to use dpgen to run the non-spin job, then you should not define "mag x x x" in the STRU file. - """ - if not os.path.isfile(stru): - raise FileNotFoundError(f"ABACUS STRU file {stru} not found!!!") - - # 1. read the file and split the lines to blocks - with open(stru) as f: - lines = f.readlines() - blocks = split_stru_block(lines) - - # 2. parse the blocks - atom_names, masses, pp_files = parse_atomic_species_block(blocks["ATOMIC_SPECIES"]) - orb_files = parse_numerical_orbital_block(blocks.get("NUMERICAL_ORBITAL", [])) - dpks_descriptor = blocks.get("NUMERICAL_DESCRIPTOR", []) - celldm = parse_lattice_constant_block(blocks["LATTICE_CONSTANT"]) - cell = parse_lattice_vectors_block(blocks["LATTICE_VECTORS"]) - cell = np.array(cell) * celldm * bohr2ang - atom_numbs, coords, move, mags, velocity, sc, lambda_ = parse_pos( - blocks["ATOMIC_POSITIONS"], atom_names, celldm, cell - ) - - cell, coords = right_hand_rule(cell, coords) - data = { - "atom_names": atom_names, - "atom_numbs": atom_numbs, - "atom_types": np.array( - [i for i in range(len(atom_numbs)) for j in range(atom_numbs[i])] - ), - "masses": np.array(masses), - "pp_files": pp_files, - "cells": np.array([cell]), - "coords": np.array([coords]), - } - if len(mags) > 0: - data["spins"] = np.array([mags]) - if len(orb_files) > 0: - data["orb_files"] = orb_files - if len(dpks_descriptor) > 0: - data["dpks_descriptor"] = dpks_descriptor[0].strip() - if len(move) > 0: - data["move"] = np.array([move]) - - return data - - -def make_unlabeled_stru( - data, - frame_idx, - pp_file=None, - numerical_orbital=None, - numerical_descriptor=None, - mass=None, - move=None, - velocity=None, - mag=None, - angle1=None, - angle2=None, - sc=None, - lambda_=None, - link_file=False, - dest_dir=None, - **kwargs, -): - """Make an unlabeled STRU file from a dictionary. - - Parameters - ---------- - data : dict - System data - frame_idx : int - The index of the frame to dump - pp_file : list of string or dict - List of pseudo potential files, or a dictionary of pseudo potential files for each atomnames - numerical_orbital : list of string or dict, optional - List of orbital files, or a dictionary of orbital files for each atomnames - numerical_descriptor : str, optional - numerical descriptor file - mass : list of float, optional - List of atomic masses - move : list of (list of list of bool), optional - List of the move flag of each xyz direction of each atom for each frame - velocity : list of list of float, optional - List of the velocity of each xyz direction of each atom - mag : list of (list of float or float), optional - List of the magnetic moment of each atom, can be a list of three floats or one float - For noncollinear, three floats are the xyz component of the magnetic moment. - For collinear, one float is the norm of the magnetic moment. - angle1 : list of float, optional - List of the angle1 of each atom. For noncollinear calculation, it is the angle between the magnetic moment and the z-axis. - angle2 : list of float, optional - List of the angle2 of each atom. For noncollinear calculation, it is the angle between the projection of magnetic moment on xy plane and the x-axis. - sc : list of (bool or list of 3 bool), optional - List of the spin constraint flag of each atom. Each element can be a bool or a list of three bools or None. - lambda_ : list of (float or list of 3 float), optional - List of the lambda of each atom. Each element can be a float or a list of three floats. - link_file : bool, optional - Whether to link the pseudo potential files and orbital files in the STRU file. - If True, then only filename will be written in the STRU file, and make a soft link to the real file. - dest_dir : str, optional - The destination directory to make the soft link of the pseudo potential files and orbital files. - For velocity, mag, angle1, angle2, sc, and lambda_, if the value is None, then the corresponding information will not be written. - ABACUS support defining "mag" and "angle1"/"angle2" at the same time, and in this case, the "mag" only define the norm of the magnetic moment, and "angle1" and "angle2" define the direction of the magnetic moment. - If data has spins, then it will be written as mag to STRU file; while if mag is passed at the same time, then mag will be used. - """ - - def _link_file(dest_dir, src_file): - if not os.path.isfile(src_file): - print(f"ERROR: link_file: {src_file} is not a file.") - return False - src_file = os.path.abspath(src_file) - if not os.path.isdir(dest_dir): - os.makedirs(dest_dir) - dest_file = os.path.join(dest_dir, os.path.basename(src_file)) - if os.path.isfile(dest_file): - if os.path.samefile(src_file, dest_file): - return True - else: - os.remove(dest_file) - os.symlink(src_file, dest_file) - return True - - def ndarray2list(i): - if isinstance(i, np.ndarray): - return i.tolist() - else: - return i - - def process_file_input(file_input, atom_names, input_name): - # For pp_file and numerical_orbital, process the file input, and return a list of file names - # file_input can be a list of file names, or a dictionary of file names for each atom names - if isinstance(file_input, (list, tuple)): - if len(file_input) != len(atom_names): - raise ValueError( - f"{input_name} length is not equal to the number of atom types" - ) - return file_input - elif isinstance(file_input, dict): - for element in atom_names: - if element not in file_input: - raise KeyError(f"{input_name} does not contain {element}") - return [file_input[element] for element in atom_names] - else: - raise ValueError(f"Invalid {input_name}: {file_input}") - - if link_file and dest_dir is None: - print( - "WARNING: make_unlabeled_stru: link_file is True, but dest_dir is None. Will write the filename to STRU but not making soft link." - ) - if dest_dir is not None and dest_dir.strip() == "": - dest_dir = "." - - # check the input data - if mass is None and data.get("masses") is not None and len(data["masses"]) > 0: - mass = data["masses"] - - if ( - pp_file is None - and data.get("pp_files") is not None - and len(data["pp_files"]) > 0 - ): - pp_file = data["pp_files"] - - if ( - numerical_orbital is None - and data.get("orb_files") is not None - and len(data["orb_files"]) > 0 - ): - numerical_orbital = data["orb_files"] - - if numerical_descriptor is None and data.get("dpks_descriptor") is not None: - numerical_descriptor = data["dpks_descriptor"] - - if mag is None and data.get("spins") is not None and len(data["spins"]) > 0: - mag = data["spins"][frame_idx] - - if move is None and data.get("move", None) is not None and len(data["move"]) > 0: - move = data["move"][frame_idx] - - # check the length of the input data - atom_numbs = sum(data["atom_numbs"]) - for key in [move, velocity, mag, angle1, angle2, sc, lambda_]: - if key is not None: - if ( - not isinstance(ndarray2list(key), (list, tuple)) - and len(key) != atom_numbs - ): - key_name = [name for name, value in locals().items() if value is key][0] - print( - f"ERROR: make_unlabeled_stru: the length of '{key_name}' ({len(key)}) should be equal to the number of atom number ({atom_numbs})." - ) - return "" - - # ATOMIC_SPECIES block - out = "ATOMIC_SPECIES\n" - if pp_file is not None: - ppfiles = process_file_input( - ndarray2list(pp_file), data["atom_names"], "pp_file" - ) - else: - warnings.warn( - "pp_file is not provided, will use empty string for pseudo potential file." - ) - ppfiles = [""] * len(data["atom_names"]) - - for iele in range(len(data["atom_names"])): - if data["atom_numbs"][iele] == 0: - continue - out += data["atom_names"][iele] + " " - if mass is not None: - out += f"{mass[iele]:.3f} " - else: - out += "1 " - - ipp_file = ppfiles[iele] - if ipp_file != "": - if not link_file: - out += ipp_file - else: - out += os.path.basename(ipp_file.rstrip("/")) - if dest_dir is not None: - _link_file(dest_dir, ipp_file) - out += "\n" - out += "\n" - - # NUMERICAL_ORBITAL block - if numerical_orbital is not None: - numerical_orbital = ndarray2list(numerical_orbital) - orbfiles = process_file_input( - numerical_orbital, data["atom_names"], "numerical_orbital" - ) - orbfiles = [ - orbfiles[i] - for i in range(len(data["atom_names"])) - if data["atom_numbs"][i] != 0 - ] - out += "NUMERICAL_ORBITAL\n" - for iorb in orbfiles: - if not link_file: - out += iorb - else: - out += os.path.basename(iorb.rstrip("/")) - if dest_dir is not None: - _link_file(dest_dir, iorb) - out += "\n" - out += "\n" - - # deepks block - if numerical_descriptor is not None: - assert isinstance(numerical_descriptor, str) - if not link_file: - out += f"NUMERICAL_DESCRIPTOR\n{numerical_descriptor}\n" - else: - out += f"NUMERICAL_DESCRIPTOR\n{os.path.basename(numerical_descriptor)}\n" - if dest_dir is not None: - _link_file(dest_dir, numerical_descriptor) - out += "\n" - - # LATTICE_CONSTANT and LATTICE_VECTORS block - out += "LATTICE_CONSTANT\n" - out += str(1 / bohr2ang) + "\n\n" - - out += "LATTICE_VECTORS\n" - for ix in range(3): - for iy in range(3): - out += str(data["cells"][frame_idx][ix][iy]) + " " - out += "\n" - out += "\n" - - # ATOMIC_POSITIONS block - out += "ATOMIC_POSITIONS\n" - out += "Cartesian # Cartesian(Unit is LATTICE_CONSTANT)\n" - # ret += "\n" - natom_tot = 0 # in for loop, it is also the atom index - for iele in range(len(data["atom_names"])): - if data["atom_numbs"][iele] == 0: - continue - out += data["atom_names"][iele] + "\n" - out += "0.0\n" - out += str(data["atom_numbs"][iele]) + "\n" - for iatom in range(data["atom_numbs"][iele]): - iatomtype = np.nonzero(data["atom_types"] == iele)[0][ - iatom - ] # it is the atom index - iout = f"{data['coords'][frame_idx][iatomtype, 0]:.12f} {data['coords'][frame_idx][iatomtype, 1]:.12f} {data['coords'][frame_idx][iatomtype, 2]:.12f}" - # add flags for move, velocity, mag, angle1, angle2, and sc - if move is not None: - if ( - isinstance(ndarray2list(move[iatomtype]), (list, tuple)) - and len(move[iatomtype]) == 3 - ): - iout += " " + " ".join( - ["1" if ii else "0" for ii in move[iatomtype]] - ) - elif isinstance(ndarray2list(move[iatomtype]), (int, float, bool)): - iout += " 1 1 1" if move[iatomtype] else " 0 0 0" - else: - iout += " 1 1 1" - - if ( - velocity is not None - and isinstance(ndarray2list(velocity[iatomtype]), (list, tuple)) - and len(velocity[iatomtype]) == 3 - ): - iout += " v " + " ".join([f"{ii:.12f}" for ii in velocity[iatomtype]]) - - if mag is not None: - if isinstance(ndarray2list(mag[iatomtype]), (list, tuple)) and len( - mag[iatomtype] - ) in [1, 3]: - iout += " mag " + " ".join([f"{ii:.12f}" for ii in mag[iatomtype]]) - elif isinstance(ndarray2list(mag[iatomtype]), (int, float)): - iout += " mag " + f"{mag[iatomtype]:.12f}" - - if angle1 is not None and isinstance( - ndarray2list(angle1[iatomtype]), (int, float) - ): - iout += " angle1 " + f"{angle1[iatomtype]:.12f}" - - if angle2 is not None and isinstance( - ndarray2list(angle2[iatomtype]), (int, float) - ): - iout += " angle2 " + f"{angle2[iatomtype]:.12f}" - - if sc is not None: - if isinstance(ndarray2list(sc[iatomtype]), (list, tuple)) and len( - sc[iatomtype] - ) in [1, 3]: - iout += " sc " + " ".join( - ["1" if ii else "0" for ii in sc[iatomtype]] - ) - elif isinstance(ndarray2list(sc[iatomtype]), (int, float, bool)): - iout += " sc " + "1" if sc[iatomtype] else "0" - - if lambda_ is not None: - if isinstance(ndarray2list(lambda_[iatomtype]), (list, tuple)) and len( - lambda_[iatomtype] - ) in [1, 3]: - iout += " lambda " + " ".join( - [f"{ii:.12f}" for ii in lambda_[iatomtype]] - ) - elif isinstance(ndarray2list(lambda_[iatomtype]), (int, float)): - iout += " lambda " + f"{lambda_[iatomtype]:.12f}" - - out += iout + "\n" - natom_tot += 1 - assert natom_tot == sum(data["atom_numbs"]) - return out +from dpdata.formats.abacus.stru import * # noqa: F403 diff --git a/dpdata/amber/__init__.py b/dpdata/amber/__init__.py index e69de29bb..6d20c397a 100644 --- a/dpdata/amber/__init__.py +++ b/dpdata/amber/__init__.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from dpdata.formats.amber import * # noqa: F403 diff --git a/dpdata/amber/mask.py b/dpdata/amber/mask.py index 155e2a7be..177b34788 100644 --- a/dpdata/amber/mask.py +++ b/dpdata/amber/mask.py @@ -1,42 +1,3 @@ -"""Amber mask.""" - from __future__ import annotations -try: - import parmed -except ImportError: - pass - - -def pick_by_amber_mask(param, maskstr, coords=None): - """Pick atoms by amber masks. - - Parameters - ---------- - param : str or parmed.Structure - filename of Amber param file or parmed.Structure - maskstr : str - Amber masks - coords : np.ndarray (optional) - frame coordinates, shape: N*3 - """ - parm = load_param_file(param) - if coords is not None: - parm.initialize_topology(xyz=coords) - sele = [] - if len(maskstr) > 0: - newmaskstr = maskstr.replace("@0", "!@*") - sele = [ - parm.atoms[i].idx - for i in parmed.amber.mask.AmberMask(parm, newmaskstr).Selected() - ] - return sele - - -def load_param_file(param_file): - if isinstance(param_file, str): - return parmed.load_file(param_file) - elif isinstance(param_file, parmed.Structure): - return param_file - else: - raise RuntimeError("Unsupported structure") +from dpdata.formats.amber.mask import * # noqa: F403 diff --git a/dpdata/amber/md.py b/dpdata/amber/md.py index 06d9e2032..37a1186af 100644 --- a/dpdata/amber/md.py +++ b/dpdata/amber/md.py @@ -1,190 +1,3 @@ from __future__ import annotations -import os -import re - -import numpy as np - -from dpdata.amber.mask import pick_by_amber_mask -from dpdata.unit import EnergyConversion -from dpdata.utils import open_file - -from ..periodic_table import ELEMENTS - -kcalmol2eV = EnergyConversion("kcal_mol", "eV").value() -symbols = ["X"] + ELEMENTS - -energy_convert = kcalmol2eV -force_convert = energy_convert - - -def cell_lengths_angles_to_cell( - cell_lengths: np.ndarray, cell_angles: np.ndarray -) -> np.ndarray: - """Convert cell lengths and angles to cell vectors. - - Parameters - ---------- - cell_lengths - Cell lengths with shape ``(..., 3)`` where the last dimension is - ``a, b, c``. - cell_angles - Cell angles in degrees with shape ``(..., 3)`` where the last dimension - is ``alpha, beta, gamma``. - - Returns - ------- - np.ndarray - Cell vectors with shape ``(..., 3, 3)``. - """ - alpha = np.deg2rad(cell_angles[..., 0]) - beta = np.deg2rad(cell_angles[..., 1]) - gamma = np.deg2rad(cell_angles[..., 2]) - - a = cell_lengths[..., 0] - b = cell_lengths[..., 1] - c = cell_lengths[..., 2] - - if np.any(cell_lengths <= 0.0): - raise RuntimeError("Invalid AMBER cell lengths") - if np.any((cell_angles <= 0.0) | (cell_angles >= 180.0)): - raise RuntimeError("Invalid AMBER cell angles") - - cos_alpha = np.cos(alpha) - cos_beta = np.cos(beta) - cos_gamma = np.cos(gamma) - sin_gamma = np.sin(gamma) - ly = b * sin_gamma - if np.any(ly <= 1e-8): - raise RuntimeError("Invalid AMBER cell angles") - - z_factor = ( - 1 - - cos_alpha**2 - - cos_beta**2 - - cos_gamma**2 - + 2 * cos_alpha * cos_beta * cos_gamma - ) - lz2 = c**2 * z_factor / sin_gamma**2 - if np.any(lz2 <= 1e-8): - raise RuntimeError("Invalid AMBER cell angles") - - z = np.sqrt(z_factor) / sin_gamma - - shape = (*cell_lengths.shape[:-1], 3, 3) - cells = np.zeros(shape) - cells[..., 0, 0] = a - cells[..., 1, 0] = b * cos_gamma - cells[..., 1, 1] = b * sin_gamma - cells[..., 2, 0] = c * cos_beta - cells[..., 2, 1] = c * (cos_alpha - cos_beta * cos_gamma) / sin_gamma - cells[..., 2, 2] = c * z - return cells - - -def read_amber_traj( - parm7_file, - nc_file, - mdfrc_file=None, - mden_file=None, - mdout_file=None, - use_element_symbols=None, - labeled=True, -): - """The amber trajectory includes: - * nc, NetCDF format, stores coordinates - * mdfrc, NetCDF format, stores forces - * mden (optional), text format, stores energies - * mdout (optional), text format, may store energies if there is no mden_file - * parm7, text format, stores types. - - Parameters - ---------- - parm7_file, nc_file, mdfrc_file, mden_file, mdout_file: - filenames - use_element_symbols : None or list or str - If use_element_symbols is a list of atom indexes, these atoms will use element symbols - instead of amber types. For example, a ligand will use C, H, O, N, and so on - instead of h1, hc, o, os, and so on. - IF use_element_symbols is str, it will be considered as Amber mask. - labeled : bool - Whether to return labeled data - """ - from scipy.io import netcdf_file - - flag_atom_type = False - flag_atom_numb = False - amber_types = [] - atomic_number = [] - with open_file(parm7_file) as f: - for line in f: - if line.startswith("%FLAG"): - flag_atom_type = line.startswith("%FLAG AMBER_ATOM_TYPE") - flag_atom_numb = (use_element_symbols is not None) and line.startswith( - "%FLAG ATOMIC_NUMBER" - ) - elif flag_atom_type or flag_atom_numb: - if line.startswith("%FORMAT"): - fmt = re.findall(r"\d+", line) - fmt0 = int(fmt[0]) - fmt1 = int(fmt[1]) - else: - for ii in range(fmt0): - start_index = ii * fmt1 - end_index = (ii + 1) * fmt1 - if end_index >= len(line): - continue - content = line[start_index:end_index].strip() - if flag_atom_type: - amber_types.append(content) - elif flag_atom_numb: - atomic_number.append(int(content)) - if use_element_symbols is not None: - if isinstance(use_element_symbols, str): - use_element_symbols = pick_by_amber_mask(parm7_file, use_element_symbols) - for ii in use_element_symbols: - amber_types[ii] = symbols[atomic_number[ii]] - - with netcdf_file(nc_file, "r") as f: - coords = np.array(f.variables["coordinates"][:]) - cell_lengths = np.array(f.variables["cell_lengths"][:]) - cell_angles = np.array(f.variables["cell_angles"][:]) - cells = cell_lengths_angles_to_cell(cell_lengths, cell_angles) - - if labeled: - with netcdf_file(mdfrc_file, "r") as f: - forces = np.array(f.variables["forces"][:]) - - # load energy from mden_file or mdout_file - energies = [] - if mden_file is not None and os.path.isfile(mden_file): - with open_file(mden_file) as f: - for line in f: - if line.startswith("L6"): - s = line.split() - if s[2] != "E_pot": - energies.append(float(s[2])) - elif mdout_file is not None and os.path.isfile(mdout_file): - with open_file(mdout_file) as f: - for line in f: - if "EPtot" in line: - s = line.split() - energies.append(float(s[-1])) - else: - raise RuntimeError("Please provide one of mden_file and mdout_file") - - atom_names, atom_types, atom_numbs = np.unique( - amber_types, return_inverse=True, return_counts=True - ) - - data = {} - data["atom_names"] = list(atom_names) - data["atom_numbs"] = list(atom_numbs) - data["atom_types"] = atom_types - if labeled: - data["forces"] = forces * force_convert - data["energies"] = np.array(energies) * energy_convert - data["coords"] = coords - data["cells"] = cells - data["orig"] = np.array([0, 0, 0]) - return data +from dpdata.formats.amber.md import * # noqa: F403 diff --git a/dpdata/amber/sqm.py b/dpdata/amber/sqm.py index 93e41f9aa..81db3ee9a 100644 --- a/dpdata/amber/sqm.py +++ b/dpdata/amber/sqm.py @@ -1,120 +1,3 @@ from __future__ import annotations -from typing import TYPE_CHECKING - -import numpy as np - -from dpdata.periodic_table import ELEMENTS -from dpdata.unit import EnergyConversion -from dpdata.utils import open_file - -if TYPE_CHECKING: - from dpdata.utils import FileType - -kcal2ev = EnergyConversion("kcal_mol", "eV").value() - -START = 0 -READ_CHARGE = 2 -READ_COORDS_START = 3 -READ_COORDS = 6 -READ_FORCES = 7 - - -def parse_sqm_out(fname: FileType): - """Read atom symbols, charges and coordinates from ambertools sqm.out file.""" - atom_symbols = [] - coords = [] - charges = [] - forces = [] - energies = [] - - with open_file(fname) as f: - flag = START - for line in f: - if line.startswith(" Total SCF energy"): - energy = float(line.strip().split()[-2]) - energies = [energy] - elif line.startswith(" Atom Element Mulliken Charge"): - flag = READ_CHARGE - charges = [] - elif line.startswith(" Total Mulliken Charge"): - flag = START - elif line.startswith(" Final Structure"): - flag = READ_COORDS_START - coords = [] - elif line.startswith("QMMM: Forces on QM atoms"): - flag = READ_FORCES - forces = [] - elif flag == READ_CHARGE: - ls = line.strip().split() - atom_symbols.append(ls[-2]) - charges.append(float(ls[-1])) - elif READ_COORDS_START <= flag < READ_COORDS: - flag += 1 - elif flag == READ_COORDS: - coords.append([float(x) for x in line.strip().split()[-3:]]) - if len(coords) == len(charges): - flag = START - elif flag == READ_FORCES: - ll = line.strip() - if not ll.startswith("QMMM: Atm "): - flag = START - continue - forces.append([float(ll[-60:-40]), float(ll[-40:-20]), float(ll[-20:])]) - if len(forces) == len(charges): - flag = START - - data = {} - atom_names, data["atom_types"], atom_numbs = np.unique( - atom_symbols, return_inverse=True, return_counts=True - ) - data["charges"] = np.array(charges) - data["atom_names"] = list(atom_names) - data["atom_numbs"] = list(atom_numbs) - data["orig"] = np.array([0, 0, 0]) - data["cells"] = np.array( - [[[100.0, 0.0, 0.0], [0.0, 100.0, 0.0], [0.0, 0.0, 100.0]]] - ) - data["nopbc"] = True - data["coords"] = np.array([coords]) - - energies = np.array(energies) - forces = -np.array([forces], dtype=np.float64) * kcal2ev - if len(forces) > 0: - data["energies"] = energies - data["forces"] = forces - - return data - - -def make_sqm_in(data, fname: FileType | None = None, frame_idx=0, **kwargs): - symbols = [data["atom_names"][ii] for ii in data["atom_types"]] - atomic_numbers = [ELEMENTS.index(ss) + 1 for ss in symbols] - charge = kwargs.get("charge", 0) - - # multiplicity - mult = kwargs.get("mult", 1) - if mult != 1: - raise RuntimeError("Multiplicity is not 1, which is not supported by sqm") - - maxcyc = kwargs.get("maxcyc", 0) # 0 represents a single-point calculation - theory = kwargs.get("qm_theory", "DFTB3") - ret = "Run semi-emperical minimization\n" - ret += " &qmmm\n" - ret += f" qm_theory='{theory}'\n" - ret += f" qmcharge={charge}\n" - ret += f" maxcyc={maxcyc}\n" - ret += " verbosity=4\n" - ret += " /\n" - for ii in range(len(data["atom_types"])): - ret += "{:>4s}{:>6s}{:>16s}{:>16s}{:>16s}\n".format( - str(atomic_numbers[ii]), - str(symbols[ii]), - f"{data['coords'][frame_idx][ii, 0]:.6f}", - f"{data['coords'][frame_idx][ii, 1]:.6f}", - f"{data['coords'][frame_idx][ii, 2]:.6f}", - ) - if fname is not None: - with open_file(fname, "w") as fp: - fp.write(ret) - return ret +from dpdata.formats.amber.sqm import * # noqa: F403 diff --git a/dpdata/bond_order_system.py b/dpdata/bond_order_system.py index 7a23acca5..db29d6b65 100644 --- a/dpdata/bond_order_system.py +++ b/dpdata/bond_order_system.py @@ -6,11 +6,11 @@ import numpy as np -import dpdata.rdkit.utils -from dpdata.rdkit.sanitize import Sanitizer +import dpdata.formats.rdkit.utils +from dpdata.formats.rdkit.sanitize import Sanitizer from dpdata.system import Axis, DataType, System -# import dpdata.rdkit.mol2 +# import dpdata.formats.rdkit.mol2 class BondOrderSystem(System): @@ -79,7 +79,7 @@ def __init__( self.sanitizer = Sanitizer(sanitize_level, raise_errors, verbose) if data: - mol = dpdata.rdkit.utils.system_data_to_mol(data) + mol = dpdata.formats.rdkit.utils.system_data_to_mol(data) self.from_rdkit_mol(mol) if file_name: self.from_fmt( @@ -161,7 +161,7 @@ def __add__(self, other): # magic method "+" operation # ''' # if isinstance(other, BondOrderSystem): - # if dpdata.rdkit.utils.check_same_molecule(self.rdkit_mol, other.rdkit_mol): + # if dpdata.formats.rdkit.utils.check_same_molecule(self.rdkit_mol, other.rdkit_mol): # self.__class__(self, data=other.data) # else: # raise RuntimeError("The two systems are not of the same topology.") @@ -171,7 +171,7 @@ def __add__(self, other): def from_rdkit_mol(self, rdkit_mol): """Initialize from a rdkit.Chem.rdchem.Mol object.""" rdkit_mol = self.sanitizer.sanitize(rdkit_mol) - self.data = dpdata.rdkit.utils.mol_to_system_data(rdkit_mol) + self.data = dpdata.formats.rdkit.utils.mol_to_system_data(rdkit_mol) self.data["bond_dict"] = dict( [(f"{int(bond[0])}-{int(bond[1])}", bond[2]) for bond in self.data["bonds"]] ) diff --git a/dpdata/cp2k/__init__.py b/dpdata/cp2k/__init__.py index e69de29bb..8e66eb10e 100644 --- a/dpdata/cp2k/__init__.py +++ b/dpdata/cp2k/__init__.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from dpdata.formats.cp2k import * # noqa: F403 diff --git a/dpdata/cp2k/cell.py b/dpdata/cp2k/cell.py index a3021b815..5fd326de9 100644 --- a/dpdata/cp2k/cell.py +++ b/dpdata/cp2k/cell.py @@ -1,68 +1,3 @@ -# %% from __future__ import annotations -import numpy as np - - -def cell_to_low_triangle(A, B, C, alpha, beta, gamma): - """Convert cell to low triangle matrix. - - Parameters - ---------- - A : float - cell length A - B : float - cell length B - C : float - cell length C - alpha : float - radian. The angle between vector B and vector C. - beta : float - radian. The angle between vector A and vector C. - gamma : float - radian. The angle between vector B and vector C. - - Returns - ------- - cell : list - The cell matrix used by dpdata in low triangle form. - """ - if not np.pi * 5 / 180 < alpha < np.pi * 175 / 180: - raise RuntimeError( - f"alpha=={alpha}: must be a radian, and \ - must be in np.pi*5/180 < alpha < np.pi*175/180" - ) - if not np.pi * 5 / 180 < beta < np.pi * 175 / 180: - raise RuntimeError( - f"beta=={beta}: must be a radian, and \ - must be in np.pi*5/180 < beta < np.pi*175/180" - ) - if not np.pi * 5 / 180 < gamma < np.pi * 175 / 180: - raise RuntimeError( - f"gamma=={gamma}: must be a radian, and \ - must be in np.pi*5/180 < gamma < np.pi*175/180" - ) - if not A > 0.2: - raise RuntimeError(f"A=={A}, must be greater than 0.2") - if not B > 0.2: - raise RuntimeError(f"B=={B}, must be greater than 0.2") - if not C > 0.2: - raise RuntimeError(f"C=={C}, must be greater than 0.2") - - lx = A - xy = B * np.cos(gamma) - xz = C * np.cos(beta) - ly = B * np.sin(gamma) - if not ly > 0.1: - raise RuntimeError( - "ly:=B* np.sin(gamma)=={}, must be greater than 0.1", format(ly) - ) - yz = (B * C * np.cos(alpha) - xy * xz) / ly - if not C**2 - xz**2 - yz**2 > 0.01: - raise RuntimeError( - "lz^2:=C**2-xz**2-yz**2=={}, must be greater than 0.01", - format(C**2 - xz**2 - yz**2), - ) - lz = np.sqrt(C**2 - xz**2 - yz**2) - cell = np.asarray([[lx, 0, 0], [xy, ly, 0], [xz, yz, lz]]).astype("float32") - return cell +from dpdata.formats.cp2k.cell import * # noqa: F403 diff --git a/dpdata/cp2k/output.py b/dpdata/cp2k/output.py index bf575f728..b7f4ce4d4 100644 --- a/dpdata/cp2k/output.py +++ b/dpdata/cp2k/output.py @@ -1,510 +1,3 @@ -# %% from __future__ import annotations -import math -import re -from collections import OrderedDict - -import numpy as np - -from ..unit import ( - EnergyConversion, - ForceConversion, - LengthConversion, - PressureConversion, -) -from .cell import cell_to_low_triangle - -AU_TO_ANG = LengthConversion("bohr", "angstrom").value() -AU_TO_EV = EnergyConversion("hartree", "eV").value() -AU_TO_EV_EVERY_ANG = ForceConversion("hartree/bohr", "eV/angstrom").value() -delimiter_patterns = [] -delimiter_p1 = re.compile(r"^ \* GO CP2K GO! \*+") -delimiter_p2 = re.compile(r"^ \*+") -delimiter_patterns.append(delimiter_p1) -delimiter_patterns.append(delimiter_p2) -avail_patterns = [] -avail_patterns.append(re.compile(r"^ INITIAL POTENTIAL ENERGY")) -avail_patterns.append(re.compile(r"^ ENSEMBLE TYPE")) - - -class Cp2kSystems: - """deal with cp2k outputfile.""" - - def __init__(self, log_file_name, xyz_file_name, restart=False): - self.log_file_object = open(log_file_name) - self.xyz_file_object = open(xyz_file_name) - self.log_block_generator = self.get_log_block_generator() - self.xyz_block_generator = self.get_xyz_block_generator() - self.restart_flag = restart - - self.cell = None - self.print_level = None - - self.atomic_kinds = None - - if self.restart_flag: - self.handle_single_log_frame(next(self.log_block_generator)) - - def __del__(self): - self.log_file_object.close() - self.xyz_file_object.close() - - def __iter__(self): - return self - - def __next__(self): - info_dict = {} - log_info_dict = self.handle_single_log_frame(next(self.log_block_generator)) - # print(log_info_dict) - xyz_info_dict = self.handle_single_xyz_frame(next(self.xyz_block_generator)) - # eq1 = [v1==v2 for v1,v2 in zip(log_info_dict['atom_numbs'], xyz_info_dict['atom_numbs'])] - # eq2 = [v1==v2 for v1,v2 in zip(log_info_dict['atom_names'], xyz_info_dict['atom_names'])] - # eq3 = [v1==v2 for v1,v2 in zip(log_info_dict['atom_types'], xyz_info_dict['atom_types'])] - # assert all(eq1), (log_info_dict,xyz_info_dict,'There may be errors in the file. If it is a restart task; use restart=True') - # assert all(eq2), (log_info_dict,xyz_info_dict,'There may be errors in the file. If it is a restart task; use restart=True') - # assert all(eq3), (log_info_dict,xyz_info_dict,'There may be errors in the file. If it is a restart task; use restart=True') - assert math.isclose( - log_info_dict["energies"][0], xyz_info_dict["energies"][0], abs_tol=1.0e-6 - ), ( - log_info_dict["energies"], - xyz_info_dict["energies"], - "There may be errors in the file", - ) - info_dict.update(log_info_dict) - info_dict.update(xyz_info_dict) - return info_dict - - def get_log_block_generator(self): - lines = [] - delimiter_flag = False - yield_flag = False - while True: - line = self.log_file_object.readline() - if line: - lines.append(line) - if any(p.match(line) for p in delimiter_patterns): - if delimiter_flag is True: - yield_flag = True - yield lines - lines = [] - delimiter_flag = False - else: - line = self.log_file_object.readline() - lines.append(line) - if any(p.match(line) for p in avail_patterns): - delimiter_flag = True - else: - if not yield_flag: - raise StopIteration("None of the delimiter patterns are matched") - break - if delimiter_flag is True: - raise RuntimeError("This file lacks some content, please check") - - def get_xyz_block_generator(self): - p3 = re.compile(r"^\s*(\d+)\s*") - yield_flag = False - while True: - line = self.xyz_file_object.readline() - if not line: - if not yield_flag: - raise StopIteration("None of the xyz patterns are matched") - break - if p3.match(line): - yield_flag = True - atom_num = int(p3.match(line).group(1)) - lines = [] - lines.append(line) - for ii in range(atom_num + 1): - lines.append(self.xyz_file_object.readline()) - if not lines[-1]: - raise RuntimeError( - f"this xyz file may lack of lines, should be {atom_num + 2};lines:{lines}" - ) - yield lines - - def handle_single_log_frame(self, lines): - info_dict = {} - energy_pattern_1 = re.compile( - r" INITIAL POTENTIAL ENERGY\[hartree\]\s+=\s+(?P\S+)" - ) - # CONSERVED QUANTITY [hartree] = -0.279168013085E+04 - energy_pattern_2 = re.compile( - r" POTENTIAL ENERGY\[hartree\]\s+=\s+(?P\S+)" - ) - energy = None - cell_length_pattern = re.compile( - r" (INITIAL ){0,1}CELL LNTHS\[bohr\]\s+=\s+(?P\S+)\s+(?P\S+)\s+(?P\S+)" - ) - cell_angle_pattern = re.compile( - r" (INITIAL ){0,1}CELL ANGLS\[deg\]\s+=\s+(?P\S+)\s+(?P\S+)\s+(?P\S+)" - ) - cell_A, cell_B, cell_C = ( - 0, - 0, - 0, - ) - cell_alpha, cell_beta, cell_gamma = ( - 0, - 0, - 0, - ) - cell_a_pattern = re.compile( - r" CELL\| Vector a \[angstrom\]:\s+(?P\S+)\s+(?P\S+)\s+(?P\S+)" - ) - cell_b_pattern = re.compile( - r" CELL\| Vector b \[angstrom\]:\s+(?P\S+)\s+(?P\S+)\s+(?P\S+)" - ) - cell_c_pattern = re.compile( - r" CELL\| Vector c \[angstrom\]:\s+(?P\S+)\s+(?P\S+)\s+(?P\S+)" - ) - force_start_pattern = re.compile(r" ATOMIC FORCES in") - force_flag = False - force_end_pattern = re.compile(r" SUM OF ATOMIC FORCES") - force_lines = [] - cell_flag = 0 - print_level_pattern = re.compile( - r" GLOBAL\| Global print level\s+(?P\S+)" - ) - print_level_flag = 0 - atomic_kinds_pattern = re.compile(r"\s+\d+\. Atomic kind:\s+(?P\S+)") - atomic_kinds = [] - stress_sign = "STRESS" - stress_flag = 0 - stress = [] - - for line in lines: - if stress_flag == 3: - if line == "\n": - stress_flag = 0 - else: - stress.append(line.split()[1:4]) - if stress_flag == 2: - stress_flag = 3 - if stress_flag == 1: - stress_flag = 2 - if stress_sign in line: - stress_flag = 1 - if force_start_pattern.match(line): - force_flag = True - if force_end_pattern.match(line): - assert force_flag is True, ( - force_flag, - "there may be errors in this file ", - ) - force_flag = False - if force_flag is True: - force_lines.append(line) - if energy_pattern_1.match(line): - energy = ( - float(energy_pattern_1.match(line).groupdict()["number"]) * AU_TO_EV - ) - # print('1to', energy) - if energy_pattern_2.match(line): - energy = ( - float(energy_pattern_2.match(line).groupdict()["number"]) * AU_TO_EV - ) - if cell_length_pattern.match(line): - cell_A = ( - float(cell_length_pattern.match(line).groupdict()["A"]) * AU_TO_ANG - ) - cell_B = ( - float(cell_length_pattern.match(line).groupdict()["B"]) * AU_TO_ANG - ) - cell_C = ( - float(cell_length_pattern.match(line).groupdict()["C"]) * AU_TO_ANG - ) - cell_flag += 1 - if cell_angle_pattern.match(line): - cell_alpha = np.deg2rad( - float(cell_angle_pattern.match(line).groupdict()["alpha"]) - ) - cell_beta = np.deg2rad( - float(cell_angle_pattern.match(line).groupdict()["beta"]) - ) - cell_gamma = np.deg2rad( - float(cell_angle_pattern.match(line).groupdict()["gamma"]) - ) - cell_flag += 1 - if print_level_pattern.match(line): - print_level = print_level_pattern.match(line).groupdict()["print_level"] - print_level_flag += 1 - if cell_a_pattern.match(line): - cell_ax = float(cell_a_pattern.match(line).groupdict()["ax"]) - cell_ay = float(cell_a_pattern.match(line).groupdict()["ay"]) - cell_az = float(cell_a_pattern.match(line).groupdict()["az"]) - cell_flag += 1 - if cell_b_pattern.match(line): - cell_bx = float(cell_b_pattern.match(line).groupdict()["bx"]) - cell_by = float(cell_b_pattern.match(line).groupdict()["by"]) - cell_bz = float(cell_b_pattern.match(line).groupdict()["bz"]) - cell_flag += 1 - if cell_c_pattern.match(line): - cell_cx = float(cell_c_pattern.match(line).groupdict()["cx"]) - cell_cy = float(cell_c_pattern.match(line).groupdict()["cy"]) - cell_cz = float(cell_c_pattern.match(line).groupdict()["cz"]) - cell_flag += 1 - - if atomic_kinds_pattern.match(line): - akind = atomic_kinds_pattern.match(line).groupdict()["akind"] - atomic_kinds.append(akind) - if print_level_flag == 1: - self.print_level = print_level - if print_level == "LOW": - raise RuntimeError( - "please provide cp2k output with higher print level(at least MEDIUM)" - ) - - if cell_flag == 2: - self.cell = cell_to_low_triangle( - cell_A, cell_B, cell_C, cell_alpha, cell_beta, cell_gamma - ) - elif cell_flag == 5: - self.cell = np.asarray( - [ - [cell_ax, cell_ay, cell_az], - [cell_bx, cell_by, cell_bz], - [cell_cx, cell_cy, cell_cz], - ] - ).astype("float64") - if atomic_kinds: - self.atomic_kinds = atomic_kinds - # print(self.atomic_kinds) - # lx = cell_A - # xy = cell_B * np.cos(cell_gamma) - # xz = cell_C * np.cos(cell_beta) - # ly = cell_B* np.sin(cell_gamma) - # yz = (cell_B*cell_C*np.cos(cell_alpha)-xy*xz)/ly - # lz = np.sqrt(cell_C**2-xz**2-yz**2) - # self.cell = [[lx, 0 , 0], - # [xy, ly, 0 ], - # [xz, yz, lz]] - - element_index = -1 - element_dict = OrderedDict() - atom_types_idx_list = [] - forces_list = [] - for line in force_lines[3:]: - line_list = line.split() - # print(line_list) - if element_dict.get(line_list[1]): - element_dict[line_list[1]][1] += 1 - else: - element_index += 1 - element_dict[line_list[1]] = [element_index, 1] - atom_types_idx_list.append(element_dict[line_list[1]][0]) - forces_list.append( - [ - float(line_list[3]) * AU_TO_EV_EVERY_ANG, - float(line_list[4]) * AU_TO_EV_EVERY_ANG, - float(line_list[5]) * AU_TO_EV_EVERY_ANG, - ] - ) - # print(atom_types_idx_list) - # atom_names=list(element_dict.keys()) - atom_names = self.atomic_kinds - atom_numbs = [] - - GPa = PressureConversion("eV/angstrom^3", "GPa").value() - if stress: - stress = np.array(stress) - stress = stress.astype("float64") - stress = stress[np.newaxis, :, :] - # stress to virial conversion, default unit in cp2k is GPa - # note the stress is virial = stress * volume - virial = stress * np.linalg.det(self.cell) / GPa - virial = virial.squeeze() - else: - virial = None - for ii in element_dict.keys(): - atom_numbs.append(element_dict[ii][1]) - # print(atom_numbs) - info_dict["atom_names"] = atom_names - info_dict["atom_numbs"] = atom_numbs - info_dict["atom_types"] = np.asarray(atom_types_idx_list) - info_dict["print_level"] = self.print_level - info_dict["cells"] = np.asarray([self.cell]).astype("float64") - info_dict["energies"] = np.asarray([energy]).astype("float64") - info_dict["forces"] = np.asarray([forces_list]).astype("float64") - if virial is not None: - info_dict["virials"] = np.asarray([virial]).astype("float64") - return info_dict - - def handle_single_xyz_frame(self, lines): - info_dict = {} - atom_num = int(lines[0].strip("\n").strip()) - if len(lines) != atom_num + 2: - raise RuntimeError( - f"format error, atom_num=={atom_num}, {len(lines)}!=atom_num+2" - ) - data_format_line = lines[1].strip("\n").strip() + " " - prop_pattern = re.compile(r"(?P\w+)\s*=\s*(?P.*?)[, ]") - prop_dict = dict(prop_pattern.findall(data_format_line)) - - energy = 0 - if prop_dict.get("E"): - energy = float(prop_dict.get("E")) * AU_TO_EV - # info_dict['energies'] = np.array([prop_dict['E']]).astype('float64') - - element_index = -1 - element_dict = OrderedDict() - atom_types_list = [] - coords_list = [] - for line in lines[2:]: - line_list = line.split() - if element_dict.get(line_list[0]): - element_dict[line_list[0]][1] += 1 - else: - element_index += 1 - element_dict[line_list[0]] = [element_index, 1] - atom_types_list.append(element_dict[line_list[0]][0]) - # coords_list.append([float(line_list[1])*AU_TO_ANG, - # float(line_list[2])*AU_TO_ANG, - # float(line_list[3])*AU_TO_ANG]) - coords_list.append( - [float(line_list[1]), float(line_list[2]), float(line_list[3])] - ) - atom_names = list(element_dict.keys()) - atom_numbs = [] - for ii in atom_names: - atom_numbs.append(element_dict[ii][1]) - # info_dict['atom_names'] = atom_names - # info_dict['atom_numbs'] = atom_numbs - # info_dict['atom_types'] = np.asarray(atom_types_list) - info_dict["coords"] = np.asarray([coords_list]).astype("float64") - info_dict["energies"] = np.array([energy]).astype("float64") - info_dict["orig"] = np.zeros(3) - return info_dict - - -# %% - - -def get_frames(fname): - coord_flag = False - force_flag = False - stress_flag = False - eV = EnergyConversion("hartree", "eV").value() - angstrom = LengthConversion("bohr", "angstrom").value() - GPa = PressureConversion("eV/angstrom^3", "GPa").value() - atom_symbol_idx_list = [] - atom_symbol_list = [] - cell = [] - coord = [] - force = [] - stress = [] - - fp = open(fname) - # check if output is converged, if not, return sys = 0 - content = fp.read() - count = content.count("SCF run converged") - if count == 0: - fp.close() - return [], [], [], [], [], [], [], None - - # search duplicated header - fp.seek(0) - header_idx = [] - for idx, ii in enumerate(fp): - if "Multiplication driver" in ii: - header_idx.append(idx) - - # parse from last header - fp.seek(0) - for idx, ii in enumerate(fp): - if idx > header_idx[-1]: - if "CELL| Vector" in ii: - cell.append(ii.split()[4:7]) - if "Atomic kind:" in ii: - atom_symbol_list.append(ii.split()[3]) - - # beginning of coords block - if "Atom Kind Element" in ii or "Atom Kind Element" in ii: - coord_flag = True - # parse coords lines - elif coord_flag: - if ii == "\n": - coord_flag = len(coord) == 0 # skip empty line at the beginning - else: - coord.append(ii.split()[4:7]) - atom_symbol_idx_list.append(ii.split()[1]) - - if "ENERGY|" in ii: - energy = ii.split()[8] - if " Atom Kind " in ii: - force_flag = True - force_idx = idx - if force_flag: - if idx > force_idx: - if "SUM OF ATOMIC FORCES" in ii: - force_flag = False - else: - force.append(ii.split()[3:6]) - # add reading stress tensor - if "STRESS TENSOR [GPa" in ii: - stress_flag = True - stress_idx = idx - if stress_flag: - if idx > stress_idx + 2: - if ii == "\n": - stress_flag = False - else: - stress.append(ii.split()[1:4]) - - fp.close() - assert coord, "cannot find coords" - assert energy, "cannot find energies" - assert force, "cannot find forces" - - # conver to float array and add extra dimension for nframes - cell = np.array(cell) - cell = cell.astype("float64") - cell = cell[np.newaxis, :, :] - coord = np.array(coord) - coord = coord.astype("float64") - coord = coord[np.newaxis, :, :] - atom_symbol_idx_list = np.array(atom_symbol_idx_list) - atom_symbol_idx_list = atom_symbol_idx_list.astype(int) - atom_symbol_idx_list = atom_symbol_idx_list - 1 - atom_symbol_list = np.array(atom_symbol_list) - atom_symbol_list = atom_symbol_list[atom_symbol_idx_list] - force = np.array(force) - force = force.astype("float64") - force = force[np.newaxis, :, :] - - # virial is not necessary - if stress: - stress = np.array(stress) - stress = stress.astype("float64") - stress = stress[np.newaxis, :, :] - # stress to virial conversion, default unit in cp2k is GPa - # note the stress is virial = stress * volume - virial = stress * np.linalg.det(cell[0]) / GPa - else: - virial = None - - # force unit conversion, default unit in cp2k is hartree/bohr - force = force * eV / angstrom - # energy unit conversion, default unit in cp2k is hartree - energy = float(energy) * eV - energy = np.array(energy).astype("float64") - energy = energy[np.newaxis] - - tmp_names, symbol_idx = np.unique(atom_symbol_list, return_index=True) - atom_types = [] - atom_numbs = [] - # preserve the atom_name order - atom_names = atom_symbol_list[np.sort(symbol_idx, kind="stable")] - for jj in atom_symbol_list: - for idx, ii in enumerate(atom_names): - if jj == ii: - atom_types.append(idx) - for idx in range(len(atom_names)): - atom_numbs.append(atom_types.count(idx)) - - atom_types = np.array(atom_types) - - return list(atom_names), atom_numbs, atom_types, cell, coord, energy, force, virial - - -# %% +from dpdata.formats.cp2k.output import * # noqa: F403 diff --git a/dpdata/deepmd/__init__.py b/dpdata/deepmd/__init__.py index e69de29bb..2ee2e6795 100644 --- a/dpdata/deepmd/__init__.py +++ b/dpdata/deepmd/__init__.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from dpdata.formats.deepmd import * # noqa: F403 diff --git a/dpdata/deepmd/comp.py b/dpdata/deepmd/comp.py index 410d789e1..f2da0108a 100644 --- a/dpdata/deepmd/comp.py +++ b/dpdata/deepmd/comp.py @@ -1,162 +1,3 @@ from __future__ import annotations -import glob -import os -import shutil -import warnings - -import numpy as np - -import dpdata -from dpdata.utils import open_file - -from .raw import load_type - - -def _cond_load_data(fname): - tmp = None - if os.path.isfile(fname): - tmp = np.load(fname) - return tmp - - -def _load_set(folder, nopbc: bool): - coords = np.load(os.path.join(folder, "coord.npy")) - if nopbc: - cells = np.zeros((coords.shape[0], 3, 3)) - else: - cells = np.load(os.path.join(folder, "box.npy")) - return cells, coords - - -def to_system_data(folder, type_map=None, labels=True): - # data is empty - data = load_type(folder, type_map=type_map) - data["orig"] = np.zeros([3]) - if os.path.isfile(os.path.join(folder, "nopbc")): - data["nopbc"] = True - sets = sorted(glob.glob(os.path.join(folder, "set.*"))) - all_cells = [] - all_coords = [] - for ii in sets: - cells, coords = _load_set(ii, data.get("nopbc", False)) - nframes = np.reshape(cells, [-1, 3, 3]).shape[0] - all_cells.append(np.reshape(cells, [nframes, 3, 3])) - all_coords.append(np.reshape(coords, [nframes, -1, 3])) - data["cells"] = np.concatenate(all_cells, axis=0) - data["coords"] = np.concatenate(all_coords, axis=0) - # allow custom dtypes - if labels: - dtypes = dpdata.system.LabeledSystem.DTYPES - else: - dtypes = dpdata.system.System.DTYPES - - for dtype in dtypes: - if dtype.name in ( - "atom_numbs", - "atom_names", - "atom_types", - "orig", - "cells", - "coords", - "real_atom_names", - "nopbc", - ): - # skip as these data contains specific rules - continue - if not (len(dtype.shape) and dtype.shape[0] == dpdata.system.Axis.NFRAMES): - warnings.warn( - f"Shape of {dtype.name} is not (nframes, ...), but {dtype.shape}. This type of data will not converted from deepmd/npy format." - ) - continue - natoms = data["atom_types"].shape[0] - shape = [ - natoms if xx == dpdata.system.Axis.NATOMS else xx for xx in dtype.shape[1:] - ] - all_data = [] - for ii in sets: - tmp = _cond_load_data(os.path.join(ii, dtype.deepmd_name + ".npy")) - if tmp is not None: - all_data.append(np.reshape(tmp, [tmp.shape[0], *shape])) - if len(all_data) > 0: - data[dtype.name] = np.concatenate(all_data, axis=0) - return data - - -def dump(folder, data, set_size=5000, comp_prec=np.float32, remove_sets=True): - os.makedirs(folder, exist_ok=True) - sets = sorted(glob.glob(os.path.join(folder, "set.*"))) - if len(sets) > 0: - if remove_sets: - for ii in sets: - shutil.rmtree(ii) - else: - raise RuntimeError( - "found " - + str(sets) - + " in " - + folder - + "not a clean deepmd raw dir. please firstly clean set.* then try compress" - ) - # dump raw - np.savetxt(os.path.join(folder, "type.raw"), data["atom_types"], fmt="%d") - np.savetxt(os.path.join(folder, "type_map.raw"), data["atom_names"], fmt="%s") - # BondOrder System - if "bonds" in data: - np.savetxt( - os.path.join(folder, "bonds.raw"), - data["bonds"], - header="begin_atom, end_atom, bond_order", - ) - if "formal_charges" in data: - np.savetxt(os.path.join(folder, "formal_charges.raw"), data["formal_charges"]) - # reshape frame properties and convert prec - nframes = data["cells"].shape[0] - # dump frame properties: cell, coord, energy, force and virial - nsets = nframes // set_size - if set_size * nsets < nframes: - nsets += 1 - for ii in range(nsets): - set_stt = ii * set_size - set_end = (ii + 1) * set_size - set_folder = os.path.join(folder, "set.%03d" % ii) # noqa: UP031 - os.makedirs(set_folder) - try: - os.remove(os.path.join(folder, "nopbc")) - except OSError: - pass - if data.get("nopbc", False): - with open_file(os.path.join(folder, "nopbc"), "w") as fw_nopbc: - pass - # allow custom dtypes - labels = "energies" in data - if labels: - dtypes = dpdata.system.LabeledSystem.DTYPES - else: - dtypes = dpdata.system.System.DTYPES - for dtype in dtypes: - if dtype.name in ( - "atom_numbs", - "atom_names", - "atom_types", - "orig", - "real_atom_names", - "nopbc", - ): - # skip as these data contains specific rules - continue - if dtype.name not in data: - continue - if not (len(dtype.shape) and dtype.shape[0] == dpdata.system.Axis.NFRAMES): - warnings.warn( - f"Shape of {dtype.name} is not (nframes, ...), but {dtype.shape}. This type of data will not converted to deepmd/npy format." - ) - continue - ddata = np.reshape(data[dtype.name], [nframes, -1]) - if np.issubdtype(ddata.dtype, np.floating): - ddata = ddata.astype(comp_prec) - for ii in range(nsets): - set_stt = ii * set_size - set_end = (ii + 1) * set_size - set_folder = os.path.join(folder, "set.%03d" % ii) # noqa: UP031 - np.save(os.path.join(set_folder, dtype.deepmd_name), ddata[set_stt:set_end]) +from dpdata.formats.deepmd.comp import * # noqa: F403 diff --git a/dpdata/deepmd/hdf5.py b/dpdata/deepmd/hdf5.py index c2b3bd424..9ef1c5a75 100644 --- a/dpdata/deepmd/hdf5.py +++ b/dpdata/deepmd/hdf5.py @@ -1,228 +1,3 @@ -"""Utils for deepmd/hdf5 format.""" - from __future__ import annotations -import warnings -from typing import TYPE_CHECKING - -import numpy as np - -import dpdata - -if TYPE_CHECKING: - import h5py - -__all__ = ["to_system_data", "dump"] - - -def to_system_data( - f: h5py.File | h5py.Group, - folder: str, - type_map: list | None = None, - labels: bool = True, -): - """Load a HDF5 file. - - Parameters - ---------- - f : h5py.File or h5py.Group - HDF5 file or group object - folder : str - path in the HDF5 file - type_map : list - type map - labels : bool - labels - """ - from wcmatch.glob import globfilter - - g = f[folder] if folder else f - - data = {} - # ignore empty files or groups - if "type.raw" not in g.keys(): - return data - data["atom_types"] = g["type.raw"][:] - ntypes = np.max(data["atom_types"]) + 1 - natoms = data["atom_types"].size - data["atom_numbs"] = [] - for ii in range(ntypes): - data["atom_numbs"].append(np.count_nonzero(data["atom_types"] == ii)) - data["atom_names"] = [] - # if find type_map.raw, use it - if "type_map.raw" in g.keys(): - my_type_map = list(np.char.decode(g["type_map.raw"][:])) - # else try to use arg type_map - elif type_map is not None: - my_type_map = type_map - # in the last case, make artificial atom names - else: - my_type_map = [] - for ii in range(ntypes): - my_type_map.append("Type_%d" % ii) # noqa: UP031 - assert len(my_type_map) >= len(data["atom_numbs"]) - for ii in range(len(data["atom_numbs"])): - data["atom_names"].append(my_type_map[ii]) - - data["orig"] = np.zeros([3]) - if "nopbc" in g.keys(): - data["nopbc"] = True - sets = globfilter(g.keys(), "set.*") - - data_types = {} - # allow custom dtypes - if labels: - dtypes = dpdata.system.LabeledSystem.DTYPES - else: - dtypes = dpdata.system.System.DTYPES - for dtype in dtypes: - if dtype.name in ( - "atom_numbs", - "atom_names", - "atom_types", - "orig", - "real_atom_types", - "real_atom_names", - "nopbc", - ): - # skip as these data contains specific rules - continue - if not (len(dtype.shape) and dtype.shape[0] == dpdata.system.Axis.NFRAMES): - warnings.warn( - f"Shape of {dtype.name} is not (nframes, ...), but {dtype.shape}. This type of data will not converted from deepmd/hdf5 format." - ) - continue - shape = [ - natoms if xx == dpdata.system.Axis.NATOMS else xx for xx in dtype.shape[1:] - ] - - data_types[dtype.name] = { - "fn": dtype.deepmd_name, - "shape": shape, - "required": dtype.required - and not (dtype.name == "cells" and data.get("nopbc", False)), - } - - for dt, prop in data_types.items(): - all_data = [] - - for ii in sets: - set = g[ii] - fn = "{}.npy".format(prop["fn"]) - if fn in set.keys(): - dd = set[fn][:] - nframes = dd.shape[0] - all_data.append(np.reshape(dd, (nframes, *prop["shape"]))) - elif prop["required"]: - raise RuntimeError(f"{folder}/{ii}/{fn} not found") - - if len(all_data) > 0: - data[dt] = np.concatenate(all_data, axis=0) - if "cells" not in data: - nframes = data["coords"].shape[0] - data["cells"] = np.zeros((nframes, 3, 3)) - return data - - -def dump( - f: h5py.File | h5py.Group, - folder: str, - data: dict, - set_size=5000, - comp_prec=np.float32, -) -> None: - """Dump data to a HDF5 file. - - Parameters - ---------- - f : h5py.File or h5py.Group - HDF5 file or group object - folder : str - path in the HDF5 file - data : dict - System or LabeledSystem data - set_size : int, default: 5000 - size of a set - comp_prec : np.dtype, default: np.float32 - precision of data - """ - # if folder is None, use the root of the file - if folder: - if folder in f: - del f[folder] - g = f.create_group(folder) - else: - g = f - # ignore empty systems - if not len(data["coords"]): - return - # dump raw (array in fact) - g.create_dataset("type.raw", data=data["atom_types"]) - g.create_dataset("type_map.raw", data=np.array(data["atom_names"], dtype="S")) - # BondOrder System - if "bonds" in data: - g.create_dataset("bonds.raw", data=data["bonds"]) - if "formal_charges" in data: - g.create_dataset("formal_charges.raw", data=data["formal_charges"]) - # reshape frame properties and convert prec - nframes = data["cells"].shape[0] - - nopbc = data.get("nopbc", False) - reshaped_data = {} - - data_types = {} - - labels = "energies" in data - if labels: - dtypes = dpdata.system.LabeledSystem.DTYPES - else: - dtypes = dpdata.system.System.DTYPES - # allow custom dtypes - for dtype in dtypes: - if dtype.name in ( - "atom_numbs", - "atom_names", - "atom_types", - "orig", - "real_atom_types", - "real_atom_names", - "nopbc", - ): - # skip as these data contains specific rules - continue - if not (len(dtype.shape) and dtype.shape[0] == dpdata.system.Axis.NFRAMES): - warnings.warn( - f"Shape of {dtype.name} is not (nframes, ...), but {dtype.shape}. This type of data will not converted to deepmd/hdf5 format." - ) - continue - - data_types[dtype.name] = { - "fn": dtype.deepmd_name, - "shape": (nframes, -1), - "dump": not (dtype.name == "cells" and nopbc), - } - - for dt, prop in data_types.items(): - if dt in data: - if prop["dump"]: - ddata = np.reshape(data[dt], prop["shape"]) - if np.issubdtype(ddata.dtype, np.floating): - ddata = ddata.astype(comp_prec) - reshaped_data[dt] = ddata - - # dump frame properties: cell, coord, energy, force and virial - nsets = nframes // set_size - if set_size * nsets < nframes: - nsets += 1 - for ii in range(nsets): - set_stt = ii * set_size - set_end = (ii + 1) * set_size - set_folder = g.create_group("set.%03d" % ii) # noqa: UP031 - for dt, prop in data_types.items(): - if dt in reshaped_data: - set_folder.create_dataset( - "{}.npy".format(prop["fn"]), data=reshaped_data[dt][set_stt:set_end] - ) - - if nopbc: - g.create_dataset("nopbc", data=True) +from dpdata.formats.deepmd.hdf5 import * # noqa: F403 diff --git a/dpdata/deepmd/mixed.py b/dpdata/deepmd/mixed.py index 734b6a730..a730d076b 100644 --- a/dpdata/deepmd/mixed.py +++ b/dpdata/deepmd/mixed.py @@ -1,299 +1,3 @@ from __future__ import annotations -import copy -import math - -import numpy as np - -import dpdata -from dpdata.data_type import Axis - -from .comp import dump as comp_dump -from .comp import to_system_data as comp_to_system_data - - -def _pad_to(sys_data, target_natoms, dtypes): - """Pad system data dict so that NATOMS dimension becomes target_natoms. - - Virtual atoms get real_atom_types = -1, and all other per-atom data is - padded with zeros. - - Parameters - ---------- - sys_data : dict - System data dict, already in mixed-type format. - target_natoms : int - Target number of atoms after padding. - dtypes : tuple[DataType, ...] - Registered data types to iterate for generic per-atom padding. - """ - natoms = sys_data["atom_types"].shape[0] - npad = target_natoms - natoms - if npad <= 0: - return - nframes = sys_data["coords"].shape[0] - - # Pad atom_types (all MIXED_TOKEN = 0) - sys_data["atom_types"] = np.concatenate( - [sys_data["atom_types"], np.zeros(npad, dtype=int)] - ) - sys_data["atom_numbs"] = [target_natoms] - - # Pad real_atom_types with -1 (virtual atom sentinel) - sys_data["real_atom_types"] = np.concatenate( - [ - sys_data["real_atom_types"], - -np.ones((nframes, npad), dtype=sys_data["real_atom_types"].dtype), - ], - axis=1, - ) - - # Pad coords and all other per-atom data generically - reserved = { - "atom_numbs", - "atom_names", - "atom_types", - "orig", - "cells", - "real_atom_names", - "real_atom_types", - "nopbc", - } - for dtype in dtypes: - if dtype.name in reserved: - continue - if dtype.name not in sys_data: - continue - if not ( - len(dtype.shape) >= 2 - and dtype.shape[0] == Axis.NFRAMES - and Axis.NATOMS in dtype.shape - ): - continue - axis_natoms = list(dtype.shape).index(Axis.NATOMS) - arr = sys_data[dtype.name] - pad_width = [(0, 0)] * len(arr.shape) - pad_width[axis_natoms] = (0, npad) - sys_data[dtype.name] = np.pad( - arr, pad_width, mode="constant", constant_values=0 - ) - - -def _strip_virtual_atoms(atom_types_row, coords, extra_data, dtypes): - """Strip virtual atoms (type -1) from a group of frames. - - Parameters - ---------- - atom_types_row : np.ndarray - 1-D array of atom type indices for the group (same for all frames). - coords : np.ndarray - Coordinates array, shape (nframes, natoms_padded, 3). - extra_data : dict - Dict of {name: array} for this group, arrays already frame-sliced. - dtypes : tuple[DataType, ...] - Registered data types. - - Returns - ------- - atom_types : np.ndarray - Atom types with virtual atoms removed. - coords : np.ndarray - Coords with virtual atoms removed. - extra_data : dict - Extra data with virtual atoms removed. - """ - real_mask = atom_types_row >= 0 - if real_mask.all(): - return atom_types_row, coords, extra_data - - atom_types = atom_types_row[real_mask] - coords = coords[:, real_mask, :] - - stripped = {} - for name, arr in extra_data.items(): - for dtype in dtypes: - if dtype.name == name and Axis.NATOMS in dtype.shape: - axis_natoms = list(dtype.shape).index(Axis.NATOMS) - idx = [slice(None)] * len(arr.shape) - idx[axis_natoms] = real_mask - arr = arr[tuple(idx)] - break - stripped[name] = arr - - return atom_types, coords, stripped - - -def to_system_data(folder, type_map=None, labels=True): - data = comp_to_system_data(folder, type_map, labels) - # data is empty - old_type_map = data["atom_names"].copy() - if type_map is not None: - assert isinstance(type_map, list) - missing_type = [i for i in old_type_map if i not in type_map] - assert not missing_type, ( - f"These types are missing in selected type_map: {missing_type} !" - ) - index_map = np.array([type_map.index(i) for i in old_type_map]) - data["atom_names"] = type_map.copy() - else: - index_map = None - all_real_atom_types_concat = data.pop("real_atom_types").astype(int) - if index_map is not None: - # Preserve -1 (virtual atom sentinel) during remapping - valid = all_real_atom_types_concat >= 0 - remapped = np.full_like(all_real_atom_types_concat, -1) - remapped[valid] = index_map[all_real_atom_types_concat[valid]] - all_real_atom_types_concat = remapped - all_cells_concat = data["cells"] - all_coords_concat = data["coords"] - - # handle custom registered data types - if labels: - dtypes = dpdata.system.LabeledSystem.DTYPES - else: - dtypes = dpdata.system.System.DTYPES - reserved = { - "atom_numbs", - "atom_names", - "atom_types", - "real_atom_names", - "real_atom_types", - "cells", - "coords", - "orig", - "nopbc", - } - extra_data = {} - for dtype in dtypes: - name = dtype.name - if name in reserved: - continue - if not (len(dtype.shape) and dtype.shape[0] == dpdata.system.Axis.NFRAMES): - continue - if name in data: - extra_data[name] = data.pop(name) - - data_list = [] - while True: - if all_real_atom_types_concat.size == 0: - break - # temp_formula = formula(data['atom_names'], temp_atom_numbs) - temp_idx = np.arange(all_real_atom_types_concat.shape[0])[ - (all_real_atom_types_concat == all_real_atom_types_concat[0]).all(-1) - ] - rest_idx = np.arange(all_real_atom_types_concat.shape[0])[ - (all_real_atom_types_concat != all_real_atom_types_concat[0]).any(-1) - ] - - # Extract data for this group - group_atom_types = all_real_atom_types_concat[0] - group_coords = all_coords_concat[temp_idx] - group_extra = {} - for name in extra_data: - group_extra[name] = extra_data[name][temp_idx] - extra_data[name] = extra_data[name][rest_idx] - - # Strip virtual atoms (type -1) introduced by padding - group_atom_types, group_coords, group_extra = _strip_virtual_atoms( - group_atom_types, group_coords, group_extra, dtypes - ) - - temp_atom_numbs = [ - np.count_nonzero(group_atom_types == i) - for i in range(len(data["atom_names"])) - ] - - temp_data = data.copy() - temp_data["atom_names"] = data["atom_names"].copy() - temp_data["atom_numbs"] = temp_atom_numbs - temp_data["atom_types"] = group_atom_types - all_real_atom_types_concat = all_real_atom_types_concat[rest_idx] - temp_data["cells"] = all_cells_concat[temp_idx] - all_cells_concat = all_cells_concat[rest_idx] - temp_data["coords"] = group_coords - all_coords_concat = all_coords_concat[rest_idx] - - for name in group_extra: - temp_data[name] = group_extra[name] - - data_list.append(temp_data) - return data_list - - -def dump(folder, data, set_size=2000, comp_prec=np.float32, remove_sets=True): - # if not converted to mixed - if "real_atom_types" not in data: - from dpdata import LabeledSystem, System - - # not change the original content - data = copy.deepcopy(data) - - if "energies" in data: - temp_sys = LabeledSystem(data=data) - else: - temp_sys = System(data=data) - temp_sys.convert_to_mixed_type() - - data = data.copy() - data["atom_names"] = data.pop("real_atom_names") - comp_dump(folder, data, set_size, comp_prec, remove_sets) - - -def mix_system(*system, type_map, atom_numb_pad=None, **kwargs): - """Mix the systems into mixed_type ones according to the unified given type_map. - - Parameters - ---------- - *system : System - The systems to mix - type_map : list of str - Maps atom type to name - atom_numb_pad : int, optional - If provided, pad atom counts to the next multiple of this number - using virtual atoms (type -1 in real_atom_types). This reduces the - number of subdirectories when systems have many different atom counts. - For example, atom_numb_pad=8 groups systems into multiples of 8. - **kwargs : dict - Other parameters - - Returns - ------- - mixed_systems: dict - dict of mixed system with key 'atom_numbs' - """ - mixed_systems = {} - temp_systems = {} - atom_numbs_frame_index = {} # index of frames in cur sys - # Use LabeledSystem DTYPES as superset for generic per-atom padding - dtypes = dpdata.system.LabeledSystem.DTYPES - for sys in system: - tmp_sys = sys.copy() - natom = tmp_sys.get_natoms() - tmp_sys.convert_to_mixed_type(type_map=type_map) - if atom_numb_pad is not None and atom_numb_pad > 1: - padded_natom = math.ceil(natom / atom_numb_pad) * atom_numb_pad - _pad_to(tmp_sys.data, padded_natom, dtypes) - group_key = str(padded_natom) - else: - group_key = str(natom) - if group_key not in atom_numbs_frame_index: - atom_numbs_frame_index[group_key] = 0 - atom_numbs_frame_index[group_key] += tmp_sys.get_nframes() - if group_key not in temp_systems or not temp_systems[group_key]: - temp_systems[group_key] = tmp_sys - else: - temp_systems[group_key].append(tmp_sys) - for natom_key in temp_systems: - if atom_numbs_frame_index[natom_key] > 0: - mixed_systems[natom_key] = temp_systems[natom_key] - return mixed_systems - - -def split_system(sys, split_num=10000): - rest = sys.get_nframes() - split_num - if rest <= 0: - return sys, None, 0 - else: - split_sys = sys.sub_system(range(split_num)) - rest_sys = sys.sub_system(range(split_num, sys.get_nframes())) - return split_sys, rest_sys, rest +from dpdata.formats.deepmd.mixed import * # noqa: F403 diff --git a/dpdata/deepmd/raw.py b/dpdata/deepmd/raw.py index 50dc5afd3..2c7d1d4ed 100644 --- a/dpdata/deepmd/raw.py +++ b/dpdata/deepmd/raw.py @@ -1,140 +1,3 @@ from __future__ import annotations -import os -import warnings - -import numpy as np - -import dpdata -from dpdata.utils import open_file - - -def load_type(folder, type_map=None): - data = {} - data["atom_types"] = np.loadtxt(os.path.join(folder, "type.raw"), ndmin=1).astype( - int - ) - ntypes = np.max(data["atom_types"]) + 1 - data["atom_names"] = [] - # if find type_map.raw, use it - if os.path.isfile(os.path.join(folder, "type_map.raw")): - with open_file(os.path.join(folder, "type_map.raw")) as fp: - my_type_map = fp.read().split() - # else try to use arg type_map - elif type_map is not None: - my_type_map = type_map - # in the last case, make artificial atom names - else: - my_type_map = [] - for ii in range(ntypes): - my_type_map.append("Type_%d" % ii) # noqa: UP031 - data["atom_names"] = my_type_map - data["atom_numbs"] = [] - for ii, _ in enumerate(data["atom_names"]): - data["atom_numbs"].append(np.count_nonzero(data["atom_types"] == ii)) - - return data - - -def to_system_data(folder, type_map=None, labels=True): - if os.path.isdir(folder): - data = load_type(folder, type_map=type_map) - data["orig"] = np.zeros([3]) - data["coords"] = np.loadtxt(os.path.join(folder, "coord.raw"), ndmin=2) - nframes = data["coords"].shape[0] - if os.path.isfile(os.path.join(folder, "nopbc")): - data["nopbc"] = True - data["cells"] = np.zeros((nframes, 3, 3)) - else: - data["cells"] = np.loadtxt(os.path.join(folder, "box.raw"), ndmin=2) - data["cells"] = np.reshape(data["cells"], [nframes, 3, 3]) - data["coords"] = np.reshape(data["coords"], [nframes, -1, 3]) - if os.path.isfile(os.path.join(folder, "nopbc")): - data["nopbc"] = True - # allow custom dtypes - if labels: - dtypes = dpdata.system.LabeledSystem.DTYPES - else: - dtypes = dpdata.system.System.DTYPES - for dtype in dtypes: - if dtype.name in ( - "atom_numbs", - "atom_names", - "atom_types", - "orig", - "cells", - "coords", - "real_atom_types", - "real_atom_names", - "nopbc", - ): - # skip as these data contains specific rules - continue - if not (len(dtype.shape) and dtype.shape[0] == dpdata.system.Axis.NFRAMES): - warnings.warn( - f"Shape of {dtype.name} is not (nframes, ...), but {dtype.shape}. This type of data will not converted from deepmd/raw format." - ) - continue - natoms = data["atom_types"].shape[0] - shape = [ - natoms if xx == dpdata.system.Axis.NATOMS else xx - for xx in dtype.shape[1:] - ] - if os.path.exists(os.path.join(folder, f"{dtype.deepmd_name}.raw")): - data[dtype.name] = np.reshape( - np.loadtxt(os.path.join(folder, f"{dtype.deepmd_name}.raw")), - [nframes, *shape], - ) - return data - else: - raise RuntimeError("not dir " + folder) - - -def dump(folder, data): - os.makedirs(folder, exist_ok=True) - nframes = data["cells"].shape[0] - np.savetxt(os.path.join(folder, "type.raw"), data["atom_types"], fmt="%d") - np.savetxt(os.path.join(folder, "type_map.raw"), data["atom_names"], fmt="%s") - # BondOrder System - if "bonds" in data: - np.savetxt( - os.path.join(folder, "bonds.raw"), - data["bonds"], - header="begin_atom, end_atom, bond_order", - ) - if "formal_charges" in data: - np.savetxt(os.path.join(folder, "formal_charges.raw"), data["formal_charges"]) - try: - os.remove(os.path.join(folder, "nopbc")) - except OSError: - pass - if data.get("nopbc", False): - with open_file(os.path.join(folder, "nopbc"), "w") as fw_nopbc: - pass - # allow custom dtypes - labels = "energies" in data - if labels: - dtypes = dpdata.system.LabeledSystem.DTYPES - else: - dtypes = dpdata.system.System.DTYPES - for dtype in dtypes: - if dtype.name in ( - "atom_numbs", - "atom_names", - "atom_types", - "orig", - "real_atom_types", - "real_atom_names", - "nopbc", - ): - # skip as these data contains specific rules - continue - if dtype.name not in data: - continue - if not (len(dtype.shape) and dtype.shape[0] == dpdata.system.Axis.NFRAMES): - warnings.warn( - f"Shape of {dtype.name} is not (nframes, ...), but {dtype.shape}. This type of data will not converted to deepmd/raw format." - ) - continue - ddata = np.reshape(data[dtype.name], [nframes, -1]) - np.savetxt(os.path.join(folder, f"{dtype.deepmd_name}.raw"), ddata) +from dpdata.formats.deepmd.raw import * # noqa: F403 diff --git a/dpdata/dftbplus/__init__.py b/dpdata/dftbplus/__init__.py index e69de29bb..0e9a8e392 100644 --- a/dpdata/dftbplus/__init__.py +++ b/dpdata/dftbplus/__init__.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from dpdata.formats.dftbplus import * # noqa: F403 diff --git a/dpdata/dftbplus/output.py b/dpdata/dftbplus/output.py index 49fdd2b1b..133adb120 100644 --- a/dpdata/dftbplus/output.py +++ b/dpdata/dftbplus/output.py @@ -1,83 +1,3 @@ from __future__ import annotations -from typing import TYPE_CHECKING - -import numpy as np - -from dpdata.utils import open_file - -if TYPE_CHECKING: - from dpdata.utils import FileType - - -def read_dftb_plus( - fn_1: FileType, fn_2: FileType -) -> tuple[str, np.ndarray, float, np.ndarray]: - """Read from DFTB+ input and output. - - Parameters - ---------- - fn_1 : str - DFTB+ input file name - fn_2 : str - DFTB+ output file name - - Returns - ------- - str - atomic symbols - np.ndarray - atomic coordinates - float - total potential energy - np.ndarray - atomic forces - - """ - coord = None - symbols = None - forces = None - energy = None - with open_file(fn_1) as f: - flag = 0 - for line in f: - if flag == 1: - flag += 1 - elif flag == 2: - components = line.split() - flag += 1 - elif line.startswith("Geometry"): - flag = 1 - coord = [] - symbols = [] - elif flag in (3, 4, 5, 6): - s = line.split() - components_num = int(s[1]) - symbols.append(components[components_num - 1]) - coord.append([float(s[2]), float(s[3]), float(s[4])]) - flag += 1 - if flag == 7: - flag = 0 - with open_file(fn_2) as f: - flag = 0 - for line in f: - if line.startswith("Total Forces"): - flag = 8 - forces = [] - elif flag in (8, 9, 10, 11): - s = line.split() - forces.append([float(s[1]), float(s[2]), float(s[3])]) - flag += 1 - if flag == 12: - flag = 0 - elif line.startswith("Total energy:"): - s = line.split() - energy = float(s[2]) - flag = 0 - - symbols = np.array(symbols) - forces = np.array(forces) - coord = np.array(coord) - assert coord.shape == forces.shape - - return symbols, coord, energy, forces +from dpdata.formats.dftbplus.output import * # noqa: F403 diff --git a/dpdata/fhi_aims/__init__.py b/dpdata/fhi_aims/__init__.py old mode 100755 new mode 100644 index e69de29bb..ac6ca3623 --- a/dpdata/fhi_aims/__init__.py +++ b/dpdata/fhi_aims/__init__.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from dpdata.formats.fhi_aims import * # noqa: F403 diff --git a/dpdata/fhi_aims/output.py b/dpdata/fhi_aims/output.py old mode 100755 new mode 100644 index 762e8bf4d..206a23634 --- a/dpdata/fhi_aims/output.py +++ b/dpdata/fhi_aims/output.py @@ -1,204 +1,3 @@ from __future__ import annotations -import re -import warnings - -import numpy as np - -latt_patt = r"\|\s+([0-9]{1,}[.][0-9]*)\s+([0-9]{1,}[.][0-9]*)\s+([0-9]{1,}[.][0-9]*)" -pos_patt_first = r"\|\s+[0-9]{1,}[:]\s\w+\s(\w+)(\s.*[-]?[0-9]{1,}[.][0-9]*)(\s+[-]?[0-9]{1,}[.][0-9]*)(\s+[-]?[0-9]{1,}[.][0-9]*)" -pos_patt_other = r"\s+[a][t][o][m]\s+([-]?[0-9]{1,}[.][0-9]*)\s+([-]?[0-9]{1,}[.][0-9]*)\s+([-]?[0-9]{1,}[.][0-9]*)\s+(\w{1,2})" -force_patt = r"\|\s+[0-9]{1,}\s+([-]?[0-9]{1,}[.][0-9]*[E][+-][0-9]{1,})\s+([-]?[0-9]{1,}[.][0-9]*[E][+-][0-9]{1,})\s+([-]?[0-9]{1,}[.][0-9]*[E][+-][0-9]{1,})" -eng_patt = r"Total energy uncorrected.*([-]?[0-9]{1,}[.][0-9]*[E][+-][0-9]{1,})\s+eV" -# atom_numb_patt=r"Number of atoms.*([0-9]{1,})" - -debug = False - - -def get_info(lines, type_idx_zero=False): - atom_types = [] - atom_names = [] - cell = [] - atom_numbs = None - _atom_names = [] - - contents = "\n".join(lines) - # cell - # _tmp=re.findall(latt_patt,contents) - # for ii in _tmp: - # vect=[float(kk) for kk in ii] - # cell.append(vect) - # ------------------ - for ln, l in enumerate(lines): - if l.startswith(" | Unit cell"): - break - _tmp = lines[ln + 1 : ln + 4] - for ii in _tmp: - v_str = ii.split("|")[1].split() - vect = [float(kk) for kk in v_str] - cell.append(vect) - - _tmp = re.findall(pos_patt_first, contents) - for ii in _tmp: - _atom_names.append(ii[0]) - atom_names = [] - for ii in _atom_names: - if ii not in atom_names: - atom_names.append(ii) - - atom_numbs = [_atom_names.count(ii) for ii in atom_names] - if type_idx_zero: - type_map = dict(zip(atom_names, range(len(atom_names)))) - else: - type_map = dict(zip(atom_names, range(1, len(atom_names) + 1))) - atom_types = list(map(lambda k: type_map[k], _atom_names)) - assert atom_numbs is not None, "cannot find ion type info in aims output" - - return [cell, atom_numbs, atom_names, atom_types] - - -def get_fhi_aims_block(fp): - blk = [] - for ii in fp: - if not ii: - return blk - blk.append(ii.rstrip("\n")) - if "Begin self-consistency loop: Re-initialization" in ii: - return blk - return blk - - -def get_frames(fname, md=True, begin=0, step=1, convergence_check=True): - fp = open(fname) - blk = get_fhi_aims_block(fp) - ret = get_info(blk, type_idx_zero=True) - - cell, atom_numbs, atom_names, atom_types = ret[0], ret[1], ret[2], ret[3] - ntot = sum(atom_numbs) - - all_coords = [] - all_cells = [] - all_energies = [] - all_forces = [] - all_virials = [] - - cc = 0 - rec_failed = [] - while len(blk) > 0: - if debug: - with open(str(cc), "w") as f: - f.write("\n".join(blk)) - if cc >= begin and (cc - begin) % step == 0: - if cc == 0: - coord, _cell, energy, force, virial, is_converge = analyze_block( - blk, first_blk=True, md=md - ) - else: - coord, _cell, energy, force, virial, is_converge = analyze_block( - blk, first_blk=False - ) - if len(coord) == 0: - break - if is_converge or not convergence_check: - all_coords.append(coord) - - if _cell: - all_cells.append(_cell) - else: - all_cells.append(cell) - - all_energies.append(energy) - all_forces.append(force) - if virial is not None: - all_virials.append(virial) - if not is_converge: - rec_failed.append(cc + 1) - - blk = get_fhi_aims_block(fp) - cc += 1 - - if len(rec_failed) > 0: - prt = ( - "so they are not collected." - if convergence_check - else "but they are still collected due to the requirement for ignoring convergence checks." - ) - warnings.warn( - f"The following structures were unconverged: {rec_failed}; " + prt - ) - - if len(all_virials) == 0: - all_virials = None - else: - all_virials = np.array(all_virials) - fp.close() - return ( - atom_names, - atom_numbs, - np.array(atom_types), - np.array(all_cells), - np.array(all_coords), - np.array(all_energies), - np.array(all_forces), - all_virials, - ) - - -def analyze_block(lines, first_blk=False, md=True): - coord = [] - cell = [] - energy = None - force = [] - virial = None - atom_names = [] - _atom_names = [] - - contents = "\n".join(lines) - try: - natom = int(re.findall("Number of atoms.*([0-9]{1,})", lines)[0]) - except Exception: - natom = 0 - - if first_blk: - if md: - _tmp = re.findall(pos_patt_other, contents)[:] - for ii in _tmp[slice(int(len(_tmp) / 2), len(_tmp))]: - coord.append([float(kk) for kk in ii[:-1]]) - else: - _tmp = re.findall(pos_patt_first, contents) - for ii in _tmp: - coord.append([float(kk) for kk in ii[1:]]) - else: - _tmp = re.findall(pos_patt_other, contents) - for ii in _tmp: - coord.append([float(kk) for kk in ii[:-1]]) - - _tmp = re.findall(force_patt, contents) - for ii in _tmp: - force.append([float(kk) for kk in ii]) - - if "Self-consistency cycle converged" in contents: - is_converge = True - else: - is_converge = False - - try: - _eng_patt = re.compile(eng_patt) - energy = float(_eng_patt.search(contents).group().split()[-2]) - except Exception: - energy = None - - if not energy: - is_converge = False - - if energy: - assert (force is not None) and len(coord) > 0 - - return coord, cell, energy, force, virial, is_converge - - -if __name__ == "__main__": - import sys - - ret = get_frames(sys.argv[1], begin=0, step=1) - print(ret) +from dpdata.formats.fhi_aims.output import * # noqa: F403 diff --git a/dpdata/formats/__init__.py b/dpdata/formats/__init__.py new file mode 100644 index 000000000..536b023e3 --- /dev/null +++ b/dpdata/formats/__init__.py @@ -0,0 +1 @@ +# Format modules for dpdata diff --git a/dpdata/formats/abacus/__init__.py b/dpdata/formats/abacus/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dpdata/formats/abacus/md.py b/dpdata/formats/abacus/md.py new file mode 100644 index 000000000..8df156c94 --- /dev/null +++ b/dpdata/formats/abacus/md.py @@ -0,0 +1,224 @@ +from __future__ import annotations + +import os +import warnings + +import numpy as np + +from dpdata.utils import open_file + +from .scf import ( + bohr2ang, + get_geometry_in, + get_mag_force, + kbar2evperang3, +) +from .stru import get_frame_from_stru + +# Read in geometries from an ABACUS MD trajectory. +# The atomic coordinates are read in from generated files in OUT.XXXX. +# Energies, forces +# IMPORTANT: the program defaultly takes STRU input file as standard cell information, +# therefore the direct and cartesan coordinates read could be different from the ones in +# the output cif files!!! +# It is highly recommanded to use ORTHOGANAL coordinates in STRU file if you wish to get +# same coordinates in both dpdata and output cif files. + + +def get_path_out(fname, inlines): + # This function is different from the same-name function in scf.py. + # This function returns OUT.XXXX's base directory. + path_out = os.path.join(fname, "OUT.ABACUS/") + for line in inlines: + if len(line) > 0 and "suffix" in line and "suffix" == line.split()[0]: + suffix = line.split()[1] + path_out = os.path.join(fname, f"OUT.{suffix}/") + break + return path_out + + +def get_coord_dump_freq(inlines): + for line in inlines: + if len(line) > 0 and "md_dumpfreq" in line and "md_dumpfreq" == line.split()[0]: + return int(line.split()[1]) + return 1 + + +def get_coords_from_dump(dumplines, natoms): + nlines = len(dumplines) + total_natoms = sum(natoms) + # The output of VIRIAL, FORCE, and VELOCITY are controlled by INPUT parameters dump_virial, dump_force, and dump_vel, respectively. + # So the search of keywords can determine whether these datas are printed into MD_dump. + calc_stress = False + calc_force = False + check_line = 6 + if "VIRIAL" in dumplines[6]: + calc_stress = True + check_line = 10 + assert "POSITION" in dumplines[check_line], ( + "keywords 'POSITION' cannot be found in the 6th line. Please check." + ) + if "FORCE" in dumplines[check_line]: + calc_force = True + + nframes_dump = -1 + if calc_stress: + nframes_dump = int(nlines / (total_natoms + 13)) + else: + nframes_dump = int(nlines / (total_natoms + 9)) + assert nframes_dump > 0, ( + "Number of lines in MD_dump file = %d. Number of atoms = %d. The MD_dump file is incomplete." # noqa: UP031 + % (nlines, total_natoms) + ) + cells = np.zeros([nframes_dump, 3, 3]) + stresses = np.zeros([nframes_dump, 3, 3]) + forces = np.zeros([nframes_dump, total_natoms, 3]) + coords = np.zeros([nframes_dump, total_natoms, 3]) + iframe = 0 + for iline in range(nlines): + if "MDSTEP" in dumplines[iline]: + # read in LATTICE_CONSTANT + # for abacus version >= v3.1.4, the unit is angstrom, and "ANGSTROM" is added at the end + # for abacus version < v3.1.4, the unit is bohr + celldm = float(dumplines[iline + 1].split()[1]) + newversion = True + if "Angstrom" not in dumplines[iline + 1]: + celldm *= bohr2ang # transfer unit to ANGSTROM + newversion = False + + # read in LATTICE_VECTORS + for ix in range(3): + cells[iframe, ix] = ( + np.array([float(i) for i in dumplines[iline + 3 + ix].split()[0:3]]) + * celldm + ) + if calc_stress: + stresses[iframe, ix] = np.array( + [float(i) for i in dumplines[iline + 7 + ix].split()[0:3]] + ) + + if calc_stress: + skipline = 11 + else: + skipline = 7 + + for iat in range(total_natoms): + # INDEX LABEL POSITION (Angstrom) FORCE (eV/Angstrom) VELOCITY (Angstrom/fs) + # 0 Sn 0.000000000000 0.000000000000 0.000000000000 -0.000000000000 -0.000000000001 -0.000000000001 0.001244557166 -0.000346684288 0.000768457739 + # 1 Sn 0.000000000000 3.102800034079 3.102800034079 -0.000186795145 -0.000453823768 -0.000453823768 0.000550996187 -0.000886442775 0.001579501983 + # for abacus version >= v3.1.4, the value of POSITION is the real cartessian position, and unit is angstrom, and if cal_force the VELOCITY is added at the end. + # for abacus version < v3.1.4, the real position = POSITION * celldm + coords[iframe, iat] = np.array( + [float(i) for i in dumplines[iline + skipline + iat].split()[2:5]] + ) + + if not newversion: + coords[iframe, iat] *= celldm + + if calc_force: + forces[iframe, iat] = np.array( + [ + float(i) + for i in dumplines[iline + skipline + iat].split()[5:8] + ] + ) + iframe += 1 + assert iframe == nframes_dump, ( + "iframe=%d, nframe_dump=%d. Number of frames does not match number of lines in MD_dump." # noqa: UP031 + % (iframe, nframes_dump) + ) + stresses *= kbar2evperang3 + return coords, cells, forces, stresses + + +def get_energy(outlines, ndump, dump_freq): + energy = [] + nenergy = 0 + for line_idx, line in enumerate(outlines): + if "final etot is" in line or "#TOTAL ENERGY#" in line: + if nenergy % dump_freq == 0: + energy.append(float(line.split()[-2])) + nenergy += 1 + elif "!! convergence has not been achieved" in line: + if nenergy % dump_freq == 0: + energy.append(np.nan) + nenergy += 1 + assert ndump == len(energy), ( + "Number of total energies in running_md.log = %d. Number of frames in MD_dump = %d. Please check." # noqa: UP031 + % (len(energy), ndump) + ) + energy = np.array(energy) + return energy + + +def get_frame(fname): + if isinstance(fname, str): + # if the input parameter is only one string, it is assumed that it is the + # base directory containing INPUT file; + path_in = os.path.join(fname, "INPUT") + else: + raise RuntimeError("invalid input") + with open_file(path_in) as fp: + inlines = fp.read().split("\n") + geometry_path_in = get_geometry_in(fname, inlines) # base dir of STRU + path_out = get_path_out(fname, inlines) + + data = get_frame_from_stru(geometry_path_in) + natoms = data["atom_numbs"] + # should remove spins from STRU file + if "spins" in data: + data.pop("spins") + + # This coords is not to be used. + dump_freq = get_coord_dump_freq(inlines=inlines) + # ndump = int(os.popen("ls -l %s | grep 'md_pos_' | wc -l" %path_out).readlines()[0]) + # number of dumped geometry files + # coords = get_coords_from_cif(ndump, dump_freq, atom_names, natoms, types, path_out, cell) + with open_file(os.path.join(path_out, "MD_dump")) as fp: + dumplines = fp.read().split("\n") + coords, cells, force, stress = get_coords_from_dump(dumplines, natoms) + ndump = np.shape(coords)[0] + with open_file(os.path.join(path_out, "running_md.log")) as fp: + outlines = fp.read().split("\n") + energy = get_energy(outlines, ndump, dump_freq) + + unconv_stru = "" + for i, iene in enumerate(energy): + if np.isnan(iene): + coords = np.delete(coords, i - ndump, axis=0) + cells = np.delete(cells, i - ndump, axis=0) + force = np.delete(force, i - ndump, axis=0) + stress = np.delete(stress, i - ndump, axis=0) + energy = np.delete(energy, i - ndump, axis=0) + unconv_stru += "%d " % i # noqa: UP031 + ndump = len(energy) + if unconv_stru != "": + warnings.warn(f"Structure {unconv_stru} are unconverged and not collected!") + + for iframe in range(ndump): + stress[iframe] *= np.linalg.det(cells[iframe, :, :].reshape([3, 3])) + if np.sum(np.abs(stress[0])) < 1e-10: + stress = None + + magmom, magforce = get_mag_force(outlines) + + data["cells"] = cells + # for idx in range(ndump): + # data['cells'][:, :, :] = cell + data["coords"] = coords + data["energies"] = energy + data["forces"] = force + data["virials"] = stress + if not isinstance(data["virials"], np.ndarray): + del data["virials"] + data["orig"] = np.zeros(3) + if len(magmom) > 0: + data["spins"] = magmom + if len(magforce) > 0: + data["force_mags"] = magforce + + # need to expand the move. + if "move" in data: + data["move"] = [data["move"][0] for i in range(ndump)] + + return data diff --git a/dpdata/formats/abacus/relax.py b/dpdata/formats/abacus/relax.py new file mode 100644 index 000000000..db60412b8 --- /dev/null +++ b/dpdata/formats/abacus/relax.py @@ -0,0 +1,265 @@ +from __future__ import annotations + +import glob +import os + +import numpy as np + +from dpdata.utils import open_file + +from .scf import ( + bohr2ang, + collect_force, + collect_stress, + get_geometry_in, + get_mag_force, + kbar2evperang3, +) +from .stru import get_frame_from_stru + +# Read in geometries from an ABACUS RELAX(CELL-RELAX) trajectory in OUT.XXXX/runnning_relax/cell-relax.log. + + +def get_log_file(fname, inlines): + suffix = "ABACUS" + calculation = "scf" + for line in inlines: + if "suffix" in line and "suffix" == line.split()[0]: + suffix = line.split()[1] + elif "calculation" in line and "calculation" == line.split()[0]: + calculation = line.split()[1] + logf = os.path.join(fname, f"OUT.{suffix}/running_{calculation}.log") + return logf + + +def get_relax_stru_files(output_dir): + """Find the STRU files in the output directory. + + Args: + output_dir (str): output directory + + Returns + ------- + strus: list of STRU files + example: + ["STRU_ION1_D", "STRU_ION2_D"] + """ + return glob.glob(os.path.join(output_dir, "STRU_ION*_D")) + + +def get_coords_from_log(loglines, natoms, stru_files=None): + """NOTICE: unit of coords and cells is Angstrom + order: + coordinate + cell (no output if cell is not changed) + energy (no output, if SCF is not converged) + force (no output, if cal_force is not setted or abnormal ending) + stress (no output, if set cal_stress is not setted or abnormal ending). + """ + natoms_log = 0 + for line in loglines: + if line[13:41] == "number of atom for this type": + natoms_log += int(line.split()[-1]) + + assert natoms_log > 0 and natoms_log == natoms, ( + f"ERROR: detected atom number in log file is {natoms_log}, while the atom number in STRU file is {natoms}" + ) + + energy = [] + cells = [] + coords = [] + coord_direct = [] # if the coordinate is direct type or not + + for i in range(len(loglines)): + line = loglines[i] + if line[18:41] == "lattice constant (Bohr)": + a0 = float(line.split()[-1]) + elif len(loglines[i].split()) >= 2 and loglines[i].split()[1] == "COORDINATES": + # read coordinate information + coords.append([]) + direct_coord = False + if loglines[i].split()[0] == "DIRECT": + coord_direct.append(True) + for k in range(2, 2 + natoms): + coords[-1].append( + list(map(lambda x: float(x), loglines[i + k].split()[1:4])) + ) + elif loglines[i].split()[0] == "CARTESIAN": + coord_direct.append(False) + for k in range(2, 2 + natoms): + coords[-1].append( + list( + map( + lambda x: float(x) * a0 * bohr2ang, + loglines[i + k].split()[1:4], + ) + ) + ) + else: + assert False, "Unrecongnized coordinate type, %s, line:%d" % ( # noqa: UP031 + loglines[i].split()[0], + i, + ) + + elif ( + loglines[i][1:56] + == "Lattice vectors: (Cartesian coordinate: in unit of a_0)" + ): + # add the cell information for previous structures + while len(cells) < len(coords) - 1: + cells.append(cells[-1]) + # get current cell information + cells.append([]) + for k in range(1, 4): + cells[-1].append( + list( + map( + lambda x: float(x) * a0 * bohr2ang, + loglines[i + k].split()[0:3], + ) + ) + ) + + elif line[1:14] == "final etot is" or "#TOTAL ENERGY#" in line: + # add the energy for previous structures whose SCF is not converged + while len(energy) < len(coords) - 1: + energy.append(np.nan) + # get the energy of current structure + energy.append(float(line.split()[-2])) + + # in some relax method (like: bfgs_trad), the coordinate is not outputed in running_relax.log + # but if out_stru is true, then STRU_ION*_D will be outputed in OUT.ABACUS + # we should read cell and coord from STRU_ION*_D files + if len(energy) > 1 and len(coords) == 1: + # the energies of all structrues are collected, but coords have only the first structure + if ( + stru_files is not None and len(stru_files) > 1 + ): # if stru_files are not only STRU_ION_D + stru_file_name = [os.path.basename(i) for i in stru_files] + coords = coords[:1] + [np.nan for i in range(len(energy) - 1)] + coord_direct = coord_direct[:1] + [False for i in range(len(energy) - 1)] + cells = cells[:1] + [np.nan for i in range(len(energy) - 1)] + for iframe in range(1, len(energy)): + if f"STRU_ION{iframe}_D" in stru_file_name: + # read the structure from STRU_ION*_D + stru_data = get_frame_from_stru( + stru_files[stru_file_name.index(f"STRU_ION{iframe}_D")] + ) + coords[iframe] = stru_data["coords"][0] + cells[iframe] = stru_data["cells"][0] + + force = collect_force(loglines) + stress = collect_stress(loglines) + + # delete last structures which has no energy + while len(energy) < len(coords): + del coords[-1] + del coord_direct[-1] + + # add cells for last structures whose cell is not changed + while len(cells) < len(coords): + cells.append(cells[-1]) + + # only keep structures that have all of coord, force and stress + if len(stress) == 0 and len(force) == 0: + minl = len(coords) + elif len(stress) == 0: + minl = min(len(coords), len(force)) + force = force[:minl] + elif len(force) == 0: + minl = min(len(coords), len(stress)) + stress = stress[:minl] + else: + minl = min(len(coords), len(force), len(stress)) + force = force[:minl] + stress = stress[:minl] + + coords = coords[:minl] + energy = energy[:minl] + cells = cells[:minl] + + # delete structures whose energy is np.nan + for i in range(minl): + if ( + np.isnan(energy[i - minl]) + or np.any(np.isnan(coords[i - minl])) + or np.any(np.isnan(cells[i - minl])) + ): + del energy[i - minl] + del coords[i - minl] + del cells[i - minl] + del coord_direct[i - minl] + if len(force) > 0: + del force[i - minl] + if len(stress) > 0: + del stress[i - minl] + + energy = np.array(energy) + cells = np.array(cells) + coords = np.array(coords) + stress = np.array(stress) + force = np.array(force) + + # transfer direct coordinate to cartessian type + for i in range(len(coords)): + if coord_direct[i]: + coords[i] = coords[i].dot(cells[i]) + + if len(stress) > 0: + virial = np.zeros([len(cells), 3, 3]) + for i in range(len(cells)): + volume = np.linalg.det(cells[i, :, :].reshape([3, 3])) + virial[i] = stress[i] * kbar2evperang3 * volume + else: + virial = None + + return energy, cells, coords, force, stress, virial + + +def get_frame(fname): + if isinstance(fname, str): + # if the input parameter is only one string, it is assumed that it is the + # base directory containing INPUT file; + path_in = os.path.join(fname, "INPUT") + else: + raise RuntimeError("invalid input") + with open_file(path_in) as fp: + inlines = fp.read().split("\n") + geometry_path_in = get_geometry_in(fname, inlines) # base dir of STRU + + data = get_frame_from_stru(geometry_path_in) + natoms = sum(data["atom_numbs"]) + # should remove spins from STRU file + if "spins" in data: + data.pop("spins") + + logf = get_log_file(fname, inlines) + assert os.path.isfile(logf), f"Error: can not find {logf}" + with open_file(logf) as f1: + lines = f1.readlines() + + relax_stru_files = get_relax_stru_files(os.path.dirname(logf)) + + energy, cells, coords, force, stress, virial = get_coords_from_log( + lines, natoms, stru_files=relax_stru_files + ) + + magmom, magforce = get_mag_force(lines) + + data["cells"] = cells + data["coords"] = coords + data["energies"] = energy + data["forces"] = force + if isinstance(virial, np.ndarray): + data["virials"] = virial + data["stress"] = stress + data["orig"] = np.zeros(3) + + if len(magmom) > 0: + data["spins"] = magmom + if len(magforce) > 0: + data["force_mags"] = magforce + if "move" in data: + data["move"] = [data["move"][0] for i in range(len(data["energies"]))] + + return data diff --git a/dpdata/formats/abacus/scf.py b/dpdata/formats/abacus/scf.py new file mode 100644 index 000000000..991396b65 --- /dev/null +++ b/dpdata/formats/abacus/scf.py @@ -0,0 +1,255 @@ +from __future__ import annotations + +import os +import re +import warnings + +import numpy as np + +from dpdata.utils import open_file + +from ...unit import LengthConversion, PressureConversion +from .stru import get_frame_from_stru + +bohr2ang = LengthConversion("bohr", "angstrom").value() +kbar2evperang3 = PressureConversion("kbar", "eV/angstrom^3").value() + + +def CheckFile(ifile): + if not os.path.isfile(ifile): + print(f"Can not find file {ifile}") + return False + return True + + +def get_geometry_in(fname, inlines): + geometry_path_in = os.path.join(fname, "STRU") + for line in inlines: + if "stru_file" in line and "stru_file" == line.split()[0]: + atom_file = line.split()[1] + geometry_path_in = os.path.join(fname, atom_file) + break + return geometry_path_in + + +def get_path_out(fname, inlines): + path_out = os.path.join(fname, "OUT.ABACUS/running_scf.log") + for line in inlines: + if "suffix" in line and "suffix" == line.split()[0]: + suffix = line.split()[1] + path_out = os.path.join(fname, f"OUT.{suffix}/running_scf.log") + break + return path_out + + +def get_energy(outlines): + Etot = None + for line in reversed(outlines): + if "final etot is" in line: # for LTS + Etot = float(line.split()[-2]) # in eV + return Etot, True + elif "TOTAL ENERGY" in line: # for develop + Etot = float(line.split()[-2]) # in eV + return Etot, True + elif "convergence has NOT been achieved!" in line: + return Etot, False + elif "convergence has not been achieved" in line: + return Etot, False + + return Etot, False + + +def collect_force(outlines): + force = [] + for i, line in enumerate(outlines): + # if "TOTAL-FORCE (eV/Angstrom)" in line: + if "TOTAL-FORCE" in line: + value_pattern = re.compile( + r"^\s*[A-Z][a-z]?[1-9][0-9]*\s+[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\s+[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\s+[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\s*$" + ) + j = i + # find the first line of force + noforce = False + while not value_pattern.match(outlines[j]): + j += 1 + if ( + j >= i + 10 + ): # if can not find the first line of force in 10 lines, then stop + warnings.warn("Warning: can not find the first line of force") + noforce = True + break + if noforce: + break + + force.append([]) + while value_pattern.match(outlines[j]): + force[-1].append([float(ii) for ii in outlines[j].split()[1:4]]) + j += 1 + return force # only return the last force + + +def get_force(outlines, natoms): + force = collect_force(outlines) + if len(force) == 0: + return None + else: + return np.array(force[-1]) # only return the last force + + +def collect_stress(outlines): + stress = [] + for i, line in enumerate(outlines): + # if "TOTAL-STRESS (KBAR)" in line: + if "TOTAL-STRESS" in line: + value_pattern = re.compile( + r"^\s*[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\s+[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\s+[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\s*$" + ) + j = i + nostress = False + while not value_pattern.match(outlines[j]): + j += 1 + if ( + j >= i + 10 + ): # if can not find the first line of stress in 10 lines, then stop + warnings.warn("Warning: can not find the first line of stress") + nostress = True + break + if nostress: + break + + stress.append([]) + while value_pattern.match(outlines[j]): + stress[-1].append( + list(map(lambda x: float(x), outlines[j].split()[0:3])) + ) + j += 1 + return stress + + +def get_stress(outlines): + stress = collect_stress(outlines) + if len(stress) == 0: + return None + else: + return np.array(stress[-1]) * kbar2evperang3 # only return the last stress + + +def get_mag_force(outlines): + """Read atomic magmom and magnetic force from OUT.ABACUS/running_scf.log. + + Returns + ------- + magmom: list of list of atomic magnetic moments (three dimensions: ION_STEP * NATOMS * 1/3) + magforce: list of list of atomic magnetic forces (three dimensions: ION_STEP * NATOMS * 1/3) + e.g.: + ------------------------------------------------------------------------------------------- + Total Magnetism (uB) + ------------------------------------------------------------------------------------------- + Fe 0.0000000001 0.0000000000 3.0000000307 + Fe -0.0000000000 -0.0000000000 3.0000001151 + ------------------------------------------------------------------------------------------- + ------------------------------------------------------------------------------------------- + Magnetic force (eV/uB) + ------------------------------------------------------------------------------------------- + Fe 0.0000000000 0.0000000000 -1.2117698671 + Fe 0.0000000000 0.0000000000 -1.2117928796 + ------------------------------------------------------------------------------------------- + + """ + mags = [] + magforces = [] + for i, line in enumerate(outlines): + if "Total Magnetism (uB)" in line: + j = i + 2 + mag = [] + while "-------------------------" not in outlines[j]: + imag = [float(ii) for ii in outlines[j].split()[1:]] + if len(imag) == 1: + imag = [0, 0, imag[0]] + mag.append(imag) + j += 1 + mags.append(mag) + if "Magnetic force (eV/uB)" in line: + j = i + 2 + magforce = [] + while "-------------------------" not in outlines[j]: + imagforce = [float(ii) for ii in outlines[j].split()[1:]] + if len(imagforce) == 1: + imagforce = [0, 0, imagforce[0]] + magforce.append(imagforce) + j += 1 + magforces.append(magforce) + return np.array(mags), np.array(magforces) + + +def get_frame(fname): + data = { + "atom_names": [], + "atom_numbs": [], + "atom_types": [], + "cells": np.array([]), + "coords": np.array([]), + "energies": np.array([]), + "forces": np.array([]), + } + + if isinstance(fname, str): + # if the input parameter is only one string, it is assumed that it is the + # base directory containing INPUT file; + path_in = os.path.join(fname, "INPUT") + else: + raise RuntimeError("invalid input") + + if not CheckFile(path_in): + return data + + with open_file(path_in) as fp: + inlines = fp.read().split("\n") + + geometry_path_in = get_geometry_in(fname, inlines) + + # get OUT.ABACUS/running_scf.log + path_out = get_path_out(fname, inlines) + if not (CheckFile(geometry_path_in) and CheckFile(path_out)): + return data + with open_file(path_out) as fp: + outlines = fp.read().split("\n") + + # get energy + energy, converge = get_energy(outlines) + if not converge: + return data + + # read STRU file + data = get_frame_from_stru(geometry_path_in) + natoms = sum(data["atom_numbs"]) + # should remove spins from STRU file + if "spins" in data: + data.pop("spins") + move = data.pop("move", None) + + # get magmom and magforce, force and stress + magmom, magforce = get_mag_force(outlines) + if len(magmom) > 0: + magmom = magmom[-1:] + if len(magforce) > 0: + magforce = magforce[-1:] + + force = get_force(outlines, natoms) + stress = get_stress(outlines) + + data["energies"] = np.array(energy)[np.newaxis] + data["forces"] = np.empty((0,)) if force is None else force[np.newaxis, :, :] + data["orig"] = np.zeros(3) + if stress is not None: + cell = data["cells"][0] + stress *= np.abs(np.linalg.det(cell)) + data["virials"] = stress[np.newaxis, :, :] + + if len(magmom) > 0: + data["spins"] = magmom + if len(magforce) > 0: + data["force_mags"] = magforce + if move is not None: + data["move"] = move + return data diff --git a/dpdata/formats/abacus/stru.py b/dpdata/formats/abacus/stru.py new file mode 100644 index 000000000..0d899695b --- /dev/null +++ b/dpdata/formats/abacus/stru.py @@ -0,0 +1,820 @@ +from __future__ import annotations + +import os +import re +import warnings + +import numpy as np + +from ...unit import LengthConversion + +bohr2ang = LengthConversion("bohr", "angstrom").value() + + +def split_stru_block(lines): + """Split the ABACUS STRU file into blocks by keyword. + + Args: + lines (list): list of lines in the ABACUS STRU file. + + Returns + ------- + dict: dictionary of blocks. + """ + + def clean_comment(line): + return re.split("[#]", line)[0] + + ABACUS_STRU_KEYS = [ + "ATOMIC_SPECIES", + "NUMERICAL_ORBITAL", + "LATTICE_CONSTANT", + "LATTICE_VECTORS", + "ATOMIC_POSITIONS", + "NUMERICAL_DESCRIPTOR", + "PAW_FILES", + ] + blocks = {i: [] for i in ABACUS_STRU_KEYS} + i = 0 + while i < len(lines): + line = clean_comment(lines[i]).strip() + if line in ABACUS_STRU_KEYS: + key = line + for j in range(i + 1, len(lines)): + if clean_comment(lines[j]).strip() == "": + continue + elif clean_comment(lines[j]).strip() in ABACUS_STRU_KEYS: + break + else: + blocks[key].append(clean_comment(lines[j])) + i = j + else: + i += 1 + + return blocks + + +def parse_atomic_species_block(lines): + """Parse the ATOMIC_SPECIES block. + + Args: + lines (list): list of lines in the ATOMIC_SPECIES block. + + Returns + ------- + tuple: tuple of atom_names, masses, and pp_files. + + """ + atom_names, masses, pp_files = [], [], [] + for line in lines: + line = line.split() + atom_names.append(line[0]) + masses.append(float(line[1])) + + # for standard STRU, the pseudo potential file is required, + # but it is not required for dpdata. + if len(line) > 2: + pp_files.append(line[2]) + else: + pp_files.append(None) + + return atom_names, masses, pp_files + + +def parse_numerical_orbital_block(lines): + """Parse the NUMERICAL_ORBITAL block. + + Args: + lines (list): list of lines in the NUMERICAL_ORBITAL block. + + Returns + ------- + list: list of orbital files. + """ + return [line.strip() for line in lines] + + +def parse_lattice_constant_block(lines): + """Parse the LATTICE_CONSTANT block. + + Args: + lines (list): list of lines in the LATTICE_CONSTANT block. + + Returns + ------- + float: the lattice constant. + """ + return float(lines[0]) + + +def parse_lattice_vectors_block(lines): + """Parse the LATTICE_VECTORS block. + + Args: + lines (list): list of lines in the LATTICE_VECTORS block. + + Returns + ------- + np.ndarray: the cell vectors. + """ + cell = np.zeros((3, 3)) + for i, line in enumerate(lines): + cell[i] = [float(x) for x in line.split()] + return cell + + +def parse_pos_oneline(pos_line): + """Parses a line from the atom position block in a structure file. + + The content in atom position block can include: + - `m` or NO key word: Three numbers (0 or 1) controlling atom movement in geometry relaxation calculations. + - `v`, `vel`, or `velocity`: Three components of initial velocity of atoms in geometry relaxation calculations. + - `mag` or `magmom`: Start magnetization for each atom. Can be one number (colinear) or three numbers (non-colinear). + - `angle1`: In non-colinear case, angle between c-axis and real spin (in degrees). + - `angle2`: In non-colinear case, angle between a-axis and real spin projection in ab-plane (in degrees). + - `cs` or `constrain`: Three numbers (0 or 1) controlling the spin constraint of the atom. + - `lambda`: Three numbers controlling the lambda of the atom. + + Parameters + ---------- + pos_line : A line from the atom position block. + + Returns + ------- + tuple: A tuple containing: + - pos (list of float): The position coordinates. + - move (list of int or None): Movement control values. + - velocity (list of float or None): Initial velocity components. + - magmom (float, list of float, or None): Magnetization values. + - angle1 (float or None): Angle1 value. + - angle2 (float or None): Angle2 value. + - constrain (list of bool or None): Spin constraint values. + - lambda1 (float, list of float, or None): Lambda values. + + e.g.: + ``` + Fe + 1.0 + 2 + 0.0 0.0 0.0 m 0 0 0 mag 1.0 angle1 90 angle2 0 cs 0 0 0 + 0.5 0.5 0.5 m 1 1 1 mag 1.0 angle1 90 angle2 180 + ``` + """ + pos_line = pos_line.split("#")[0] # remove comments + sline = pos_line.split() + pos = [float(i) for i in sline[:3]] + move = None + velocity = None + magmom = None + angle1 = None + angle2 = None + constrain = None + lambda1 = None + if len(sline) > 3: + mag_list = None + velocity_list = None + move_list = [] + angle1_list = None + angle2_list = None + constrain_list = None + lambda_list = None + label = "move" + for i in range(3, len(sline)): + # firstly read the label + if sline[i] == "m": + label = "move" + elif sline[i] in ["v", "vel", "velocity"]: + label = "velocity" + velocity_list = [] + elif sline[i] in ["mag", "magmom"]: + label = "magmom" + mag_list = [] + elif sline[i] == "angle1": + label = "angle1" + angle1_list = [] + elif sline[i] == "angle2": + label = "angle2" + angle2_list = [] + elif sline[i] in ["constrain", "sc"]: + label = "constrain" + constrain_list = [] + elif sline[i] in ["lambda"]: + label = "lambda" + lambda_list = [] + + # the read the value to the list + elif label == "move": + move_list.append(int(sline[i])) + elif label == "velocity": + velocity_list.append(float(sline[i])) + elif label == "magmom": + mag_list.append(float(sline[i])) + elif label == "angle1": + angle1_list.append(float(sline[i])) + elif label == "angle2": + angle2_list.append(float(sline[i])) + elif label == "constrain": + constrain_list.append(bool(int(sline[i]))) + elif label == "lambda": + lambda_list.append(float(sline[i])) + + if move_list is not None and len(move_list) > 0: + if len(move_list) == 3: + move = move_list + else: + raise RuntimeError(f"Invalid setting of move: {pos_line}") + + if velocity_list is not None: + if len(velocity_list) == 3: + velocity = velocity_list + else: + raise RuntimeError(f"Invalid setting of velocity: {pos_line}") + + if mag_list is not None: + if len(mag_list) == 3: + magmom = mag_list + elif len(mag_list) == 1: + magmom = mag_list[0] + else: + raise RuntimeError(f"Invalid magnetic moment {pos_line}") + + if angle1_list is not None: + if len(angle1_list) == 1: + angle1 = angle1_list[0] + else: + raise RuntimeError(f"Invalid angle1 {pos_line}") + + if angle2_list is not None: + if len(angle2_list) == 1: + angle2 = angle2_list[0] + else: + raise RuntimeError(f"Invalid angle2 {pos_line}") + + if constrain_list is not None: + if len(constrain_list) == 3: + constrain = constrain_list + elif len(constrain_list) == 1: + constrain = constrain_list[0] + else: + raise RuntimeError(f"Invalid constrain {pos_line}") + + if lambda_list is not None: + if len(lambda_list) == 3: + lambda1 = lambda_list + elif len(lambda_list) == 1: + lambda1 = lambda_list[0] + else: + raise RuntimeError(f"Invalid lambda {pos_line}") + + return pos, move, velocity, magmom, angle1, angle2, constrain, lambda1 + + +def get_atom_mag_cartesian(atommag, angle1, angle2): + """Transform atommag, angle1, angle2 to magmom in cartesian coordinates. + + Parameters + ---------- + atommag : float/list of float/None + Atom magnetic moment. + angle1 : float/None + value of angle1. + angle2 : float/None + value of angle2. + ABACUS support defining mag, angle1, angle2 at the same time. + angle1 is the angle between z-axis and real spin (in degrees). + angle2 is the angle between x-axis and real spin projection in xy-plane (in degrees). + If only mag is defined, then transfer it to magmom directly. + And if mag, angle1, angle2 are defined, then mag is only the norm of magmom, and the direction is defined by angle1 and angle2. + """ + if atommag is None: + return None + if not (isinstance(atommag, list) or isinstance(atommag, float)): + raise RuntimeError(f"Invalid atommag: {atommag}") + + if angle1 is None and angle2 is None: + if isinstance(atommag, list): + return atommag + else: + return [0, 0, atommag] + else: + a1 = 0 + a2 = 0 + if angle1 is not None: + a1 = angle1 + if angle2 is not None: + a2 = angle2 + if isinstance(atommag, list): + mag_norm = np.linalg.norm(atommag) + else: + mag_norm = atommag + return [ + mag_norm * np.sin(np.radians(a1)) * np.cos(np.radians(a2)), + mag_norm * np.sin(np.radians(a1)) * np.sin(np.radians(a2)), + mag_norm * np.cos(np.radians(a1)), + ] + + +def get_cartesian_coords(coords, coord_type, celldm, cell): + """Transform the atomic coordinates to cartesian coordinates. + + Args: + coords (np.ndarray): atomic coordinates read from the STRU file. + coord_type (str): the coordination type, either "cartesian" or "direct". + celldm (float): the lattice constant. + cell (np.ndarray): the cell vectors in angstrom. + + Returns + ------- + np.ndarray: the cartesian coordinates in angstrom. + """ + if coord_type == "cartesian": + return coords * celldm * bohr2ang + elif coord_type == "direct": + return np.matmul(coords, cell) + else: + raise RuntimeError(f"Invalid coordination type: {coord_type}") + + +def parse_pos(coords_lines, atom_names, celldm, cell): + """Read the atomic positions block in the ABACUS STRU file. + + Args: + coords_lines (list): list of lines in the atomic positions block. + atom_names (list): list of atom names. + celldm (float): the lattice constant. + cell (np.ndarray): the cell vectors in angstrom, and has multipy celldm. + + Returns + ------- + tuple: tuple of atom_numbs, coords, move, mags, velocity, sc, lambda_ + Note: for atomic magnetic moment, we finnaly transform it to non-collinear magnetic moment in cartesian coordinates, + and do not return the angle1 and angle2, and the magnetic moment of each atom type. + + """ + coord_type = coords_lines[0].split()[0].lower() # cartisan or direct + atom_numbs = [] # the number of each atom type + coords = [] # coordinations of atoms + move = [] # move flag of each atom + velocity = [] # velocity of each atom + mags = [] # magnetic moment of each atom + sc = [] # spin constraint flag of each atom + lambda_ = [] # lambda of each atom + + ntype = len(atom_names) + line_idx = 1 # starting line of first element + define_atom_mag = False + for it in range(ntype): + atom_name = coords_lines[line_idx].split()[0] + if atom_name != atom_names[it]: + raise RuntimeError( + f"Read atom name '{atom_name}' is not equal to the expected atom name '{atom_names[it]}'" + ) + atom_type_mag = float(coords_lines[line_idx + 1].split()[0]) + line_idx += 2 + atom_numbs.append(int(coords_lines[line_idx].split()[0])) + line_idx += 1 + for iline in range(atom_numbs[it]): + pos, imove, ivelocity, imagmom, iangle1, iangle2, iconstrain, ilambda1 = ( + parse_pos_oneline(coords_lines[line_idx]) + ) + + coords.append(get_cartesian_coords(np.array(pos), coord_type, celldm, cell)) + + move.append(imove) + velocity.append(ivelocity) + sc.append(iconstrain) + lambda_.append(ilambda1) + + # calculate the magnetic moment in cartesian coordinates + mag = get_atom_mag_cartesian(imagmom, iangle1, iangle2) + if mag is None: + mag = [0, 0, atom_type_mag] + mags.append(mag) + + if imagmom is not None: + define_atom_mag = True + + line_idx += 1 + coords = np.array(coords) # need transformation!!! + + if all([i is None for i in move]): + move = [] + else: + move = np.array(move, dtype=bool) + + if all([i is None for i in velocity]): + velocity = [] + else: + velocity = np.array(velocity) + + if all([i is None for i in sc]): + sc = [] + + if all([i is None for i in lambda_]): + lambda_ = [] + + # here return the magnetic moment only when the atom magnetic moment is specified. + if not define_atom_mag: + mags = [] + else: + mags = np.array(mags) + + return atom_numbs, coords, move, mags, velocity, sc, lambda_ + + +def right_hand_rule( + cell: np.ndarray, coord: np.ndarray +) -> tuple[np.ndarray, np.ndarray]: + """Rotate the cell and coord to make the cell fit the right-hand rule. + + Args: + cell (np.ndarray): the cell vectors. + coord (np.ndarray): the atomic coordinates in cartesian. + + Returns + ------- + tuple: the rotated cell and coord. + """ + if np.linalg.det(cell) < 0: + cell = -cell + coord = -coord + return cell, coord + + +def get_frame_from_stru(stru): + """Read the ABACUS STRU file and return the dpdata frame. + + The description of ABACUS STRU can be found in https://abacus.deepmodeling.com/en/latest/advanced/input_files/stru.html + + Args: + stru (str): path to the ABACUS STRU file. + + Returns + ------- + data: the parsed stru information in dictionary. + { + "atom_names": list of atom names, + "atom_numbs": list of atom numbers, + "atom_types": list of atom types, + "masses": list of atomic masses, + "pp_files", list of pseudo potential files, + "orb_files", list of orbital files, + "dpks_descriptor": the deepks descriptor file, + + # below are the information in each frame + + "cells": list of cell vectors, + "coords": list of atomic coordinates, + "spins": list of magnetic moments, # return only when set "mag xxx" for each atom in STRU file + "moves": list of move flags, + } + For some keys, if the information is not provided in the STRU file, then it will not be included in the dictionary. + "spins" is designed for delta spin calculation, and when dpdata.System is write to lmp format, the spin will be written as magmom. + But we should note that this file format is valid only for a spin lammps job, not for a normal job. + If you want to use dpgen to run the non-spin job, then you should not define "mag x x x" in the STRU file. + """ + if not os.path.isfile(stru): + raise FileNotFoundError(f"ABACUS STRU file {stru} not found!!!") + + # 1. read the file and split the lines to blocks + with open(stru) as f: + lines = f.readlines() + blocks = split_stru_block(lines) + + # 2. parse the blocks + atom_names, masses, pp_files = parse_atomic_species_block(blocks["ATOMIC_SPECIES"]) + orb_files = parse_numerical_orbital_block(blocks.get("NUMERICAL_ORBITAL", [])) + dpks_descriptor = blocks.get("NUMERICAL_DESCRIPTOR", []) + celldm = parse_lattice_constant_block(blocks["LATTICE_CONSTANT"]) + cell = parse_lattice_vectors_block(blocks["LATTICE_VECTORS"]) + cell = np.array(cell) * celldm * bohr2ang + atom_numbs, coords, move, mags, velocity, sc, lambda_ = parse_pos( + blocks["ATOMIC_POSITIONS"], atom_names, celldm, cell + ) + + cell, coords = right_hand_rule(cell, coords) + data = { + "atom_names": atom_names, + "atom_numbs": atom_numbs, + "atom_types": np.array( + [i for i in range(len(atom_numbs)) for j in range(atom_numbs[i])] + ), + "masses": np.array(masses), + "pp_files": pp_files, + "cells": np.array([cell]), + "coords": np.array([coords]), + } + if len(mags) > 0: + data["spins"] = np.array([mags]) + if len(orb_files) > 0: + data["orb_files"] = orb_files + if len(dpks_descriptor) > 0: + data["dpks_descriptor"] = dpks_descriptor[0].strip() + if len(move) > 0: + data["move"] = np.array([move]) + + return data + + +def make_unlabeled_stru( + data, + frame_idx, + pp_file=None, + numerical_orbital=None, + numerical_descriptor=None, + mass=None, + move=None, + velocity=None, + mag=None, + angle1=None, + angle2=None, + sc=None, + lambda_=None, + link_file=False, + dest_dir=None, + **kwargs, +): + """Make an unlabeled STRU file from a dictionary. + + Parameters + ---------- + data : dict + System data + frame_idx : int + The index of the frame to dump + pp_file : list of string or dict + List of pseudo potential files, or a dictionary of pseudo potential files for each atomnames + numerical_orbital : list of string or dict, optional + List of orbital files, or a dictionary of orbital files for each atomnames + numerical_descriptor : str, optional + numerical descriptor file + mass : list of float, optional + List of atomic masses + move : list of (list of list of bool), optional + List of the move flag of each xyz direction of each atom for each frame + velocity : list of list of float, optional + List of the velocity of each xyz direction of each atom + mag : list of (list of float or float), optional + List of the magnetic moment of each atom, can be a list of three floats or one float + For noncollinear, three floats are the xyz component of the magnetic moment. + For collinear, one float is the norm of the magnetic moment. + angle1 : list of float, optional + List of the angle1 of each atom. For noncollinear calculation, it is the angle between the magnetic moment and the z-axis. + angle2 : list of float, optional + List of the angle2 of each atom. For noncollinear calculation, it is the angle between the projection of magnetic moment on xy plane and the x-axis. + sc : list of (bool or list of 3 bool), optional + List of the spin constraint flag of each atom. Each element can be a bool or a list of three bools or None. + lambda_ : list of (float or list of 3 float), optional + List of the lambda of each atom. Each element can be a float or a list of three floats. + link_file : bool, optional + Whether to link the pseudo potential files and orbital files in the STRU file. + If True, then only filename will be written in the STRU file, and make a soft link to the real file. + dest_dir : str, optional + The destination directory to make the soft link of the pseudo potential files and orbital files. + For velocity, mag, angle1, angle2, sc, and lambda_, if the value is None, then the corresponding information will not be written. + ABACUS support defining "mag" and "angle1"/"angle2" at the same time, and in this case, the "mag" only define the norm of the magnetic moment, and "angle1" and "angle2" define the direction of the magnetic moment. + If data has spins, then it will be written as mag to STRU file; while if mag is passed at the same time, then mag will be used. + """ + + def _link_file(dest_dir, src_file): + if not os.path.isfile(src_file): + print(f"ERROR: link_file: {src_file} is not a file.") + return False + src_file = os.path.abspath(src_file) + if not os.path.isdir(dest_dir): + os.makedirs(dest_dir) + dest_file = os.path.join(dest_dir, os.path.basename(src_file)) + if os.path.isfile(dest_file): + if os.path.samefile(src_file, dest_file): + return True + else: + os.remove(dest_file) + os.symlink(src_file, dest_file) + return True + + def ndarray2list(i): + if isinstance(i, np.ndarray): + return i.tolist() + else: + return i + + def process_file_input(file_input, atom_names, input_name): + # For pp_file and numerical_orbital, process the file input, and return a list of file names + # file_input can be a list of file names, or a dictionary of file names for each atom names + if isinstance(file_input, (list, tuple)): + if len(file_input) != len(atom_names): + raise ValueError( + f"{input_name} length is not equal to the number of atom types" + ) + return file_input + elif isinstance(file_input, dict): + for element in atom_names: + if element not in file_input: + raise KeyError(f"{input_name} does not contain {element}") + return [file_input[element] for element in atom_names] + else: + raise ValueError(f"Invalid {input_name}: {file_input}") + + if link_file and dest_dir is None: + print( + "WARNING: make_unlabeled_stru: link_file is True, but dest_dir is None. Will write the filename to STRU but not making soft link." + ) + if dest_dir is not None and dest_dir.strip() == "": + dest_dir = "." + + # check the input data + if mass is None and data.get("masses") is not None and len(data["masses"]) > 0: + mass = data["masses"] + + if ( + pp_file is None + and data.get("pp_files") is not None + and len(data["pp_files"]) > 0 + ): + pp_file = data["pp_files"] + + if ( + numerical_orbital is None + and data.get("orb_files") is not None + and len(data["orb_files"]) > 0 + ): + numerical_orbital = data["orb_files"] + + if numerical_descriptor is None and data.get("dpks_descriptor") is not None: + numerical_descriptor = data["dpks_descriptor"] + + if mag is None and data.get("spins") is not None and len(data["spins"]) > 0: + mag = data["spins"][frame_idx] + + if move is None and data.get("move", None) is not None and len(data["move"]) > 0: + move = data["move"][frame_idx] + + # check the length of the input data + atom_numbs = sum(data["atom_numbs"]) + for key in [move, velocity, mag, angle1, angle2, sc, lambda_]: + if key is not None: + if ( + not isinstance(ndarray2list(key), (list, tuple)) + and len(key) != atom_numbs + ): + key_name = [name for name, value in locals().items() if value is key][0] + print( + f"ERROR: make_unlabeled_stru: the length of '{key_name}' ({len(key)}) should be equal to the number of atom number ({atom_numbs})." + ) + return "" + + # ATOMIC_SPECIES block + out = "ATOMIC_SPECIES\n" + if pp_file is not None: + ppfiles = process_file_input( + ndarray2list(pp_file), data["atom_names"], "pp_file" + ) + else: + warnings.warn( + "pp_file is not provided, will use empty string for pseudo potential file." + ) + ppfiles = [""] * len(data["atom_names"]) + + for iele in range(len(data["atom_names"])): + if data["atom_numbs"][iele] == 0: + continue + out += data["atom_names"][iele] + " " + if mass is not None: + out += f"{mass[iele]:.3f} " + else: + out += "1 " + + ipp_file = ppfiles[iele] + if ipp_file != "": + if not link_file: + out += ipp_file + else: + out += os.path.basename(ipp_file.rstrip("/")) + if dest_dir is not None: + _link_file(dest_dir, ipp_file) + out += "\n" + out += "\n" + + # NUMERICAL_ORBITAL block + if numerical_orbital is not None: + numerical_orbital = ndarray2list(numerical_orbital) + orbfiles = process_file_input( + numerical_orbital, data["atom_names"], "numerical_orbital" + ) + orbfiles = [ + orbfiles[i] + for i in range(len(data["atom_names"])) + if data["atom_numbs"][i] != 0 + ] + out += "NUMERICAL_ORBITAL\n" + for iorb in orbfiles: + if not link_file: + out += iorb + else: + out += os.path.basename(iorb.rstrip("/")) + if dest_dir is not None: + _link_file(dest_dir, iorb) + out += "\n" + out += "\n" + + # deepks block + if numerical_descriptor is not None: + assert isinstance(numerical_descriptor, str) + if not link_file: + out += f"NUMERICAL_DESCRIPTOR\n{numerical_descriptor}\n" + else: + out += f"NUMERICAL_DESCRIPTOR\n{os.path.basename(numerical_descriptor)}\n" + if dest_dir is not None: + _link_file(dest_dir, numerical_descriptor) + out += "\n" + + # LATTICE_CONSTANT and LATTICE_VECTORS block + out += "LATTICE_CONSTANT\n" + out += str(1 / bohr2ang) + "\n\n" + + out += "LATTICE_VECTORS\n" + for ix in range(3): + for iy in range(3): + out += str(data["cells"][frame_idx][ix][iy]) + " " + out += "\n" + out += "\n" + + # ATOMIC_POSITIONS block + out += "ATOMIC_POSITIONS\n" + out += "Cartesian # Cartesian(Unit is LATTICE_CONSTANT)\n" + # ret += "\n" + natom_tot = 0 # in for loop, it is also the atom index + for iele in range(len(data["atom_names"])): + if data["atom_numbs"][iele] == 0: + continue + out += data["atom_names"][iele] + "\n" + out += "0.0\n" + out += str(data["atom_numbs"][iele]) + "\n" + for iatom in range(data["atom_numbs"][iele]): + iatomtype = np.nonzero(data["atom_types"] == iele)[0][ + iatom + ] # it is the atom index + iout = f"{data['coords'][frame_idx][iatomtype, 0]:.12f} {data['coords'][frame_idx][iatomtype, 1]:.12f} {data['coords'][frame_idx][iatomtype, 2]:.12f}" + # add flags for move, velocity, mag, angle1, angle2, and sc + if move is not None: + if ( + isinstance(ndarray2list(move[iatomtype]), (list, tuple)) + and len(move[iatomtype]) == 3 + ): + iout += " " + " ".join( + ["1" if ii else "0" for ii in move[iatomtype]] + ) + elif isinstance(ndarray2list(move[iatomtype]), (int, float, bool)): + iout += " 1 1 1" if move[iatomtype] else " 0 0 0" + else: + iout += " 1 1 1" + + if ( + velocity is not None + and isinstance(ndarray2list(velocity[iatomtype]), (list, tuple)) + and len(velocity[iatomtype]) == 3 + ): + iout += " v " + " ".join([f"{ii:.12f}" for ii in velocity[iatomtype]]) + + if mag is not None: + if isinstance(ndarray2list(mag[iatomtype]), (list, tuple)) and len( + mag[iatomtype] + ) in [1, 3]: + iout += " mag " + " ".join([f"{ii:.12f}" for ii in mag[iatomtype]]) + elif isinstance(ndarray2list(mag[iatomtype]), (int, float)): + iout += " mag " + f"{mag[iatomtype]:.12f}" + + if angle1 is not None and isinstance( + ndarray2list(angle1[iatomtype]), (int, float) + ): + iout += " angle1 " + f"{angle1[iatomtype]:.12f}" + + if angle2 is not None and isinstance( + ndarray2list(angle2[iatomtype]), (int, float) + ): + iout += " angle2 " + f"{angle2[iatomtype]:.12f}" + + if sc is not None: + if isinstance(ndarray2list(sc[iatomtype]), (list, tuple)) and len( + sc[iatomtype] + ) in [1, 3]: + iout += " sc " + " ".join( + ["1" if ii else "0" for ii in sc[iatomtype]] + ) + elif isinstance(ndarray2list(sc[iatomtype]), (int, float, bool)): + iout += " sc " + "1" if sc[iatomtype] else "0" + + if lambda_ is not None: + if isinstance(ndarray2list(lambda_[iatomtype]), (list, tuple)) and len( + lambda_[iatomtype] + ) in [1, 3]: + iout += " lambda " + " ".join( + [f"{ii:.12f}" for ii in lambda_[iatomtype]] + ) + elif isinstance(ndarray2list(lambda_[iatomtype]), (int, float)): + iout += " lambda " + f"{lambda_[iatomtype]:.12f}" + + out += iout + "\n" + natom_tot += 1 + assert natom_tot == sum(data["atom_numbs"]) + return out diff --git a/dpdata/formats/amber/__init__.py b/dpdata/formats/amber/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dpdata/formats/amber/mask.py b/dpdata/formats/amber/mask.py new file mode 100644 index 000000000..155e2a7be --- /dev/null +++ b/dpdata/formats/amber/mask.py @@ -0,0 +1,42 @@ +"""Amber mask.""" + +from __future__ import annotations + +try: + import parmed +except ImportError: + pass + + +def pick_by_amber_mask(param, maskstr, coords=None): + """Pick atoms by amber masks. + + Parameters + ---------- + param : str or parmed.Structure + filename of Amber param file or parmed.Structure + maskstr : str + Amber masks + coords : np.ndarray (optional) + frame coordinates, shape: N*3 + """ + parm = load_param_file(param) + if coords is not None: + parm.initialize_topology(xyz=coords) + sele = [] + if len(maskstr) > 0: + newmaskstr = maskstr.replace("@0", "!@*") + sele = [ + parm.atoms[i].idx + for i in parmed.amber.mask.AmberMask(parm, newmaskstr).Selected() + ] + return sele + + +def load_param_file(param_file): + if isinstance(param_file, str): + return parmed.load_file(param_file) + elif isinstance(param_file, parmed.Structure): + return param_file + else: + raise RuntimeError("Unsupported structure") diff --git a/dpdata/formats/amber/md.py b/dpdata/formats/amber/md.py new file mode 100644 index 000000000..54c8bb8d2 --- /dev/null +++ b/dpdata/formats/amber/md.py @@ -0,0 +1,190 @@ +from __future__ import annotations + +import os +import re + +import numpy as np + +from dpdata.formats.amber.mask import pick_by_amber_mask +from dpdata.unit import EnergyConversion +from dpdata.utils import open_file + +from ...periodic_table import ELEMENTS + +kcalmol2eV = EnergyConversion("kcal_mol", "eV").value() +symbols = ["X"] + ELEMENTS + +energy_convert = kcalmol2eV +force_convert = energy_convert + + +def cell_lengths_angles_to_cell( + cell_lengths: np.ndarray, cell_angles: np.ndarray +) -> np.ndarray: + """Convert cell lengths and angles to cell vectors. + + Parameters + ---------- + cell_lengths + Cell lengths with shape ``(..., 3)`` where the last dimension is + ``a, b, c``. + cell_angles + Cell angles in degrees with shape ``(..., 3)`` where the last dimension + is ``alpha, beta, gamma``. + + Returns + ------- + np.ndarray + Cell vectors with shape ``(..., 3, 3)``. + """ + alpha = np.deg2rad(cell_angles[..., 0]) + beta = np.deg2rad(cell_angles[..., 1]) + gamma = np.deg2rad(cell_angles[..., 2]) + + a = cell_lengths[..., 0] + b = cell_lengths[..., 1] + c = cell_lengths[..., 2] + + if np.any(cell_lengths <= 0.0): + raise RuntimeError("Invalid AMBER cell lengths") + if np.any((cell_angles <= 0.0) | (cell_angles >= 180.0)): + raise RuntimeError("Invalid AMBER cell angles") + + cos_alpha = np.cos(alpha) + cos_beta = np.cos(beta) + cos_gamma = np.cos(gamma) + sin_gamma = np.sin(gamma) + ly = b * sin_gamma + if np.any(ly <= 1e-8): + raise RuntimeError("Invalid AMBER cell angles") + + z_factor = ( + 1 + - cos_alpha**2 + - cos_beta**2 + - cos_gamma**2 + + 2 * cos_alpha * cos_beta * cos_gamma + ) + lz2 = c**2 * z_factor / sin_gamma**2 + if np.any(lz2 <= 1e-8): + raise RuntimeError("Invalid AMBER cell angles") + + z = np.sqrt(z_factor) / sin_gamma + + shape = (*cell_lengths.shape[:-1], 3, 3) + cells = np.zeros(shape) + cells[..., 0, 0] = a + cells[..., 1, 0] = b * cos_gamma + cells[..., 1, 1] = b * sin_gamma + cells[..., 2, 0] = c * cos_beta + cells[..., 2, 1] = c * (cos_alpha - cos_beta * cos_gamma) / sin_gamma + cells[..., 2, 2] = c * z + return cells + + +def read_amber_traj( + parm7_file, + nc_file, + mdfrc_file=None, + mden_file=None, + mdout_file=None, + use_element_symbols=None, + labeled=True, +): + """The amber trajectory includes: + * nc, NetCDF format, stores coordinates + * mdfrc, NetCDF format, stores forces + * mden (optional), text format, stores energies + * mdout (optional), text format, may store energies if there is no mden_file + * parm7, text format, stores types. + + Parameters + ---------- + parm7_file, nc_file, mdfrc_file, mden_file, mdout_file: + filenames + use_element_symbols : None or list or str + If use_element_symbols is a list of atom indexes, these atoms will use element symbols + instead of amber types. For example, a ligand will use C, H, O, N, and so on + instead of h1, hc, o, os, and so on. + IF use_element_symbols is str, it will be considered as Amber mask. + labeled : bool + Whether to return labeled data + """ + from scipy.io import netcdf_file + + flag_atom_type = False + flag_atom_numb = False + amber_types = [] + atomic_number = [] + with open_file(parm7_file) as f: + for line in f: + if line.startswith("%FLAG"): + flag_atom_type = line.startswith("%FLAG AMBER_ATOM_TYPE") + flag_atom_numb = (use_element_symbols is not None) and line.startswith( + "%FLAG ATOMIC_NUMBER" + ) + elif flag_atom_type or flag_atom_numb: + if line.startswith("%FORMAT"): + fmt = re.findall(r"\d+", line) + fmt0 = int(fmt[0]) + fmt1 = int(fmt[1]) + else: + for ii in range(fmt0): + start_index = ii * fmt1 + end_index = (ii + 1) * fmt1 + if end_index >= len(line): + continue + content = line[start_index:end_index].strip() + if flag_atom_type: + amber_types.append(content) + elif flag_atom_numb: + atomic_number.append(int(content)) + if use_element_symbols is not None: + if isinstance(use_element_symbols, str): + use_element_symbols = pick_by_amber_mask(parm7_file, use_element_symbols) + for ii in use_element_symbols: + amber_types[ii] = symbols[atomic_number[ii]] + + with netcdf_file(nc_file, "r") as f: + coords = np.array(f.variables["coordinates"][:]) + cell_lengths = np.array(f.variables["cell_lengths"][:]) + cell_angles = np.array(f.variables["cell_angles"][:]) + cells = cell_lengths_angles_to_cell(cell_lengths, cell_angles) + + if labeled: + with netcdf_file(mdfrc_file, "r") as f: + forces = np.array(f.variables["forces"][:]) + + # load energy from mden_file or mdout_file + energies = [] + if mden_file is not None and os.path.isfile(mden_file): + with open_file(mden_file) as f: + for line in f: + if line.startswith("L6"): + s = line.split() + if s[2] != "E_pot": + energies.append(float(s[2])) + elif mdout_file is not None and os.path.isfile(mdout_file): + with open_file(mdout_file) as f: + for line in f: + if "EPtot" in line: + s = line.split() + energies.append(float(s[-1])) + else: + raise RuntimeError("Please provide one of mden_file and mdout_file") + + atom_names, atom_types, atom_numbs = np.unique( + amber_types, return_inverse=True, return_counts=True + ) + + data = {} + data["atom_names"] = list(atom_names) + data["atom_numbs"] = list(atom_numbs) + data["atom_types"] = atom_types + if labeled: + data["forces"] = forces * force_convert + data["energies"] = np.array(energies) * energy_convert + data["coords"] = coords + data["cells"] = cells + data["orig"] = np.array([0, 0, 0]) + return data diff --git a/dpdata/formats/amber/sqm.py b/dpdata/formats/amber/sqm.py new file mode 100644 index 000000000..93e41f9aa --- /dev/null +++ b/dpdata/formats/amber/sqm.py @@ -0,0 +1,120 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numpy as np + +from dpdata.periodic_table import ELEMENTS +from dpdata.unit import EnergyConversion +from dpdata.utils import open_file + +if TYPE_CHECKING: + from dpdata.utils import FileType + +kcal2ev = EnergyConversion("kcal_mol", "eV").value() + +START = 0 +READ_CHARGE = 2 +READ_COORDS_START = 3 +READ_COORDS = 6 +READ_FORCES = 7 + + +def parse_sqm_out(fname: FileType): + """Read atom symbols, charges and coordinates from ambertools sqm.out file.""" + atom_symbols = [] + coords = [] + charges = [] + forces = [] + energies = [] + + with open_file(fname) as f: + flag = START + for line in f: + if line.startswith(" Total SCF energy"): + energy = float(line.strip().split()[-2]) + energies = [energy] + elif line.startswith(" Atom Element Mulliken Charge"): + flag = READ_CHARGE + charges = [] + elif line.startswith(" Total Mulliken Charge"): + flag = START + elif line.startswith(" Final Structure"): + flag = READ_COORDS_START + coords = [] + elif line.startswith("QMMM: Forces on QM atoms"): + flag = READ_FORCES + forces = [] + elif flag == READ_CHARGE: + ls = line.strip().split() + atom_symbols.append(ls[-2]) + charges.append(float(ls[-1])) + elif READ_COORDS_START <= flag < READ_COORDS: + flag += 1 + elif flag == READ_COORDS: + coords.append([float(x) for x in line.strip().split()[-3:]]) + if len(coords) == len(charges): + flag = START + elif flag == READ_FORCES: + ll = line.strip() + if not ll.startswith("QMMM: Atm "): + flag = START + continue + forces.append([float(ll[-60:-40]), float(ll[-40:-20]), float(ll[-20:])]) + if len(forces) == len(charges): + flag = START + + data = {} + atom_names, data["atom_types"], atom_numbs = np.unique( + atom_symbols, return_inverse=True, return_counts=True + ) + data["charges"] = np.array(charges) + data["atom_names"] = list(atom_names) + data["atom_numbs"] = list(atom_numbs) + data["orig"] = np.array([0, 0, 0]) + data["cells"] = np.array( + [[[100.0, 0.0, 0.0], [0.0, 100.0, 0.0], [0.0, 0.0, 100.0]]] + ) + data["nopbc"] = True + data["coords"] = np.array([coords]) + + energies = np.array(energies) + forces = -np.array([forces], dtype=np.float64) * kcal2ev + if len(forces) > 0: + data["energies"] = energies + data["forces"] = forces + + return data + + +def make_sqm_in(data, fname: FileType | None = None, frame_idx=0, **kwargs): + symbols = [data["atom_names"][ii] for ii in data["atom_types"]] + atomic_numbers = [ELEMENTS.index(ss) + 1 for ss in symbols] + charge = kwargs.get("charge", 0) + + # multiplicity + mult = kwargs.get("mult", 1) + if mult != 1: + raise RuntimeError("Multiplicity is not 1, which is not supported by sqm") + + maxcyc = kwargs.get("maxcyc", 0) # 0 represents a single-point calculation + theory = kwargs.get("qm_theory", "DFTB3") + ret = "Run semi-emperical minimization\n" + ret += " &qmmm\n" + ret += f" qm_theory='{theory}'\n" + ret += f" qmcharge={charge}\n" + ret += f" maxcyc={maxcyc}\n" + ret += " verbosity=4\n" + ret += " /\n" + for ii in range(len(data["atom_types"])): + ret += "{:>4s}{:>6s}{:>16s}{:>16s}{:>16s}\n".format( + str(atomic_numbers[ii]), + str(symbols[ii]), + f"{data['coords'][frame_idx][ii, 0]:.6f}", + f"{data['coords'][frame_idx][ii, 1]:.6f}", + f"{data['coords'][frame_idx][ii, 2]:.6f}", + ) + if fname is not None: + with open_file(fname, "w") as fp: + fp.write(ret) + return ret diff --git a/dpdata/formats/cp2k/__init__.py b/dpdata/formats/cp2k/__init__.py new file mode 100644 index 000000000..f3d05133b --- /dev/null +++ b/dpdata/formats/cp2k/__init__.py @@ -0,0 +1,5 @@ +from __future__ import annotations + +from . import cell, output + +__all__ = ["cell", "output"] diff --git a/dpdata/formats/cp2k/cell.py b/dpdata/formats/cp2k/cell.py new file mode 100644 index 000000000..a3021b815 --- /dev/null +++ b/dpdata/formats/cp2k/cell.py @@ -0,0 +1,68 @@ +# %% +from __future__ import annotations + +import numpy as np + + +def cell_to_low_triangle(A, B, C, alpha, beta, gamma): + """Convert cell to low triangle matrix. + + Parameters + ---------- + A : float + cell length A + B : float + cell length B + C : float + cell length C + alpha : float + radian. The angle between vector B and vector C. + beta : float + radian. The angle between vector A and vector C. + gamma : float + radian. The angle between vector B and vector C. + + Returns + ------- + cell : list + The cell matrix used by dpdata in low triangle form. + """ + if not np.pi * 5 / 180 < alpha < np.pi * 175 / 180: + raise RuntimeError( + f"alpha=={alpha}: must be a radian, and \ + must be in np.pi*5/180 < alpha < np.pi*175/180" + ) + if not np.pi * 5 / 180 < beta < np.pi * 175 / 180: + raise RuntimeError( + f"beta=={beta}: must be a radian, and \ + must be in np.pi*5/180 < beta < np.pi*175/180" + ) + if not np.pi * 5 / 180 < gamma < np.pi * 175 / 180: + raise RuntimeError( + f"gamma=={gamma}: must be a radian, and \ + must be in np.pi*5/180 < gamma < np.pi*175/180" + ) + if not A > 0.2: + raise RuntimeError(f"A=={A}, must be greater than 0.2") + if not B > 0.2: + raise RuntimeError(f"B=={B}, must be greater than 0.2") + if not C > 0.2: + raise RuntimeError(f"C=={C}, must be greater than 0.2") + + lx = A + xy = B * np.cos(gamma) + xz = C * np.cos(beta) + ly = B * np.sin(gamma) + if not ly > 0.1: + raise RuntimeError( + "ly:=B* np.sin(gamma)=={}, must be greater than 0.1", format(ly) + ) + yz = (B * C * np.cos(alpha) - xy * xz) / ly + if not C**2 - xz**2 - yz**2 > 0.01: + raise RuntimeError( + "lz^2:=C**2-xz**2-yz**2=={}, must be greater than 0.01", + format(C**2 - xz**2 - yz**2), + ) + lz = np.sqrt(C**2 - xz**2 - yz**2) + cell = np.asarray([[lx, 0, 0], [xy, ly, 0], [xz, yz, lz]]).astype("float32") + return cell diff --git a/dpdata/formats/cp2k/output.py b/dpdata/formats/cp2k/output.py new file mode 100644 index 000000000..e5ee733c9 --- /dev/null +++ b/dpdata/formats/cp2k/output.py @@ -0,0 +1,510 @@ +# %% +from __future__ import annotations + +import math +import re +from collections import OrderedDict + +import numpy as np + +from ...unit import ( + EnergyConversion, + ForceConversion, + LengthConversion, + PressureConversion, +) +from .cell import cell_to_low_triangle + +AU_TO_ANG = LengthConversion("bohr", "angstrom").value() +AU_TO_EV = EnergyConversion("hartree", "eV").value() +AU_TO_EV_EVERY_ANG = ForceConversion("hartree/bohr", "eV/angstrom").value() +delimiter_patterns = [] +delimiter_p1 = re.compile(r"^ \* GO CP2K GO! \*+") +delimiter_p2 = re.compile(r"^ \*+") +delimiter_patterns.append(delimiter_p1) +delimiter_patterns.append(delimiter_p2) +avail_patterns = [] +avail_patterns.append(re.compile(r"^ INITIAL POTENTIAL ENERGY")) +avail_patterns.append(re.compile(r"^ ENSEMBLE TYPE")) + + +class Cp2kSystems: + """deal with cp2k outputfile.""" + + def __init__(self, log_file_name, xyz_file_name, restart=False): + self.log_file_object = open(log_file_name) + self.xyz_file_object = open(xyz_file_name) + self.log_block_generator = self.get_log_block_generator() + self.xyz_block_generator = self.get_xyz_block_generator() + self.restart_flag = restart + + self.cell = None + self.print_level = None + + self.atomic_kinds = None + + if self.restart_flag: + self.handle_single_log_frame(next(self.log_block_generator)) + + def __del__(self): + self.log_file_object.close() + self.xyz_file_object.close() + + def __iter__(self): + return self + + def __next__(self): + info_dict = {} + log_info_dict = self.handle_single_log_frame(next(self.log_block_generator)) + # print(log_info_dict) + xyz_info_dict = self.handle_single_xyz_frame(next(self.xyz_block_generator)) + # eq1 = [v1==v2 for v1,v2 in zip(log_info_dict['atom_numbs'], xyz_info_dict['atom_numbs'])] + # eq2 = [v1==v2 for v1,v2 in zip(log_info_dict['atom_names'], xyz_info_dict['atom_names'])] + # eq3 = [v1==v2 for v1,v2 in zip(log_info_dict['atom_types'], xyz_info_dict['atom_types'])] + # assert all(eq1), (log_info_dict,xyz_info_dict,'There may be errors in the file. If it is a restart task; use restart=True') + # assert all(eq2), (log_info_dict,xyz_info_dict,'There may be errors in the file. If it is a restart task; use restart=True') + # assert all(eq3), (log_info_dict,xyz_info_dict,'There may be errors in the file. If it is a restart task; use restart=True') + assert math.isclose( + log_info_dict["energies"][0], xyz_info_dict["energies"][0], abs_tol=1.0e-6 + ), ( + log_info_dict["energies"], + xyz_info_dict["energies"], + "There may be errors in the file", + ) + info_dict.update(log_info_dict) + info_dict.update(xyz_info_dict) + return info_dict + + def get_log_block_generator(self): + lines = [] + delimiter_flag = False + yield_flag = False + while True: + line = self.log_file_object.readline() + if line: + lines.append(line) + if any(p.match(line) for p in delimiter_patterns): + if delimiter_flag is True: + yield_flag = True + yield lines + lines = [] + delimiter_flag = False + else: + line = self.log_file_object.readline() + lines.append(line) + if any(p.match(line) for p in avail_patterns): + delimiter_flag = True + else: + if not yield_flag: + raise StopIteration("None of the delimiter patterns are matched") + break + if delimiter_flag is True: + raise RuntimeError("This file lacks some content, please check") + + def get_xyz_block_generator(self): + p3 = re.compile(r"^\s*(\d+)\s*") + yield_flag = False + while True: + line = self.xyz_file_object.readline() + if not line: + if not yield_flag: + raise StopIteration("None of the xyz patterns are matched") + break + if p3.match(line): + yield_flag = True + atom_num = int(p3.match(line).group(1)) + lines = [] + lines.append(line) + for ii in range(atom_num + 1): + lines.append(self.xyz_file_object.readline()) + if not lines[-1]: + raise RuntimeError( + f"this xyz file may lack of lines, should be {atom_num + 2};lines:{lines}" + ) + yield lines + + def handle_single_log_frame(self, lines): + info_dict = {} + energy_pattern_1 = re.compile( + r" INITIAL POTENTIAL ENERGY\[hartree\]\s+=\s+(?P\S+)" + ) + # CONSERVED QUANTITY [hartree] = -0.279168013085E+04 + energy_pattern_2 = re.compile( + r" POTENTIAL ENERGY\[hartree\]\s+=\s+(?P\S+)" + ) + energy = None + cell_length_pattern = re.compile( + r" (INITIAL ){0,1}CELL LNTHS\[bohr\]\s+=\s+(?P\S+)\s+(?P\S+)\s+(?P\S+)" + ) + cell_angle_pattern = re.compile( + r" (INITIAL ){0,1}CELL ANGLS\[deg\]\s+=\s+(?P\S+)\s+(?P\S+)\s+(?P\S+)" + ) + cell_A, cell_B, cell_C = ( + 0, + 0, + 0, + ) + cell_alpha, cell_beta, cell_gamma = ( + 0, + 0, + 0, + ) + cell_a_pattern = re.compile( + r" CELL\| Vector a \[angstrom\]:\s+(?P\S+)\s+(?P\S+)\s+(?P\S+)" + ) + cell_b_pattern = re.compile( + r" CELL\| Vector b \[angstrom\]:\s+(?P\S+)\s+(?P\S+)\s+(?P\S+)" + ) + cell_c_pattern = re.compile( + r" CELL\| Vector c \[angstrom\]:\s+(?P\S+)\s+(?P\S+)\s+(?P\S+)" + ) + force_start_pattern = re.compile(r" ATOMIC FORCES in") + force_flag = False + force_end_pattern = re.compile(r" SUM OF ATOMIC FORCES") + force_lines = [] + cell_flag = 0 + print_level_pattern = re.compile( + r" GLOBAL\| Global print level\s+(?P\S+)" + ) + print_level_flag = 0 + atomic_kinds_pattern = re.compile(r"\s+\d+\. Atomic kind:\s+(?P\S+)") + atomic_kinds = [] + stress_sign = "STRESS" + stress_flag = 0 + stress = [] + + for line in lines: + if stress_flag == 3: + if line == "\n": + stress_flag = 0 + else: + stress.append(line.split()[1:4]) + if stress_flag == 2: + stress_flag = 3 + if stress_flag == 1: + stress_flag = 2 + if stress_sign in line: + stress_flag = 1 + if force_start_pattern.match(line): + force_flag = True + if force_end_pattern.match(line): + assert force_flag is True, ( + force_flag, + "there may be errors in this file ", + ) + force_flag = False + if force_flag is True: + force_lines.append(line) + if energy_pattern_1.match(line): + energy = ( + float(energy_pattern_1.match(line).groupdict()["number"]) * AU_TO_EV + ) + # print('1to', energy) + if energy_pattern_2.match(line): + energy = ( + float(energy_pattern_2.match(line).groupdict()["number"]) * AU_TO_EV + ) + if cell_length_pattern.match(line): + cell_A = ( + float(cell_length_pattern.match(line).groupdict()["A"]) * AU_TO_ANG + ) + cell_B = ( + float(cell_length_pattern.match(line).groupdict()["B"]) * AU_TO_ANG + ) + cell_C = ( + float(cell_length_pattern.match(line).groupdict()["C"]) * AU_TO_ANG + ) + cell_flag += 1 + if cell_angle_pattern.match(line): + cell_alpha = np.deg2rad( + float(cell_angle_pattern.match(line).groupdict()["alpha"]) + ) + cell_beta = np.deg2rad( + float(cell_angle_pattern.match(line).groupdict()["beta"]) + ) + cell_gamma = np.deg2rad( + float(cell_angle_pattern.match(line).groupdict()["gamma"]) + ) + cell_flag += 1 + if print_level_pattern.match(line): + print_level = print_level_pattern.match(line).groupdict()["print_level"] + print_level_flag += 1 + if cell_a_pattern.match(line): + cell_ax = float(cell_a_pattern.match(line).groupdict()["ax"]) + cell_ay = float(cell_a_pattern.match(line).groupdict()["ay"]) + cell_az = float(cell_a_pattern.match(line).groupdict()["az"]) + cell_flag += 1 + if cell_b_pattern.match(line): + cell_bx = float(cell_b_pattern.match(line).groupdict()["bx"]) + cell_by = float(cell_b_pattern.match(line).groupdict()["by"]) + cell_bz = float(cell_b_pattern.match(line).groupdict()["bz"]) + cell_flag += 1 + if cell_c_pattern.match(line): + cell_cx = float(cell_c_pattern.match(line).groupdict()["cx"]) + cell_cy = float(cell_c_pattern.match(line).groupdict()["cy"]) + cell_cz = float(cell_c_pattern.match(line).groupdict()["cz"]) + cell_flag += 1 + + if atomic_kinds_pattern.match(line): + akind = atomic_kinds_pattern.match(line).groupdict()["akind"] + atomic_kinds.append(akind) + if print_level_flag == 1: + self.print_level = print_level + if print_level == "LOW": + raise RuntimeError( + "please provide cp2k output with higher print level(at least MEDIUM)" + ) + + if cell_flag == 2: + self.cell = cell_to_low_triangle( + cell_A, cell_B, cell_C, cell_alpha, cell_beta, cell_gamma + ) + elif cell_flag == 5: + self.cell = np.asarray( + [ + [cell_ax, cell_ay, cell_az], + [cell_bx, cell_by, cell_bz], + [cell_cx, cell_cy, cell_cz], + ] + ).astype("float64") + if atomic_kinds: + self.atomic_kinds = atomic_kinds + # print(self.atomic_kinds) + # lx = cell_A + # xy = cell_B * np.cos(cell_gamma) + # xz = cell_C * np.cos(cell_beta) + # ly = cell_B* np.sin(cell_gamma) + # yz = (cell_B*cell_C*np.cos(cell_alpha)-xy*xz)/ly + # lz = np.sqrt(cell_C**2-xz**2-yz**2) + # self.cell = [[lx, 0 , 0], + # [xy, ly, 0 ], + # [xz, yz, lz]] + + element_index = -1 + element_dict = OrderedDict() + atom_types_idx_list = [] + forces_list = [] + for line in force_lines[3:]: + line_list = line.split() + # print(line_list) + if element_dict.get(line_list[1]): + element_dict[line_list[1]][1] += 1 + else: + element_index += 1 + element_dict[line_list[1]] = [element_index, 1] + atom_types_idx_list.append(element_dict[line_list[1]][0]) + forces_list.append( + [ + float(line_list[3]) * AU_TO_EV_EVERY_ANG, + float(line_list[4]) * AU_TO_EV_EVERY_ANG, + float(line_list[5]) * AU_TO_EV_EVERY_ANG, + ] + ) + # print(atom_types_idx_list) + # atom_names=list(element_dict.keys()) + atom_names = self.atomic_kinds + atom_numbs = [] + + GPa = PressureConversion("eV/angstrom^3", "GPa").value() + if stress: + stress = np.array(stress) + stress = stress.astype("float64") + stress = stress[np.newaxis, :, :] + # stress to virial conversion, default unit in cp2k is GPa + # note the stress is virial = stress * volume + virial = stress * np.linalg.det(self.cell) / GPa + virial = virial.squeeze() + else: + virial = None + for ii in element_dict.keys(): + atom_numbs.append(element_dict[ii][1]) + # print(atom_numbs) + info_dict["atom_names"] = atom_names + info_dict["atom_numbs"] = atom_numbs + info_dict["atom_types"] = np.asarray(atom_types_idx_list) + info_dict["print_level"] = self.print_level + info_dict["cells"] = np.asarray([self.cell]).astype("float64") + info_dict["energies"] = np.asarray([energy]).astype("float64") + info_dict["forces"] = np.asarray([forces_list]).astype("float64") + if virial is not None: + info_dict["virials"] = np.asarray([virial]).astype("float64") + return info_dict + + def handle_single_xyz_frame(self, lines): + info_dict = {} + atom_num = int(lines[0].strip("\n").strip()) + if len(lines) != atom_num + 2: + raise RuntimeError( + f"format error, atom_num=={atom_num}, {len(lines)}!=atom_num+2" + ) + data_format_line = lines[1].strip("\n").strip() + " " + prop_pattern = re.compile(r"(?P\w+)\s*=\s*(?P.*?)[, ]") + prop_dict = dict(prop_pattern.findall(data_format_line)) + + energy = 0 + if prop_dict.get("E"): + energy = float(prop_dict.get("E")) * AU_TO_EV + # info_dict['energies'] = np.array([prop_dict['E']]).astype('float64') + + element_index = -1 + element_dict = OrderedDict() + atom_types_list = [] + coords_list = [] + for line in lines[2:]: + line_list = line.split() + if element_dict.get(line_list[0]): + element_dict[line_list[0]][1] += 1 + else: + element_index += 1 + element_dict[line_list[0]] = [element_index, 1] + atom_types_list.append(element_dict[line_list[0]][0]) + # coords_list.append([float(line_list[1])*AU_TO_ANG, + # float(line_list[2])*AU_TO_ANG, + # float(line_list[3])*AU_TO_ANG]) + coords_list.append( + [float(line_list[1]), float(line_list[2]), float(line_list[3])] + ) + atom_names = list(element_dict.keys()) + atom_numbs = [] + for ii in atom_names: + atom_numbs.append(element_dict[ii][1]) + # info_dict['atom_names'] = atom_names + # info_dict['atom_numbs'] = atom_numbs + # info_dict['atom_types'] = np.asarray(atom_types_list) + info_dict["coords"] = np.asarray([coords_list]).astype("float64") + info_dict["energies"] = np.array([energy]).astype("float64") + info_dict["orig"] = np.zeros(3) + return info_dict + + +# %% + + +def get_frames(fname): + coord_flag = False + force_flag = False + stress_flag = False + eV = EnergyConversion("hartree", "eV").value() + angstrom = LengthConversion("bohr", "angstrom").value() + GPa = PressureConversion("eV/angstrom^3", "GPa").value() + atom_symbol_idx_list = [] + atom_symbol_list = [] + cell = [] + coord = [] + force = [] + stress = [] + + fp = open(fname) + # check if output is converged, if not, return sys = 0 + content = fp.read() + count = content.count("SCF run converged") + if count == 0: + fp.close() + return [], [], [], [], [], [], [], None + + # search duplicated header + fp.seek(0) + header_idx = [] + for idx, ii in enumerate(fp): + if "Multiplication driver" in ii: + header_idx.append(idx) + + # parse from last header + fp.seek(0) + for idx, ii in enumerate(fp): + if idx > header_idx[-1]: + if "CELL| Vector" in ii: + cell.append(ii.split()[4:7]) + if "Atomic kind:" in ii: + atom_symbol_list.append(ii.split()[3]) + + # beginning of coords block + if "Atom Kind Element" in ii or "Atom Kind Element" in ii: + coord_flag = True + # parse coords lines + elif coord_flag: + if ii == "\n": + coord_flag = len(coord) == 0 # skip empty line at the beginning + else: + coord.append(ii.split()[4:7]) + atom_symbol_idx_list.append(ii.split()[1]) + + if "ENERGY|" in ii: + energy = ii.split()[8] + if " Atom Kind " in ii: + force_flag = True + force_idx = idx + if force_flag: + if idx > force_idx: + if "SUM OF ATOMIC FORCES" in ii: + force_flag = False + else: + force.append(ii.split()[3:6]) + # add reading stress tensor + if "STRESS TENSOR [GPa" in ii: + stress_flag = True + stress_idx = idx + if stress_flag: + if idx > stress_idx + 2: + if ii == "\n": + stress_flag = False + else: + stress.append(ii.split()[1:4]) + + fp.close() + assert coord, "cannot find coords" + assert energy, "cannot find energies" + assert force, "cannot find forces" + + # conver to float array and add extra dimension for nframes + cell = np.array(cell) + cell = cell.astype("float64") + cell = cell[np.newaxis, :, :] + coord = np.array(coord) + coord = coord.astype("float64") + coord = coord[np.newaxis, :, :] + atom_symbol_idx_list = np.array(atom_symbol_idx_list) + atom_symbol_idx_list = atom_symbol_idx_list.astype(int) + atom_symbol_idx_list = atom_symbol_idx_list - 1 + atom_symbol_list = np.array(atom_symbol_list) + atom_symbol_list = atom_symbol_list[atom_symbol_idx_list] + force = np.array(force) + force = force.astype("float64") + force = force[np.newaxis, :, :] + + # virial is not necessary + if stress: + stress = np.array(stress) + stress = stress.astype("float64") + stress = stress[np.newaxis, :, :] + # stress to virial conversion, default unit in cp2k is GPa + # note the stress is virial = stress * volume + virial = stress * np.linalg.det(cell[0]) / GPa + else: + virial = None + + # force unit conversion, default unit in cp2k is hartree/bohr + force = force * eV / angstrom + # energy unit conversion, default unit in cp2k is hartree + energy = float(energy) * eV + energy = np.array(energy).astype("float64") + energy = energy[np.newaxis] + + tmp_names, symbol_idx = np.unique(atom_symbol_list, return_index=True) + atom_types = [] + atom_numbs = [] + # preserve the atom_name order + atom_names = atom_symbol_list[np.sort(symbol_idx, kind="stable")] + for jj in atom_symbol_list: + for idx, ii in enumerate(atom_names): + if jj == ii: + atom_types.append(idx) + for idx in range(len(atom_names)): + atom_numbs.append(atom_types.count(idx)) + + atom_types = np.array(atom_types) + + return list(atom_names), atom_numbs, atom_types, cell, coord, energy, force, virial + + +# %% diff --git a/dpdata/formats/deepmd/__init__.py b/dpdata/formats/deepmd/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dpdata/formats/deepmd/comp.py b/dpdata/formats/deepmd/comp.py new file mode 100644 index 000000000..410d789e1 --- /dev/null +++ b/dpdata/formats/deepmd/comp.py @@ -0,0 +1,162 @@ +from __future__ import annotations + +import glob +import os +import shutil +import warnings + +import numpy as np + +import dpdata +from dpdata.utils import open_file + +from .raw import load_type + + +def _cond_load_data(fname): + tmp = None + if os.path.isfile(fname): + tmp = np.load(fname) + return tmp + + +def _load_set(folder, nopbc: bool): + coords = np.load(os.path.join(folder, "coord.npy")) + if nopbc: + cells = np.zeros((coords.shape[0], 3, 3)) + else: + cells = np.load(os.path.join(folder, "box.npy")) + return cells, coords + + +def to_system_data(folder, type_map=None, labels=True): + # data is empty + data = load_type(folder, type_map=type_map) + data["orig"] = np.zeros([3]) + if os.path.isfile(os.path.join(folder, "nopbc")): + data["nopbc"] = True + sets = sorted(glob.glob(os.path.join(folder, "set.*"))) + all_cells = [] + all_coords = [] + for ii in sets: + cells, coords = _load_set(ii, data.get("nopbc", False)) + nframes = np.reshape(cells, [-1, 3, 3]).shape[0] + all_cells.append(np.reshape(cells, [nframes, 3, 3])) + all_coords.append(np.reshape(coords, [nframes, -1, 3])) + data["cells"] = np.concatenate(all_cells, axis=0) + data["coords"] = np.concatenate(all_coords, axis=0) + # allow custom dtypes + if labels: + dtypes = dpdata.system.LabeledSystem.DTYPES + else: + dtypes = dpdata.system.System.DTYPES + + for dtype in dtypes: + if dtype.name in ( + "atom_numbs", + "atom_names", + "atom_types", + "orig", + "cells", + "coords", + "real_atom_names", + "nopbc", + ): + # skip as these data contains specific rules + continue + if not (len(dtype.shape) and dtype.shape[0] == dpdata.system.Axis.NFRAMES): + warnings.warn( + f"Shape of {dtype.name} is not (nframes, ...), but {dtype.shape}. This type of data will not converted from deepmd/npy format." + ) + continue + natoms = data["atom_types"].shape[0] + shape = [ + natoms if xx == dpdata.system.Axis.NATOMS else xx for xx in dtype.shape[1:] + ] + all_data = [] + for ii in sets: + tmp = _cond_load_data(os.path.join(ii, dtype.deepmd_name + ".npy")) + if tmp is not None: + all_data.append(np.reshape(tmp, [tmp.shape[0], *shape])) + if len(all_data) > 0: + data[dtype.name] = np.concatenate(all_data, axis=0) + return data + + +def dump(folder, data, set_size=5000, comp_prec=np.float32, remove_sets=True): + os.makedirs(folder, exist_ok=True) + sets = sorted(glob.glob(os.path.join(folder, "set.*"))) + if len(sets) > 0: + if remove_sets: + for ii in sets: + shutil.rmtree(ii) + else: + raise RuntimeError( + "found " + + str(sets) + + " in " + + folder + + "not a clean deepmd raw dir. please firstly clean set.* then try compress" + ) + # dump raw + np.savetxt(os.path.join(folder, "type.raw"), data["atom_types"], fmt="%d") + np.savetxt(os.path.join(folder, "type_map.raw"), data["atom_names"], fmt="%s") + # BondOrder System + if "bonds" in data: + np.savetxt( + os.path.join(folder, "bonds.raw"), + data["bonds"], + header="begin_atom, end_atom, bond_order", + ) + if "formal_charges" in data: + np.savetxt(os.path.join(folder, "formal_charges.raw"), data["formal_charges"]) + # reshape frame properties and convert prec + nframes = data["cells"].shape[0] + # dump frame properties: cell, coord, energy, force and virial + nsets = nframes // set_size + if set_size * nsets < nframes: + nsets += 1 + for ii in range(nsets): + set_stt = ii * set_size + set_end = (ii + 1) * set_size + set_folder = os.path.join(folder, "set.%03d" % ii) # noqa: UP031 + os.makedirs(set_folder) + try: + os.remove(os.path.join(folder, "nopbc")) + except OSError: + pass + if data.get("nopbc", False): + with open_file(os.path.join(folder, "nopbc"), "w") as fw_nopbc: + pass + # allow custom dtypes + labels = "energies" in data + if labels: + dtypes = dpdata.system.LabeledSystem.DTYPES + else: + dtypes = dpdata.system.System.DTYPES + for dtype in dtypes: + if dtype.name in ( + "atom_numbs", + "atom_names", + "atom_types", + "orig", + "real_atom_names", + "nopbc", + ): + # skip as these data contains specific rules + continue + if dtype.name not in data: + continue + if not (len(dtype.shape) and dtype.shape[0] == dpdata.system.Axis.NFRAMES): + warnings.warn( + f"Shape of {dtype.name} is not (nframes, ...), but {dtype.shape}. This type of data will not converted to deepmd/npy format." + ) + continue + ddata = np.reshape(data[dtype.name], [nframes, -1]) + if np.issubdtype(ddata.dtype, np.floating): + ddata = ddata.astype(comp_prec) + for ii in range(nsets): + set_stt = ii * set_size + set_end = (ii + 1) * set_size + set_folder = os.path.join(folder, "set.%03d" % ii) # noqa: UP031 + np.save(os.path.join(set_folder, dtype.deepmd_name), ddata[set_stt:set_end]) diff --git a/dpdata/formats/deepmd/hdf5.py b/dpdata/formats/deepmd/hdf5.py new file mode 100644 index 000000000..c2b3bd424 --- /dev/null +++ b/dpdata/formats/deepmd/hdf5.py @@ -0,0 +1,228 @@ +"""Utils for deepmd/hdf5 format.""" + +from __future__ import annotations + +import warnings +from typing import TYPE_CHECKING + +import numpy as np + +import dpdata + +if TYPE_CHECKING: + import h5py + +__all__ = ["to_system_data", "dump"] + + +def to_system_data( + f: h5py.File | h5py.Group, + folder: str, + type_map: list | None = None, + labels: bool = True, +): + """Load a HDF5 file. + + Parameters + ---------- + f : h5py.File or h5py.Group + HDF5 file or group object + folder : str + path in the HDF5 file + type_map : list + type map + labels : bool + labels + """ + from wcmatch.glob import globfilter + + g = f[folder] if folder else f + + data = {} + # ignore empty files or groups + if "type.raw" not in g.keys(): + return data + data["atom_types"] = g["type.raw"][:] + ntypes = np.max(data["atom_types"]) + 1 + natoms = data["atom_types"].size + data["atom_numbs"] = [] + for ii in range(ntypes): + data["atom_numbs"].append(np.count_nonzero(data["atom_types"] == ii)) + data["atom_names"] = [] + # if find type_map.raw, use it + if "type_map.raw" in g.keys(): + my_type_map = list(np.char.decode(g["type_map.raw"][:])) + # else try to use arg type_map + elif type_map is not None: + my_type_map = type_map + # in the last case, make artificial atom names + else: + my_type_map = [] + for ii in range(ntypes): + my_type_map.append("Type_%d" % ii) # noqa: UP031 + assert len(my_type_map) >= len(data["atom_numbs"]) + for ii in range(len(data["atom_numbs"])): + data["atom_names"].append(my_type_map[ii]) + + data["orig"] = np.zeros([3]) + if "nopbc" in g.keys(): + data["nopbc"] = True + sets = globfilter(g.keys(), "set.*") + + data_types = {} + # allow custom dtypes + if labels: + dtypes = dpdata.system.LabeledSystem.DTYPES + else: + dtypes = dpdata.system.System.DTYPES + for dtype in dtypes: + if dtype.name in ( + "atom_numbs", + "atom_names", + "atom_types", + "orig", + "real_atom_types", + "real_atom_names", + "nopbc", + ): + # skip as these data contains specific rules + continue + if not (len(dtype.shape) and dtype.shape[0] == dpdata.system.Axis.NFRAMES): + warnings.warn( + f"Shape of {dtype.name} is not (nframes, ...), but {dtype.shape}. This type of data will not converted from deepmd/hdf5 format." + ) + continue + shape = [ + natoms if xx == dpdata.system.Axis.NATOMS else xx for xx in dtype.shape[1:] + ] + + data_types[dtype.name] = { + "fn": dtype.deepmd_name, + "shape": shape, + "required": dtype.required + and not (dtype.name == "cells" and data.get("nopbc", False)), + } + + for dt, prop in data_types.items(): + all_data = [] + + for ii in sets: + set = g[ii] + fn = "{}.npy".format(prop["fn"]) + if fn in set.keys(): + dd = set[fn][:] + nframes = dd.shape[0] + all_data.append(np.reshape(dd, (nframes, *prop["shape"]))) + elif prop["required"]: + raise RuntimeError(f"{folder}/{ii}/{fn} not found") + + if len(all_data) > 0: + data[dt] = np.concatenate(all_data, axis=0) + if "cells" not in data: + nframes = data["coords"].shape[0] + data["cells"] = np.zeros((nframes, 3, 3)) + return data + + +def dump( + f: h5py.File | h5py.Group, + folder: str, + data: dict, + set_size=5000, + comp_prec=np.float32, +) -> None: + """Dump data to a HDF5 file. + + Parameters + ---------- + f : h5py.File or h5py.Group + HDF5 file or group object + folder : str + path in the HDF5 file + data : dict + System or LabeledSystem data + set_size : int, default: 5000 + size of a set + comp_prec : np.dtype, default: np.float32 + precision of data + """ + # if folder is None, use the root of the file + if folder: + if folder in f: + del f[folder] + g = f.create_group(folder) + else: + g = f + # ignore empty systems + if not len(data["coords"]): + return + # dump raw (array in fact) + g.create_dataset("type.raw", data=data["atom_types"]) + g.create_dataset("type_map.raw", data=np.array(data["atom_names"], dtype="S")) + # BondOrder System + if "bonds" in data: + g.create_dataset("bonds.raw", data=data["bonds"]) + if "formal_charges" in data: + g.create_dataset("formal_charges.raw", data=data["formal_charges"]) + # reshape frame properties and convert prec + nframes = data["cells"].shape[0] + + nopbc = data.get("nopbc", False) + reshaped_data = {} + + data_types = {} + + labels = "energies" in data + if labels: + dtypes = dpdata.system.LabeledSystem.DTYPES + else: + dtypes = dpdata.system.System.DTYPES + # allow custom dtypes + for dtype in dtypes: + if dtype.name in ( + "atom_numbs", + "atom_names", + "atom_types", + "orig", + "real_atom_types", + "real_atom_names", + "nopbc", + ): + # skip as these data contains specific rules + continue + if not (len(dtype.shape) and dtype.shape[0] == dpdata.system.Axis.NFRAMES): + warnings.warn( + f"Shape of {dtype.name} is not (nframes, ...), but {dtype.shape}. This type of data will not converted to deepmd/hdf5 format." + ) + continue + + data_types[dtype.name] = { + "fn": dtype.deepmd_name, + "shape": (nframes, -1), + "dump": not (dtype.name == "cells" and nopbc), + } + + for dt, prop in data_types.items(): + if dt in data: + if prop["dump"]: + ddata = np.reshape(data[dt], prop["shape"]) + if np.issubdtype(ddata.dtype, np.floating): + ddata = ddata.astype(comp_prec) + reshaped_data[dt] = ddata + + # dump frame properties: cell, coord, energy, force and virial + nsets = nframes // set_size + if set_size * nsets < nframes: + nsets += 1 + for ii in range(nsets): + set_stt = ii * set_size + set_end = (ii + 1) * set_size + set_folder = g.create_group("set.%03d" % ii) # noqa: UP031 + for dt, prop in data_types.items(): + if dt in reshaped_data: + set_folder.create_dataset( + "{}.npy".format(prop["fn"]), data=reshaped_data[dt][set_stt:set_end] + ) + + if nopbc: + g.create_dataset("nopbc", data=True) diff --git a/dpdata/formats/deepmd/mixed.py b/dpdata/formats/deepmd/mixed.py new file mode 100644 index 000000000..734b6a730 --- /dev/null +++ b/dpdata/formats/deepmd/mixed.py @@ -0,0 +1,299 @@ +from __future__ import annotations + +import copy +import math + +import numpy as np + +import dpdata +from dpdata.data_type import Axis + +from .comp import dump as comp_dump +from .comp import to_system_data as comp_to_system_data + + +def _pad_to(sys_data, target_natoms, dtypes): + """Pad system data dict so that NATOMS dimension becomes target_natoms. + + Virtual atoms get real_atom_types = -1, and all other per-atom data is + padded with zeros. + + Parameters + ---------- + sys_data : dict + System data dict, already in mixed-type format. + target_natoms : int + Target number of atoms after padding. + dtypes : tuple[DataType, ...] + Registered data types to iterate for generic per-atom padding. + """ + natoms = sys_data["atom_types"].shape[0] + npad = target_natoms - natoms + if npad <= 0: + return + nframes = sys_data["coords"].shape[0] + + # Pad atom_types (all MIXED_TOKEN = 0) + sys_data["atom_types"] = np.concatenate( + [sys_data["atom_types"], np.zeros(npad, dtype=int)] + ) + sys_data["atom_numbs"] = [target_natoms] + + # Pad real_atom_types with -1 (virtual atom sentinel) + sys_data["real_atom_types"] = np.concatenate( + [ + sys_data["real_atom_types"], + -np.ones((nframes, npad), dtype=sys_data["real_atom_types"].dtype), + ], + axis=1, + ) + + # Pad coords and all other per-atom data generically + reserved = { + "atom_numbs", + "atom_names", + "atom_types", + "orig", + "cells", + "real_atom_names", + "real_atom_types", + "nopbc", + } + for dtype in dtypes: + if dtype.name in reserved: + continue + if dtype.name not in sys_data: + continue + if not ( + len(dtype.shape) >= 2 + and dtype.shape[0] == Axis.NFRAMES + and Axis.NATOMS in dtype.shape + ): + continue + axis_natoms = list(dtype.shape).index(Axis.NATOMS) + arr = sys_data[dtype.name] + pad_width = [(0, 0)] * len(arr.shape) + pad_width[axis_natoms] = (0, npad) + sys_data[dtype.name] = np.pad( + arr, pad_width, mode="constant", constant_values=0 + ) + + +def _strip_virtual_atoms(atom_types_row, coords, extra_data, dtypes): + """Strip virtual atoms (type -1) from a group of frames. + + Parameters + ---------- + atom_types_row : np.ndarray + 1-D array of atom type indices for the group (same for all frames). + coords : np.ndarray + Coordinates array, shape (nframes, natoms_padded, 3). + extra_data : dict + Dict of {name: array} for this group, arrays already frame-sliced. + dtypes : tuple[DataType, ...] + Registered data types. + + Returns + ------- + atom_types : np.ndarray + Atom types with virtual atoms removed. + coords : np.ndarray + Coords with virtual atoms removed. + extra_data : dict + Extra data with virtual atoms removed. + """ + real_mask = atom_types_row >= 0 + if real_mask.all(): + return atom_types_row, coords, extra_data + + atom_types = atom_types_row[real_mask] + coords = coords[:, real_mask, :] + + stripped = {} + for name, arr in extra_data.items(): + for dtype in dtypes: + if dtype.name == name and Axis.NATOMS in dtype.shape: + axis_natoms = list(dtype.shape).index(Axis.NATOMS) + idx = [slice(None)] * len(arr.shape) + idx[axis_natoms] = real_mask + arr = arr[tuple(idx)] + break + stripped[name] = arr + + return atom_types, coords, stripped + + +def to_system_data(folder, type_map=None, labels=True): + data = comp_to_system_data(folder, type_map, labels) + # data is empty + old_type_map = data["atom_names"].copy() + if type_map is not None: + assert isinstance(type_map, list) + missing_type = [i for i in old_type_map if i not in type_map] + assert not missing_type, ( + f"These types are missing in selected type_map: {missing_type} !" + ) + index_map = np.array([type_map.index(i) for i in old_type_map]) + data["atom_names"] = type_map.copy() + else: + index_map = None + all_real_atom_types_concat = data.pop("real_atom_types").astype(int) + if index_map is not None: + # Preserve -1 (virtual atom sentinel) during remapping + valid = all_real_atom_types_concat >= 0 + remapped = np.full_like(all_real_atom_types_concat, -1) + remapped[valid] = index_map[all_real_atom_types_concat[valid]] + all_real_atom_types_concat = remapped + all_cells_concat = data["cells"] + all_coords_concat = data["coords"] + + # handle custom registered data types + if labels: + dtypes = dpdata.system.LabeledSystem.DTYPES + else: + dtypes = dpdata.system.System.DTYPES + reserved = { + "atom_numbs", + "atom_names", + "atom_types", + "real_atom_names", + "real_atom_types", + "cells", + "coords", + "orig", + "nopbc", + } + extra_data = {} + for dtype in dtypes: + name = dtype.name + if name in reserved: + continue + if not (len(dtype.shape) and dtype.shape[0] == dpdata.system.Axis.NFRAMES): + continue + if name in data: + extra_data[name] = data.pop(name) + + data_list = [] + while True: + if all_real_atom_types_concat.size == 0: + break + # temp_formula = formula(data['atom_names'], temp_atom_numbs) + temp_idx = np.arange(all_real_atom_types_concat.shape[0])[ + (all_real_atom_types_concat == all_real_atom_types_concat[0]).all(-1) + ] + rest_idx = np.arange(all_real_atom_types_concat.shape[0])[ + (all_real_atom_types_concat != all_real_atom_types_concat[0]).any(-1) + ] + + # Extract data for this group + group_atom_types = all_real_atom_types_concat[0] + group_coords = all_coords_concat[temp_idx] + group_extra = {} + for name in extra_data: + group_extra[name] = extra_data[name][temp_idx] + extra_data[name] = extra_data[name][rest_idx] + + # Strip virtual atoms (type -1) introduced by padding + group_atom_types, group_coords, group_extra = _strip_virtual_atoms( + group_atom_types, group_coords, group_extra, dtypes + ) + + temp_atom_numbs = [ + np.count_nonzero(group_atom_types == i) + for i in range(len(data["atom_names"])) + ] + + temp_data = data.copy() + temp_data["atom_names"] = data["atom_names"].copy() + temp_data["atom_numbs"] = temp_atom_numbs + temp_data["atom_types"] = group_atom_types + all_real_atom_types_concat = all_real_atom_types_concat[rest_idx] + temp_data["cells"] = all_cells_concat[temp_idx] + all_cells_concat = all_cells_concat[rest_idx] + temp_data["coords"] = group_coords + all_coords_concat = all_coords_concat[rest_idx] + + for name in group_extra: + temp_data[name] = group_extra[name] + + data_list.append(temp_data) + return data_list + + +def dump(folder, data, set_size=2000, comp_prec=np.float32, remove_sets=True): + # if not converted to mixed + if "real_atom_types" not in data: + from dpdata import LabeledSystem, System + + # not change the original content + data = copy.deepcopy(data) + + if "energies" in data: + temp_sys = LabeledSystem(data=data) + else: + temp_sys = System(data=data) + temp_sys.convert_to_mixed_type() + + data = data.copy() + data["atom_names"] = data.pop("real_atom_names") + comp_dump(folder, data, set_size, comp_prec, remove_sets) + + +def mix_system(*system, type_map, atom_numb_pad=None, **kwargs): + """Mix the systems into mixed_type ones according to the unified given type_map. + + Parameters + ---------- + *system : System + The systems to mix + type_map : list of str + Maps atom type to name + atom_numb_pad : int, optional + If provided, pad atom counts to the next multiple of this number + using virtual atoms (type -1 in real_atom_types). This reduces the + number of subdirectories when systems have many different atom counts. + For example, atom_numb_pad=8 groups systems into multiples of 8. + **kwargs : dict + Other parameters + + Returns + ------- + mixed_systems: dict + dict of mixed system with key 'atom_numbs' + """ + mixed_systems = {} + temp_systems = {} + atom_numbs_frame_index = {} # index of frames in cur sys + # Use LabeledSystem DTYPES as superset for generic per-atom padding + dtypes = dpdata.system.LabeledSystem.DTYPES + for sys in system: + tmp_sys = sys.copy() + natom = tmp_sys.get_natoms() + tmp_sys.convert_to_mixed_type(type_map=type_map) + if atom_numb_pad is not None and atom_numb_pad > 1: + padded_natom = math.ceil(natom / atom_numb_pad) * atom_numb_pad + _pad_to(tmp_sys.data, padded_natom, dtypes) + group_key = str(padded_natom) + else: + group_key = str(natom) + if group_key not in atom_numbs_frame_index: + atom_numbs_frame_index[group_key] = 0 + atom_numbs_frame_index[group_key] += tmp_sys.get_nframes() + if group_key not in temp_systems or not temp_systems[group_key]: + temp_systems[group_key] = tmp_sys + else: + temp_systems[group_key].append(tmp_sys) + for natom_key in temp_systems: + if atom_numbs_frame_index[natom_key] > 0: + mixed_systems[natom_key] = temp_systems[natom_key] + return mixed_systems + + +def split_system(sys, split_num=10000): + rest = sys.get_nframes() - split_num + if rest <= 0: + return sys, None, 0 + else: + split_sys = sys.sub_system(range(split_num)) + rest_sys = sys.sub_system(range(split_num, sys.get_nframes())) + return split_sys, rest_sys, rest diff --git a/dpdata/formats/deepmd/raw.py b/dpdata/formats/deepmd/raw.py new file mode 100644 index 000000000..50dc5afd3 --- /dev/null +++ b/dpdata/formats/deepmd/raw.py @@ -0,0 +1,140 @@ +from __future__ import annotations + +import os +import warnings + +import numpy as np + +import dpdata +from dpdata.utils import open_file + + +def load_type(folder, type_map=None): + data = {} + data["atom_types"] = np.loadtxt(os.path.join(folder, "type.raw"), ndmin=1).astype( + int + ) + ntypes = np.max(data["atom_types"]) + 1 + data["atom_names"] = [] + # if find type_map.raw, use it + if os.path.isfile(os.path.join(folder, "type_map.raw")): + with open_file(os.path.join(folder, "type_map.raw")) as fp: + my_type_map = fp.read().split() + # else try to use arg type_map + elif type_map is not None: + my_type_map = type_map + # in the last case, make artificial atom names + else: + my_type_map = [] + for ii in range(ntypes): + my_type_map.append("Type_%d" % ii) # noqa: UP031 + data["atom_names"] = my_type_map + data["atom_numbs"] = [] + for ii, _ in enumerate(data["atom_names"]): + data["atom_numbs"].append(np.count_nonzero(data["atom_types"] == ii)) + + return data + + +def to_system_data(folder, type_map=None, labels=True): + if os.path.isdir(folder): + data = load_type(folder, type_map=type_map) + data["orig"] = np.zeros([3]) + data["coords"] = np.loadtxt(os.path.join(folder, "coord.raw"), ndmin=2) + nframes = data["coords"].shape[0] + if os.path.isfile(os.path.join(folder, "nopbc")): + data["nopbc"] = True + data["cells"] = np.zeros((nframes, 3, 3)) + else: + data["cells"] = np.loadtxt(os.path.join(folder, "box.raw"), ndmin=2) + data["cells"] = np.reshape(data["cells"], [nframes, 3, 3]) + data["coords"] = np.reshape(data["coords"], [nframes, -1, 3]) + if os.path.isfile(os.path.join(folder, "nopbc")): + data["nopbc"] = True + # allow custom dtypes + if labels: + dtypes = dpdata.system.LabeledSystem.DTYPES + else: + dtypes = dpdata.system.System.DTYPES + for dtype in dtypes: + if dtype.name in ( + "atom_numbs", + "atom_names", + "atom_types", + "orig", + "cells", + "coords", + "real_atom_types", + "real_atom_names", + "nopbc", + ): + # skip as these data contains specific rules + continue + if not (len(dtype.shape) and dtype.shape[0] == dpdata.system.Axis.NFRAMES): + warnings.warn( + f"Shape of {dtype.name} is not (nframes, ...), but {dtype.shape}. This type of data will not converted from deepmd/raw format." + ) + continue + natoms = data["atom_types"].shape[0] + shape = [ + natoms if xx == dpdata.system.Axis.NATOMS else xx + for xx in dtype.shape[1:] + ] + if os.path.exists(os.path.join(folder, f"{dtype.deepmd_name}.raw")): + data[dtype.name] = np.reshape( + np.loadtxt(os.path.join(folder, f"{dtype.deepmd_name}.raw")), + [nframes, *shape], + ) + return data + else: + raise RuntimeError("not dir " + folder) + + +def dump(folder, data): + os.makedirs(folder, exist_ok=True) + nframes = data["cells"].shape[0] + np.savetxt(os.path.join(folder, "type.raw"), data["atom_types"], fmt="%d") + np.savetxt(os.path.join(folder, "type_map.raw"), data["atom_names"], fmt="%s") + # BondOrder System + if "bonds" in data: + np.savetxt( + os.path.join(folder, "bonds.raw"), + data["bonds"], + header="begin_atom, end_atom, bond_order", + ) + if "formal_charges" in data: + np.savetxt(os.path.join(folder, "formal_charges.raw"), data["formal_charges"]) + try: + os.remove(os.path.join(folder, "nopbc")) + except OSError: + pass + if data.get("nopbc", False): + with open_file(os.path.join(folder, "nopbc"), "w") as fw_nopbc: + pass + # allow custom dtypes + labels = "energies" in data + if labels: + dtypes = dpdata.system.LabeledSystem.DTYPES + else: + dtypes = dpdata.system.System.DTYPES + for dtype in dtypes: + if dtype.name in ( + "atom_numbs", + "atom_names", + "atom_types", + "orig", + "real_atom_types", + "real_atom_names", + "nopbc", + ): + # skip as these data contains specific rules + continue + if dtype.name not in data: + continue + if not (len(dtype.shape) and dtype.shape[0] == dpdata.system.Axis.NFRAMES): + warnings.warn( + f"Shape of {dtype.name} is not (nframes, ...), but {dtype.shape}. This type of data will not converted to deepmd/raw format." + ) + continue + ddata = np.reshape(data[dtype.name], [nframes, -1]) + np.savetxt(os.path.join(folder, f"{dtype.deepmd_name}.raw"), ddata) diff --git a/dpdata/formats/dftbplus/__init__.py b/dpdata/formats/dftbplus/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dpdata/formats/dftbplus/output.py b/dpdata/formats/dftbplus/output.py new file mode 100644 index 000000000..49fdd2b1b --- /dev/null +++ b/dpdata/formats/dftbplus/output.py @@ -0,0 +1,83 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numpy as np + +from dpdata.utils import open_file + +if TYPE_CHECKING: + from dpdata.utils import FileType + + +def read_dftb_plus( + fn_1: FileType, fn_2: FileType +) -> tuple[str, np.ndarray, float, np.ndarray]: + """Read from DFTB+ input and output. + + Parameters + ---------- + fn_1 : str + DFTB+ input file name + fn_2 : str + DFTB+ output file name + + Returns + ------- + str + atomic symbols + np.ndarray + atomic coordinates + float + total potential energy + np.ndarray + atomic forces + + """ + coord = None + symbols = None + forces = None + energy = None + with open_file(fn_1) as f: + flag = 0 + for line in f: + if flag == 1: + flag += 1 + elif flag == 2: + components = line.split() + flag += 1 + elif line.startswith("Geometry"): + flag = 1 + coord = [] + symbols = [] + elif flag in (3, 4, 5, 6): + s = line.split() + components_num = int(s[1]) + symbols.append(components[components_num - 1]) + coord.append([float(s[2]), float(s[3]), float(s[4])]) + flag += 1 + if flag == 7: + flag = 0 + with open_file(fn_2) as f: + flag = 0 + for line in f: + if line.startswith("Total Forces"): + flag = 8 + forces = [] + elif flag in (8, 9, 10, 11): + s = line.split() + forces.append([float(s[1]), float(s[2]), float(s[3])]) + flag += 1 + if flag == 12: + flag = 0 + elif line.startswith("Total energy:"): + s = line.split() + energy = float(s[2]) + flag = 0 + + symbols = np.array(symbols) + forces = np.array(forces) + coord = np.array(coord) + assert coord.shape == forces.shape + + return symbols, coord, energy, forces diff --git a/dpdata/formats/fhi_aims/__init__.py b/dpdata/formats/fhi_aims/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/dpdata/formats/fhi_aims/output.py b/dpdata/formats/fhi_aims/output.py new file mode 100755 index 000000000..762e8bf4d --- /dev/null +++ b/dpdata/formats/fhi_aims/output.py @@ -0,0 +1,204 @@ +from __future__ import annotations + +import re +import warnings + +import numpy as np + +latt_patt = r"\|\s+([0-9]{1,}[.][0-9]*)\s+([0-9]{1,}[.][0-9]*)\s+([0-9]{1,}[.][0-9]*)" +pos_patt_first = r"\|\s+[0-9]{1,}[:]\s\w+\s(\w+)(\s.*[-]?[0-9]{1,}[.][0-9]*)(\s+[-]?[0-9]{1,}[.][0-9]*)(\s+[-]?[0-9]{1,}[.][0-9]*)" +pos_patt_other = r"\s+[a][t][o][m]\s+([-]?[0-9]{1,}[.][0-9]*)\s+([-]?[0-9]{1,}[.][0-9]*)\s+([-]?[0-9]{1,}[.][0-9]*)\s+(\w{1,2})" +force_patt = r"\|\s+[0-9]{1,}\s+([-]?[0-9]{1,}[.][0-9]*[E][+-][0-9]{1,})\s+([-]?[0-9]{1,}[.][0-9]*[E][+-][0-9]{1,})\s+([-]?[0-9]{1,}[.][0-9]*[E][+-][0-9]{1,})" +eng_patt = r"Total energy uncorrected.*([-]?[0-9]{1,}[.][0-9]*[E][+-][0-9]{1,})\s+eV" +# atom_numb_patt=r"Number of atoms.*([0-9]{1,})" + +debug = False + + +def get_info(lines, type_idx_zero=False): + atom_types = [] + atom_names = [] + cell = [] + atom_numbs = None + _atom_names = [] + + contents = "\n".join(lines) + # cell + # _tmp=re.findall(latt_patt,contents) + # for ii in _tmp: + # vect=[float(kk) for kk in ii] + # cell.append(vect) + # ------------------ + for ln, l in enumerate(lines): + if l.startswith(" | Unit cell"): + break + _tmp = lines[ln + 1 : ln + 4] + for ii in _tmp: + v_str = ii.split("|")[1].split() + vect = [float(kk) for kk in v_str] + cell.append(vect) + + _tmp = re.findall(pos_patt_first, contents) + for ii in _tmp: + _atom_names.append(ii[0]) + atom_names = [] + for ii in _atom_names: + if ii not in atom_names: + atom_names.append(ii) + + atom_numbs = [_atom_names.count(ii) for ii in atom_names] + if type_idx_zero: + type_map = dict(zip(atom_names, range(len(atom_names)))) + else: + type_map = dict(zip(atom_names, range(1, len(atom_names) + 1))) + atom_types = list(map(lambda k: type_map[k], _atom_names)) + assert atom_numbs is not None, "cannot find ion type info in aims output" + + return [cell, atom_numbs, atom_names, atom_types] + + +def get_fhi_aims_block(fp): + blk = [] + for ii in fp: + if not ii: + return blk + blk.append(ii.rstrip("\n")) + if "Begin self-consistency loop: Re-initialization" in ii: + return blk + return blk + + +def get_frames(fname, md=True, begin=0, step=1, convergence_check=True): + fp = open(fname) + blk = get_fhi_aims_block(fp) + ret = get_info(blk, type_idx_zero=True) + + cell, atom_numbs, atom_names, atom_types = ret[0], ret[1], ret[2], ret[3] + ntot = sum(atom_numbs) + + all_coords = [] + all_cells = [] + all_energies = [] + all_forces = [] + all_virials = [] + + cc = 0 + rec_failed = [] + while len(blk) > 0: + if debug: + with open(str(cc), "w") as f: + f.write("\n".join(blk)) + if cc >= begin and (cc - begin) % step == 0: + if cc == 0: + coord, _cell, energy, force, virial, is_converge = analyze_block( + blk, first_blk=True, md=md + ) + else: + coord, _cell, energy, force, virial, is_converge = analyze_block( + blk, first_blk=False + ) + if len(coord) == 0: + break + if is_converge or not convergence_check: + all_coords.append(coord) + + if _cell: + all_cells.append(_cell) + else: + all_cells.append(cell) + + all_energies.append(energy) + all_forces.append(force) + if virial is not None: + all_virials.append(virial) + if not is_converge: + rec_failed.append(cc + 1) + + blk = get_fhi_aims_block(fp) + cc += 1 + + if len(rec_failed) > 0: + prt = ( + "so they are not collected." + if convergence_check + else "but they are still collected due to the requirement for ignoring convergence checks." + ) + warnings.warn( + f"The following structures were unconverged: {rec_failed}; " + prt + ) + + if len(all_virials) == 0: + all_virials = None + else: + all_virials = np.array(all_virials) + fp.close() + return ( + atom_names, + atom_numbs, + np.array(atom_types), + np.array(all_cells), + np.array(all_coords), + np.array(all_energies), + np.array(all_forces), + all_virials, + ) + + +def analyze_block(lines, first_blk=False, md=True): + coord = [] + cell = [] + energy = None + force = [] + virial = None + atom_names = [] + _atom_names = [] + + contents = "\n".join(lines) + try: + natom = int(re.findall("Number of atoms.*([0-9]{1,})", lines)[0]) + except Exception: + natom = 0 + + if first_blk: + if md: + _tmp = re.findall(pos_patt_other, contents)[:] + for ii in _tmp[slice(int(len(_tmp) / 2), len(_tmp))]: + coord.append([float(kk) for kk in ii[:-1]]) + else: + _tmp = re.findall(pos_patt_first, contents) + for ii in _tmp: + coord.append([float(kk) for kk in ii[1:]]) + else: + _tmp = re.findall(pos_patt_other, contents) + for ii in _tmp: + coord.append([float(kk) for kk in ii[:-1]]) + + _tmp = re.findall(force_patt, contents) + for ii in _tmp: + force.append([float(kk) for kk in ii]) + + if "Self-consistency cycle converged" in contents: + is_converge = True + else: + is_converge = False + + try: + _eng_patt = re.compile(eng_patt) + energy = float(_eng_patt.search(contents).group().split()[-2]) + except Exception: + energy = None + + if not energy: + is_converge = False + + if energy: + assert (force is not None) and len(coord) > 0 + + return coord, cell, energy, force, virial, is_converge + + +if __name__ == "__main__": + import sys + + ret = get_frames(sys.argv[1], begin=0, step=1) + print(ret) diff --git a/dpdata/formats/gaussian/__init__.py b/dpdata/formats/gaussian/__init__.py new file mode 100644 index 000000000..7ebe5e1cb --- /dev/null +++ b/dpdata/formats/gaussian/__init__.py @@ -0,0 +1,5 @@ +from __future__ import annotations + +from . import fchk, gjf, log + +__all__ = ["fchk", "gjf", "log"] diff --git a/dpdata/formats/gaussian/fchk.py b/dpdata/formats/gaussian/fchk.py new file mode 100644 index 000000000..ab882cc21 --- /dev/null +++ b/dpdata/formats/gaussian/fchk.py @@ -0,0 +1,175 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numpy as np + +from dpdata.utils import open_file + +if TYPE_CHECKING: + from dpdata.utils import FileType + +from ...periodic_table import ELEMENTS +from ...unit import ( + EnergyConversion, + ForceConversion, + HessianConversion, + LengthConversion, +) + +length_convert = LengthConversion("bohr", "angstrom").value() +energy_convert = EnergyConversion("hartree", "eV").value() +force_convert = ForceConversion("hartree/bohr", "eV/angstrom").value() +hessian_convert = HessianConversion("hartree/bohr^2", "eV/angstrom^2").value() + + +def create_full_hessian(hessian_raw: list | np.ndarray, natoms: int) -> np.ndarray: + """ + Reconstructs the full, symmetric Hessian matrix from a 1D array + containing its lower triangular elements. + + Args: + hessian_raw (list | np.ndarray): A 1D list or NumPy array containing the + lower triangular elements (including the + diagonal) of the Hessian matrix. + natoms (int): The number of atoms in the system. + + Returns + ------- + np.ndarray: A full, symmetric (3*natoms, 3*natoms) Hessian matrix. + + Raises + ------ + ValueError: If the number of elements in `hessian_raw` does not match + the expected number for the lower triangle of a + (3*natoms, 3*natoms) matrix. + """ + # Convert input to a NumPy array in case it's a list + hessian_block = np.array(hessian_raw) + + # Calculate the dimension of the final matrix + dim = 3 * natoms + + # Validate that the input data has the correct length + # A lower triangle of an n x n matrix has n*(n+1)/2 elements + expected_length = dim * (dim + 1) // 2 + if hessian_block.size != expected_length: + raise ValueError( + f"Input length {hessian_block.size} != expected {expected_length}" + ) + + # Create a zero matrix, then fill the lower triangle + hessian_full = np.zeros((dim, dim), dtype=hessian_block.dtype) + lower_triangle_indices = np.tril_indices(dim) + hessian_full[lower_triangle_indices] = hessian_block + + # This is done by copying the lower triangle to the upper triangle + # M_full = M_lower + M_lower.T - diag(M_lower) + hessian_full = hessian_full + hessian_full.T - np.diag(np.diag(hessian_full)) + + return hessian_full + + +def to_system_data(file_name: FileType, has_forces=True, has_hessian=True): + """Read Gaussian fchk file. + + Parameters + ---------- + file_name : str + file name + has_forces : bool, default True + whether to read force + Note: Cartesian Gradient in fchk file is converted to forces by taking negative sign + has_hessian : bool, default True + whether to read hessian + + Returns + ------- + data : dict + system data, including hessian if has_hessian is True + """ + data = {} + natoms = 0 + atom_numbers = [] + coords_t = [] + energy_t = [] + forces_t = [] + hessian_t = [] + # Read fchk file + with open_file(file_name) as fp: + for line in fp: + if isinstance(line, bytes): + line = line.decode(errors="ignore") + if "Number of atoms" in line: + natoms = int(line.split()[-1]) + elif "Atomic numbers" in line and "I" in line: + n = int(line.split()[-1]) + atom_numbers = [] + while len(atom_numbers) < n: + next_line = next(fp) + if isinstance(next_line, bytes): + next_line = next_line.decode(errors="ignore") + atom_numbers += [int(x) for x in next_line.split()] + elif "Current cartesian coordinates" in line and "R" in line: + n = int(line.split()[-1]) + coords_raw = [] + while len(coords_raw) < n: + next_line = next(fp) + if isinstance(next_line, bytes): + next_line = next_line.decode(errors="ignore") + coords_raw += [float(x) for x in next_line.split()] + coords = np.array(coords_raw).reshape(-1, 3) * length_convert + coords_t.append(coords) + elif "Total Energy" in line: + energy = float(line.split()[-1]) * energy_convert + energy_t.append(energy) + elif "Cartesian Gradient" in line: + n = int(line.split()[-1]) + forces_raw = [] + while len(forces_raw) < n: + next_line = next(fp) + if isinstance(next_line, bytes): + next_line = next_line.decode(errors="ignore") + forces_raw += [float(x) for x in next_line.split()] + # Cartesian Gradient is the negative of forces: F = -∇E + forces = -np.array(forces_raw).reshape(-1, 3) * force_convert + forces_t.append(forces) + elif "Cartesian Force Constants" in line and "R" in line: + n = int(line.split()[-1]) + hessian_raw = [] + while len(hessian_raw) < n: + next_line = next(fp) + if isinstance(next_line, bytes): + next_line = next_line.decode(errors="ignore") + hessian_raw += [float(x) for x in next_line.split()] + hessian_full = ( + create_full_hessian(hessian_raw, natoms) * hessian_convert + ) + # store as (natoms, 3, natoms, 3) to align with registered shape + hessian_t.append(hessian_full.reshape(natoms, 3, natoms, 3)) + # Assert key data + assert coords_t, "cannot find coords" + assert energy_t, "cannot find energy" + if has_forces: + assert forces_t, "cannot find forces" + if has_hessian: + assert hessian_t, "cannot find hessian" + # Assemble data + atom_symbols = [ELEMENTS[z - 1] for z in atom_numbers] + atom_names, atom_types, atom_numbs = np.unique( + atom_symbols, return_inverse=True, return_counts=True + ) + data["atom_names"] = list(atom_names) + data["atom_numbs"] = list(atom_numbs) + data["atom_types"] = atom_types + data["coords"] = np.array(coords_t).reshape(-1, natoms, 3) + data["orig"] = np.zeros(3) + data["cells"] = np.array([np.eye(3) * 100]) + data["nopbc"] = True + if energy_t: + data["energies"] = np.array(energy_t) + if has_forces and forces_t: + data["forces"] = np.array(forces_t) + if has_hessian and hessian_t: + data["hessian"] = np.array(hessian_t) + return data diff --git a/dpdata/formats/gaussian/gjf.py b/dpdata/formats/gaussian/gjf.py new file mode 100644 index 000000000..419ec354c --- /dev/null +++ b/dpdata/formats/gaussian/gjf.py @@ -0,0 +1,335 @@ +# The initial code of this file is based on +# https://github.com/deepmodeling/dpgen/blob/0767dce7cad29367edb2e4a55fd0d8724dbda642/dpgen/generator/lib/gaussian.py#L1-L190 +# under LGPL 3.0 license +"""Generate Gaussian input file.""" + +from __future__ import annotations + +import itertools +import re +import uuid +import warnings + +import numpy as np + +from dpdata.periodic_table import Element + + +def _crd2frag(symbols: list[str], crds: np.ndarray) -> tuple[int, list[int]]: + """Detect fragments from coordinates. + + Parameters + ---------- + symbols : list[str] + element symbols; virtual elements are not supported + crds : np.ndarray + atomic coordinates, shape: (N, 3) + + Returns + ------- + frag_numb : int + number of fragments + frag_index : list[int] + frament index that each atom belongs to + + Notes + ----- + In this method, Open Babel is used to detect bond connectivity. The threshold + is the sum of covalent radii with a slight tolerance (0.45 A). Note that + this threshold has errors. + + PBC support is removed from this method as Gaussian does not support PBC calculation. + + Raises + ------ + ImportError + if Open Babel is not installed + """ + from scipy.sparse import csr_matrix + from scipy.sparse.csgraph import connected_components + + try: + from openbabel import openbabel + except ImportError: + import openbabel + atomnumber = len(symbols) + # Use openbabel to connect atoms + mol = openbabel.OBMol() + mol.BeginModify() + for idx, (symbol, position) in enumerate(zip(symbols, crds.astype(np.float64))): + num = Element(symbol).Z + atom = mol.NewAtom(idx) + atom.SetAtomicNum(int(num)) + atom.SetVector(*position) + mol.ConnectTheDots() + mol.PerceiveBondOrders() + mol.EndModify() + bonds = [] + for ii in range(mol.NumBonds()): + bond = mol.GetBond(ii) + a = bond.GetBeginAtom().GetId() + b = bond.GetEndAtom().GetId() + bo = bond.GetBondOrder() + bonds.extend([[a, b, bo], [b, a, bo]]) + bonds = np.array(bonds, ndmin=2).reshape((-1, 3)) + graph = csr_matrix( + (bonds[:, 2], (bonds[:, 0], bonds[:, 1])), shape=(atomnumber, atomnumber) + ) + frag_numb, frag_index = connected_components(graph, 0) + return frag_numb, frag_index + + +def detect_multiplicity(symbols: np.ndarray) -> int: + """Find the minimal multiplicity of the given molecules. + + Parameters + ---------- + symbols : np.ndarray + element symbols; virtual elements are not supported + + Returns + ------- + int + spin multiplicity + """ + # currently only support charge=0 + # oxygen -> 3 + if np.count_nonzero(symbols == ["O"]) == 2 and symbols.size == 2: + return 3 + # calculates the total number of electrons, assumes they are paired as much as possible + n_total = sum([Element(s).Z for s in symbols]) + return n_total % 2 + 1 + + +def make_gaussian_input( + sys_data: dict, + keywords: str | list[str], + multiplicity: str | int = "auto", + charge: int = 0, + fragment_guesses: bool = False, + basis_set: str | None = None, + keywords_high_multiplicity: str | None = None, + nproc: int = 1, +) -> str: + """Make gaussian input file. + + Parameters + ---------- + sys_data : dict + system data + keywords : str or list[str] + Gaussian keywords, e.g. force b3lyp/6-31g**. If a list, + run multiple steps + multiplicity : str or int, default=auto + spin multiplicity state. It can be a number. If auto, + multiplicity will be detected automatically, with the + following rules: + fragment_guesses=True + multiplicity will +1 for each radical, and +2 + for each oxygen molecule + fragment_guesses=False + multiplicity will be 1 or 2, but +2 for each + oxygen molecule + charge : int, default=0 + molecule charge. Only used when charge is not provided + by the system + fragment_guesses : bool, default=False + initial guess generated from fragment guesses. If True, + multiplicity should be auto + basis_set : str, default=None + custom basis set + keywords_high_multiplicity : str, default=None + keywords for points with multiple raicals. multiplicity + should be auto. If not set, fallback to normal keywords + nproc : int, default=1 + Number of CPUs to use + + Returns + ------- + str + gjf output string + """ + coordinates = sys_data["coords"][0] + atom_names = sys_data["atom_names"] + atom_numbs = sys_data["atom_numbs"] + atom_types = sys_data["atom_types"] + # get atom symbols list + symbols = [atom_names[atom_type] for atom_type in atom_types] + + # assume default charge is zero and default spin multiplicity is 1 + if "charge" in sys_data.keys(): + charge = sys_data["charge"] + + use_fragment_guesses = False + if isinstance(multiplicity, int): + mult_auto = False + elif multiplicity == "auto": + mult_auto = True + else: + raise RuntimeError('The keyword "multiplicity" is illegal.') + + if fragment_guesses: + # Initial guess generated from fragment guesses + # New feature of Gaussian 16 + use_fragment_guesses = True + if not mult_auto: + warnings.warn("Automatically set multiplicity to auto!") + mult_auto = True + + if mult_auto: + frag_numb, frag_index = _crd2frag(symbols, coordinates) + if frag_numb == 1: + use_fragment_guesses = False + mult_frags = [] + for i in range(frag_numb): + idx = frag_index == i + mult_frags.append(detect_multiplicity(np.array(symbols)[idx])) + if use_fragment_guesses: + multiplicity = sum(mult_frags) - frag_numb + 1 - charge % 2 + chargekeywords_frag = "%d %d" % (charge, multiplicity) + "".join( # noqa: UP031 + [" %d %d" % (charge, mult_frag) for mult_frag in mult_frags] # noqa: UP031 + ) + else: + multi_frags = np.array(mult_frags) + multiplicity = ( + 1 + + np.count_nonzero(multi_frags == 2) % 2 + + np.count_nonzero(multi_frags == 3) * 2 + - charge % 2 + ) + + if ( + keywords_high_multiplicity is not None + and np.count_nonzero(multi_frags == 2) >= 2 + ): + # at least 2 radicals + keywords = keywords_high_multiplicity + + if isinstance(keywords, str): + keywords = [keywords] + else: + keywords = keywords.copy() + + buff = [] + # keywords, e.g., force b3lyp/6-31g** + if use_fragment_guesses: + keywords[0] = f"{keywords[0]} guess=fragment={frag_numb}" + + chkkeywords = [] + if len(keywords) > 1: + chkkeywords.append(f"%chk={str(uuid.uuid1())}.chk") + + nprockeywords = f"%nproc={nproc:d}" + # use formula as title + titlekeywords = "".join( + [f"{symbol}{numb}" for symbol, numb in zip(atom_names, atom_numbs)] + ) + chargekeywords = f"{charge} {multiplicity}" + + buff = [ + *chkkeywords, + nprockeywords, + f"#{keywords[0]}", + "", + titlekeywords, + "", + (chargekeywords_frag if use_fragment_guesses else chargekeywords), + ] + + for ii, (symbol, coordinate) in enumerate(zip(symbols, coordinates)): + if use_fragment_guesses: + buff.append( + "%s(Fragment=%d) %f %f %f" % (symbol, frag_index[ii] + 1, *coordinate) # noqa: UP031 + ) + else: + buff.append("{} {:f} {:f} {:f}".format(symbol, *coordinate)) # noqa: UP031 + if not sys_data.get("nopbc", False): + # PBC condition + cell = sys_data["cells"][0] + for ii in range(3): + # use TV as atomic symbol, see https://gaussian.com/pbc/ + buff.append("TV {:f} {:f} {:f}".format(*cell[ii])) + if basis_set is not None: + # custom basis set + buff.extend(["", basis_set, ""]) + for kw in itertools.islice(keywords, 1, None): + buff.extend( + [ + "\n--link1--", + *chkkeywords, + nprockeywords, + f"#{kw}", + "", + titlekeywords, + "", + chargekeywords, + "", + ] + ) + buff.append("\n") + return "\n".join(buff) + + +def read_gaussian_input(inp: str): + """Read Gaussian input. + + Parameters + ---------- + inp : str + Gaussian input str + + Returns + ------- + dict + system data + """ + flag = 0 + coords = [] + elements = [] + cells = [] + for line in inp.split("\n"): + if not line.strip(): + # empty line + flag += 1 + elif flag == 0: + # keywords + if line.startswith("#"): + # setting + keywords = line.split() + elif line.startswith("%"): + pass + elif flag == 1: + # title + pass + elif flag == 2: + # multi and coords + s = line.split() + if len(s) == 2: + pass + elif len(s) == 4: + if s[0] == "TV": + cells.append(list(map(float, s[1:4]))) + else: + # element + elements.append(re.sub("\\(.*?\\)|\\{.*?}|\\[.*?]", "", s[0])) + coords.append(list(map(float, s[1:4]))) + elif flag == 3: + # end + break + atom_names, atom_types, atom_numbs = np.unique( + elements, return_inverse=True, return_counts=True + ) + if len(cells): + nopbc = False + else: + nopbc = True + cells = np.array([np.eye(3)]) * 100 + return { + "atom_names": list(atom_names), + "atom_numbs": list(atom_numbs), + "atom_types": atom_types, + "cells": np.array(cells).reshape(1, 3, 3), + "nopbc": nopbc, + "coords": np.array(coords).reshape(1, -1, 3), + "orig": np.zeros(3), + } diff --git a/dpdata/formats/gaussian/log.py b/dpdata/formats/gaussian/log.py new file mode 100644 index 000000000..a94141901 --- /dev/null +++ b/dpdata/formats/gaussian/log.py @@ -0,0 +1,136 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numpy as np + +from dpdata.utils import open_file + +if TYPE_CHECKING: + from dpdata.utils import FileType + +from ...periodic_table import ELEMENTS +from ...unit import EnergyConversion, ForceConversion, LengthConversion + +length_convert = LengthConversion("bohr", "angstrom").value() +energy_convert = EnergyConversion("hartree", "eV").value() +force_convert = ForceConversion("hartree/bohr", "eV/angstrom").value() + +symbols = ["X"] + ELEMENTS + + +def to_system_data(file_name: FileType, md=False): + """Read Gaussian log file. + + Parameters + ---------- + file_name : str + file name + md : bool, default False + whether to read multiple frames + + Returns + ------- + data : dict + system data + + Raises + ------ + RuntimeError + if the input orientation is not found + """ + data = {} + # read from log lines + flag = 0 + energy_t = [] + coords_t = [] + atom_symbols = [] + forces_t = [] + cells_t = [] + nopbc = True + coords = None + + with open_file(file_name) as fp: + for line in fp: + if line.startswith(" SCF Done"): + # energies + energy = float(line.split()[4]) + elif line.startswith( + " Center Atomic Forces (Hartrees/Bohr)" + ): + flag = 1 + forces = [] + elif line.startswith( + " Input orientation:" + ) or line.startswith(" Z-Matrix orientation:"): + flag = 5 + coords = [] + atom_symbols = [] + cells = [] + + if 1 <= flag <= 3 or 5 <= flag <= 9: + flag += 1 + elif flag == 4: + # forces + if line.startswith(" -------"): + if coords is None: + raise RuntimeError( + "Input orientation is not found. Using Gaussian keyword " + "`Geom=PrintInputOrient` to always print the input orientation. " + "See https://gaussian.com/geom/ for more details." + ) + forces_t.append(forces) + energy_t.append(energy) + coords_t.append(coords) + if cells: + nopbc = False + cells_t.append(cells) + else: + cells_t.append( + [[100.0, 0.0, 0.0], [0.0, 100.0, 0.0], [0.0, 0.0, 100.0]] + ) + flag = 0 + coords = None + else: + s = line.split() + if line[14:16] == "-2": + # PBC + pass + else: + forces.append( + [float(line[23:38]), float(line[38:53]), float(line[53:68])] + ) + elif flag == 10: + # atom_symbols and coords + if line.startswith(" -------"): + flag = 0 + else: + s = line.split() + if int(s[1]) == -2: + # PBC cells, see https://gaussian.com/pbc/ + cells.append([float(x) for x in s[3:6]]) + else: + coords.append([float(x) for x in s[3:6]]) + atom_symbols.append(symbols[int(s[1])]) + + assert coords_t, "cannot find coords" + assert energy_t, "cannot find energies" + assert forces_t, "cannot find forces" + + atom_names, data["atom_types"], atom_numbs = np.unique( + atom_symbols, return_inverse=True, return_counts=True + ) + data["atom_names"] = list(atom_names) + data["atom_numbs"] = list(atom_numbs) + if not md: + forces_t = forces_t[-1:] + energy_t = energy_t[-1:] + coords_t = coords_t[-1:] + cells_t = cells_t[-1:] + data["forces"] = np.array(forces_t) * force_convert + data["energies"] = np.array(energy_t) * energy_convert + data["coords"] = np.array(coords_t) + data["orig"] = np.array([0, 0, 0]) + data["cells"] = np.array(cells_t) + data["nopbc"] = nopbc + return data diff --git a/dpdata/formats/gromacs/__init__.py b/dpdata/formats/gromacs/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dpdata/formats/gromacs/gro.py b/dpdata/formats/gromacs/gro.py new file mode 100644 index 000000000..0c61544fd --- /dev/null +++ b/dpdata/formats/gromacs/gro.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import re +from typing import TYPE_CHECKING + +import numpy as np + +from dpdata.utils import open_file + +if TYPE_CHECKING: + from dpdata.utils import FileType + +from ...unit import LengthConversion + +nm2ang = LengthConversion("nm", "angstrom").value() +ang2nm = LengthConversion("angstrom", "nm").value() +cell_idx_gmx2dp = [0, 4, 8, 1, 2, 3, 5, 6, 7] + + +def _format_atom_name(atom_name): + patt = re.compile("[a-zA-Z]*") + match = re.search(patt, atom_name) + fmt_name = match.group().capitalize() + return fmt_name + + +def _get_line(line, fmt_atom_name=True): + atom_name = line[10:15].split()[0] + if fmt_atom_name: + atom_name = _format_atom_name(atom_name) + atom_idx = int(line[15:20].split()[0]) + posis = [float(line[ii : ii + 8]) for ii in range(20, 44, 8)] + posis = np.array(posis) * nm2ang + return atom_name, atom_idx, posis + + +def _get_cell(line): + cell = np.zeros([3, 3]) + lengths = [float(ii) for ii in line.split()] + if len(lengths) >= 3: + for dd in range(3): + cell[dd][dd] = lengths[dd] + else: + raise RuntimeError("wrong box format: ", line) + if len(lengths) == 9: + cell[0][1] = lengths[3] + cell[0][2] = lengths[4] + cell[1][0] = lengths[5] + cell[1][2] = lengths[6] + cell[2][0] = lengths[7] + cell[2][1] = lengths[8] + cell = cell * nm2ang + return cell + + +def file_to_system_data(fname: FileType, format_atom_name=True, **kwargs): + system = {"coords": [], "cells": []} + with open_file(fname) as fp: + frame = 0 + while True: + flag = fp.readline() + if not flag: + break + else: + frame += 1 + names = [] + idxs = [] + posis = [] + natoms = int(fp.readline()) + for ii in range(natoms): + n, i, p = _get_line(fp.readline(), fmt_atom_name=format_atom_name) + names.append(n) + idxs.append(i) + posis.append(p) + cell = _get_cell(fp.readline()) + posis = np.array(posis) + if frame == 1: + system["orig"] = np.zeros(3) + system["atom_names"] = list(set(names)) + system["atom_numbs"] = [ + names.count(ii) for ii in system["atom_names"] + ] + system["atom_types"] = [ + system["atom_names"].index(ii) for ii in names + ] + system["atom_types"] = np.array(system["atom_types"], dtype=int) + system["coords"].append(posis) + system["cells"].append(cell) + system["coords"] = np.array(system["coords"]) + system["cells"] = np.array(system["cells"]) + return system + + +def from_system_data(system, f_idx=0, **kwargs): + resname = kwargs.get("resname", "MOL") + shift = kwargs.get("shift", 0) + ret = "" + ret += " molecule" + "\n" + n_atoms = sum(system["atom_numbs"]) + ret += " " + str(n_atoms) + "\n" + for i in range(n_atoms): + atom_type = system["atom_types"][i] + atom_name = system["atom_names"][atom_type] + coords = system["coords"][f_idx] * ang2nm + ret += "{:>5d}{:<5s}{:>5s}{:5d}{:8.3f}{:8.3f}{:8.3f}\n".format( + 1, resname, atom_name, i + shift + 1, *tuple(coords[i]) + ) + cell = (system["cells"][f_idx].flatten() * ang2nm)[cell_idx_gmx2dp] + ret += " " + " ".join([f"{x:.3f}" for x in cell]) + + return ret diff --git a/dpdata/formats/lammps/__init__.py b/dpdata/formats/lammps/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dpdata/formats/lammps/dump.py b/dpdata/formats/lammps/dump.py new file mode 100644 index 000000000..89e75e4de --- /dev/null +++ b/dpdata/formats/lammps/dump.py @@ -0,0 +1,433 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import os +import sys +from typing import TYPE_CHECKING + +import numpy as np + +from dpdata.utils import open_file + +if TYPE_CHECKING: + from dpdata.utils import FileType + +lib_path = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(lib_path) +import warnings + +import lmp + + +class UnwrapWarning(UserWarning): + pass + + +warnings.simplefilter("once", UnwrapWarning) + + +def _get_block(lines, key): + for idx in range(len(lines)): + if ("ITEM: " + key) in lines[idx]: + break + idx_s = idx + 1 + for idx in range(idx_s, len(lines)): + if ("ITEM: ") in lines[idx]: + break + idx_e = idx + if idx_e == len(lines) - 1: + idx_e += 1 + return lines[idx_s:idx_e], lines[idx_s - 1] + + +def get_atype(lines, type_idx_zero=False): + blk, head = _get_block(lines, "ATOMS") + keys = head.split() + id_idx = keys.index("id") - 2 + tidx = keys.index("type") - 2 + atype = [] + for ii in blk: + atype.append([int(ii.split()[id_idx]), int(ii.split()[tidx])]) + atype.sort() + atype = np.array(atype, dtype=int) + if type_idx_zero: + return atype[:, 1] - 1 + else: + return atype[:, 1] + + +def get_natoms(lines): + blk, head = _get_block(lines, "NUMBER OF ATOMS") + return int(blk[0]) + + +def get_natomtypes(lines): + atype = get_atype(lines) + return max(atype) + + +def get_natoms_vec(lines): + atype = get_atype(lines) + natoms_vec = [] + natomtypes = get_natomtypes(lines) + for ii in range(natomtypes): + natoms_vec.append(sum(atype == ii + 1)) + assert sum(natoms_vec) == get_natoms(lines) + return natoms_vec + + +def get_coordtype_and_scalefactor(keys): + # 4 types in total,with different scaling factor + key_pc = ["x", "y", "z"] # plain cartesian, sf = 1 + key_uc = ["xu", "yu", "zu"] # unwraped cartesian, sf = 1 + key_s = ["xs", "ys", "zs"] # scaled by lattice parameter, sf = lattice parameter + key_su = ["xsu", "ysu", "zsu"] # scaled and unfolded,sf = lattice parameter + lmp_coor_type = [key_pc, key_uc, key_s, key_su] + sf = [0, 0, 1, 1] + uw = [0, 1, 0, 1] # unwraped or not + for k in range(4): + if all(i in keys for i in lmp_coor_type[k]): + return lmp_coor_type[k], sf[k], uw[k] + + +def safe_get_posi(lines, cell, orig=np.zeros(3), unwrap=False): + blk, head = _get_block(lines, "ATOMS") + keys = head.split() + coord_tp_and_sf = get_coordtype_and_scalefactor(keys) + assert coord_tp_and_sf is not None, "Dump file does not contain atomic coordinates!" + coordtype, sf, uw = coord_tp_and_sf + id_idx = keys.index("id") - 2 + xidx = keys.index(coordtype[0]) - 2 + yidx = keys.index(coordtype[1]) - 2 + zidx = keys.index(coordtype[2]) - 2 + posis = [] + for ii in blk: + words = ii.split() + posis.append( + [ + float(words[id_idx]), + float(words[xidx]), + float(words[yidx]), + float(words[zidx]), + ] + ) + posis.sort() + posis = np.array(posis)[:, 1:4] + if not sf: + posis = (posis - orig) @ np.linalg.inv( + cell + ) # Convert to scaled coordinates for unscaled coordinates + if uw and unwrap: + return ( + posis @ cell + ) # convert scaled coordinates back to Cartesien coordinates unwrap at the periodic boundaries + else: + if uw and not unwrap: + warnings.warn( + message="Your dump file contains unwrapped coordinates, but you did not specify unwrapping (unwrap = True). The default is wrapping at periodic boundaries (unwrap = False).\n", + category=UnwrapWarning, + ) + return ( + (posis % 1) @ cell + ) # Convert scaled coordinates back to Cartesien coordinates with wraping at periodic boundary conditions + + +def get_dumpbox(lines): + blk, h = _get_block(lines, "BOX BOUNDS") + bounds = np.zeros([3, 2]) + tilt = np.zeros([3]) + load_tilt = "xy xz yz" in h + for dd in range(3): + info = [float(jj) for jj in blk[dd].split()] + bounds[dd][0] = info[0] + bounds[dd][1] = info[1] + if load_tilt: + tilt[dd] = info[2] + return bounds, tilt + + +def dumpbox2box(bounds, tilt): + xy = tilt[0] + xz = tilt[1] + yz = tilt[2] + xlo = bounds[0][0] - min(0.0, xy, xz, xy + xz) + xhi = bounds[0][1] - max(0.0, xy, xz, xy + xz) + ylo = bounds[1][0] - min(0.0, yz) + yhi = bounds[1][1] - max(0.0, yz) + zlo = bounds[2][0] + zhi = bounds[2][1] + info = [[xlo, xhi], [ylo, yhi], [zlo, zhi]] + return lmp.lmpbox2box(info, tilt) + + +def box2dumpbox(orig, box): + lohi, tilt = lmp.box2lmpbox(orig, box) + xy = tilt[0] + xz = tilt[1] + yz = tilt[2] + bounds = np.zeros([3, 2]) + bounds[0][0] = lohi[0][0] + min(0.0, xy, xz, xy + xz) + bounds[0][1] = lohi[0][1] + max(0.0, xy, xz, xy + xz) + bounds[1][0] = lohi[1][0] + min(0.0, yz) + bounds[1][1] = lohi[1][1] + max(0.0, yz) + bounds[2][0] = lohi[2][0] + bounds[2][1] = lohi[2][1] + return bounds, tilt + + +def load_file(fname: FileType, begin=0, step=1): + lines = [] + buff = [] + cc = -1 + with open_file(fname) as fp: + while True: + line = fp.readline().rstrip("\n") + if not line: + if cc >= begin and (cc - begin) % step == 0: + lines += buff + buff = [] + cc += 1 + return lines + if "ITEM: TIMESTEP" in line: + if cc >= begin and (cc - begin) % step == 0: + lines += buff + buff = [] + cc += 1 + if cc >= begin and (cc - begin) % step == 0: + buff.append(line) + + +def get_spin_keys(inputfile): + """ + Read input file and get the keys for spin info in dump. + + Parameters + ---------- + inputfile : str + Path to the input file. + + Returns + ------- + list or None + List of spin info keys if found, None otherwise. + """ + if inputfile is None: + return None + + if not os.path.isfile(inputfile): + warnings.warn(f"Input file {inputfile} not found.") + return None + + with open(inputfile) as f: + for line in f.readlines(): + ls = line.split() + if ( + len(ls) > 7 + and ls[0] == "compute" + and all(key in ls for key in ["sp", "spx", "spy", "spz"]) + ): + compute_name = ls[1] + return [ + f"c_{compute_name}[{ls.index(key) - 3}]" + for key in ["sp", "spx", "spy", "spz"] + ] + + return None + + +def get_spin(lines, spin_keys): + """ + Get the spin info from the dump file. + + Parameters + ---------- + lines : list + The content of the dump file. + spin_keys : list + The keys for spin info in dump file. + the spin info is stored in sp, spx, spy, spz or spin_keys, which is the spin norm and the spin vector + 1 1 0.00141160 5.64868599 0.01005602 1.54706291 0.00000000 0.00000000 1.00000000 -1.40772100 -2.03739417 -1522.64797384 -0.00397809 -0.00190426 -0.00743976 + """ + blk, head = _get_block(lines, "ATOMS") + heads = head.split() + + if spin_keys is not None and all(i in heads for i in spin_keys): + key = spin_keys + else: + return None + + try: + idx_id = heads.index("id") - 2 + idx_sp, idx_spx, idx_spy, idx_spz = (heads.index(k) - 2 for k in key) + + norm = [] + vec = [] + atom_ids = [] + for line in blk: + words = line.split() + norm.append([float(words[idx_sp])]) + vec.append( + [float(words[idx_spx]), float(words[idx_spy]), float(words[idx_spz])] + ) + atom_ids.append(int(words[idx_id])) + + spin = np.array(norm) * np.array(vec) + atom_ids, spin = zip(*sorted(zip(atom_ids, spin))) + return np.array(spin) + except (ValueError, IndexError) as e: + warnings.warn(f"Error processing spin data: {str(e)}") + return None + + +def system_data( + lines, type_map=None, type_idx_zero=True, unwrap=False, input_file=None +): + array_lines = split_traj(lines) + lines = array_lines[0] + system = {} + system["atom_numbs"] = get_natoms_vec(lines) + system["atom_names"] = [] + if type_map is None: + for ii in range(len(system["atom_numbs"])): + system["atom_names"].append("TYPE_%d" % ii) # noqa: UP031 + else: + assert len(type_map) >= len(system["atom_numbs"]) + for ii in range(len(system["atom_numbs"])): + system["atom_names"].append(type_map[ii]) + bounds, tilt = get_dumpbox(lines) + orig, cell = dumpbox2box(bounds, tilt) + system["orig"] = np.array(orig) - np.array(orig) + system["cells"] = [np.array(cell)] + system["atom_types"] = get_atype(lines, type_idx_zero=type_idx_zero) + system["coords"] = [safe_get_posi(lines, cell, np.array(orig), unwrap)] + spin_keys = get_spin_keys(input_file) + spin = get_spin(lines, spin_keys) + has_spin = False + if spin is not None: + system["spins"] = [spin] + has_spin = True + for ii in range(1, len(array_lines)): + bounds, tilt = get_dumpbox(array_lines[ii]) + orig, cell = dumpbox2box(bounds, tilt) + system["cells"].append(cell) + atype = get_atype(array_lines[ii], type_idx_zero=type_idx_zero) + # map atom type; a[as[a][as[as[b]]]] = b[as[b][as^{-1}[b]]] = b[id] + idx = np.argsort(atype, kind="stable")[ + np.argsort(np.argsort(system["atom_types"], kind="stable"), kind="stable") + ] + system["coords"].append( + safe_get_posi(array_lines[ii], cell, np.array(orig), unwrap)[idx] + ) + if has_spin: + spin = get_spin(array_lines[ii], spin_keys) + if spin is not None: + system["spins"].append(spin[idx]) + else: + warnings.warn( + f"Warning: spin info is not found in frame {ii}, remove spin info." + ) + system.pop("spins") + has_spin = False + if has_spin: + system["spins"] = np.array(system["spins"]) + system["cells"] = np.array(system["cells"]) + system["coords"] = np.array(system["coords"]) + return system + + +def split_traj(dump_lines): + marks = [] + for idx, ii in enumerate(dump_lines): + if "ITEM: TIMESTEP" in ii: + marks.append(idx) + if len(marks) == 0: + return None + elif len(marks) == 1: + return [dump_lines] + else: + block_size = marks[1] - marks[0] + ret = [] + for ii in marks: + ret.append(dump_lines[ii : ii + block_size]) + # for ii in range(len(marks)-1): + # assert(marks[ii+1] - marks[ii] == block_size) + return ret + return None + + +def from_system_data(system, f_idx=0, timestep=0): + """Convert system data to LAMMPS dump format string. + + Parameters + ---------- + system : dict + System data dictionary containing atoms, coordinates, cell, etc. + f_idx : int, optional + Frame index to dump (default: 0) + timestep : int, optional + Timestep number for the dump (default: 0) + + Returns + ------- + str + LAMMPS dump format string + """ + ret = "" + + # Get basic system info + natoms = sum(system["atom_numbs"]) + coords = system["coords"][f_idx] + cell = system["cells"][f_idx] + atom_types = system["atom_types"] + orig = system.get("orig", np.zeros(3)) + + # Convert cell to dump format (bounds and tilt) + bounds, tilt = box2dumpbox(orig, cell) + + # Write timestep + ret += "ITEM: TIMESTEP\n" + ret += f"{timestep}\n" + + # Write number of atoms + ret += "ITEM: NUMBER OF ATOMS\n" + ret += f"{natoms}\n" + + # Write box bounds + ret += "ITEM: BOX BOUNDS xy xz yz pp pp pp\n" + ret += f"{bounds[0][0]:.10f} {bounds[0][1]:.10f} {tilt[0]:.10f}\n" + ret += f"{bounds[1][0]:.10f} {bounds[1][1]:.10f} {tilt[1]:.10f}\n" + ret += f"{bounds[2][0]:.10f} {bounds[2][1]:.10f} {tilt[2]:.10f}\n" + + # Write atoms header + ret += "ITEM: ATOMS id type x y z\n" + + # Write atom data + for ii in range(natoms): + atom_id = ii + 1 # LAMMPS uses 1-based indexing + atom_type = atom_types[ii] + 1 # LAMMPS uses 1-based type indexing + x, y, z = coords[ii] + ret += f"{atom_id} {atom_type} {x:.10f} {y:.10f} {z:.10f}\n" + + return ret + + +if __name__ == "__main__": + # fname = 'dump.hti' + # lines = open(fname).read().split('\n') + # # print(get_natoms(lines)) + # # print(get_natomtypes(lines)) + # # print(get_natoms_vec(lines)) + # posi = get_posi(lines) + # dbox1, tilt1 = box2dumpbox(orig, box) + # print(dbox - dbox1) + # print(tilt - tilt1) + # print(orig) + # print(box) + # np.savetxt('tmp.out', posi - orig, fmt='%.6f') + # print(system_data(lines)) + lines = load_file("conf_unfold.dump", begin=0, step=1) + al = split_traj(lines) + s = system_data(lines, ["O", "H"]) + # l = np.linalg.norm(s['cells'][1],axis=1) + # p = s['coords'][0] + l + # np.savetxt('p',p,fmt='%1.10f') diff --git a/dpdata/formats/lammps/lmp.py b/dpdata/formats/lammps/lmp.py new file mode 100644 index 000000000..c9d60ec53 --- /dev/null +++ b/dpdata/formats/lammps/lmp.py @@ -0,0 +1,649 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import numpy as np + +from dpdata.periodic_table import ELEMENTS, Element + +ptr_float_fmt = "%15.10f" +ptr_int_fmt = "%6d" +ptr_key_fmt = "%15s" + +# Mapping of LAMMPS atom styles to their column layouts +# Format: (atom_id_col, atom_type_col, x_col, y_col, z_col, has_molecule_id, has_charge, charge_col) +ATOM_STYLE_COLUMNS = { + "atomic": (0, 1, 2, 3, 4, False, False, None), + "angle": (0, 2, 3, 4, 5, True, False, None), + "bond": (0, 2, 3, 4, 5, True, False, None), + "charge": (0, 1, 3, 4, 5, False, True, 2), + "full": (0, 2, 4, 5, 6, True, True, 3), + "molecular": (0, 2, 3, 4, 5, True, False, None), + "dipole": (0, 1, 3, 4, 5, False, True, 2), + "sphere": (0, 1, 4, 5, 6, False, False, None), +} + + +def detect_atom_style(lines: list[str]) -> str | None: + """Detect LAMMPS atom style from data file content. + + Parameters + ---------- + lines : list + Lines from LAMMPS data file + + Returns + ------- + str or None + Detected atom style, or None if not detected + """ + # Look for atom style in comments after "Atoms" section header + atom_lines = get_atoms(lines) + if not atom_lines: + return None + + # Find the "Atoms" line + for idx, line in enumerate(lines): + if "Atoms" in line: + # Check if there's a comment with atom style after "Atoms" + if "#" in line: + comment_part = line.split("#")[1].strip().lower() + for style in ATOM_STYLE_COLUMNS: + if style in comment_part: + return style + break + + # If no explicit style found, try to infer from first data line + if atom_lines: + first_line = atom_lines[0].split() + num_cols = len(first_line) + + # Try to match based on number of columns and content patterns + # This is a heuristic approach + if num_cols == 5: + # Could be atomic style: atom-ID atom-type x y z + return "atomic" + elif num_cols == 6: + # Could be charge or bond/molecular style + # Try to determine if column 2 (index 2) looks like a charge (float) or type (int) + try: + val = float(first_line[2]) + # If it's a small float, likely a charge + if abs(val) < 10 and val != int(val): + return "charge" + else: + # Likely molecule ID (integer), so bond/molecular style + return "bond" + except ValueError: + return "atomic" # fallback + elif num_cols == 7: + # Could be full style: atom-ID molecule-ID atom-type charge x y z + return "full" + elif num_cols >= 8: + # Could be dipole or sphere style + # For now, default to dipole if we have enough columns + return "dipole" + + return None # Unable to detect + + +def _get_block(lines, keys): + for idx in range(len(lines)): + if keys in lines[idx]: + break + if idx == len(lines) - 1: + return None + idx_s = idx + 2 + idx = idx_s + ret = [] + while True: + if idx == len(lines) or len(lines[idx].split()) == 0: + break + else: + ret.append(lines[idx]) + idx += 1 + return ret + + +def lmpbox2box(lohi, tilt): + xy = tilt[0] + xz = tilt[1] + yz = tilt[2] + orig = np.array([lohi[0][0], lohi[1][0], lohi[2][0]]) + lens = [] + for dd in range(3): + lens.append(lohi[dd][1] - lohi[dd][0]) + xx = [lens[0], 0, 0] + yy = [xy, lens[1], 0] + zz = [xz, yz, lens[2]] + return orig, np.array([xx, yy, zz]) + + +def box2lmpbox(orig, box): + lohi = np.zeros([3, 2]) + for dd in range(3): + lohi[dd][0] = orig[dd] + tilt = np.zeros(3) + tilt[0] = box[1][0] + tilt[1] = box[2][0] + tilt[2] = box[2][1] + lens = np.zeros(3) + lens[0] = box[0][0] + lens[1] = box[1][1] + lens[2] = box[2][2] + for dd in range(3): + lohi[dd][1] = lohi[dd][0] + lens[dd] + return lohi, tilt + + +def get_atoms(lines): + return _get_block(lines, "Atoms") + + +def get_natoms(lines): + for ii in lines: + if "atoms" in ii: + return int(ii.split()[0]) + return None + + +def get_natomtypes(lines): + for ii in lines: + if "atom types" in ii: + return int(ii.split()[0]) + return None + + +def _atom_info_mol(line): + vec = line.split() + # idx, mole_type, atom_type, charge, x, y, z + return ( + int(vec[0]), + int(vec[1]), + int(vec[2]), + float(vec[3]), + float(vec[4]), + float(vec[5]), + float(vec[6]), + ) + + +def _atom_info_atom(line): + vec = line.split() + # idx, atom_type, x, y, z + return int(vec[0]), int(vec[1]), float(vec[2]), float(vec[3]), float(vec[4]) + + +def _atom_info_style(line: str, atom_style: str = "atomic") -> dict[str, int | float]: + """Parse atom information based on the specified atom style. + + Parameters + ---------- + line : str + The atom line from LAMMPS data file + atom_style : str + The LAMMPS atom style (atomic, full, charge, etc.) + + Returns + ------- + dict + Dictionary containing parsed atom information with keys: + 'atom_id', 'atom_type', 'x', 'y', 'z', 'molecule_id' (if present), 'charge' (if present) + """ + if atom_style not in ATOM_STYLE_COLUMNS: + raise ValueError( + f"Unsupported atom style: {atom_style}. Supported styles: {list(ATOM_STYLE_COLUMNS.keys())}" + ) + + vec = line.split() + columns = ATOM_STYLE_COLUMNS[atom_style] + + result = { + "atom_id": int(vec[columns[0]]), + "atom_type": int(vec[columns[1]]), + "x": float(vec[columns[2]]), + "y": float(vec[columns[3]]), + "z": float(vec[columns[4]]), + } + + # Add molecule ID if present + if columns[5]: # has_molecule_id + result["molecule_id"] = int( + vec[1] + ) # molecule ID is always in column 1 when present + + # Add charge if present + if columns[6]: # has_charge + result["charge"] = float(vec[columns[7]]) # charge_col + + return result + + +def get_natoms_vec(lines: list[str], atom_style: str = "atomic") -> list[int]: + """Get number of atoms for each atom type. + + Parameters + ---------- + lines : list + Lines from LAMMPS data file + atom_style : str + The LAMMPS atom style + + Returns + ------- + list + Number of atoms for each atom type + """ + atype = get_atype(lines, atom_style=atom_style) + natoms_vec = [] + natomtypes = get_natomtypes(lines) + for ii in range(natomtypes): + natoms_vec.append(sum(atype == ii + 1)) + assert sum(natoms_vec) == get_natoms(lines) + return natoms_vec + + +def get_atype( + lines: list[str], type_idx_zero: bool = False, atom_style: str = "atomic" +) -> np.ndarray: + """Get atom types from LAMMPS data file. + + Parameters + ---------- + lines : list + Lines from LAMMPS data file + type_idx_zero : bool + Whether to use zero-based indexing for atom types + atom_style : str + The LAMMPS atom style + + Returns + ------- + np.ndarray + Array of atom types + """ + alines = get_atoms(lines) + atype = [] + for ii in alines: + atom_info = _atom_info_style(ii, atom_style) + at = atom_info["atom_type"] + if type_idx_zero: + atype.append(at - 1) + else: + atype.append(at) + return np.array(atype, dtype=int) + + +def get_posi(lines: list[str], atom_style: str = "atomic") -> np.ndarray: + """Get atomic positions from LAMMPS data file. + + Parameters + ---------- + lines : list + Lines from LAMMPS data file + atom_style : str + The LAMMPS atom style + + Returns + ------- + np.ndarray + Array of atomic positions + """ + atom_lines = get_atoms(lines) + posis = [] + for ii in atom_lines: + atom_info = _atom_info_style(ii, atom_style) + posis.append([atom_info["x"], atom_info["y"], atom_info["z"]]) + return np.array(posis) + + +def get_charges(lines: list[str], atom_style: str = "atomic") -> np.ndarray | None: + """Get atomic charges from LAMMPS data file if the atom style supports charges. + + Parameters + ---------- + lines : list + Lines from LAMMPS data file + atom_style : str + The LAMMPS atom style + + Returns + ------- + np.ndarray or None + Array of atomic charges if atom style has charges, None otherwise + """ + if atom_style not in ATOM_STYLE_COLUMNS: + raise ValueError(f"Unsupported atom style: {atom_style}") + + # Check if this atom style has charges + if not ATOM_STYLE_COLUMNS[atom_style][6]: # has_charge + return None + + atom_lines = get_atoms(lines) + charges = [] + for ii in atom_lines: + atom_info = _atom_info_style(ii, atom_style) + charges.append(atom_info["charge"]) + return np.array(charges) + + +def get_spins(lines: list[str], atom_style: str = "atomic") -> np.ndarray | None: + atom_lines = get_atoms(lines) + if len(atom_lines[0].split()) < 8: + return None + spins_ori = [] + spins_norm = [] + for ii in atom_lines: + iis = ii.split() + spins_ori.append([float(jj) for jj in iis[5:8]]) + spins_norm.append([float(iis[-1])]) + return np.array(spins_ori) * np.array(spins_norm) + + +def get_lmpbox(lines): + box_info = [] + tilt = np.zeros(3) + for ii in lines: + if "xlo" in ii and "xhi" in ii: + box_info.append([float(ii.split()[0]), float(ii.split()[1])]) + break + for ii in lines: + if "ylo" in ii and "yhi" in ii: + box_info.append([float(ii.split()[0]), float(ii.split()[1])]) + break + for ii in lines: + if "zlo" in ii and "zhi" in ii: + box_info.append([float(ii.split()[0]), float(ii.split()[1])]) + break + for ii in lines: + if "xy" in ii and "xz" in ii and "yz" in ii: + tilt = np.array([float(jj) for jj in ii.split()[0:3]]) + return box_info, tilt + + +def system_data( + lines: list[str], + type_map: list[str] | None = None, + type_idx_zero: bool = True, + atom_style: str = "atomic", +) -> dict: + """Parse LAMMPS data file to system data format. + + Parameters + ---------- + lines : list + Lines from LAMMPS data file + type_map : list, optional + Mapping from atom types to element names + type_idx_zero : bool + Whether to use zero-based indexing for atom types + atom_style : str + The LAMMPS atom style (atomic, full, charge, etc.) + + Returns + ------- + dict + System data dictionary + """ + system = {} + system["atom_numbs"] = get_natoms_vec(lines, atom_style=atom_style) + system["atom_names"] = [] + if type_map is None: + for ii in range(len(system["atom_numbs"])): + system["atom_names"].append("Type_%d" % ii) # noqa: UP031 + else: + assert len(type_map) >= len(system["atom_numbs"]) + for ii in range(len(system["atom_numbs"])): + system["atom_names"].append(type_map[ii]) + lohi, tilt = get_lmpbox(lines) + orig, cell = lmpbox2box(lohi, tilt) + system["orig"] = np.array(orig) + system["cells"] = [np.array(cell)] + natoms = sum(system["atom_numbs"]) + system["atom_types"] = get_atype( + lines, type_idx_zero=type_idx_zero, atom_style=atom_style + ) + system["coords"] = [get_posi(lines, atom_style=atom_style)] + system["cells"] = np.array(system["cells"]) + system["coords"] = np.array(system["coords"]) + + # Add charges if the atom style supports them + charges = get_charges(lines, atom_style=atom_style) + if charges is not None: + system["charges"] = np.array([charges]) + + spins = get_spins(lines, atom_style=atom_style) + if spins is not None: + system["spins"] = np.array([spins]) + + return system + + +def to_system_data( + lines: list[str], + type_map: list[str] | None = None, + type_idx_zero: bool = True, + atom_style: str = "atomic", +) -> dict: + """Parse LAMMPS data file to system data format. + + Parameters + ---------- + lines : list + Lines from LAMMPS data file + type_map : list, optional + Mapping from atom types to element names + type_idx_zero : bool + Whether to use zero-based indexing for atom types + atom_style : str + The LAMMPS atom style. If "auto", attempts to detect automatically + from file. Default is "atomic". + + Returns + ------- + dict + System data dictionary + """ + # Attempt automatic detection if requested + if atom_style == "auto": + detected_style = detect_atom_style(lines) + if detected_style: + atom_style = detected_style + else: + atom_style = "atomic" # fallback to default + + return system_data( + lines, type_map=type_map, type_idx_zero=type_idx_zero, atom_style=atom_style + ) + + +def rotate_to_lower_triangle( + cell: np.ndarray, coord: np.ndarray +) -> tuple[np.ndarray, np.ndarray]: + """Rotate the cell to lower triangular and ensure the diagonal elements are non-negative. + + Args: + cell (np.ndarray): The original cell matrix. + coord (np.ndarray): The coordinates of the atoms. + + Returns + ------- + tuple[np.ndarray, np.ndarray]: The rotated cell and adjusted coordinates. + """ + q, _ = np.linalg.qr(cell.T) + cell = np.matmul(cell, q) + coord = np.matmul(coord, q) + + # Ensure the diagonal elements of the cell are non-negative + rot = np.eye(3) + if cell[0][0] < 0: + rot[0][0] = -1 + if cell[1][1] < 0: + rot[1][1] = -1 + if cell[2][2] < 0: + rot[2][2] = -1 + cell = np.matmul(cell, rot) + coord = np.matmul(coord, rot) + return cell, coord + + +def _get_lammps_masses(system) -> np.ndarray | None: + """Get masses for the LAMMPS ``Masses`` section. + + Prefer explicitly stored masses when available. Otherwise, infer masses from + ``atom_names`` when all names are valid chemical element symbols. + + Parameters + ---------- + system : dict + System data dictionary + + Returns + ------- + np.ndarray or None + Per-type masses aligned with ``atom_names``. Returns ``None`` when the + masses cannot be determined safely. + + Raises + ------ + ValueError + If explicit ``system["masses"]`` is present but does not match the + length of ``atom_names``. + """ + atom_names = system["atom_names"] + masses = system.get("masses") + if masses is not None: + masses = np.asarray(masses, dtype=float) + if masses.ndim != 1 or len(masses) != len(atom_names): + raise ValueError( + 'Explicit system["masses"] must be a 1D array with the same ' + 'length as system["atom_names"] to write the LAMMPS Masses ' + "section." + ) + return masses + + if not all(name in ELEMENTS for name in atom_names): + return None + + return np.array([Element(name).mass for name in atom_names], dtype=float) + + +def from_system_data(system, f_idx=0): + ret = "" + ret += "\n" + natoms = sum(system["atom_numbs"]) + ntypes = len(system["atom_numbs"]) + cell, coord = rotate_to_lower_triangle( + system["cells"][f_idx], system["coords"][f_idx] + ) + ret += "%d atoms\n" % natoms # noqa: UP031 + ret += "%d atom types\n" % ntypes # noqa: UP031 + ret += (ptr_float_fmt + " " + ptr_float_fmt + " xlo xhi\n") % ( + 0, + cell[0][0], + ) # noqa: UP031 + ret += (ptr_float_fmt + " " + ptr_float_fmt + " ylo yhi\n") % ( + 0, + cell[1][1], + ) # noqa: UP031 + ret += (ptr_float_fmt + " " + ptr_float_fmt + " zlo zhi\n") % ( + 0, + cell[2][2], + ) # noqa: UP031 + ret += ( + ptr_float_fmt + " " + ptr_float_fmt + " " + ptr_float_fmt + " xy xz yz\n" + ) % ( + cell[1][0], + cell[2][0], + cell[2][1], + ) # noqa: UP031 + ret += "\n" + + masses = _get_lammps_masses(system) + if masses is not None: + ret += "Masses\n" + ret += "\n" + mass_fmt = ptr_int_fmt + " " + ptr_float_fmt + " # %s\n" # noqa: UP031 + for ii, (mass, atom_name) in enumerate(zip(masses, system["atom_names"])): + ret += mass_fmt % (ii + 1, mass, atom_name) + ret += "\n" + + ret += "Atoms # atomic\n" + ret += "\n" + coord_fmt = ( + ptr_int_fmt + + " " + + ptr_int_fmt + + " " + + ptr_float_fmt + + " " + + ptr_float_fmt + + " " + + ptr_float_fmt + + "\n" + ) # noqa: UP031 + + if "spins" in system: + coord_fmt = ( + coord_fmt.strip("\n") + + " " + + ptr_float_fmt + + " " + + ptr_float_fmt + + " " + + ptr_float_fmt + + " " + + ptr_float_fmt + + "\n" + ) # noqa: UP031 + spins_norm = np.linalg.norm(system["spins"][f_idx], axis=1) + for ii in range(natoms): + if "spins" in system: + if spins_norm[ii] != 0: + ret += coord_fmt % ( + ii + 1, + system["atom_types"][ii] + 1, + coord[ii][0] - system["orig"][0], + coord[ii][1] - system["orig"][1], + coord[ii][2] - system["orig"][2], + system["spins"][f_idx][ii][0] / spins_norm[ii], + system["spins"][f_idx][ii][1] / spins_norm[ii], + system["spins"][f_idx][ii][2] / spins_norm[ii], + spins_norm[ii], + ) # noqa: UP031 + else: + ret += coord_fmt % ( + ii + 1, + system["atom_types"][ii] + 1, + coord[ii][0] - system["orig"][0], + coord[ii][1] - system["orig"][1], + coord[ii][2] - system["orig"][2], + system["spins"][f_idx][ii][0], + system["spins"][f_idx][ii][1], + system["spins"][f_idx][ii][2] + 1, + spins_norm[ii], + ) # noqa: UP031 + else: + ret += coord_fmt % ( + ii + 1, + system["atom_types"][ii] + 1, + coord[ii][0] - system["orig"][0], + coord[ii][1] - system["orig"][1], + coord[ii][2] - system["orig"][2], + ) # noqa: UP031 + return ret + + +if __name__ == "__main__": + fname = "water-SPCE.data" + lines = open(fname).read().split("\n") + bonds, tilt = get_lmpbox(lines) + # print(bonds, tilt) + orig, box = lmpbox2box(bonds, tilt) + # print(orig, box) + bonds1, tilt1 = box2lmpbox(orig, box) + # print(bonds1, tilt1) + print(bonds1 - bonds) + print(tilt1 - tilt) + print(box) + print(get_atype(lines)) + print(get_posi(lines)) diff --git a/dpdata/formats/lmdb/__init__.py b/dpdata/formats/lmdb/__init__.py new file mode 100644 index 000000000..53a3e8f0e --- /dev/null +++ b/dpdata/formats/lmdb/__init__.py @@ -0,0 +1,5 @@ +from __future__ import annotations + +from .format import LMDBFormat + +__all__ = ["LMDBFormat"] diff --git a/dpdata/formats/lmdb/format.py b/dpdata/formats/lmdb/format.py new file mode 100644 index 000000000..9b518be6b --- /dev/null +++ b/dpdata/formats/lmdb/format.py @@ -0,0 +1,286 @@ +from __future__ import annotations + +import os + +import lmdb +import msgpack +import msgpack_numpy as m +import numpy as np + +from dpdata.format import Format + +m.patch() + + +class LMDBError(Exception): + """Base class for LMDB errors.""" + + +class LMDBMetadataError(LMDBError): + """Metadata not found in LMDB.""" + + +class LMDBFrameError(LMDBError): + """Frame data not found in LMDB.""" + + +class LMDBFormat(Format): + """ + Class for handling the LMDB format, which stores atomic configurations in a + Lightning Memory-Mapped Database (LMDB). + + This format is optimized for machine learning workflows where fast, random + access to a large number of frames is required. All frames from multiple + systems (with potentially different numbers of atoms) are stored in a + single LMDB database file. + + Both single systems and multiple systems are supported via the standard + ``dpdata`` APIs. + + Examples + -------- + **Saving a single LabeledSystem** + + >>> import dpdata + >>> system = dpdata.LabeledSystem("path/to/input.vasp", fmt="vasp/outcar") + >>> system.to("lmdb", "my_single_system.lmdb") + + **Loading a single LabeledSystem** + + >>> loaded_system = dpdata.LabeledSystem("my_single_system.lmdb", fmt="lmdb") + + **Saving multiple systems to a single LMDB database** + + >>> import dpdata + >>> system_1 = dpdata.LabeledSystem("path/to/system1/OUTCAR", fmt="vasp/outcar") + >>> system_2 = dpdata.LabeledSystem("path/to/system2/OUTCAR", fmt="vasp/outcar") + >>> multi_systems_obj = dpdata.MultiSystems(system_1, system_2) + >>> multi_systems_obj.to("lmdb", "my_multi_system_db.lmdb") + + **Loading multiple systems from a single LMDB database** + + >>> import dpdata + >>> loaded_multi_systems = dpdata.MultiSystems.from_file("my_multi_system_db.lmdb", fmt="lmdb") + """ + + def to_multi_systems( + self, formulas, directory, map_size=1000000000, frame_idx_fmt="012d", **kwargs + ): + """Implement MultiSystems.to for LMDB format. + + Parameters + ---------- + formulas : list[str] + list of formulas + directory : str + directory of system + map_size : int, optional + Maximum size of the LMDB database in bytes. Default is 1GB. + frame_idx_fmt : str, optional + The format string used to encode the frame index as a key. Default is "012d". + **kwargs : dict + other parameters + + Yields + ------ + tuple + (self, formula) to be used by to_system + """ + self._frame_idx_fmt = frame_idx_fmt + self._global_frame_idx = 0 + self._system_info = [] + os.makedirs(directory, exist_ok=True) + with lmdb.open(directory, map_size=map_size) as env: + with env.begin(write=True) as txn: + self._txn = txn + for ff in formulas: + yield (self, ff) + # Finalize metadata + metadata = { + "nframes": self._global_frame_idx, + "system_info": self._system_info, + "frame_idx_fmt": self._frame_idx_fmt, + } + txn.put(b"__metadata__", msgpack.packb(metadata, use_bin_type=True)) + self._txn = None + + def _dump_to_txn(self, data, txn, formula, dtypes): + from dpdata.data_type import Axis + + nframes = data["coords"].shape[0] + + # Identify symbolic shapes and frame-dependent keys + data_shapes = {} + frame_dependent_keys = [] + for dt in dtypes: + if dt.name in data: + if dt.shape is not None: + data_shapes[dt.name] = [ + s.value if isinstance(s, Axis) else s for s in dt.shape + ] + if Axis.NFRAMES in dt.shape: + frame_dependent_keys.append(dt.name) + else: + data_shapes[dt.name] = None + + # Record system info + # natoms needs to be extracted from data + if "atom_numbs" in data: + natoms_list = data["atom_numbs"] + else: + # Fallback for systems without atom_numbs (should not happen in valid dpdata systems) + natoms_list = [] + + self._system_info.append( + { + "formula": formula, + "natoms": natoms_list, + "nframes": nframes, + "start_idx": self._global_frame_idx, + "data_shapes": data_shapes, + "frame_dependent_keys": frame_dependent_keys, + } + ) + + for i in range(nframes): + frame_data = {} + for key, val in data.items(): + if key in frame_dependent_keys: + frame_data[key] = val[i] + else: + frame_data[key] = val + + key = f"{self._global_frame_idx:{self._frame_idx_fmt}}".encode("ascii") + value = msgpack.packb(frame_data, use_bin_type=True) + txn.put(key, value) + self._global_frame_idx += 1 + + def to_labeled_system(self, data, file_name, **kwargs): + """Save a single LabeledSystem to an LMDB database.""" + from dpdata.system import LabeledSystem + + if isinstance(file_name, tuple) and file_name[0] is self: + txn, formula = self._txn, file_name[1] + self._dump_to_txn(data, txn, formula, LabeledSystem.DTYPES) + else: + # Single system call: use to_multi_systems logic + # Infer formula from data if possible, or use default + formula = kwargs.get("formula", "unknown") + gen = self.to_multi_systems([formula], file_name, **kwargs) + handle = next(gen) + self.to_labeled_system(data, handle, **kwargs) + try: + next(gen) + except StopIteration: + pass + + def to_system(self, data, file_name, **kwargs): + """Save a single System to an LMDB database.""" + from dpdata.system import System + + if isinstance(file_name, tuple) and file_name[0] is self: + txn, formula = self._txn, file_name[1] + self._dump_to_txn(data, txn, formula, System.DTYPES) + else: + # Single system call + formula = kwargs.get("formula", "unknown") + gen = self.to_multi_systems([formula], file_name, **kwargs) + handle = next(gen) + self.to_system(data, handle, **kwargs) + try: + next(gen) + except StopIteration: + pass + + def from_multi_systems(self, file_name, map_size=1000000000, **kwargs): + """Load multiple systems from a single LMDB database. + + Parameters + ---------- + file_name : str + The path to the LMDB database directory. + map_size : int, optional + Maximum size of the LMDB database in bytes. + **kwargs : dict + other parameters + + Yields + ------ + dict + data dictionary for each system + """ + from dpdata.data_type import Axis, DataType + from dpdata.system import LabeledSystem, System + + with lmdb.open(file_name, readonly=True) as env: + with env.begin() as txn: + metadata_packed = txn.get(b"__metadata__") + if metadata_packed is None: + raise LMDBMetadataError("LMDB database does not contain metadata.") + metadata = msgpack.unpackb(metadata_packed, raw=False) + frame_idx_fmt = metadata.get("frame_idx_fmt", "012d") + + for sys_info in metadata["system_info"]: + system_frames = [] + start_idx = sys_info["start_idx"] + nframes = sys_info["nframes"] + data_shapes = sys_info.get("data_shapes", {}) + frame_dependent_keys = sys_info.get("frame_dependent_keys", []) + + for i in range(start_idx, start_idx + nframes): + key = f"{i:{frame_idx_fmt}}".encode("ascii") + value = txn.get(key) + if value is None: + raise LMDBFrameError(f"Frame data not found for key: {key}") + frame_data = msgpack.unpackb(value, raw=False) + system_frames.append(frame_data) + + # Aggregate data for one system + first_frame = system_frames[0] + is_labeled = "energies" in first_frame + cls = LabeledSystem if is_labeled else System + + # Auto-register unknown data types + existing_dt_names = [dt.name for dt in cls.DTYPES] + new_dts = [] + axis_map = {a.value: a for a in Axis} + for key, val in first_frame.items(): + if key not in existing_dt_names and key in data_shapes: + shape_raw = data_shapes[key] + if shape_raw is not None: + shape = tuple([axis_map.get(s, s) for s in shape_raw]) + else: + shape = None + + v_arr = np.array(val) + new_dts.append( + DataType(key, type(v_arr), shape=shape, required=False) + ) + + if new_dts: + cls.register_data_type(*new_dts) + + agg_data = {} + for key, val in first_frame.items(): + if key in frame_dependent_keys: + agg_data[key] = np.array([d[key] for d in system_frames]) + else: + agg_data[key] = val + + yield agg_data + + def from_labeled_system(self, file_name, **kwargs): + """Load data for a single LabeledSystem from an LMDB database.""" + if isinstance(file_name, dict): + return file_name + # from_multi_systems returns a generator of dicts + gen = self.from_multi_systems(file_name, **kwargs) + return next(gen) + + def from_system(self, file_name, **kwargs): + """Load data for a single System from an LMDB database.""" + if isinstance(file_name, dict): + return file_name + # from_multi_systems returns a generator of dicts + gen = self.from_multi_systems(file_name, **kwargs) + return next(gen) diff --git a/dpdata/formats/openmx/__init__.py b/dpdata/formats/openmx/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dpdata/formats/openmx/omx.py b/dpdata/formats/openmx/omx.py new file mode 100644 index 000000000..16368eb2f --- /dev/null +++ b/dpdata/formats/openmx/omx.py @@ -0,0 +1,200 @@ +#!/usr/bin/python3 +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numpy as np + +from dpdata.utils import open_file + +if TYPE_CHECKING: + from dpdata.utils import FileType + +from ...unit import ( + EnergyConversion, + ForceConversion, + LengthConversion, + PressureConversion, +) + +ry2ev = EnergyConversion("rydberg", "eV").value() +kbar2evperang3 = PressureConversion("kbar", "eV/angstrom^3").value() + +length_convert = LengthConversion("bohr", "angstrom").value() +energy_convert = EnergyConversion("hartree", "eV").value() +force_convert = ForceConversion("hartree/bohr", "eV/angstrom").value() + +import warnings +from collections import OrderedDict + + +def load_atom(lines): + atom_names = [] + atom_names_mode = False + for line in lines: + if "" in line: + atom_names_mode = False + elif atom_names_mode: + parts = line.split() + atom_names.append(parts[1]) + atom_names_original = atom_names + atom_names = list(OrderedDict.fromkeys(set(atom_names))) + atom_names = sorted( + atom_names, key=atom_names_original.index + ) # Unique ordering of atomic species + ntypes = len(atom_names) + atom_numbs = [0] * ntypes + atom_types = [] + atom_types_mode = False + for line in lines: + if "" in line: + atom_types_mode = False + elif atom_types_mode: + parts = line.split() + for i, atom_name in enumerate(atom_names): + if parts[1] == atom_name: + atom_numbs[i] += 1 + atom_types.append(i) + atom_types = np.array(atom_types) + return atom_names, atom_types, atom_numbs + + +def load_cells(lines): + cells = [] + for line in lines: + if "Cell_Vectors=" in line: + part = line.split("Cell_Vectors=")[1] + parts = part.split() + values = list(map(float, parts[:9])) + cell = [values[0:3], values[3:6], values[6:9]] + cells.append(cell) + # Checking SCF converged or not + for token in line.split(): + if token.startswith("scf_conv="): + scf_conv = int(token.split("=")[1]) + if scf_conv == 0: + warnings.warn("SCF not converged!", stacklevel=2) + cells = np.array(cells) + return cells + + +# load atom_names, atom_numbs, atom_types, cells +def load_param_file(fname: FileType, mdname: FileType): + with open_file(fname) as dat_file: + lines = dat_file.readlines() + atom_names, atom_types, atom_numbs = load_atom(lines) + + with open_file(mdname) as md_file: + lines = md_file.readlines() + cells = load_cells(lines) + return atom_names, atom_numbs, atom_types, cells + + +def load_coords(lines, atom_names, natoms): + cnt = 0 + coord, coords = [], [] + for line in lines: + if "time=" in line: + continue + for atom_name in atom_names: + atom_name += " " + if atom_name in line: + cnt += 1 + parts = line.split() + for_line = [float(parts[1]), float(parts[2]), float(parts[3])] + coord.append(for_line) + if cnt == natoms: + coords.append(coord) + cnt = 0 + coord = [] + coords = np.array(coords) + return coords + + +def load_data(mdname: FileType, atom_names, natoms): + with open_file(mdname) as md_file: + lines = md_file.readlines() + coords = load_coords(lines, atom_names, natoms) + steps = [str(i) for i in range(1, coords.shape[0] + 1)] + return coords, steps + + +def to_system_data(fname: FileType, mdname: FileType): + data = {} + ( + data["atom_names"], + data["atom_numbs"], + data["atom_types"], + data["cells"], + ) = load_param_file(fname, mdname) + data["coords"], steps = load_data( + mdname, + data["atom_names"], + np.sum(data["atom_numbs"]), + ) + data["orig"] = np.zeros(3) + return data, steps + + +def load_energy(lines): + energy = [] + for line in lines: + if "time=" in line: + parts = line.split() + ene_line = float(parts[4]) # Hartree + energy.append(ene_line) + continue + energy = energy_convert * np.array(energy) # Hartree -> eV + return energy + + +def load_force(lines, atom_names, atom_numbs): + cnt = 0 + field, fields = [], [] + for line in lines: + if "time=" in line: + continue + for atom_name in atom_names: + atom_name += " " + if atom_name in line: + cnt += 1 + parts = line.split() + for_line = [float(parts[4]), float(parts[5]), float(parts[6])] + field.append(for_line) + if cnt == np.sum(atom_numbs): + fields.append(field) + cnt = 0 + field = [] + force = force_convert * np.array(fields) + return force + + +# load energy, force +def to_system_label(fname, mdname): + atom_names, atom_numbs, atom_types, cells = load_param_file(fname, mdname) + with open_file(mdname) as md_file: + lines = md_file.readlines() + energy = load_energy(lines) + force = load_force(lines, atom_names, atom_numbs) + return energy, force + + +if __name__ == "__main__": + file_name = "Au111Surface" + fname = f"{file_name}.dat" + mdname = f"{file_name}.md" + atom_names, atom_numbs, atom_types, cells = load_param_file(fname, mdname) + coords, steps = load_data(mdname, atom_names, np.sum(atom_numbs)) + data, steps = to_system_data(fname, mdname) + energy, force = to_system_label(fname, mdname) + print(atom_names) + print(atom_numbs) + print(atom_types) +# print(cells.shape) +# print(coords.shape) +# print(len(energy)) +# print(force.shape) diff --git a/dpdata/formats/orca/__init__.py b/dpdata/formats/orca/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dpdata/formats/orca/output.py b/dpdata/formats/orca/output.py new file mode 100644 index 000000000..a0915162b --- /dev/null +++ b/dpdata/formats/orca/output.py @@ -0,0 +1,73 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numpy as np + +from dpdata.utils import open_file + +if TYPE_CHECKING: + from dpdata.utils import FileType + + +def read_orca_sp_output( + fn: FileType, +) -> tuple[np.ndarray, np.ndarray, float, np.ndarray]: + """Read from ORCA output. + + Note that both the energy and the gradient should be printed. + + Parameters + ---------- + fn : str + file name + + Returns + ------- + np.ndarray + atomic symbols + np.ndarray + atomic coordinates + float + total potential energy + np.ndarray + atomic forces + """ + coord = None + symbols = None + forces = None + energy = None + with open_file(fn) as f: + flag = 0 + for line in f: + if flag in (1, 3, 4): + flag += 1 + elif flag == 2: + s = line.split() + if not len(s): + flag = 0 + else: + symbols.append(s[0].capitalize()) + coord.append([float(s[1]), float(s[2]), float(s[3])]) + elif flag == 5: + s = line.split() + if not len(s): + flag = 0 + else: + forces.append([float(s[3]), float(s[4]), float(s[5])]) + elif line.startswith("CARTESIAN COORDINATES (ANGSTROEM)"): + # coord + flag = 1 + coord = [] + symbols = [] + elif line.startswith("CARTESIAN GRADIENT"): + flag = 3 + forces = [] + elif line.startswith("FINAL SINGLE POINT ENERGY"): + energy = float(line.split()[-1]) + symbols = np.array(symbols) + forces = -np.array(forces) + coord = np.array(coord) + assert coord.shape == forces.shape + + return symbols, coord, energy, forces diff --git a/dpdata/formats/psi4/__init__.py b/dpdata/formats/psi4/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dpdata/formats/psi4/input.py b/dpdata/formats/psi4/input.py new file mode 100644 index 000000000..3959cb753 --- /dev/null +++ b/dpdata/formats/psi4/input.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import numpy as np + +# Angston is used in Psi4 by default +template = """molecule {{ +{atoms:s} +{charge:d} {multiplicity:d} +}} +set basis {basis:s} +set gradient_write on +G, wfn = gradient("WB97M-D3BJ", return_wfn=True) +wfn.energy() +wfn.gradient().print_out() +""" + + +def write_psi4_input( + types: np.ndarray, + coords: np.ndarray, + method: str, + basis: str, + charge: int = 0, + multiplicity: int = 1, +) -> str: + """Write Psi4 input file. + + Parameters + ---------- + types : np.ndarray + atomic symbols + coords : np.ndarray + atomic coordinates + method : str + computational method + basis : str + basis set; see https://psicode.org/psi4manual/master/basissets_tables.html + charge : int, default=0 + charge of system + multiplicity : int, default=1 + multiplicity of system + + Returns + ------- + str + content of Psi4 input file + """ + return template.format( + atoms="\n".join( + [ + "{:s} {:16.9f} {:16.9f} {:16.9f}".format(*ii) + for ii in zip(types, *coords.T) + ] + ), + charge=charge, + multiplicity=multiplicity, + method=method, + basis=basis, + ) diff --git a/dpdata/formats/psi4/output.py b/dpdata/formats/psi4/output.py new file mode 100644 index 000000000..c3594ffb4 --- /dev/null +++ b/dpdata/formats/psi4/output.py @@ -0,0 +1,80 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numpy as np + +from dpdata.unit import LengthConversion +from dpdata.utils import open_file + +if TYPE_CHECKING: + from dpdata.utils import FileType + + +def read_psi4_output(fn: FileType) -> tuple[str, np.ndarray, float, np.ndarray]: + """Read from Psi4 output. + + Note that both the energy and the gradient should be printed. + + Parameters + ---------- + fn : str + file name + + Returns + ------- + str + atomic symbols + np.ndarray + atomic coordinates + float + total potential energy + np.ndarray + atomic forces + """ + coord = None + symbols = None + forces = None + energy = None + length_unit = None + with open_file(fn) as f: + flag = 0 + for line in f: + if flag in (1, 3, 4, 5, 6): + flag += 1 + elif flag == 2: + s = line.split() + if not len(s): + flag = 0 + else: + symbols.append(s[0].capitalize()) + coord.append([float(s[1]), float(s[2]), float(s[3])]) + elif flag == 7: + s = line.split() + if not len(s): + flag = 0 + else: + forces.append([float(s[1]), float(s[2]), float(s[3])]) + elif line.startswith( + " Center X Y Z Mass" + ): + # coord + flag = 1 + coord = [] + symbols = [] + elif line.startswith(" Geometry (in "): + # remove ), + length_unit = line.split()[2][:-2].lower() + elif line.startswith(" ## Total Gradient"): + flag = 3 + forces = [] + elif line.startswith(" Total Energy ="): + energy = float(line.split()[-1]) + assert length_unit is not None + length_convert = LengthConversion(length_unit, "angstrom").value() + symbols = np.array(symbols) + forces = -np.array(forces) + coord = np.array(coord) * length_convert + assert coord.shape == forces.shape + + return symbols, coord, energy, forces diff --git a/dpdata/formats/pwmat/__init__.py b/dpdata/formats/pwmat/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dpdata/formats/pwmat/atomconfig.py b/dpdata/formats/pwmat/atomconfig.py new file mode 100644 index 000000000..11677b0ef --- /dev/null +++ b/dpdata/formats/pwmat/atomconfig.py @@ -0,0 +1,95 @@ +#!/usr/bin/python3 +from __future__ import annotations + +import numpy as np + +from ...periodic_table import ELEMENTS + + +def _to_system_data_lower(lines): + system = {} + natoms = int(lines[0].split()[0]) + cell = [] + for idx, ii in enumerate(lines): + if "lattice" in ii or "Lattice" in ii or "LATTICE" in ii: + for kk in range(idx + 1, idx + 1 + 3): + vector = [float(jj) for jj in lines[kk].split()[0:3]] + cell.append(vector) + system["cells"] = np.array([cell]) + coord = [] + atomic_number = [] + atom_numbs = [] + for idx, ii in enumerate(lines): + if "Position" in ii or "POSITION" in ii or "position" in ii: + for kk in range(idx + 1, idx + 1 + natoms): + min = kk + for jj in range(kk + 1, idx + 1 + natoms): + if int(lines[jj].split()[0]) < int(lines[min].split()[0]): + min = jj + lines[min], lines[kk] = lines[kk], lines[min] + for gg in range(idx + 1, idx + 1 + natoms): + tmpv = [float(jj) for jj in lines[gg].split()[1:4]] + tmpv = np.matmul(np.array(tmpv), system["cells"][0]) + coord.append(tmpv) + tmpn = int(lines[gg].split()[0]) + atomic_number.append(tmpn) + for ii in np.unique(sorted(atomic_number)): + atom_numbs.append(atomic_number.count(ii)) + system["atom_numbs"] = [int(ii) for ii in atom_numbs] + system["coords"] = np.array([coord]) + system["orig"] = np.zeros(3) + atom_types = [] + for idx, ii in enumerate(system["atom_numbs"]): + for jj in range(ii): + atom_types.append(idx) + system["atom_types"] = np.array(atom_types, dtype=int) + system["atom_names"] = [ELEMENTS[ii - 1] for ii in np.unique(sorted(atomic_number))] + return system + + +def to_system_data(lines): + return _to_system_data_lower(lines) + + +def from_system_data(system, f_idx=0, skip_zeros=True): + ret = "" + natoms = sum(system["atom_numbs"]) + ret += "%d" % natoms # noqa: UP031 + ret += "\n" + ret += "LATTICE" + ret += "\n" + for ii in system["cells"][f_idx]: + for jj in ii: + ret += f"{jj:.16e} " + ret += "\n" + ret += "POSITION" + ret += "\n" + atom_numbs = system["atom_numbs"] + atom_names = system["atom_names"] + atype = system["atom_types"] + posis = system["coords"][f_idx] + # atype_idx = [[idx,tt] for idx,tt in enumerate(atype)] + # sort_idx = np.argsort(atype, kind = 'mergesort') + sort_idx = np.lexsort((np.arange(len(atype)), atype)) + atype = atype[sort_idx] + posis = posis[sort_idx] + symbal = [] + for ii, jj in zip(atom_numbs, atom_names): + for kk in range(ii): + symbal.append(jj) + atomic_numbers = [] + for ii in symbal: + atomic_numbers.append(ELEMENTS.index(ii) + 1) + posi_list = [] + for jj, ii in zip(atomic_numbers, posis): + ii = np.matmul(ii, np.linalg.inv(system["cells"][0])) + posi_list.append("%d %15.10f %15.10f %15.10f 1 1 1" % (jj, ii[0], ii[1], ii[2])) # noqa: UP031 + for kk in range(len(posi_list)): + min = kk + for jj in range(kk, len(posi_list)): + if int(posi_list[jj].split()[0]) < int(posi_list[min].split()[0]): + min = jj + posi_list[min], posi_list[kk] = posi_list[kk], posi_list[min] + posi_list.append("") + ret += "\n".join(posi_list) + return ret diff --git a/dpdata/formats/pwmat/movement.py b/dpdata/formats/pwmat/movement.py new file mode 100644 index 000000000..a0f28e64b --- /dev/null +++ b/dpdata/formats/pwmat/movement.py @@ -0,0 +1,208 @@ +from __future__ import annotations + +import warnings + +import numpy as np + +from ...periodic_table import ELEMENTS + + +def system_info(lines, type_idx_zero=False): + atom_names = [] + atom_numbs = [] + nelm = 0 + natoms = int(lines[0].split()[0]) + iteration = float(lines[0].split("Etot")[0].split("=")[1].split(",")[0]) + # print(iteration) + if iteration > 0: + nelm = 40 + else: + nelm = 100 + atomic_number = [] + for idx, ii in enumerate(lines): + if ("Position" in ii) and ("nonperiodic_Position" not in ii): + for kk in range(idx + 1, idx + 1 + natoms): + min = kk + for jj in range(kk + 1, idx + 1 + natoms): + if int(lines[jj].split()[0]) < int(lines[min].split()[0]): + min = jj + lines[min], lines[kk] = lines[kk], lines[min] + for gg in range(idx + 1, idx + 1 + natoms): + tmpn = int(lines[gg].split()[0]) + atomic_number.append(tmpn) + for ii in np.unique(sorted(atomic_number)): + atom_numbs.append(atomic_number.count(ii)) + atom_types = [] + for idx, ii in enumerate(atom_numbs): + for jj in range(ii): + if type_idx_zero: + atom_types.append(idx) + else: + atom_types.append(idx + 1) + for ii in np.unique(sorted(atomic_number)): + atom_names.append(ELEMENTS[ii - 1]) + return atom_names, atom_numbs, np.array(atom_types, dtype=int), nelm + + +def get_movement_block(fp): + blk = [] + for ii in fp: + if not ii: + return blk + blk.append(ii.rstrip("\n")) + if "------------" in ii: + return blk + return blk + + +# we assume that the force is printed ... +def get_frames(fname, begin=0, step=1, convergence_check=True): + fp = open(fname) + blk = get_movement_block(fp) + + atom_names, atom_numbs, atom_types, nelm = system_info(blk, type_idx_zero=True) + ntot = sum(atom_numbs) + + all_coords = [] + all_cells = [] + all_energies = [] + all_atomic_energy = [] + all_forces = [] + all_virials = [] + + cc = 0 + rec_failed = [] + while len(blk) > 0: + if cc >= begin and (cc - begin) % step == 0: + coord, cell, energy, force, virial, is_converge = analyze_block( + blk, ntot, nelm + ) + if len(coord) == 0: + break + if is_converge or not convergence_check: + all_coords.append(coord) + all_cells.append(cell) + all_energies.append(energy) + all_forces.append(force) + if virial is not None: + all_virials.append(virial) + if not is_converge: + rec_failed.append(cc + 1) + + blk = get_movement_block(fp) + cc += 1 + + if len(rec_failed) > 0: + prt = ( + "so they are not collected." + if convergence_check + else "but they are still collected due to the requirement for ignoring convergence checks." + ) + warnings.warn( + f"The following structures were unconverged: {rec_failed}; " + prt + ) + + if len(all_virials) == 0: + all_virials = None + else: + all_virials = np.array(all_virials) + fp.close() + return ( + atom_names, + atom_numbs, + atom_types, + np.array(all_cells), + np.array(all_coords), + np.array(all_energies), + np.array(all_forces), + all_virials, + ) + + +def analyze_block(lines, ntot, nelm): + coord = [] + cell = [] + energy = None + # atomic_energy = [] + force = [] + virial = None + is_converge = True + sc_index = 0 + for idx, ii in enumerate(lines): + if "Iteration" in ii: + sc_index = int(ii.split("SCF =")[1]) + if sc_index >= nelm: + is_converge = False + energy = float( + ii.split("Etot,Ep,Ek (eV)")[1].split()[2] + ) # use Ep, not Etot=Ep+Ek + elif "----------" in ii: + assert (force is not None) and len(coord) > 0 and len(cell) > 0 + # all_coords.append(coord) + # all_cells.append(cell) + # all_energies.append(energy) + # all_forces.append(force) + # if virial is not None : + # all_virials.append(virial) + return coord, cell, energy, force, virial, is_converge + # elif 'NPT' in ii: + # tmp_v = [] + elif "Lattice vector" in ii: + if "stress" in lines[idx + 1]: + tmp_v = [] + for dd in range(3): + tmp_l = lines[idx + 1 + dd] + cell.append([float(ss) for ss in tmp_l.split()[0:3]]) + tmp_v.append([float(stress) for stress in tmp_l.split()[5:8]]) + virial = np.zeros([3, 3]) + virial[0][0] = tmp_v[0][0] + virial[0][1] = tmp_v[0][1] + virial[0][2] = tmp_v[0][2] + virial[1][0] = tmp_v[1][0] + virial[1][1] = tmp_v[1][1] + virial[1][2] = tmp_v[1][2] + virial[2][0] = tmp_v[2][0] + virial[2][1] = tmp_v[2][1] + virial[2][2] = tmp_v[2][2] + volume = np.linalg.det(np.array(cell)) + virial = virial * 160.2 * 10.0 / volume + else: + for dd in range(3): + tmp_l = lines[idx + 1 + dd] + cell.append([float(ss) for ss in tmp_l.split()[0:3]]) + + # else : + # for dd in range(3) : + # tmp_l = lines[idx+1+dd] + # cell.append([float(ss) + # for ss in tmp_l.split()[0:3]]) + # virial = np.zeros([3,3]) + elif ("Position" in ii) and ("nonperiodic_Position" not in ii): + for kk in range(idx + 1, idx + 1 + ntot): + min = kk + for jj in range(kk + 1, idx + 1 + ntot): + if int(lines[jj].split()[0]) < int(lines[min].split()[0]): + min = jj + lines[min], lines[kk] = lines[kk], lines[min] + for gg in range(idx + 1, idx + 1 + ntot): + info = [float(jj) for jj in lines[gg].split()[1:4]] + info = np.matmul(np.array(info), np.array(cell)) + coord.append(info) + elif "Force" in ii: + for kk in range(idx + 1, idx + 1 + ntot): + min = kk + for jj in range(kk + 1, idx + 1 + ntot): + if int(lines[jj].split()[0]) < int(lines[min].split()[0]): + min = jj + lines[min], lines[kk] = lines[kk], lines[min] + for gg in range(idx + 1, idx + 1 + ntot): + info = [ + -float(ss) for ss in lines[gg].split() + ] # forces in MOVEMENT file are dE/dR, lacking a minus sign + force.append(info[1:4]) + # elif 'Atomic-Energy' in ii: + # for jj in range(idx+1, idx+1+ntot) : + # tmp_l = lines[jj] + # info = [float(ss) for ss in tmp_l.split()] + # atomic_energy.append(info[1]) + return coord, cell, energy, force, virial, is_converge diff --git a/dpdata/formats/pymatgen/__init__.py b/dpdata/formats/pymatgen/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dpdata/formats/pymatgen/molecule.py b/dpdata/formats/pymatgen/molecule.py new file mode 100644 index 000000000..8d397984a --- /dev/null +++ b/dpdata/formats/pymatgen/molecule.py @@ -0,0 +1,30 @@ +from __future__ import annotations + +from collections import Counter + +import numpy as np + + +def to_system_data(file_name, protect_layer=9): + from pymatgen.core import Molecule + + mol = Molecule.from_file(file_name) + elem_mol = list(str(site.species.elements[0]) for site in mol.sites) + elem_counter = Counter(elem_mol) + atom_names = list(elem_counter.keys()) + atom_numbs = list(elem_counter.values()) + atom_types = [list(atom_names).index(e) for e in elem_mol] + natoms = np.sum(atom_numbs) + + tmpcoord = np.copy(mol.cart_coords) + + system = {} + system["atom_names"] = atom_names + system["atom_numbs"] = atom_numbs + system["atom_types"] = np.array(atom_types, dtype=int) + # center = [c - h_cell_size for c in mol.center_of_mass] + system["orig"] = np.array([0, 0, 0]) + + system["coords"] = np.array([tmpcoord]) + system["cells"] = np.array([10.0 * np.eye(3)]) + return system diff --git a/dpdata/formats/pymatgen/structure.py b/dpdata/formats/pymatgen/structure.py new file mode 100644 index 000000000..1f74dbdd0 --- /dev/null +++ b/dpdata/formats/pymatgen/structure.py @@ -0,0 +1,30 @@ +from __future__ import annotations + +import numpy as np + + +def from_system_data(structure) -> dict: + """Convert one pymatgen structure to dpdata's datadict.""" + symbols = [ii.specie.symbol for ii in structure] + atom_names = list(structure.symbol_set) + atom_numbs = [symbols.count(symbol) for symbol in atom_names] + atom_types = np.array([atom_names.index(symbol) for symbol in symbols]).astype(int) + coords = structure.cart_coords + cells = structure.lattice.matrix + if all(structure.pbc): + pbc = True + elif not any(structure.pbc): + pbc = False + else: + raise ValueError(f"Partial pbc condition {structure.pbc} is not supported") + + info_dict = { + "atom_names": atom_names, + "atom_numbs": atom_numbs, + "atom_types": atom_types, + "coords": np.array([coords]), + "cells": np.array([cells]), + "orig": np.zeros(3), + "nopbc": not pbc, + } + return info_dict diff --git a/dpdata/formats/qe/__init__.py b/dpdata/formats/qe/__init__.py new file mode 100644 index 000000000..35d20788f --- /dev/null +++ b/dpdata/formats/qe/__init__.py @@ -0,0 +1,5 @@ +from __future__ import annotations + +from . import scf, traj + +__all__ = ["scf", "traj"] diff --git a/dpdata/formats/qe/scf.py b/dpdata/formats/qe/scf.py new file mode 100755 index 000000000..341261d22 --- /dev/null +++ b/dpdata/formats/qe/scf.py @@ -0,0 +1,188 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import os + +import numpy as np + +from dpdata.utils import open_file + +from .traj import ( + kbar2evperang3, + ry2ev, +) +from .traj import ( + length_convert as bohr2ang, +) + +_QE_BLOCK_KEYWORDS = [ + "ATOMIC_SPECIES", + "ATOMIC_POSITIONS", + "K_POINTS", + "ADDITIONAL_K_POINTS", + "CELL_PARAMETERS", + "CONSTRAINTS", + "OCCUPATIONS", + "ATOMIC_VELOCITIES", + "ATOMIC_FORCES", + "SOLVENTS", + "HUBBARD", +] + + +def get_block(lines, keyword, skip=0): + ret = [] + for idx, ii in enumerate(lines): + if keyword in ii: + blk_idx = idx + 1 + skip + while len(lines[blk_idx].split()) == 0: + blk_idx += 1 + while ( + len(lines[blk_idx].split()) != 0 + and (lines[blk_idx].split()[0] not in _QE_BLOCK_KEYWORDS) + ) and blk_idx != len(lines): + ret.append(lines[blk_idx]) + blk_idx += 1 + break + return ret + + +def get_cell(lines): + ret = [] + for idx, ii in enumerate(lines): + if "ibrav" in ii: + break + blk = lines[idx : idx + 2] + ibrav = int(blk[0].replace(",", "").split("=")[-1]) + if ibrav == 0: + for iline in lines: + if "CELL_PARAMETERS" in iline and "angstrom" not in iline.lower(): + raise RuntimeError( + "CELL_PARAMETERS must be written in Angstrom. Other units are not supported yet." + ) + blk = get_block(lines, "CELL_PARAMETERS") + for ii in blk: + ret.append([float(jj) for jj in ii.split()[0:3]]) + ret = np.array(ret) + elif ibrav == 1: + a = None + for iline in lines: + line = iline.replace("=", " ").replace(",", "").split() + if len(line) >= 2 and "a" == line[0]: + # print("line = ", line) + a = float(line[1]) + if len(line) >= 2 and "celldm(1)" == line[0]: + a = float(line[1]) * bohr2ang + # print("a = ", a) + if not a: + raise RuntimeError("parameter 'a' or 'celldm(1)' cannot be found.") + ret = np.array([[a, 0.0, 0.0], [0.0, a, 0.0], [0.0, 0.0, a]]) + else: + raise RuntimeError("ibrav > 1 not supported yet.") + return ret + + +def get_coords(lines, cell): + coord = [] + atom_symbol_list = [] + for iline in lines: + if "ATOMIC_POSITIONS" in iline and ( + "angstrom" not in iline.lower() and "crystal" not in iline.lower() + ): + raise RuntimeError( + "ATOMIC_POSITIONS must be written in Angstrom or crystal. Other units are not supported yet." + ) + if "ATOMIC_POSITIONS" in iline and "angstrom" in iline.lower(): + blk = get_block(lines, "ATOMIC_POSITIONS") + for ii in blk: + coord.append([float(jj) for jj in ii.split()[1:4]]) + atom_symbol_list.append(ii.split()[0]) + coord = np.array(coord) + elif "ATOMIC_POSITIONS" in iline and "crystal" in iline.lower(): + blk = get_block(lines, "ATOMIC_POSITIONS") + for ii in blk: + coord.append([float(jj) for jj in ii.split()[1:4]]) + atom_symbol_list.append(ii.split()[0]) + coord = np.array(coord) + coord = np.matmul(coord, cell) + atom_symbol_list = np.array(atom_symbol_list) + tmp_names, symbol_idx = np.unique(atom_symbol_list, return_index=True) + atom_types = [] + atom_numbs = [] + # preserve the atom_name order + atom_names = atom_symbol_list[np.sort(symbol_idx, kind="stable")] + for jj in atom_symbol_list: + for idx, ii in enumerate(atom_names): + if jj == ii: + atom_types.append(idx) + for idx in range(len(atom_names)): + atom_numbs.append(atom_types.count(idx)) + atom_types = np.array(atom_types) + + return list(atom_names), atom_numbs, atom_types, coord + + +def get_energy(lines): + energy = None + for ii in lines: + if "! total energy" in ii: + energy = ry2ev * float(ii.split("=")[1].split()[0]) + return energy + + +def get_force(lines, natoms): + blk = get_block(lines, "Forces acting on atoms", skip=1) + ret = [] + blk = blk[0 : sum(natoms)] + for ii in blk: + ret.append([float(jj) for jj in ii.split("=")[1].split()]) + ret = np.array(ret) + ret *= ry2ev / bohr2ang + return ret + + +def get_stress(lines): + blk = get_block(lines, "total stress") + if len(blk) == 0: + return None + ret = [] + for ii in blk: + ret.append([float(jj) for jj in ii.split()[3:6]]) + ret = np.array(ret) + ret *= kbar2evperang3 + return ret + + +def get_frame(fname): + if isinstance(fname, str): + path_out = fname + outname = os.path.basename(path_out) + # the name of the input file is assumed to be different from the output by 'in' and 'out' + inname = outname.replace("out", "in") + path_in = os.path.join(os.path.dirname(path_out), inname) + elif isinstance(fname, list) and len(fname) == 2: + path_in = fname[0] + path_out = fname[1] + else: + raise RuntimeError("invalid input") + with open_file(path_out) as fp: + outlines = fp.read().split("\n") + with open_file(path_in) as fp: + inlines = fp.read().split("\n") + cell = get_cell(inlines) + atom_names, natoms, types, coords = get_coords(inlines, cell) + energy = get_energy(outlines) + force = get_force(outlines, natoms) + stress = get_stress(outlines) + if stress is not None: + stress = (stress * np.linalg.det(cell))[np.newaxis, :, :] + return ( + atom_names, + natoms, + types, + cell[np.newaxis, :, :], + coords[np.newaxis, :, :], + np.array(energy)[np.newaxis], + force[np.newaxis, :, :], + stress, + ) diff --git a/dpdata/formats/qe/traj.py b/dpdata/formats/qe/traj.py new file mode 100644 index 000000000..382d3acfa --- /dev/null +++ b/dpdata/formats/qe/traj.py @@ -0,0 +1,284 @@ +#!/usr/bin/python3 +from __future__ import annotations + +import warnings +from typing import TYPE_CHECKING + +import numpy as np + +from dpdata.utils import open_file + +if TYPE_CHECKING: + from dpdata.utils import FileType + +import os + +from ...unit import ( + EnergyConversion, + ForceConversion, + LengthConversion, + PressureConversion, +) + +ry2ev = EnergyConversion("rydberg", "eV").value() +kbar2evperang3 = PressureConversion("kbar", "eV/angstrom^3").value() +gpa2evperbohr = PressureConversion("GPa", "eV/bohr^3").value() + +length_convert = LengthConversion("bohr", "angstrom").value() +energy_convert = EnergyConversion("hartree", "eV").value() +force_convert = ForceConversion("hartree/bohr", "eV/angstrom").value() + + +def load_key(lines, key): + for ii in lines: + if key in ii: + words = ii.split(",") + for jj in words: + if key in jj: + return jj.split("=")[1] + return None + + +def load_block(lines, key, nlines): + for idx, ii in enumerate(lines): + if key in ii: + break + return lines[idx + 1 : idx + 1 + nlines] + + +def convert_celldm(ibrav, celldm): + if ibrav == 1: + return celldm[0] * np.eye(3) + elif ibrav == 2: + return celldm[0] * 0.5 * np.array([[-1, 0, 1], [0, 1, 1], [-1, 1, 0]]) + elif ibrav == 3: + return celldm[0] * 0.5 * np.array([[1, 1, 1], [-1, 1, 1], [-1, -1, 1]]) + elif ibrav == -3: + return celldm[0] * 0.5 * np.array([[-1, 1, 1], [1, -1, 1], [1, 1, -1]]) + else: + warnings.warn( + "unsupported ibrav " + + str(ibrav) + + " if no .cel file, the cell convertion may be wrong. " + ) + return np.eye(3) + # raise RuntimeError('unsupported ibrav ' + str(ibrav)) + + +def load_cell_parameters(lines): + blk = load_block(lines, "CELL_PARAMETERS", 3) + ret = [] + for ii in blk: + ret.append([float(jj) for jj in ii.split()[0:3]]) + return np.array(ret) + + +def load_atom_names(lines, ntypes): + blk = load_block(lines, "ATOMIC_SPECIES", ntypes) + return [ii.split()[0] for ii in blk] + + +def load_celldm(lines): + celldm = np.zeros(6) + for ii in range(6): + key = "celldm(%d)" % (ii + 1) # noqa: UP031 + val = load_key(lines, key) + if val is not None: + celldm[ii] = float(val) + return celldm + + +def load_atom_types(lines, natoms, atom_names): + blk = load_block(lines, "ATOMIC_POSITIONS", natoms) + ret = [] + for ii in blk: + ret.append(atom_names.index(ii.split()[0])) + return np.array(ret, dtype=int) + + +def load_param_file(fname: FileType): + with open_file(fname) as fp: + lines = fp.read().split("\n") + natoms = int(load_key(lines, "nat")) + ntypes = int(load_key(lines, "ntyp")) + atom_names = load_atom_names(lines, ntypes) + atom_types = load_atom_types(lines, natoms, atom_names) + atom_numbs = [] + for ii in range(ntypes): + atom_numbs.append(np.sum(atom_types == ii)) + ibrav = int(load_key(lines, "ibrav")) + celldm = load_celldm(lines) + if ibrav == 0: + cell = load_cell_parameters(lines) + else: + cell = convert_celldm(ibrav, celldm) + cell = cell * length_convert + # print(atom_names) + # print(atom_numbs) + # print(atom_types) + # print(cell) + return atom_names, atom_numbs, atom_types, cell + + +def _load_pos_block(fp, natoms): + head = fp.readline() + if not head: + # print('get None') + return None, None + else: + ss = head.split()[0] + blk = [] + for ii in range(natoms): + newline = fp.readline() + if not newline: + return None, None + blk.append([float(jj) for jj in newline.split()]) + return blk, ss + + +def load_data(fname: FileType, natoms, begin=0, step=1, convert=1.0): + coords = [] + steps = [] + cc = 0 + with open_file(fname) as fp: + while True: + blk, ss = _load_pos_block(fp, natoms) + if blk is None: + break + else: + if cc >= begin and (cc - begin) % step == 0: + coords.append(blk) + steps.append(ss) + cc += 1 + coords = convert * np.array(coords) + return coords, steps + + +# def load_pos(fname, natoms) : +# coords = [] +# with open_file(fname) as fp: +# while True: +# blk = _load_pos_block(fp, natoms) +# # print(blk) +# if blk == None : +# break +# else : +# coords.append(blk) +# coords= length_convert * np.array(coords) +# return coords + + +def load_energy(fname, begin=0, step=1): + data = np.loadtxt(fname, ndmin=2) + steps = [] + for ii in data[begin::step, 0]: + steps.append("%d" % ii) # noqa: UP031 + with open_file(fname) as fp: + while True: + line = fp.readline() + if not line: + return None + if line.split()[0][0] != "#": + nw = len(line.split()) + break + data = np.reshape(data, [-1, nw]) + return energy_convert * data[begin::step, 5], steps + + +# def load_force(fname, natoms) : +# coords = [] +# with open_file(fname) as fp: +# while True: +# blk = _load_pos_block(fp, natoms) +# # print(blk) +# if blk == None : +# break +# else : +# coords.append(blk) +# coords= force_convert * np.array(coords) +# return coords + + +def to_system_data(input_name, prefix, begin=0, step=1): + data = {} + data["atom_names"], data["atom_numbs"], data["atom_types"], cell = load_param_file( + input_name + ) + data["coords"], csteps = load_data( + prefix + ".pos", + np.sum(data["atom_numbs"]), + begin=begin, + step=step, + convert=length_convert, + ) + data["orig"] = np.zeros(3) + try: + data["cells"], tmp_steps = load_data( + prefix + ".cel", 3, begin=begin, step=step, convert=length_convert + ) + data["cells"] = np.transpose(data["cells"], (0, 2, 1)) + if csteps != tmp_steps: + csteps.append(None) + tmp_steps.append(None) + for int_id in range(len(csteps)): + if csteps[int_id] != tmp_steps[int_id]: + break + step_id = begin + int_id * step + raise RuntimeError( + f"the step key between files are not consistent. " + f"The difference locates at step: {step_id}, " + f".pos is {csteps[int_id]}, .cel is {tmp_steps[int_id]}" + ) + except FileNotFoundError: + data["cells"] = np.tile(cell, (data["coords"].shape[0], 1, 1)) + + # handle virial + stress_fname = prefix + ".str" + if os.path.exists(stress_fname): + # 1. Read stress tensor (in GPa) for each structure + stress, vsteps = load_data(stress_fname, 3, begin=begin, step=step, convert=1.0) + if csteps != vsteps: + csteps.append(None) + vsteps.append(None) + for int_id in range(len(csteps)): + if csteps[int_id] != vsteps[int_id]: + break + step_id = begin + int_id * step + raise RuntimeError( + f"the step key between files are not consistent. " + f"The difference locates at step: {step_id}, " + f".pos is {csteps[int_id]}, .str is {vsteps[int_id]}" + ) + # 2. Calculate volume from cell. revert unit to bohr before taking det + volumes = np.linalg.det(data["cells"] / length_convert).reshape(-1) + # 3. Calculate virials for each structure, shape [nf x 3 x 3] + data["virials"] = gpa2evperbohr * volumes[:, None, None] * stress + + return data, csteps + + +def to_system_label(input_name, prefix, begin=0, step=1): + atom_names, atom_numbs, atom_types, cell = load_param_file(input_name) + energy, esteps = load_energy(prefix + ".evp", begin=begin, step=step) + force, fsteps = load_data( + prefix + ".for", + np.sum(atom_numbs), + begin=begin, + step=step, + convert=force_convert, + ) + assert esteps == fsteps, "the step key between files are not consistent " + return energy, force, esteps + + +if __name__ == "__main__": + prefix = "nacl" + atom_names, atom_numbs, atom_types, cell = load_param_file(prefix + ".in") + coords = load_data(prefix + ".pos", np.sum(atom_numbs)) + cells = load_data(prefix + ".cel", 3) + print(atom_names) + print(atom_numbs) + print(atom_types) + print(cells) + print(coords.shape) + print(cells.shape) diff --git a/dpdata/formats/rdkit/__init__.py b/dpdata/formats/rdkit/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dpdata/formats/rdkit/sanitize.py b/dpdata/formats/rdkit/sanitize.py new file mode 100644 index 000000000..9afc52c9a --- /dev/null +++ b/dpdata/formats/rdkit/sanitize.py @@ -0,0 +1,728 @@ +from __future__ import annotations + +import os +import time +from copy import deepcopy + + +def get_explicit_valence(atom, verbose=False): + exp_val_calculated_from_bonds = int( + sum([bond.GetBondTypeAsDouble() for bond in atom.GetBonds()]) + ) + try: + try: + from rdkit import Chem + + exp_val = atom.GetValence(Chem.ValenceType.EXPLICIT) + valence_method = "GetValence(Chem.ValenceType.EXPLICIT)" + except (ImportError, AttributeError, TypeError): + exp_val = atom.GetExplicitValence() + valence_method = "GetExplicitValence()" + if exp_val != exp_val_calculated_from_bonds: + if verbose: + print( + f"Explicit valence given by {valence_method} and sum of bond order are inconsistent on {atom.GetSymbol()}{atom.GetIdx() + 1}, using sum of bond order." + ) + return exp_val_calculated_from_bonds + except Exception: + return exp_val_calculated_from_bonds + + +def regularize_formal_charges(mol, sanitize=True, verbose=False): + """Regularize formal charges of atoms.""" + from rdkit import Chem + + assert isinstance(mol, Chem.rdchem.Mol) + for atom in mol.GetAtoms(): + assign_formal_charge_for_atom(atom, verbose) + if sanitize: + try: + Chem.SanitizeMol(mol) + return mol + except Exception: + return None + else: + return mol + + +def assign_formal_charge_for_atom(atom, verbose=False): + """Assigen formal charge according to 8-electron rule for element B,C,N,O,S,P,As.""" + from rdkit import Chem + + assert isinstance(atom, Chem.rdchem.Atom) + valence = get_explicit_valence(atom, verbose) + if atom.GetSymbol() == "B": + atom.SetFormalCharge(3 - valence) + elif atom.GetSymbol() == "C": + atom.SetFormalCharge(valence - 4) + if valence == 3: + print( + f"Detect a valence of 3 on #C{atom.GetIdx() + 1}, the formal charge of this atom will be assigned to -1" + ) + elif valence > 4: + raise ValueError(f"#C{atom.GetIdx() + 1} has a valence larger than 4") + elif atom.GetSymbol() == "N": + if valence > 4: + raise ValueError(f"#N{atom.GetIdx() + 1} has a valence larger than 4") + else: + atom.SetFormalCharge(valence - 3) + elif atom.GetSymbol() == "O": + atom.SetFormalCharge(valence - 2) + elif atom.GetSymbol() == "S": + if valence == 1: + atom.SetFormalCharge(-1) + elif valence == 3: + atom.SetFormalCharge(1) + elif valence > 6: + raise ValueError(f"#S{atom.GetIdx() + 1} has a valence larger than 6") + else: + atom.SetFormalCharge(0) + elif atom.GetSymbol() == "P" or atom.GetSymbol() == "As": + if valence == 5: + atom.SetFormalCharge(0) + elif valence > 5: + raise ValueError( + f"#{atom.GetSymbol()}{atom.GetIdx() + 1} has a valence larger than 5" + ) + else: + atom.SetFormalCharge(valence - 3) + + +# print bond and atom information (for debugger) +def print_bonds(mol): + for bond in mol.GetBonds(): + begin_atom = bond.GetBeginAtom() + end_atom = bond.GetEndAtom() + print( + f"{begin_atom.GetSymbol()}{begin_atom.GetIdx() + 1} {end_atom.GetSymbol()}{end_atom.GetIdx() + 1} {bond.GetBondType()}" + ) + + +def print_atoms(mol): + for atom in mol.GetAtoms(): + print( + f"{atom.GetSymbol()}{atom.GetIdx() + 1} {atom.GetFormalCharge()} {get_explicit_valence(atom)}" + ) + + +def is_terminal_oxygen(O_atom): + return len(O_atom.GetNeighbors()) == 1 + + +def get_terminal_oxygens(atom): + terminal_oxygens = [] + for nei in atom.GetNeighbors(): + if nei.GetSymbol() == "O" or nei.GetSymbol() == "S": + if is_terminal_oxygen(nei): + terminal_oxygens.append(nei) + return terminal_oxygens + + +def is_terminal_NR2(N_atom): + return len(N_atom.GetNeighbors()) == 3 + + +def get_terminal_NR2s(atom): + terminal_NR2s = [] + for nei in atom.GetNeighbors(): + if nei.GetSymbol() == "N": + if is_terminal_NR2(nei): + terminal_NR2s.append(nei) + terminal_NR2s.sort( + key=lambda N_atom: len( + [atom for atom in N_atom.GetNeighbors() if atom.GetSymbol() == "H"] + ) + ) + return terminal_NR2s + + +def sanitize_phosphate_Patom(P_atom, verbose=True): + from rdkit import Chem + + if P_atom.GetSymbol() == "P": + terminal_oxygens = get_terminal_oxygens(P_atom) + mol = P_atom.GetOwningMol() + if len(terminal_oxygens) > 1: + if verbose: + print("Phospate group detected, sanitizing it...") + # set one P=O and two P-O + bond1 = mol.GetBondBetweenAtoms( + P_atom.GetIdx(), terminal_oxygens[0].GetIdx() + ) + bond1.SetBondType(Chem.rdchem.BondType.DOUBLE) + for ii in range(1, len(terminal_oxygens)): + bond = mol.GetBondBetweenAtoms( + P_atom.GetIdx(), terminal_oxygens[ii].GetIdx() + ) + bond.SetBondType(Chem.rdchem.BondType.SINGLE) + terminal_oxygens[ii].SetFormalCharge(-1) + + +def sanitize_phosphate(mol): + for atom in mol.GetAtoms(): + sanitize_phosphate_Patom(atom) + return mol + + +def sanitize_sulfate_Satom(S_atom, verbose=True): + from rdkit import Chem + + if S_atom.GetSymbol() == "S": + terminal_oxygens = get_terminal_oxygens(S_atom) + mol = S_atom.GetOwningMol() + if len(terminal_oxygens) == 3: + if verbose: + print("Sulfate group detected, sanitizing it...") + # set one S-O and two S=O + bond1 = mol.GetBondBetweenAtoms( + S_atom.GetIdx(), terminal_oxygens[0].GetIdx() + ) + bond1.SetBondType(Chem.rdchem.BondType.SINGLE) + terminal_oxygens[0].SetFormalCharge(-1) + for ii in range(1, len(terminal_oxygens)): + bond = mol.GetBondBetweenAtoms( + S_atom.GetIdx(), terminal_oxygens[ii].GetIdx() + ) + bond.SetBondType(Chem.rdchem.BondType.DOUBLE) + + +def sanitize_sulfate(mol): + for atom in mol.GetAtoms(): + sanitize_sulfate_Satom(atom) + return mol + + +def sanitize_carboxyl_Catom(C_atom, verbose=True): + from rdkit import Chem + + if C_atom.GetSymbol() == "C": + terminal_oxygens = get_terminal_oxygens(C_atom) + mol = C_atom.GetOwningMol() + if len(terminal_oxygens) == 2: + if verbose: + print("Carbonxyl group detected, sanitizing it...") + # set one C-O and one C=O + bond1 = mol.GetBondBetweenAtoms( + C_atom.GetIdx(), terminal_oxygens[0].GetIdx() + ) + bond1.SetBondType(Chem.rdchem.BondType.SINGLE) + terminal_oxygens[0].SetFormalCharge(-1) + + bond2 = mol.GetBondBetweenAtoms( + C_atom.GetIdx(), terminal_oxygens[1].GetIdx() + ) + bond2.SetBondType(Chem.rdchem.BondType.DOUBLE) + terminal_oxygens[1].SetFormalCharge(0) + + +def sanitize_carboxyl(mol): + for atom in mol.GetAtoms(): + sanitize_carboxyl_Catom(atom) + return mol + + +def sanitize_guanidine_Catom(C_atom, verbose=True): + from rdkit import Chem + + if C_atom.GetSymbol() == "C": + terminal_NR2s = get_terminal_NR2s(C_atom) + mol = C_atom.GetOwningMol() + if len(terminal_NR2s) == 3: + if verbose: + print("Guanidyl group detected, sanitizing it...") + # set two C-N and one C=N+ + bond1 = mol.GetBondBetweenAtoms(C_atom.GetIdx(), terminal_NR2s[0].GetIdx()) + bond1.SetBondType(Chem.rdchem.BondType.SINGLE) + terminal_NR2s[0].SetFormalCharge(-1) + + bond2 = mol.GetBondBetweenAtoms(C_atom.GetIdx(), terminal_NR2s[1].GetIdx()) + bond2.SetBondType(Chem.rdchem.BondType.SINGLE) + terminal_NR2s[1].SetFormalCharge(0) + + bond3 = mol.GetBondBetweenAtoms(C_atom.GetIdx(), terminal_NR2s[2].GetIdx()) + bond3.SetBondType(Chem.rdchem.BondType.DOUBLE) + terminal_NR2s[2].SetFormalCharge(1) + + +def sanitize_guanidine(mol): + for atom in mol.GetAtoms(): + sanitize_guanidine_Catom(atom) + return mol + + +def sanitize_nitro_Natom(N_atom, verbose=True): + from rdkit import Chem + + if N_atom.GetSymbol() == "N": + terminal_oxygens = get_terminal_oxygens(N_atom) + mol = N_atom.GetOwningMol() + if len(terminal_oxygens) == 2: + if verbose: + print("Nitro group detected, sanitizing it...") + # set one N-O and one N=O + bond1 = mol.GetBondBetweenAtoms( + N_atom.GetIdx(), terminal_oxygens[0].GetIdx() + ) + bond1.SetBondType(Chem.rdchem.BondType.SINGLE) + terminal_oxygens[0].SetFormalCharge(-1) + + bond2 = mol.GetBondBetweenAtoms( + N_atom.GetIdx(), terminal_oxygens[1].GetIdx() + ) + bond2.SetBondType(Chem.rdchem.BondType.DOUBLE) + terminal_oxygens[1].SetFormalCharge(0) + + +def sanitize_nitro(mol): + for atom in mol.GetAtoms(): + sanitize_nitro_Natom(atom) + return mol + + +def is_terminal_nitrogen(N_atom): + if N_atom.GetSymbol() == "N" and len(N_atom.GetNeighbors()) == 1: + return True + else: + return False + + +def sanitize_nitrine_Natom(atom, verbose=True): + from rdkit import Chem + + if atom.GetSymbol() == "N" and len(atom.GetNeighbors()) == 2: + mol = atom.GetOwningMol() + nei1, nei2 = atom.GetNeighbors()[0], atom.GetNeighbors()[1] + if nei1.GetSymbol() == "N" and nei2.GetSymbol() == "N": + if is_terminal_nitrogen(nei1): + N_terminal = nei1 + N_non_terminal = nei2 + elif is_terminal_nitrogen(nei2): + N_terminal = nei2 + N_non_terminal = nei1 + else: + N_terminal = None + N_non_terminal = None + if (N_terminal is not None) and (N_non_terminal is not None): + # set X-N=[N+]=[N-] + if verbose: + print("Detecting nitrine group, fixing it...") + bond = mol.GetBondBetweenAtoms(atom.GetIdx(), N_terminal.GetIdx()) + bond.SetBondType(Chem.rdchem.BondType.DOUBLE) + N_terminal.SetFormalCharge(-1) + + bond = mol.GetBondBetweenAtoms(atom.GetIdx(), N_non_terminal.GetIdx()) + bond.SetBondType(Chem.rdchem.BondType.DOUBLE) + atom.SetFormalCharge(1) + + +def contain_hetero_aromatic(mol): + flag = False + for atom in mol.GetAtoms(): + if atom.GetSymbol() != "C" and atom.GetIsAromatic(): + flag = True + break + return flag + + +# for carbon with explicit valence > 4 +def regularize_carbon_bond_order(atom, verbose=True): + from rdkit import Chem + + if atom.GetSymbol() == "C" and get_explicit_valence(atom) > 4: + if verbose: + print("Detecting carbon with explicit valence > 4, fixing it...") + mol = atom.GetOwningMol() + double_bond_idx = -1 + for nei in atom.GetNeighbors(): + bond = mol.GetBondBetweenAtoms(atom.GetIdx(), nei.GetIdx()) + if bond.GetBondTypeAsDouble() == 2: + double_bond_idx = bond.GetIdx() + break + if double_bond_idx != -1: + for bond in atom.GetBonds(): + if bond.GetIdx() != double_bond_idx: + bond.SetBondType(Chem.rdchem.BondType.SINGLE) + + +# for nitrogen with explicit valence > 4 +def regularize_nitrogen_bond_order(atom, verbose=True): + from rdkit import Chem + + mol = atom.GetOwningMol() + if atom.GetSymbol() == "N" and get_explicit_valence(atom) > 4: + O_atoms = get_terminal_oxygens(atom) + for O_atom in O_atoms: + bond = mol.GetBondBetweenAtoms(atom.GetIdx(), O_atom.GetIdx()) + if bond.GetBondTypeAsDouble() == 2: + bond.SetBondType(Chem.rdchem.BondType.SINGLE) + O_atom.SetFormalCharge(-1) + + +def sanitize_mol(mol, verbose=False): + for atom in mol.GetAtoms(): + sanitize_carboxyl_Catom(atom, verbose) + sanitize_guanidine_Catom(atom, verbose) + sanitize_phosphate_Patom(atom, verbose) + sanitize_sulfate_Satom(atom, verbose) + sanitize_nitro_Natom(atom, verbose) + sanitize_nitrine_Natom(atom, verbose) + regularize_carbon_bond_order(atom, verbose) + regularize_nitrogen_bond_order(atom, verbose) + return mol + + +# copy from FEprep +def mol_edit_log(mol, i, j): + if not mol.HasProp("edit"): + mol.SetProp("edit", "%d_%d" % (i, j)) # noqa: UP031 + else: + edited = mol.GetProp("edit") + mol.SetProp("edit", edited + ",%d_%d" % (i, j)) # noqa: UP031 + + +def kekulize_aromatic_heterocycles(mol_in, assign_formal_charge=True, sanitize=True): + from rdkit import Chem + from rdkit.Chem.rdchem import BondType + + mol = Chem.RWMol(mol_in) + rings = Chem.rdmolops.GetSymmSSSR(mol) + rings = [list(i) for i in list(rings)] + rings.sort(key=lambda r: len(r)) + + def search_and_assign_ring( + mol, ring, hetero, start, forward=True, start_switch=True + ): + j = start + switch = start_switch + lring = len(ring) + delta = 1 if forward else -1 + n_edit = 0 + n_double = 0 + while not ((j in hetero) & (not switch)): + btype = BondType.SINGLE if switch else BondType.DOUBLE + bond = mol.GetBondBetweenAtoms(ring[j], ring[(j + delta) % lring]) + if bond.GetBondType() == BondType.AROMATIC: + bond.SetBondType(btype) + mol_edit_log(mol, ring[j], ring[(j + delta) % lring]) + # print(ring[j], ring[(j + delta) % lring], bond.GetBondType()) + if btype == BondType.DOUBLE: + n_double += 1 + n_edit += 1 + else: + break + j = (j + delta) % lring + switch = not switch + return n_edit, n_double + + def print_bondtypes(mol, rings): + for ring in rings: + lring = len(ring) + btype = [] + for i in range(lring): + btype.append( + mol.GetBondBetweenAtoms( + ring[i], ring[(i + 1) % lring] + ).GetBondType() + ) + atoms = [mol.GetAtomWithIdx(i).GetSymbol() for i in ring] + print(ring) + print(atoms) + print(btype) + + def hetero_priority(idx, mol): + atom = mol.GetAtomWithIdx(idx) + sym = atom.GetSymbol() + valence = len(atom.GetBonds()) + + if (sym in ["O", "S"]) & (valence == 2): + return 0 + elif sym in ["N", "P", "As", "B"]: + if valence == 3: + return 1 + elif valence == 2: + return 2 + + # save carbon/hetero aromatic rings + CAr = [] + HAr = [] + for ring in rings: + lring = len(ring) + bAllAr = True + bAllC = True + for i in range(lring): + atom = mol.GetAtomWithIdx(ring[i]) + if atom.GetSymbol() != "C": + bAllC = False + + bond = mol.GetBondBetweenAtoms(ring[i], ring[(i + 1) % lring]) + if bond.GetBondType() != BondType.AROMATIC: + bAllAr = False + if bAllAr and bAllC: + CAr.append(ring) + elif bAllAr and not bAllC: + HAr.append(ring) + + if len(HAr) == 0: + # no hetrerocycles + return mol_in + else: + # edit heterocycles + for ring in HAr: + lring = len(ring) + cring = len(CAr) + hetero = [] + hasDouble = [] + fuseCAr = [] + fuseDouble = [] + for i in range(lring): + fuseCAr.append(-1) + for j in range(cring): + if ring[i] in CAr[j]: + fuseCAr[i] = j + break + if i > 1: + if (fuseCAr[i] == fuseCAr[i - 1]) & (fuseCAr[i] >= 0): + fuseDouble.append(i) + atom = mol.GetAtomWithIdx(ring[i]) + if atom.GetSymbol() != "C": + hetero.append(i) + atom_bonds = atom.GetBonds() + btype = [bond.GetBondType() for bond in atom_bonds] + # print(btype) + if BondType.DOUBLE in btype: + hasDouble.append(i) + bond = mol.GetBondBetweenAtoms(ring[i], ring[(i + 1) % lring]) + + if (fuseCAr[0] == fuseCAr[lring - 1]) & (fuseCAr[0] >= 0): + fuseDouble.append(0) + + if (len(hetero) > 0) | (len(hasDouble) > 0): + n_targetDouble = lring // 2 + n_targetEdit = lring + hetero_prior = {i: hetero_priority(ring[i], mol) for i in hetero} + hetero.sort(key=lambda i: hetero_prior[i]) + for i in hasDouble: + d1, e1 = search_and_assign_ring(mol, ring, hetero, i, forward=True) + d2, e2 = search_and_assign_ring(mol, ring, hetero, i, forward=False) + n_targetDouble -= d1 + d2 + 1 + n_targetEdit -= e1 + e2 + for i in fuseDouble: + bond = mol.GetBondBetweenAtoms(ring[i], ring[(i - 1) % lring]) + if bond.GetBondType() == BondType.AROMATIC: + bond.SetBondType(BondType.DOUBLE) + mol_edit_log(mol, ring[i], ring[(i - 1) % lring]) + d1, e1 = search_and_assign_ring(mol, ring, hetero, i, forward=True) + d2, e2 = search_and_assign_ring( + mol, ring, hetero, (i - 1) % lring, forward=False + ) + n_targetDouble -= d1 + d2 + 1 + n_targetEdit -= e1 + e2 + 1 + for i in hetero: + atom = mol.GetAtomWithIdx(ring[i]) + if (hetero_prior[i] == 2) | (n_targetDouble * 2 >= n_targetEdit): + forward_btype = mol.GetBondBetweenAtoms( + ring[i], ring[(i + 1) % lring] + ).GetBondType() + backward_btype = mol.GetBondBetweenAtoms( + ring[i], ring[(i - 1) % lring] + ).GetBondType() + if forward_btype != BondType.AROMATIC: + switch = forward_btype == BondType.DOUBLE + d1, e1 = search_and_assign_ring( + mol, ring, hetero, i, forward=False, start_switch=switch + ) + d2 = e2 = 0 + elif backward_btype != BondType.AROMATIC: + switch = backward_btype == BondType.DOUBLE + d1, e1 = search_and_assign_ring( + mol, ring, hetero, i, forward=True, start_switch=switch + ) + d2 = e2 = 0 + else: + d1, e1 = search_and_assign_ring( + mol, ring, hetero, i, forward=True, start_switch=True + ) + d2, e2 = search_and_assign_ring( + mol, ring, hetero, i, forward=False, start_switch=False + ) + n_targetDouble -= d1 + d2 + n_targetEdit -= e1 + e2 + else: + d1, e1 = search_and_assign_ring( + mol, ring, hetero, i, forward=True, start_switch=True + ) + d2, e2 = search_and_assign_ring( + mol, ring, hetero, i, forward=False, start_switch=True + ) + n_targetDouble -= d1 + d2 + n_targetEdit -= e1 + e2 + + for ring in CAr: + lring = len(ring) + for i in range(lring): + bond = mol.GetBondBetweenAtoms(ring[i], ring[(i + 1) % lring]) + bond.SetBondType(BondType.AROMATIC) + print("Manual kekulization for aromatic heterocycles:") + print_bondtypes(mol, rings) + + atoms = mol.GetAtoms() + for i in range(len(atoms)): + mol.ReplaceAtom(i, Chem.Atom(atoms[i].GetSymbol())) + mol_edited = mol.GetMol() + # charge assignment + if assign_formal_charge: + mol_edited = regularize_formal_charges(mol_edited, sanitize=False) + if not sanitize: + return mol_edited + else: + try: + Chem.SanitizeMol(mol_edited) + return mol_edited + except Exception as e: + raise RuntimeError( + f"Manual kekulization for aromatic heterocycles failed, below are errors:\n\t {e}" + ) + + +def convert_by_obabel( + mol, cache_dir=os.path.join(os.getcwd(), ".cache"), obabel_path="obabel" +): + from openbabel import openbabel + from rdkit import Chem + + if not os.path.exists(cache_dir): + os.mkdir(cache_dir) + if mol.HasProp("_Name"): + name = mol.GetProp("_Name") + else: + name = f"mol{int(time.time())}" + mol_file_in = os.path.join(cache_dir, f"{name}.mol") + mol_file_out = os.path.join(cache_dir, f"{name}_obabel.mol") + Chem.MolToMolFile(mol, mol_file_in, kekulize=False) + obConversion = openbabel.OBConversion() + obConversion.SetInAndOutFormats("mol", "mol") + mol = openbabel.OBMol() + obConversion.ReadFile(mol, mol_file_in) + obConversion.WriteFile(mol, mol_file_out) + mol_obabel = Chem.MolFromMolFile(mol_file_out, removeHs=False, sanitize=False) + return mol_obabel + + +def super_sanitize_mol(mol, name=None, verbose=True): + from rdkit import Chem + + if name is None: + if mol.HasProp("_Name"): + name = mol.GetProp("_Name") + else: + name = "mol" + try: + if verbose: + print("=====Stage 1: use Hermite procedure=====") + # use our procedure + mol = sanitize_mol(mol, verbose) + mol = regularize_formal_charges(mol, sanitize=False) + mol_copy = deepcopy(mol) + Chem.SanitizeMol(mol_copy) + if verbose: + print(name, "Success.") + return mol_copy + except Exception as e: + try: + if verbose: + print( + "Hermite procedure failed, maybe due to unsupported representation of hetero aromatic rings, re-try with obabel" + ) + print("=====Stage 2: re-try with obabel=====") + mol = convert_by_obabel(mol) + mol = sanitize_mol(mol, verbose) + mol = kekulize_aromatic_heterocycles( + mol, assign_formal_charge=False, sanitize=False + ) # aromatic heterocycles + mol = regularize_formal_charges(mol, sanitize=False) + mol_copy = deepcopy(mol) + Chem.SanitizeMol(mol_copy) + if verbose: + print(name, "Success.") + return mol_copy + except Exception as e: + if verbose: + print(e) + print(name, "Failed!") + return None + + +class Sanitizer: + def __init__(self, level="medium", raise_errors=True, verbose=False): + """Set up sanitizer. + --------. + + Parameters + ---------- + level : 'low', 'medium' or 'high'. + `low` - use rdkit.Chem.SanitizeMol() to sanitize + `medium` - before using rdkit, assign formal charges of each atom first, which requires + the rightness of bond order information + `high` - try to regularize bond order of nitro, phosphate, sulfate, nitrine, guanidine, + pyridine-oxide function groups and aromatic heterocycles. If failed, the program + will call obabel to pre-process the mol object and re-try the procedure. + raise_errors : bool, default=True + If True, raise SanitizeError when failed. + verbose : bool, default=False + If True, print error information when failed. + """ + self._check_level(level) + self.level = level + self.raise_errors = raise_errors + self.verbose = verbose + + def _check_level(self, level): + if level not in ["low", "medium", "high"]: + raise ValueError( + f"Invalid level '{level}', please set to 'low', 'medium' or 'high'" + ) + + def _handle_exception(self, error_info): + if self.raise_errors: + raise SanitizeError(error_info) + elif self.verbose: + print(error_info) + + def sanitize(self, mol): + """Sanitize mol according to `self.level`. If failed, return None.""" + from rdkit import Chem + + if self.level == "low": + try: + Chem.SanitizeMol(mol) + return mol + except Exception as e: + error_info = f"Sanitization Failed, please use more strict sanitizer by setting 'level' to 'medium' or 'high'. The error occurs:\n\t{e}" + self._handle_exception(error_info) + return None + elif self.level == "medium": + try: + mol = regularize_formal_charges(mol, sanitize=False) + Chem.SanitizeMol(mol) + return mol + except Exception as e: + error_info = f"Sanitization Failed, please use more strict sanitizer by setting 'level' to 'high'. The error occurs:\n\t{e}" + self._handle_exception(error_info) + return None + elif self.level == "high": + mol = super_sanitize_mol(mol, verbose=self.verbose) + error_info = "Sanitization Failed. Please check your molecule file." + if mol is None: + self._handle_exception(error_info) + return mol + + +class SanitizeError(Exception): + def __init__(self, content="Sanitization Failed."): + self.content = content + + def __str__(self): + return self.content + + def __repr__(self): + return self.__str__() diff --git a/dpdata/formats/rdkit/utils.py b/dpdata/formats/rdkit/utils.py new file mode 100644 index 000000000..efeef6070 --- /dev/null +++ b/dpdata/formats/rdkit/utils.py @@ -0,0 +1,131 @@ +from __future__ import annotations + +import numpy as np + + +def mol_to_system_data(mol): + from rdkit import Chem + + if not isinstance(mol, Chem.rdchem.Mol): + raise TypeError(f"rdkit.Chem.Mol required, not {type(mol)}") + + num_confs = mol.GetNumConformers() + if num_confs: + atom_symbols = [at.GetSymbol() for at in mol.GetAtoms()] + atom_names, atom_types, atom_numbs = np.unique( + atom_symbols, return_inverse=True, return_counts=True + ) + coords = np.array([conf.GetPositions() for conf in mol.GetConformers()]) + bonds = np.array( + [ + [ + bond.GetBeginAtomIdx(), + bond.GetEndAtomIdx(), + bond.GetBondTypeAsDouble(), + ] + for bond in mol.GetBonds() + ] + ) + formal_charges = np.array( + [at.GetFormalCharge() for at in mol.GetAtoms()], dtype=np.int32 + ) + data = {} + data["atom_numbs"] = list(atom_numbs) + data["atom_names"] = list(atom_names) + data["atom_types"] = atom_types + data["cells"] = np.array( + [ + [[100.0, 0.0, 0.0], [0.0, 100.0, 0.0], [0.0, 0.0, 100.0]] + for _ in range(num_confs) + ] + ) + data["coords"] = coords + data["bonds"] = bonds + data["formal_charges"] = formal_charges + data["orig"] = np.array([0.0, 0.0, 0.0]) + # other properties + if mol.HasProp("_Name"): + data["_name"] = mol.GetProp("_Name") + return data + else: + raise ValueError("The moleclue does not contain 3-D conformers") + + +def system_data_to_mol(data): + from rdkit import Chem + + mol_ed = Chem.RWMol() + atom_symbols = [data["atom_names"][i] for i in data["atom_types"]] + # add atoms + for atom_type in data["atom_types"]: + symbol = data["atom_names"][atom_type] + atom = Chem.Atom(symbol) + mol_ed.AddAtom(atom) + # add bonds + for bond_info in data["bonds"]: + if bond_info[2] == 1: + mol_ed.AddBond(int(bond_info[0]), int(bond_info[1]), Chem.BondType.SINGLE) + elif bond_info[2] == 2: + mol_ed.AddBond(int(bond_info[0]), int(bond_info[1]), Chem.BondType.DOUBLE) + elif bond_info[2] == 3: + mol_ed.AddBond(int(bond_info[0]), int(bond_info[1]), Chem.BondType.TRIPLE) + elif bond_info[2] == 1.5: + mol_ed.AddBond(int(bond_info[0]), int(bond_info[1]), Chem.BondType.AROMATIC) + # set conformers + for frame_idx in range(data["coords"].shape[0]): + conf = Chem.rdchem.Conformer(len(data["atom_types"])) + for atom_idx in range(len(data["atom_types"])): + conf.SetAtomPosition(atom_idx, data["coords"][frame_idx][atom_idx]) + mol_ed.AddConformer(conf, assignId=True) + mol = mol_ed.GetMol() + # set formal charges + for idx, atom in enumerate(mol.GetAtoms()): + atom.SetFormalCharge(int(data["formal_charges"][idx])) + # set mol name + if "_name" in list(data.keys()): + mol.SetProp("_Name", data["_name"]) + # sanitize + Chem.SanitizeMol(mol_ed) + return mol + + +def check_same_atom(atom_1, atom_2): + if atom_1.GetIdx() != atom_2.GetIdx(): + return False + elif atom_1.GetSymbol() != atom_2.GetSymbol(): + return False + else: + return True + + +def check_same_molecule(mol_1, mol_2): + flag = True + for bond_1, bond_2 in zip(mol_1.GetBonds(), mol_2.GetBonds()): + begin_atom_1, end_atom_1 = bond_1.GetBeginAtom(), bond_1.GetEndAtom() + begin_atom_2, end_atom_2 = bond_2.GetBeginAtom(), bond_2.GetEndAtom() + if not check_same_atom(begin_atom_1, begin_atom_2): + flag = False + break + elif not check_same_atom(end_atom_1, end_atom_2): + flag = False + break + return flag + + +def check_molecule_list(mols): + flag = True + for mol in mols[1:]: + if not check_same_molecule(mol, mols[0]): + flag = False + break + return flag + + +def combine_molecules(mols): + if check_molecule_list(mols): + for mol in mols[1:]: + for conf in mol.GetConformers(): + mols[0].AddConformer(conf, assignId=True) + return mols[0] + else: + raise ValueError("molecules are not of the same topology.") diff --git a/dpdata/formats/siesta/__init__.py b/dpdata/formats/siesta/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dpdata/formats/siesta/aiMD_output.py b/dpdata/formats/siesta/aiMD_output.py new file mode 100644 index 000000000..daa4f6a25 --- /dev/null +++ b/dpdata/formats/siesta/aiMD_output.py @@ -0,0 +1,187 @@ +# !/usr/bin/python3 +from __future__ import annotations + +import numpy as np + +ev2ev = 1 +ang2ang = 1 + + +#############################read output##################################### +def get_single_line_tail(fin, keyword, num=1): + file = open(fin) + part_res = [] + for value in file: + if keyword in value: + temp = len(value.split()) - num + part_res.append(float(value.split()[temp])) + + file.close() + return part_res + + +## atomnum: number of atoms, row numbers +## begin_column: begin column num +## read_column_num: read column num +## column_num: the column number in nxet reading line +def extract_keyword( + fout, + keyword, + down_line_num, + begin_column, + read_column_num, + is_repeated_read, + column_num, +): + file = open(fout) + ret = [] + part_ret = [] + flag = 0 + idx = 0 + extr_frame = 0 + length = obtain_nframe(fout) + # for (num,value) in enumerate(file): + for value in file: + if keyword in value: + flag = 1 + continue + if flag == 1: + if idx < down_line_num: + idx += 1 + else: + flag = 0 + part_ret.append(np.array(ret)) + ret = [] + extr_frame += 1 + if extr_frame == length: + file.close() + return part_ret + ## is_repeated_read = 0: only read 1 time for SCF + ## is_repeated_read = 1: read all for aiMD --> get all frames + if is_repeated_read: + idx = 0 + continue + + for i in range(begin_column, read_column_num): + if len(value.split()) == column_num: + if not value.split()[i].isalpha(): + ret.append(float(value.strip().split()[i])) + else: + ret.append(value.strip().split()[i]) + continue + file.close() + return part_ret + + +def obtain_nframe(fname): + fp = open(fname) + flag = False + idx = 0 + temp = 0 + for ii in fp: + if "siesta: Stress tensor (static) (eV/Ang**3):" in ii: + flag = True + continue + if flag: + if "siesta: Pressure (static):" not in ii: + if len(ii.split()) == 3: + temp += 1 + if temp == 3: + idx += 1 + # print(idx) + flag = False + temp = 0 + fp.close() + return idx + + +def get_atom_types(fout, atomnums): + covert_type = extract_keyword( + fout, "outcoor: Atomic coordinates (Ang):", atomnums, 3, 4, 0, 6 + )[0] + atomtype = [] + # print(covert_type) + for i in range(0, len(covert_type)): + atomtype.append(int(covert_type[i]) - 1) + return atomtype + + +def get_atom_name(fout): + file = open(fout) + ret = [] + for value in file: + if "Species number:" in value: + for j in range(len(value.split())): + if value.split()[j] == "Label:": + ret.append(value.split()[j + 1]) + break + file.close() + return ret + + +def get_atom_numbs(atomtypes): + atom_numbs = [] + for i in set(atomtypes): + atom_numbs.append(atomtypes.count(i)) + return atom_numbs + + +def get_virial(fout, cell): + viri = extract_keyword( + fout, "siesta: Stress tensor (static) (eV/Ang**3):", 3, 0, 3, 1, 3 + ) + vols = [] + length = obtain_nframe(fout) + for ii in range(length): + vols.append(np.linalg.det(cell[ii].reshape([3, 3]))) + for jj in range(len(viri[ii])): + ## siesta: 1eV/A^3= 1.60217*10^11 Pa , ---> qe: kBar=10^8Pa + # ii *= vols[idx] * 1e3 / 1.602176621e6 * (1.602176621e3) + viri[ii][jj] *= vols[ii] + return viri + + +def covert_dimension(arr, num): + arr = np.array(arr) + frames = len(arr) + ret = np.zeros((frames, num, 3)) + for i in range(frames): + ret[i] = arr[i].reshape(num, 3) + return ret + + +def get_aiMD_frame(fname): + NumberOfSpecies = int( + get_single_line_tail(fname, "redata: Number of Atomic Species")[0] + ) + atom_names = get_atom_name(fname) + tot_natoms = int(get_single_line_tail(fname, "Number of atoms", 3)[0]) + + atom_types = get_atom_types(fname, tot_natoms) + atom_numbs = get_atom_numbs(atom_types) + assert max(atom_types) + 1 == NumberOfSpecies + + cell = extract_keyword(fname, "outcell: Unit cell vectors (Ang):", 3, 0, 3, 1, 3) + coord = extract_keyword( + fname, "outcoor: Atomic coordinates (Ang):", tot_natoms, 0, 3, 1, 6 + ) + energy = get_single_line_tail(fname, "siesta: E_KS(eV) =") + force = extract_keyword( + fname, "siesta: Atomic forces (eV/Ang):", tot_natoms, 1, 4, 1, 4 + ) + virial = get_virial(fname, cell) + + cells = covert_dimension(np.array(cell), 3) + coords = covert_dimension(np.array(coord), tot_natoms) + forces = covert_dimension(np.array(force), tot_natoms) + virials = covert_dimension(np.array(virial), 3) + return ( + atom_names, + atom_numbs, + np.array(atom_types), + cells, + coords, + np.array(energy), + forces, + virials, + ) diff --git a/dpdata/formats/siesta/output.py b/dpdata/formats/siesta/output.py new file mode 100644 index 000000000..0c944d5b5 --- /dev/null +++ b/dpdata/formats/siesta/output.py @@ -0,0 +1,142 @@ +#!/usr/bin/python3 +from __future__ import annotations + +import numpy as np + +ev2ev = 1 +ang2ang = 1 + + +#############################read output##################################### +def get_single_line_tail(fin, keyword, num=1): + file = open(fin) + res = [] + for value in file: + if keyword in value: + temp = len(value.split()) - num + res.append(float(value.split()[temp])) + file.close() + return res + return res + + +## atomnum: number of atoms, row numbers +## begin_column: begin column num +## column_num: read column num +def extract_keyword(fout, keyword, down_line_num, begin_column, column_num): + file = open(fout) + ret = [] + flag = 0 + idx = 0 + # for (num,value) in enumerate(file): + for value in file: + if keyword in value: + flag = 1 + continue + if flag == 1: + if idx < down_line_num: + idx += 1 + else: + flag = 0 + continue + if len(value.split()) >= column_num: + for i in range(begin_column, column_num): + if not value.split()[i].isalpha(): + ret.append(float(value.strip().split()[i])) + else: + ret.append(value.strip().split()[i]) + ## compatible siesta-4.0.2 and siesta-4.1-b4 + else: + flag = 0 + idx = 0 + file.close() + return ret + + +def get_atom_types(fout, atomnums): + covert_type = extract_keyword( + fout, "outcoor: Atomic coordinates (Ang):", atomnums, 3, 4 + ) + atomtype = [] + for i in range(0, len(covert_type)): + atomtype.append(int(covert_type[i]) - 1) + return atomtype + + +def get_atom_name(fout): + file = open(fout) + ret = [] + for value in file: + if "Species number:" in value: + for j in range(len(value.split())): + if value.split()[j] == "Label:": + ret.append(value.split()[j + 1]) + break + file.close() + return ret + + +def get_atom_numbs(atomtypes): + atom_numbs = [] + for i in set(atomtypes): + atom_numbs.append(atomtypes.count(i)) + return atom_numbs + + +def get_virial(fout, cells): + vols = [] + for ii in cells: + ### calucate vol + vols.append(np.linalg.det(ii.reshape([3, 3]))) + ret = extract_keyword(fout, "siesta: Stress tensor (static) (eV/Ang**3):", 3, 1, 4) + ret = np.array([ret]) + for idx, ii in enumerate(ret): + ## siesta: 1eV/A^3= 1.60217*10^11 Pa , ---> qe: kBar=10^8Pa + # ii *= vols[idx] * 1e3 / 1.602176621e6 * (1.602176621e3) + ii *= vols[idx] + return ret + + +def obtain_frame(fname): + NumberOfSpecies = int( + get_single_line_tail(fname, "redata: Number of Atomic Species")[0] + ) + atom_names = get_atom_name(fname) + tot_natoms = int(get_single_line_tail(fname, "Number of atoms", 3)[0]) + atom_types = get_atom_types(fname, tot_natoms) + atom_numbs = get_atom_numbs(atom_types) + assert max(atom_types) + 1 == NumberOfSpecies + cell = extract_keyword(fname, "outcell: Unit cell vectors (Ang):", 3, 0, 3) + coord = extract_keyword( + fname, "outcoor: Atomic coordinates (Ang):", tot_natoms, 0, 3 + ) + energy = get_single_line_tail(fname, "siesta: E_KS(eV) =") + force = extract_keyword(fname, "siesta: Atomic forces (eV/Ang):", tot_natoms, 1, 4) + virial = get_virial(fname, np.array([cell])) + + cell = np.array(cell).reshape(3, 3) + coord = np.array(coord).reshape(tot_natoms, 3) + force = np.array(force).reshape(tot_natoms, 3) + virial = np.array(virial).reshape(3, 3) + + # data = {} + # data['orig'] = np.array([0, 0, 0]) + # data['atom_names'] = atom_names + # data['atom_numbs'] = atom_numbs + # data['atom_types'] = np.array(atom_types) + # data['cells'] = np.array([cell]) + # data['coords'] = np.array([coord]) + # data['energies'] = np.array([energy]) + # data['forces'] = np.array([force]) + # data['virials'] = virial + # return data + return ( + atom_names, + atom_numbs, + np.array(atom_types), + np.array([cell]), + np.array([coord]), + np.array(energy), + np.array([force]), + np.array([virial]), + ) diff --git a/dpdata/formats/vasp/__init__.py b/dpdata/formats/vasp/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dpdata/formats/vasp/outcar.py b/dpdata/formats/vasp/outcar.py new file mode 100644 index 000000000..a16fd6f9f --- /dev/null +++ b/dpdata/formats/vasp/outcar.py @@ -0,0 +1,275 @@ +from __future__ import annotations + +import re +import warnings + +import numpy as np + + +def atom_name_from_potcar_string(instr: str) -> str: + """Get atom name from a potcar element name. + + e.g. Sn_d -> Sn + + Parameters + ---------- + instr : str + input potcar elemenet name + + Returns + ------- + name: str + name of atoms + """ + if "_" in instr: + # for case like : TITEL = PAW_PBE Sn_d 06Sep2000 + return instr.split("_")[0] + else: + return instr + + +def system_info( + lines: list[str], + type_idx_zero: bool = False, +) -> tuple[list[str], list[int], np.ndarray, int | None, int | None]: + """Get system information from lines of an OUTCAR file. + + Parameters + ---------- + lines : list[str] + the lines of the OUTCAR file + type_idx_zero : bool + if true atom types starts from 0 otherwise from 1. + + Returns + ------- + atom_names: list[str] + name of atoms + atom_numbs: list[int] + number of atoms that have a certain name. same length as atom_names + atom_types: np.ndarray + type of each atom, the array has same lenght as number of atoms + nelm: optional[int] + the value of NELM parameter + nwrite: optional[int] + the value of NWRITE parameter + """ + atom_names = [] + atom_names_potcar = [] + atom_numbs = None + nelm = None + nwrite = None + for ii in lines: + if "TITEL" in ii: + # get atom names from POTCAR info, tested only for PAW_PBE ... + # for case like : TITEL = PAW_PBE Sn_d 06Sep2000 + _ii = ii.split()[3] + atom_names.append(atom_name_from_potcar_string(_ii)) + elif "POTCAR:" in ii: + # get atom names from POTCAR info, tested only for PAW_PBE ... + # for case like : POTCAR: PAW_PBE Ti 08Apr2002 + _ii = ii.split()[2] + atom_names_potcar.append(atom_name_from_potcar_string(_ii)) + # a stricker check for "NELM"; compatible with distingct formats in different versions(6 and older, newers_expect-to-work) of vasp + elif nelm is None: + m = re.search(r"NELM\s*=\s*(\d+)", ii) + if m: + nelm = int(m.group(1)) + elif nwrite is None: + m = re.search(r"NWRITE\s*=\s*(\d+)", ii) + if m: + nwrite = int(m.group(1)) + if "ions per type" in ii: + atom_numbs_ = [int(s) for s in ii.split()[4:]] + if atom_numbs is None: + atom_numbs = atom_numbs_ + else: + assert atom_numbs == atom_numbs_, "in consistent numb atoms in OUTCAR" + if len(atom_names) == 0: + # try to use atom_names_potcar + if len(atom_names_potcar) == 0: + raise ValueError("cannot get atom names from potcar") + nnames = len(atom_names_potcar) + # the names are repeated. check if it is the case + assert atom_names_potcar[: nnames // 2] == atom_names_potcar[nnames // 2 :] + atom_names = atom_names_potcar[: nnames // 2] + assert nelm is not None, "cannot find maximum steps for each SC iteration" + assert atom_numbs is not None, "cannot find ion type info in OUTCAR" + if len(atom_numbs) != len(atom_names): + raise RuntimeError( + f"The number of the atom numbers per each type ({len(atom_numbs)}) " + f"does not match that of the atom types ({len(atom_names)}) detected " + f"from the OUTCAR. This issue may be cause by a bug in vasp <= 6.3. " + f"Please try to convert data from vasprun.xml instead." + ) + atom_names = atom_names[: len(atom_numbs)] + atom_types = [] + for idx, ii in enumerate(atom_numbs): + for jj in range(ii): + if type_idx_zero: + atom_types.append(idx) + else: + atom_types.append(idx + 1) + return atom_names, atom_numbs, np.array(atom_types, dtype=int), nelm, nwrite + + +def get_outcar_block(fp, ml=False): + blk = [] + energy_token = ["free energy TOTEN", "free energy ML TOTEN"] + ml_index = int(ml) + for ii in fp: + if not ii: + return blk + blk.append(ii.rstrip("\n")) + if energy_token[ml_index] in ii: + return blk + return blk + + +def check_outputs(coord, cell, force): + if len(force) == 0: + raise ValueError("cannot find forces in OUTCAR block") + if len(coord) == 0: + raise ValueError("cannot find coordinates in OUTCAR block") + if len(cell) == 0: + raise ValueError("cannot find cell in OUTCAR block") + return True + + +# we assume that the force is printed ... +def get_frames(fname, begin=0, step=1, ml=False, convergence_check=True): + with open(fname) as fp: + return _get_frames_lower( + fp, + fname, + begin=begin, + step=step, + ml=ml, + convergence_check=convergence_check, + ) + + +def _get_frames_lower(fp, fname, begin=0, step=1, ml=False, convergence_check=True): + blk = get_outcar_block(fp) + + atom_names, atom_numbs, atom_types, nelm, nwrite = system_info( + blk, type_idx_zero=True + ) + ntot = sum(atom_numbs) + + all_coords = [] + all_cells = [] + all_energies = [] + all_forces = [] + all_virials = [] + + cc = 0 + rec_failed = [] + while len(blk) > 0: + if cc >= begin and (cc - begin) % step == 0: + coord, cell, energy, force, virial, is_converge = analyze_block( + blk, ntot, nelm, ml + ) + if energy is None: + break + if nwrite == 0: + has_label = len(force) > 0 and len(coord) > 0 and len(cell) > 0 + if not has_label: + warnings.warn("cannot find labels in the frame, ingore") + else: + has_label = check_outputs(coord, cell, force) + if (is_converge or not convergence_check) and has_label: + all_coords.append(coord) + all_cells.append(cell) + all_energies.append(energy) + all_forces.append(force) + if virial is not None: + all_virials.append(virial) + if not is_converge: + rec_failed.append(cc + 1) + + blk = get_outcar_block(fp, ml) + cc += 1 + + if len(rec_failed) > 0: + prt = ( + "so they are not collected." + if convergence_check + else "but they are still collected due to the requirement for ignoring convergence checks." + ) + warnings.warn( + f"The following structures were unconverged: {rec_failed}; " + prt + ) + + if len(all_virials) == 0: + all_virials = None + else: + all_virials = np.array(all_virials) + return ( + atom_names, + atom_numbs, + atom_types, + np.array(all_cells), + np.array(all_coords), + np.array(all_energies), + np.array(all_forces), + all_virials, + ) + + +def analyze_block(lines, ntot, nelm, ml=False): + coord = [] + cell = [] + energy = None + force = [] + virial = None + is_converge = True + sc_index = 0 + # select different searching tokens based on the ml label + energy_token = ["free energy TOTEN", "free energy ML TOTEN"] + energy_index = [4, 5] + virial_token = ["FORCE on cell =-STRESS in cart. coord. units", "ML FORCE"] + virial_index = [14, 4] + cell_token = ["VOLUME and BASIS", "ML FORCE"] + cell_index = [5, 12] + ml_index = int(ml) + for idx, ii in enumerate(lines): + # if set ml == True, is_converged will always be True + if ("Iteration" in ii) and (not ml): + sc_index = int(ii.split()[3][:-1]) + if sc_index >= nelm: + is_converge = False + elif energy_token[ml_index] in ii: + energy = float(ii.split()[energy_index[ml_index]]) + return coord, cell, energy, force, virial, is_converge + elif cell_token[ml_index] in ii: + for dd in range(3): + tmp_l = lines[idx + cell_index[ml_index] + dd] + cell.append([float(ss) for ss in tmp_l.replace("-", " -").split()[0:3]]) + elif virial_token[ml_index] in ii: + in_kB_index = virial_index[ml_index] + while idx + in_kB_index < len(lines) and ( + not lines[idx + in_kB_index].split()[0:2] == ["in", "kB"] + ): + in_kB_index += 1 + assert idx + in_kB_index < len(lines), ( + 'ERROR: "in kB" is not found in OUTCAR. Unable to extract virial.' + ) + tmp_v = [float(ss) for ss in lines[idx + in_kB_index].split()[2:8]] + virial = np.zeros([3, 3]) + virial[0][0] = tmp_v[0] + virial[1][1] = tmp_v[1] + virial[2][2] = tmp_v[2] + virial[0][1] = tmp_v[3] + virial[1][0] = tmp_v[3] + virial[1][2] = tmp_v[4] + virial[2][1] = tmp_v[4] + virial[0][2] = tmp_v[5] + virial[2][0] = tmp_v[5] + elif "TOTAL-FORCE" in ii and (("ML" in ii) == ml): + for jj in range(idx + 2, idx + 2 + ntot): + tmp_l = lines[jj] + info = [float(ss) for ss in tmp_l.split()] + coord.append(info[:3]) + force.append(info[3:6]) + return coord, cell, energy, force, virial, is_converge diff --git a/dpdata/formats/vasp/poscar.py b/dpdata/formats/vasp/poscar.py new file mode 100644 index 000000000..78b8dbbeb --- /dev/null +++ b/dpdata/formats/vasp/poscar.py @@ -0,0 +1,134 @@ +#!/usr/bin/python3 +from __future__ import annotations + +import numpy as np + + +def _to_system_data_lower(lines, cartesian=True, selective_dynamics=False): + def move_flag_mapper(flag): + if flag == "T": + return True + elif flag == "F": + return False + else: + raise RuntimeError(f"Invalid move flag: {flag}") + + """Treat as cartesian poscar.""" + system = {} + system["atom_names"] = [str(ii) for ii in lines[5].split()] + system["atom_numbs"] = [int(ii) for ii in lines[6].split()] + scale = float(lines[1]) + cell = [] + move_flags = [] + for ii in range(2, 5): + boxv = [float(jj) for jj in lines[ii].split()] + boxv = np.array(boxv) * scale + cell.append(boxv) + system["cells"] = [np.array(cell)] + natoms = sum(system["atom_numbs"]) + coord = [] + for ii in range(8, 8 + natoms): + tmp = lines[ii].split() + tmpv = [float(jj) for jj in tmp[:3]] + if cartesian: + tmpv = np.array(tmpv) * scale + else: + tmpv = np.matmul(np.array(tmpv), system["cells"][0]) + coord.append(tmpv) + if selective_dynamics: + if len(tmp) == 6: + move_flags.append(list(map(move_flag_mapper, tmp[3:]))) + else: + raise RuntimeError( + f"Invalid move flags, should be 6 columns, got {tmp}" + ) + + system["coords"] = [np.array(coord)] + system["orig"] = np.zeros(3) + atom_types = [] + for idx, ii in enumerate(system["atom_numbs"]): + for jj in range(ii): + atom_types.append(idx) + system["atom_types"] = np.array(atom_types, dtype=int) + system["cells"] = np.array(system["cells"]) + system["coords"] = np.array(system["coords"]) + if move_flags: + move_flags = np.array(move_flags, dtype=bool) + move_flags = move_flags.reshape((1, natoms, 3)) + system["move"] = np.array(move_flags, dtype=bool) + return system + + +def to_system_data(lines): + # remove the line that has 'selective dynamics' + selective_dynamics = False + if lines[7][0] == "S" or lines[7][0] == "s": + selective_dynamics = True + lines.pop(7) + is_cartesian = lines[7][0] in ["C", "c", "K", "k"] + if not is_cartesian: + if lines[7][0] not in ["d", "D"]: + raise RuntimeError( + "seem not to be a valid POSCAR of vasp 5.x, may be a POSCAR of vasp 4.x?" + ) + return _to_system_data_lower(lines, is_cartesian, selective_dynamics) + + +def from_system_data(system, f_idx=0, skip_zeros=True): + ret = "" + for ii, name in zip(system["atom_numbs"], system["atom_names"]): + if ii == 0: + continue + ret += "%s%d " % (name, ii) # noqa: UP031 + ret += "\n" + ret += "1.0\n" + for ii in system["cells"][f_idx]: + for jj in ii: + ret += f"{jj:.16e} " + ret += "\n" + for idx, ii in enumerate(system["atom_names"]): + if system["atom_numbs"][idx] == 0: + continue + ret += f"{ii} " + ret += "\n" + for ii in system["atom_numbs"]: + if ii == 0: + continue + ret += "%d " % ii # noqa: UP031 + ret += "\n" + move = system.get("move", None) + if move is not None and len(move) > 0: + ret += "Selective Dynamics\n" + + # should use Cartesian for VESTA software + ret += "Cartesian\n" + atype = system["atom_types"] + posis = system["coords"][f_idx] + # atype_idx = [[idx,tt] for idx,tt in enumerate(atype)] + # sort_idx = np.argsort(atype, kind = 'mergesort') + sort_idx = np.lexsort((np.arange(len(atype)), atype)) + atype = atype[sort_idx] + posis = posis[sort_idx] + if move is not None and len(move) > 0: + move = move[f_idx][sort_idx] + + if isinstance(move, np.ndarray): + move = move.tolist() + + posi_list = [] + for idx in range(len(posis)): + ii_posi = posis[idx] + line = f"{ii_posi[0]:15.10f} {ii_posi[1]:15.10f} {ii_posi[2]:15.10f}" + if move is not None and len(move) > 0: + move_flags = move[idx] + if not isinstance(move_flags, list) or len(move_flags) != 3: + raise RuntimeError( + f"Invalid move flags: {move_flags}, should be a list of 3 bools" + ) + line += " " + " ".join("T" if flag else "F" for flag in move_flags) + + posi_list.append(line) + + posi_list.append("") + ret += "\n".join(posi_list) + return ret diff --git a/dpdata/formats/vasp/xml.py b/dpdata/formats/vasp/xml.py new file mode 100755 index 000000000..1b407c254 --- /dev/null +++ b/dpdata/formats/vasp/xml.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import xml.etree.ElementTree as ET +from typing import Any + +import numpy as np + + +def check_name(item, name): + assert item.attrib["name"] == name, ( + "item attrib '{}' dose not math required '{}'".format(item.attrib["name"], name) + ) + + +def get_varray(varray): + array = [] + for vv in varray.findall("v"): + array.append([float(ii) for ii in vv.text.split()]) + return np.array(array) + + +def analyze_atominfo(atominfo_xml): + check_name(atominfo_xml.find("array"), "atoms") + eles = [] + types = [] + visited = set() + for ii in atominfo_xml.find("array").find("set"): + atom_type = int(ii.findall("c")[1].text) + if atom_type not in visited: + eles.append(ii.findall("c")[0].text.strip()) + visited.add(atom_type) + types.append(atom_type) + return eles, types + + +def analyze_calculation( + cc: Any, + nelm: int | None, +) -> tuple[np.ndarray, np.ndarray, float, np.ndarray, np.ndarray | None, bool | None]: + """Analyze a calculation block. + + Parameters + ---------- + cc : xml.etree.ElementTree.Element + The xml element for a ion step calculation + nelm : Optional[int] + The number nelm, if it is not None, convergence check is performed. + + Returns + ------- + posi : np.ndarray + The positions + cell : np.ndarray + The cell + ener : float + The energy + force : np.ndarray + The forces + str : Optional[np.ndarray] + The stress + is_converged: Optional[bool] + If the scf calculation is converged. Only return boolean when + nelm is not None. Otherwise return None. + + """ + structure_xml = cc.find("structure") + check_name(structure_xml.find("crystal").find("varray"), "basis") + check_name(structure_xml.find("varray"), "positions") + cell = get_varray(structure_xml.find("crystal").find("varray")) + posi = get_varray(structure_xml.find("varray")) + strs = None + is_converged = None + if nelm is not None: + niter = len(cc.findall(".//scstep")) + is_converged = niter < nelm + for vv in cc.findall("varray"): + if vv.attrib["name"] == "forces": + forc = get_varray(vv) + elif vv.attrib["name"] == "stress": + strs = get_varray(vv) + for ii in cc.find("energy").findall("i"): + if ii.attrib["name"] == "e_fr_energy": + ener = float(ii.text) + return posi, cell, ener, forc, strs, is_converged + + +def formulate_config(eles, types, posi, cell, ener, forc, strs_): + strs = strs_ / 1602 + natoms = len(types) + ntypes = len(eles) + ret = "" + ret += "#N %d %d\n" % (natoms, ntypes - 1) # noqa: UP031 + ret += "#C " + for ii in eles: + ret += " " + ii + ret += "\n" + ret += "##\n" + ret += f"#X {cell[0][0]:13.8f} {cell[0][1]:13.8f} {cell[0][2]:13.8f}\n" + ret += f"#Y {cell[1][0]:13.8f} {cell[1][1]:13.8f} {cell[1][2]:13.8f}\n" + ret += f"#Z {cell[2][0]:13.8f} {cell[2][1]:13.8f} {cell[2][2]:13.8f}\n" + ret += "#W 1.0\n" + ret += "#E %.10f\n" % (ener / natoms) + ret += f"#S {strs[0][0]:.9e} {strs[1][1]:.9e} {strs[2][2]:.9e} {strs[0][1]:.9e} {strs[1][2]:.9e} {strs[0][2]:.9e}\n" + ret += "#F\n" + for ii in range(natoms): + sp = np.matmul(cell.T, posi[ii]) + ret += "%d" % (types[ii] - 1) # noqa: UP031 + ret += f" {sp[0]:12.6f} {sp[1]:12.6f} {sp[2]:12.6f}" + ret += f" {forc[ii][0]:12.6f} {forc[ii][1]:12.6f} {forc[ii][2]:12.6f}" + ret += "\n" + return ret + + +def analyze(fname, type_idx_zero=False, begin=0, step=1, convergence_check=True): + """Deal with broken xml file.""" + all_posi = [] + all_cell = [] + all_ener = [] + all_forc = [] + all_strs = [] + cc = 0 + if convergence_check: + tree = ET.parse(fname) + root = tree.getroot() + parameters = root.find(".//parameters") + nelm = parameters.find(".//i[@name='NELM']") + # will check convergence + nelm = int(nelm.text) + else: + # not checking convergence + nelm = None + try: + for event, elem in ET.iterparse(fname): + if elem.tag == "atominfo": + eles, types = analyze_atominfo(elem) + types = np.array(types, dtype=int) + if type_idx_zero: + types = types - 1 + if elem.tag == "calculation": + posi, cell, ener, forc, strs, is_converged = analyze_calculation( + elem, nelm + ) + # record when not checking convergence or is_converged + # and the step criteria is satisfied + if ( + (nelm is None or is_converged) + and cc >= begin + and (cc - begin) % step == 0 + ): + all_posi.append(posi) + all_cell.append(cell) + all_ener.append(ener) + all_forc.append(forc) + if strs is not None: + all_strs.append(strs) + cc += 1 + except ET.ParseError: + return ( + eles, + types, + np.array(all_cell), + np.array(all_posi), + np.array(all_ener), + np.array(all_forc), + np.array(all_strs), + ) + return ( + eles, + types, + np.array(all_cell), + np.array(all_posi), + np.array(all_ener), + np.array(all_forc), + np.array(all_strs), + ) diff --git a/dpdata/formats/xyz/__init__.py b/dpdata/formats/xyz/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dpdata/formats/xyz/quip_gap_xyz.py b/dpdata/formats/xyz/quip_gap_xyz.py new file mode 100644 index 000000000..71e976de6 --- /dev/null +++ b/dpdata/formats/xyz/quip_gap_xyz.py @@ -0,0 +1,250 @@ +#!/usr/bin/env python3 +# %% +from __future__ import annotations + +import re +from collections import OrderedDict + +import numpy as np + +from dpdata.periodic_table import Element + + +class QuipGapxyzSystems: + """deal with QuipGapxyzFile.""" + + def __init__(self, file_name): + self.file_object = open(file_name) + self.block_generator = self.get_block_generator() + + def __iter__(self): + return self + + def __next__(self): + return self.handle_single_xyz_frame(next(self.block_generator)) + + def __del__(self): + self.file_object.close() + + def get_block_generator(self): + p3 = re.compile(r"^\s*(\d+)\s*") + while True: + line = self.file_object.readline() + if not line: + break + if p3.match(line): + atom_num = int(p3.match(line).group(1)) + lines = [] + lines.append(line) + for ii in range(atom_num + 1): + lines.append(self.file_object.readline()) + if not lines[-1]: + raise RuntimeError( + f"this xyz file may lack of lines, should be {atom_num + 2};lines:{lines}" + ) + yield lines + + @staticmethod + def handle_single_xyz_frame(lines): + atom_num = int(lines[0].strip("\n").strip()) + if len(lines) != atom_num + 2: + raise RuntimeError( + f"format error, atom_num=={atom_num}, {len(lines)}!=atom_num+2" + ) + data_format_line = lines[1].strip("\n").strip() + " " + field_value_pattern = re.compile( + r"(?P\S+)=(?P[\'\"]?)(?P.*?)(?P=quote)\s+" + ) + prop_pattern = re.compile( + r"(?P\w+?):(?P[a-zA-Z]):(?P\d+)" + ) + + data_format_list = [ + kv_dict.groupdict() + for kv_dict in field_value_pattern.finditer(data_format_line) + ] + field_dict = {} + for item in data_format_list: + field_dict[item["key"]] = item["value"] + + Properties = field_dict["Properties"] + prop_list = [ + kv_dict.groupdict() for kv_dict in prop_pattern.finditer(Properties) + ] + + data_lines = [] + for line in lines[2:]: + data_lines.append(list(filter(bool, line.strip().split()))) + data_array = np.array(data_lines) + used_colomn = 0 + + type_array = None + coords_array = None + Z_array = None + force_array = None + virials = None + for kv_dict in prop_list: + if kv_dict["key"] == "species": + if kv_dict["datatype"] != "S": + raise RuntimeError( + "datatype for species must be 'S' instead of {}".format( + kv_dict["datatype"] + ) + ) + field_length = int(kv_dict["value"]) + type_array = data_array[ + :, used_colomn : used_colomn + field_length + ].flatten() + used_colomn += field_length + continue + elif kv_dict["key"] == "pos": + if kv_dict["datatype"] != "R": + raise RuntimeError( + "datatype for pos must be 'R' instead of {}".format( + kv_dict["datatype"] + ) + ) + field_length = int(kv_dict["value"]) + coords_array = data_array[:, used_colomn : used_colomn + field_length] + used_colomn += field_length + continue + elif kv_dict["key"] == "Z": + if kv_dict["datatype"] != "I": + raise RuntimeError( + "datatype for pos must be 'R' instead of {}".format( + kv_dict["datatype"] + ) + ) + field_length = int(kv_dict["value"]) + Z_array = data_array[ + :, used_colomn : used_colomn + field_length + ].flatten() + used_colomn += field_length + continue + elif kv_dict["key"] == "force": + if kv_dict["datatype"] != "R": + raise RuntimeError( + "datatype for pos must be 'R' instead of {}".format( + kv_dict["datatype"] + ) + ) + field_length = int(kv_dict["value"]) + force_array = data_array[:, used_colomn : used_colomn + field_length] + used_colomn += field_length + continue + else: + raise RuntimeError("unknown field {}".format(kv_dict["key"])) + + type_num_dict = OrderedDict() + atom_type_list = [] + type_map = {} + temp_atom_max_index = 0 + if type_array is None: + raise RuntimeError("type_array can't be None type, check .xyz file") + for ii in type_array: + if ii not in type_map: + type_map[ii] = temp_atom_max_index + temp_atom_max_index += 1 + temp_atom_index = type_map[ii] + atom_type_list.append(temp_atom_index) + type_num_dict[ii] = 1 + else: + temp_atom_index = type_map[ii] + atom_type_list.append(temp_atom_index) + type_num_dict[ii] += 1 + type_num_list = [] + for atom_type, atom_num in type_num_dict.items(): + type_num_list.append((atom_type, atom_num)) + type_num_array = np.array(type_num_list) + if field_dict.get("virial", None): + virials = np.array( + [ + np.array( + list(filter(bool, field_dict["virial"].split(" "))) + ).reshape(3, 3) + ] + ).astype(np.float64) + else: + virials = None + + info_dict = {} + info_dict["atom_names"] = list(type_num_array[:, 0]) + info_dict["atom_numbs"] = list(type_num_array[:, 1].astype(int)) + info_dict["atom_types"] = np.array(atom_type_list).astype(int) + info_dict["cells"] = np.array( + [ + np.array(list(filter(bool, field_dict["Lattice"].split(" ")))).reshape( + 3, 3 + ) + ] + ).astype(np.float64) + info_dict["coords"] = np.array([coords_array]).astype(np.float64) + info_dict["energies"] = np.array([field_dict["energy"]]).astype(np.float64) + info_dict["forces"] = np.array([force_array]).astype(np.float64) + if virials is not None: + info_dict["virials"] = virials + info_dict["orig"] = np.zeros(3) + return info_dict + + +def format_single_frame(data, frame_idx): + """Format a single frame of system data into QUIP/GAP XYZ format lines. + + Parameters + ---------- + data : dict + system data + frame_idx : int + frame index + + Returns + ------- + list[str] + lines for the frame + """ + # Number of atoms + natoms = len(data["atom_types"]) + + # Build header line with metadata + header_parts = [] + + # Energy + energy = data["energies"][frame_idx] + header_parts.append(f"energy={energy:.12e}") + + # Virial (if present) + if "virials" in data: + virial = data["virials"][frame_idx] + virial_str = " ".join(f"{v:.12e}" for v in virial.flatten()) + header_parts.append(f'virial="{virial_str}"') + + # Lattice + cell = data["cells"][frame_idx] + lattice_str = " ".join(f"{c:.12e}" for c in cell.flatten()) + header_parts.append(f'Lattice="{lattice_str}"') + + # Properties + header_parts.append("Properties=species:S:1:pos:R:3:Z:I:1:force:R:3") + + header_line = " ".join(header_parts) + + # Format atom lines + atom_lines = [] + coords = data["coords"][frame_idx] + forces = data["forces"][frame_idx] + atom_names = np.array(data["atom_names"]) + atom_types = data["atom_types"] + + for i in range(natoms): + atom_type_idx = atom_types[i] + species = atom_names[atom_type_idx] + x, y, z = coords[i] + fx, fy, fz = forces[i] + atomic_number = Element(species).Z + + atom_line = f"{species} {x:.11e} {y:.11e} {z:.11e} {atomic_number} {fx:.11e} {fy:.11e} {fz:.11e}" + atom_lines.append(atom_line) + + # Combine all lines for this frame + frame_lines = [str(natoms), header_line] + atom_lines + return frame_lines diff --git a/dpdata/formats/xyz/xyz.py b/dpdata/formats/xyz/xyz.py new file mode 100644 index 000000000..0c36ac32b --- /dev/null +++ b/dpdata/formats/xyz/xyz.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +import numpy as np + + +def coord_to_xyz(coord: np.ndarray, types: list) -> str: + """Convert coordinates and types to xyz format. + + Parameters + ---------- + coord : np.ndarray + coordinates, Nx3 array + types : list + list of types + + Returns + ------- + str + xyz format string + + Examples + -------- + >>> coord_to_xyz(np.ones((1,3)), ["C"]) + 1 + + C 1.000000 1.000000 1.000000 + """ + buff = [str(len(types)), ""] + for at, cc in zip(types, coord): + buff.append("{} {:.6f} {:.6f} {:.6f}".format(at, *cc)) + return "\n".join(buff) + + +def xyz_to_coord(xyz: str) -> tuple[np.ndarray, list]: + """Convert xyz format to coordinates and types. + + Parameters + ---------- + xyz : str + xyz format string + + Returns + ------- + coords : np.ndarray + coordinates, Nx3 array + types : list + list of types + """ + symbols = [] + coords = [] + for ii, line in enumerate(xyz.split("\n")): + if ii == 0: + natoms = int(line.strip()) + elif 2 <= ii <= 1 + natoms: + # symbol x y z + symbol, x, y, z = line.split() + coords.append((float(x), float(y), float(z))) + symbols.append(symbol) + return np.array(coords), symbols diff --git a/dpdata/gaussian/__init__.py b/dpdata/gaussian/__init__.py index e69de29bb..7b2ae19b5 100644 --- a/dpdata/gaussian/__init__.py +++ b/dpdata/gaussian/__init__.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from dpdata.formats.gaussian import * # noqa: F403 diff --git a/dpdata/gaussian/fchk.py b/dpdata/gaussian/fchk.py index 816a999ce..b41d94ec1 100644 --- a/dpdata/gaussian/fchk.py +++ b/dpdata/gaussian/fchk.py @@ -1,175 +1,3 @@ from __future__ import annotations -from typing import TYPE_CHECKING - -import numpy as np - -from dpdata.utils import open_file - -if TYPE_CHECKING: - from dpdata.utils import FileType - -from ..periodic_table import ELEMENTS -from ..unit import ( - EnergyConversion, - ForceConversion, - HessianConversion, - LengthConversion, -) - -length_convert = LengthConversion("bohr", "angstrom").value() -energy_convert = EnergyConversion("hartree", "eV").value() -force_convert = ForceConversion("hartree/bohr", "eV/angstrom").value() -hessian_convert = HessianConversion("hartree/bohr^2", "eV/angstrom^2").value() - - -def create_full_hessian(hessian_raw: list | np.ndarray, natoms: int) -> np.ndarray: - """ - Reconstructs the full, symmetric Hessian matrix from a 1D array - containing its lower triangular elements. - - Args: - hessian_raw (list | np.ndarray): A 1D list or NumPy array containing the - lower triangular elements (including the - diagonal) of the Hessian matrix. - natoms (int): The number of atoms in the system. - - Returns - ------- - np.ndarray: A full, symmetric (3*natoms, 3*natoms) Hessian matrix. - - Raises - ------ - ValueError: If the number of elements in `hessian_raw` does not match - the expected number for the lower triangle of a - (3*natoms, 3*natoms) matrix. - """ - # Convert input to a NumPy array in case it's a list - hessian_block = np.array(hessian_raw) - - # Calculate the dimension of the final matrix - dim = 3 * natoms - - # Validate that the input data has the correct length - # A lower triangle of an n x n matrix has n*(n+1)/2 elements - expected_length = dim * (dim + 1) // 2 - if hessian_block.size != expected_length: - raise ValueError( - f"Input length {hessian_block.size} != expected {expected_length}" - ) - - # Create a zero matrix, then fill the lower triangle - hessian_full = np.zeros((dim, dim), dtype=hessian_block.dtype) - lower_triangle_indices = np.tril_indices(dim) - hessian_full[lower_triangle_indices] = hessian_block - - # This is done by copying the lower triangle to the upper triangle - # M_full = M_lower + M_lower.T - diag(M_lower) - hessian_full = hessian_full + hessian_full.T - np.diag(np.diag(hessian_full)) - - return hessian_full - - -def to_system_data(file_name: FileType, has_forces=True, has_hessian=True): - """Read Gaussian fchk file. - - Parameters - ---------- - file_name : str - file name - has_forces : bool, default True - whether to read force - Note: Cartesian Gradient in fchk file is converted to forces by taking negative sign - has_hessian : bool, default True - whether to read hessian - - Returns - ------- - data : dict - system data, including hessian if has_hessian is True - """ - data = {} - natoms = 0 - atom_numbers = [] - coords_t = [] - energy_t = [] - forces_t = [] - hessian_t = [] - # Read fchk file - with open_file(file_name) as fp: - for line in fp: - if isinstance(line, bytes): - line = line.decode(errors="ignore") - if "Number of atoms" in line: - natoms = int(line.split()[-1]) - elif "Atomic numbers" in line and "I" in line: - n = int(line.split()[-1]) - atom_numbers = [] - while len(atom_numbers) < n: - next_line = next(fp) - if isinstance(next_line, bytes): - next_line = next_line.decode(errors="ignore") - atom_numbers += [int(x) for x in next_line.split()] - elif "Current cartesian coordinates" in line and "R" in line: - n = int(line.split()[-1]) - coords_raw = [] - while len(coords_raw) < n: - next_line = next(fp) - if isinstance(next_line, bytes): - next_line = next_line.decode(errors="ignore") - coords_raw += [float(x) for x in next_line.split()] - coords = np.array(coords_raw).reshape(-1, 3) * length_convert - coords_t.append(coords) - elif "Total Energy" in line: - energy = float(line.split()[-1]) * energy_convert - energy_t.append(energy) - elif "Cartesian Gradient" in line: - n = int(line.split()[-1]) - forces_raw = [] - while len(forces_raw) < n: - next_line = next(fp) - if isinstance(next_line, bytes): - next_line = next_line.decode(errors="ignore") - forces_raw += [float(x) for x in next_line.split()] - # Cartesian Gradient is the negative of forces: F = -∇E - forces = -np.array(forces_raw).reshape(-1, 3) * force_convert - forces_t.append(forces) - elif "Cartesian Force Constants" in line and "R" in line: - n = int(line.split()[-1]) - hessian_raw = [] - while len(hessian_raw) < n: - next_line = next(fp) - if isinstance(next_line, bytes): - next_line = next_line.decode(errors="ignore") - hessian_raw += [float(x) for x in next_line.split()] - hessian_full = ( - create_full_hessian(hessian_raw, natoms) * hessian_convert - ) - # store as (natoms, 3, natoms, 3) to align with registered shape - hessian_t.append(hessian_full.reshape(natoms, 3, natoms, 3)) - # Assert key data - assert coords_t, "cannot find coords" - assert energy_t, "cannot find energy" - if has_forces: - assert forces_t, "cannot find forces" - if has_hessian: - assert hessian_t, "cannot find hessian" - # Assemble data - atom_symbols = [ELEMENTS[z - 1] for z in atom_numbers] - atom_names, atom_types, atom_numbs = np.unique( - atom_symbols, return_inverse=True, return_counts=True - ) - data["atom_names"] = list(atom_names) - data["atom_numbs"] = list(atom_numbs) - data["atom_types"] = atom_types - data["coords"] = np.array(coords_t).reshape(-1, natoms, 3) - data["orig"] = np.zeros(3) - data["cells"] = np.array([np.eye(3) * 100]) - data["nopbc"] = True - if energy_t: - data["energies"] = np.array(energy_t) - if has_forces and forces_t: - data["forces"] = np.array(forces_t) - if has_hessian and hessian_t: - data["hessian"] = np.array(hessian_t) - return data +from dpdata.formats.gaussian.fchk import * # noqa: F403 diff --git a/dpdata/gaussian/gjf.py b/dpdata/gaussian/gjf.py index 419ec354c..c0e3600f2 100644 --- a/dpdata/gaussian/gjf.py +++ b/dpdata/gaussian/gjf.py @@ -1,335 +1,3 @@ -# The initial code of this file is based on -# https://github.com/deepmodeling/dpgen/blob/0767dce7cad29367edb2e4a55fd0d8724dbda642/dpgen/generator/lib/gaussian.py#L1-L190 -# under LGPL 3.0 license -"""Generate Gaussian input file.""" - from __future__ import annotations -import itertools -import re -import uuid -import warnings - -import numpy as np - -from dpdata.periodic_table import Element - - -def _crd2frag(symbols: list[str], crds: np.ndarray) -> tuple[int, list[int]]: - """Detect fragments from coordinates. - - Parameters - ---------- - symbols : list[str] - element symbols; virtual elements are not supported - crds : np.ndarray - atomic coordinates, shape: (N, 3) - - Returns - ------- - frag_numb : int - number of fragments - frag_index : list[int] - frament index that each atom belongs to - - Notes - ----- - In this method, Open Babel is used to detect bond connectivity. The threshold - is the sum of covalent radii with a slight tolerance (0.45 A). Note that - this threshold has errors. - - PBC support is removed from this method as Gaussian does not support PBC calculation. - - Raises - ------ - ImportError - if Open Babel is not installed - """ - from scipy.sparse import csr_matrix - from scipy.sparse.csgraph import connected_components - - try: - from openbabel import openbabel - except ImportError: - import openbabel - atomnumber = len(symbols) - # Use openbabel to connect atoms - mol = openbabel.OBMol() - mol.BeginModify() - for idx, (symbol, position) in enumerate(zip(symbols, crds.astype(np.float64))): - num = Element(symbol).Z - atom = mol.NewAtom(idx) - atom.SetAtomicNum(int(num)) - atom.SetVector(*position) - mol.ConnectTheDots() - mol.PerceiveBondOrders() - mol.EndModify() - bonds = [] - for ii in range(mol.NumBonds()): - bond = mol.GetBond(ii) - a = bond.GetBeginAtom().GetId() - b = bond.GetEndAtom().GetId() - bo = bond.GetBondOrder() - bonds.extend([[a, b, bo], [b, a, bo]]) - bonds = np.array(bonds, ndmin=2).reshape((-1, 3)) - graph = csr_matrix( - (bonds[:, 2], (bonds[:, 0], bonds[:, 1])), shape=(atomnumber, atomnumber) - ) - frag_numb, frag_index = connected_components(graph, 0) - return frag_numb, frag_index - - -def detect_multiplicity(symbols: np.ndarray) -> int: - """Find the minimal multiplicity of the given molecules. - - Parameters - ---------- - symbols : np.ndarray - element symbols; virtual elements are not supported - - Returns - ------- - int - spin multiplicity - """ - # currently only support charge=0 - # oxygen -> 3 - if np.count_nonzero(symbols == ["O"]) == 2 and symbols.size == 2: - return 3 - # calculates the total number of electrons, assumes they are paired as much as possible - n_total = sum([Element(s).Z for s in symbols]) - return n_total % 2 + 1 - - -def make_gaussian_input( - sys_data: dict, - keywords: str | list[str], - multiplicity: str | int = "auto", - charge: int = 0, - fragment_guesses: bool = False, - basis_set: str | None = None, - keywords_high_multiplicity: str | None = None, - nproc: int = 1, -) -> str: - """Make gaussian input file. - - Parameters - ---------- - sys_data : dict - system data - keywords : str or list[str] - Gaussian keywords, e.g. force b3lyp/6-31g**. If a list, - run multiple steps - multiplicity : str or int, default=auto - spin multiplicity state. It can be a number. If auto, - multiplicity will be detected automatically, with the - following rules: - fragment_guesses=True - multiplicity will +1 for each radical, and +2 - for each oxygen molecule - fragment_guesses=False - multiplicity will be 1 or 2, but +2 for each - oxygen molecule - charge : int, default=0 - molecule charge. Only used when charge is not provided - by the system - fragment_guesses : bool, default=False - initial guess generated from fragment guesses. If True, - multiplicity should be auto - basis_set : str, default=None - custom basis set - keywords_high_multiplicity : str, default=None - keywords for points with multiple raicals. multiplicity - should be auto. If not set, fallback to normal keywords - nproc : int, default=1 - Number of CPUs to use - - Returns - ------- - str - gjf output string - """ - coordinates = sys_data["coords"][0] - atom_names = sys_data["atom_names"] - atom_numbs = sys_data["atom_numbs"] - atom_types = sys_data["atom_types"] - # get atom symbols list - symbols = [atom_names[atom_type] for atom_type in atom_types] - - # assume default charge is zero and default spin multiplicity is 1 - if "charge" in sys_data.keys(): - charge = sys_data["charge"] - - use_fragment_guesses = False - if isinstance(multiplicity, int): - mult_auto = False - elif multiplicity == "auto": - mult_auto = True - else: - raise RuntimeError('The keyword "multiplicity" is illegal.') - - if fragment_guesses: - # Initial guess generated from fragment guesses - # New feature of Gaussian 16 - use_fragment_guesses = True - if not mult_auto: - warnings.warn("Automatically set multiplicity to auto!") - mult_auto = True - - if mult_auto: - frag_numb, frag_index = _crd2frag(symbols, coordinates) - if frag_numb == 1: - use_fragment_guesses = False - mult_frags = [] - for i in range(frag_numb): - idx = frag_index == i - mult_frags.append(detect_multiplicity(np.array(symbols)[idx])) - if use_fragment_guesses: - multiplicity = sum(mult_frags) - frag_numb + 1 - charge % 2 - chargekeywords_frag = "%d %d" % (charge, multiplicity) + "".join( # noqa: UP031 - [" %d %d" % (charge, mult_frag) for mult_frag in mult_frags] # noqa: UP031 - ) - else: - multi_frags = np.array(mult_frags) - multiplicity = ( - 1 - + np.count_nonzero(multi_frags == 2) % 2 - + np.count_nonzero(multi_frags == 3) * 2 - - charge % 2 - ) - - if ( - keywords_high_multiplicity is not None - and np.count_nonzero(multi_frags == 2) >= 2 - ): - # at least 2 radicals - keywords = keywords_high_multiplicity - - if isinstance(keywords, str): - keywords = [keywords] - else: - keywords = keywords.copy() - - buff = [] - # keywords, e.g., force b3lyp/6-31g** - if use_fragment_guesses: - keywords[0] = f"{keywords[0]} guess=fragment={frag_numb}" - - chkkeywords = [] - if len(keywords) > 1: - chkkeywords.append(f"%chk={str(uuid.uuid1())}.chk") - - nprockeywords = f"%nproc={nproc:d}" - # use formula as title - titlekeywords = "".join( - [f"{symbol}{numb}" for symbol, numb in zip(atom_names, atom_numbs)] - ) - chargekeywords = f"{charge} {multiplicity}" - - buff = [ - *chkkeywords, - nprockeywords, - f"#{keywords[0]}", - "", - titlekeywords, - "", - (chargekeywords_frag if use_fragment_guesses else chargekeywords), - ] - - for ii, (symbol, coordinate) in enumerate(zip(symbols, coordinates)): - if use_fragment_guesses: - buff.append( - "%s(Fragment=%d) %f %f %f" % (symbol, frag_index[ii] + 1, *coordinate) # noqa: UP031 - ) - else: - buff.append("{} {:f} {:f} {:f}".format(symbol, *coordinate)) # noqa: UP031 - if not sys_data.get("nopbc", False): - # PBC condition - cell = sys_data["cells"][0] - for ii in range(3): - # use TV as atomic symbol, see https://gaussian.com/pbc/ - buff.append("TV {:f} {:f} {:f}".format(*cell[ii])) - if basis_set is not None: - # custom basis set - buff.extend(["", basis_set, ""]) - for kw in itertools.islice(keywords, 1, None): - buff.extend( - [ - "\n--link1--", - *chkkeywords, - nprockeywords, - f"#{kw}", - "", - titlekeywords, - "", - chargekeywords, - "", - ] - ) - buff.append("\n") - return "\n".join(buff) - - -def read_gaussian_input(inp: str): - """Read Gaussian input. - - Parameters - ---------- - inp : str - Gaussian input str - - Returns - ------- - dict - system data - """ - flag = 0 - coords = [] - elements = [] - cells = [] - for line in inp.split("\n"): - if not line.strip(): - # empty line - flag += 1 - elif flag == 0: - # keywords - if line.startswith("#"): - # setting - keywords = line.split() - elif line.startswith("%"): - pass - elif flag == 1: - # title - pass - elif flag == 2: - # multi and coords - s = line.split() - if len(s) == 2: - pass - elif len(s) == 4: - if s[0] == "TV": - cells.append(list(map(float, s[1:4]))) - else: - # element - elements.append(re.sub("\\(.*?\\)|\\{.*?}|\\[.*?]", "", s[0])) - coords.append(list(map(float, s[1:4]))) - elif flag == 3: - # end - break - atom_names, atom_types, atom_numbs = np.unique( - elements, return_inverse=True, return_counts=True - ) - if len(cells): - nopbc = False - else: - nopbc = True - cells = np.array([np.eye(3)]) * 100 - return { - "atom_names": list(atom_names), - "atom_numbs": list(atom_numbs), - "atom_types": atom_types, - "cells": np.array(cells).reshape(1, 3, 3), - "nopbc": nopbc, - "coords": np.array(coords).reshape(1, -1, 3), - "orig": np.zeros(3), - } +from dpdata.formats.gaussian.gjf import * # noqa: F403 diff --git a/dpdata/gaussian/log.py b/dpdata/gaussian/log.py index 08a65b9dc..750343df4 100644 --- a/dpdata/gaussian/log.py +++ b/dpdata/gaussian/log.py @@ -1,136 +1,3 @@ from __future__ import annotations -from typing import TYPE_CHECKING - -import numpy as np - -from dpdata.utils import open_file - -if TYPE_CHECKING: - from dpdata.utils import FileType - -from ..periodic_table import ELEMENTS -from ..unit import EnergyConversion, ForceConversion, LengthConversion - -length_convert = LengthConversion("bohr", "angstrom").value() -energy_convert = EnergyConversion("hartree", "eV").value() -force_convert = ForceConversion("hartree/bohr", "eV/angstrom").value() - -symbols = ["X"] + ELEMENTS - - -def to_system_data(file_name: FileType, md=False): - """Read Gaussian log file. - - Parameters - ---------- - file_name : str - file name - md : bool, default False - whether to read multiple frames - - Returns - ------- - data : dict - system data - - Raises - ------ - RuntimeError - if the input orientation is not found - """ - data = {} - # read from log lines - flag = 0 - energy_t = [] - coords_t = [] - atom_symbols = [] - forces_t = [] - cells_t = [] - nopbc = True - coords = None - - with open_file(file_name) as fp: - for line in fp: - if line.startswith(" SCF Done"): - # energies - energy = float(line.split()[4]) - elif line.startswith( - " Center Atomic Forces (Hartrees/Bohr)" - ): - flag = 1 - forces = [] - elif line.startswith( - " Input orientation:" - ) or line.startswith(" Z-Matrix orientation:"): - flag = 5 - coords = [] - atom_symbols = [] - cells = [] - - if 1 <= flag <= 3 or 5 <= flag <= 9: - flag += 1 - elif flag == 4: - # forces - if line.startswith(" -------"): - if coords is None: - raise RuntimeError( - "Input orientation is not found. Using Gaussian keyword " - "`Geom=PrintInputOrient` to always print the input orientation. " - "See https://gaussian.com/geom/ for more details." - ) - forces_t.append(forces) - energy_t.append(energy) - coords_t.append(coords) - if cells: - nopbc = False - cells_t.append(cells) - else: - cells_t.append( - [[100.0, 0.0, 0.0], [0.0, 100.0, 0.0], [0.0, 0.0, 100.0]] - ) - flag = 0 - coords = None - else: - s = line.split() - if line[14:16] == "-2": - # PBC - pass - else: - forces.append( - [float(line[23:38]), float(line[38:53]), float(line[53:68])] - ) - elif flag == 10: - # atom_symbols and coords - if line.startswith(" -------"): - flag = 0 - else: - s = line.split() - if int(s[1]) == -2: - # PBC cells, see https://gaussian.com/pbc/ - cells.append([float(x) for x in s[3:6]]) - else: - coords.append([float(x) for x in s[3:6]]) - atom_symbols.append(symbols[int(s[1])]) - - assert coords_t, "cannot find coords" - assert energy_t, "cannot find energies" - assert forces_t, "cannot find forces" - - atom_names, data["atom_types"], atom_numbs = np.unique( - atom_symbols, return_inverse=True, return_counts=True - ) - data["atom_names"] = list(atom_names) - data["atom_numbs"] = list(atom_numbs) - if not md: - forces_t = forces_t[-1:] - energy_t = energy_t[-1:] - coords_t = coords_t[-1:] - cells_t = cells_t[-1:] - data["forces"] = np.array(forces_t) * force_convert - data["energies"] = np.array(energy_t) * energy_convert - data["coords"] = np.array(coords_t) - data["orig"] = np.array([0, 0, 0]) - data["cells"] = np.array(cells_t) - data["nopbc"] = nopbc - return data +from dpdata.formats.gaussian.log import * # noqa: F403 diff --git a/dpdata/gromacs/__init__.py b/dpdata/gromacs/__init__.py index e69de29bb..7251bb787 100644 --- a/dpdata/gromacs/__init__.py +++ b/dpdata/gromacs/__init__.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from dpdata.formats.gromacs import * # noqa: F403 diff --git a/dpdata/gromacs/gro.py b/dpdata/gromacs/gro.py index fe83e0c5c..8878ce8ff 100644 --- a/dpdata/gromacs/gro.py +++ b/dpdata/gromacs/gro.py @@ -1,112 +1,3 @@ -#!/usr/bin/env python3 from __future__ import annotations -import re -from typing import TYPE_CHECKING - -import numpy as np - -from dpdata.utils import open_file - -if TYPE_CHECKING: - from dpdata.utils import FileType - -from ..unit import LengthConversion - -nm2ang = LengthConversion("nm", "angstrom").value() -ang2nm = LengthConversion("angstrom", "nm").value() -cell_idx_gmx2dp = [0, 4, 8, 1, 2, 3, 5, 6, 7] - - -def _format_atom_name(atom_name): - patt = re.compile("[a-zA-Z]*") - match = re.search(patt, atom_name) - fmt_name = match.group().capitalize() - return fmt_name - - -def _get_line(line, fmt_atom_name=True): - atom_name = line[10:15].split()[0] - if fmt_atom_name: - atom_name = _format_atom_name(atom_name) - atom_idx = int(line[15:20].split()[0]) - posis = [float(line[ii : ii + 8]) for ii in range(20, 44, 8)] - posis = np.array(posis) * nm2ang - return atom_name, atom_idx, posis - - -def _get_cell(line): - cell = np.zeros([3, 3]) - lengths = [float(ii) for ii in line.split()] - if len(lengths) >= 3: - for dd in range(3): - cell[dd][dd] = lengths[dd] - else: - raise RuntimeError("wrong box format: ", line) - if len(lengths) == 9: - cell[0][1] = lengths[3] - cell[0][2] = lengths[4] - cell[1][0] = lengths[5] - cell[1][2] = lengths[6] - cell[2][0] = lengths[7] - cell[2][1] = lengths[8] - cell = cell * nm2ang - return cell - - -def file_to_system_data(fname: FileType, format_atom_name=True, **kwargs): - system = {"coords": [], "cells": []} - with open_file(fname) as fp: - frame = 0 - while True: - flag = fp.readline() - if not flag: - break - else: - frame += 1 - names = [] - idxs = [] - posis = [] - natoms = int(fp.readline()) - for ii in range(natoms): - n, i, p = _get_line(fp.readline(), fmt_atom_name=format_atom_name) - names.append(n) - idxs.append(i) - posis.append(p) - cell = _get_cell(fp.readline()) - posis = np.array(posis) - if frame == 1: - system["orig"] = np.zeros(3) - system["atom_names"] = list(set(names)) - system["atom_numbs"] = [ - names.count(ii) for ii in system["atom_names"] - ] - system["atom_types"] = [ - system["atom_names"].index(ii) for ii in names - ] - system["atom_types"] = np.array(system["atom_types"], dtype=int) - system["coords"].append(posis) - system["cells"].append(cell) - system["coords"] = np.array(system["coords"]) - system["cells"] = np.array(system["cells"]) - return system - - -def from_system_data(system, f_idx=0, **kwargs): - resname = kwargs.get("resname", "MOL") - shift = kwargs.get("shift", 0) - ret = "" - ret += " molecule" + "\n" - n_atoms = sum(system["atom_numbs"]) - ret += " " + str(n_atoms) + "\n" - for i in range(n_atoms): - atom_type = system["atom_types"][i] - atom_name = system["atom_names"][atom_type] - coords = system["coords"][f_idx] * ang2nm - ret += "{:>5d}{:<5s}{:>5s}{:5d}{:8.3f}{:8.3f}{:8.3f}\n".format( - 1, resname, atom_name, i + shift + 1, *tuple(coords[i]) - ) - cell = (system["cells"][f_idx].flatten() * ang2nm)[cell_idx_gmx2dp] - ret += " " + " ".join([f"{x:.3f}" for x in cell]) - - return ret +from dpdata.formats.gromacs.gro import * # noqa: F403 diff --git a/dpdata/lammps/__init__.py b/dpdata/lammps/__init__.py index e69de29bb..3069f9b62 100644 --- a/dpdata/lammps/__init__.py +++ b/dpdata/lammps/__init__.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from dpdata.formats.lammps import * # noqa: F403 diff --git a/dpdata/lammps/dump.py b/dpdata/lammps/dump.py index 89e75e4de..0ac2b31d7 100644 --- a/dpdata/lammps/dump.py +++ b/dpdata/lammps/dump.py @@ -1,433 +1,3 @@ -#!/usr/bin/env python3 from __future__ import annotations -import os -import sys -from typing import TYPE_CHECKING - -import numpy as np - -from dpdata.utils import open_file - -if TYPE_CHECKING: - from dpdata.utils import FileType - -lib_path = os.path.dirname(os.path.realpath(__file__)) -sys.path.append(lib_path) -import warnings - -import lmp - - -class UnwrapWarning(UserWarning): - pass - - -warnings.simplefilter("once", UnwrapWarning) - - -def _get_block(lines, key): - for idx in range(len(lines)): - if ("ITEM: " + key) in lines[idx]: - break - idx_s = idx + 1 - for idx in range(idx_s, len(lines)): - if ("ITEM: ") in lines[idx]: - break - idx_e = idx - if idx_e == len(lines) - 1: - idx_e += 1 - return lines[idx_s:idx_e], lines[idx_s - 1] - - -def get_atype(lines, type_idx_zero=False): - blk, head = _get_block(lines, "ATOMS") - keys = head.split() - id_idx = keys.index("id") - 2 - tidx = keys.index("type") - 2 - atype = [] - for ii in blk: - atype.append([int(ii.split()[id_idx]), int(ii.split()[tidx])]) - atype.sort() - atype = np.array(atype, dtype=int) - if type_idx_zero: - return atype[:, 1] - 1 - else: - return atype[:, 1] - - -def get_natoms(lines): - blk, head = _get_block(lines, "NUMBER OF ATOMS") - return int(blk[0]) - - -def get_natomtypes(lines): - atype = get_atype(lines) - return max(atype) - - -def get_natoms_vec(lines): - atype = get_atype(lines) - natoms_vec = [] - natomtypes = get_natomtypes(lines) - for ii in range(natomtypes): - natoms_vec.append(sum(atype == ii + 1)) - assert sum(natoms_vec) == get_natoms(lines) - return natoms_vec - - -def get_coordtype_and_scalefactor(keys): - # 4 types in total,with different scaling factor - key_pc = ["x", "y", "z"] # plain cartesian, sf = 1 - key_uc = ["xu", "yu", "zu"] # unwraped cartesian, sf = 1 - key_s = ["xs", "ys", "zs"] # scaled by lattice parameter, sf = lattice parameter - key_su = ["xsu", "ysu", "zsu"] # scaled and unfolded,sf = lattice parameter - lmp_coor_type = [key_pc, key_uc, key_s, key_su] - sf = [0, 0, 1, 1] - uw = [0, 1, 0, 1] # unwraped or not - for k in range(4): - if all(i in keys for i in lmp_coor_type[k]): - return lmp_coor_type[k], sf[k], uw[k] - - -def safe_get_posi(lines, cell, orig=np.zeros(3), unwrap=False): - blk, head = _get_block(lines, "ATOMS") - keys = head.split() - coord_tp_and_sf = get_coordtype_and_scalefactor(keys) - assert coord_tp_and_sf is not None, "Dump file does not contain atomic coordinates!" - coordtype, sf, uw = coord_tp_and_sf - id_idx = keys.index("id") - 2 - xidx = keys.index(coordtype[0]) - 2 - yidx = keys.index(coordtype[1]) - 2 - zidx = keys.index(coordtype[2]) - 2 - posis = [] - for ii in blk: - words = ii.split() - posis.append( - [ - float(words[id_idx]), - float(words[xidx]), - float(words[yidx]), - float(words[zidx]), - ] - ) - posis.sort() - posis = np.array(posis)[:, 1:4] - if not sf: - posis = (posis - orig) @ np.linalg.inv( - cell - ) # Convert to scaled coordinates for unscaled coordinates - if uw and unwrap: - return ( - posis @ cell - ) # convert scaled coordinates back to Cartesien coordinates unwrap at the periodic boundaries - else: - if uw and not unwrap: - warnings.warn( - message="Your dump file contains unwrapped coordinates, but you did not specify unwrapping (unwrap = True). The default is wrapping at periodic boundaries (unwrap = False).\n", - category=UnwrapWarning, - ) - return ( - (posis % 1) @ cell - ) # Convert scaled coordinates back to Cartesien coordinates with wraping at periodic boundary conditions - - -def get_dumpbox(lines): - blk, h = _get_block(lines, "BOX BOUNDS") - bounds = np.zeros([3, 2]) - tilt = np.zeros([3]) - load_tilt = "xy xz yz" in h - for dd in range(3): - info = [float(jj) for jj in blk[dd].split()] - bounds[dd][0] = info[0] - bounds[dd][1] = info[1] - if load_tilt: - tilt[dd] = info[2] - return bounds, tilt - - -def dumpbox2box(bounds, tilt): - xy = tilt[0] - xz = tilt[1] - yz = tilt[2] - xlo = bounds[0][0] - min(0.0, xy, xz, xy + xz) - xhi = bounds[0][1] - max(0.0, xy, xz, xy + xz) - ylo = bounds[1][0] - min(0.0, yz) - yhi = bounds[1][1] - max(0.0, yz) - zlo = bounds[2][0] - zhi = bounds[2][1] - info = [[xlo, xhi], [ylo, yhi], [zlo, zhi]] - return lmp.lmpbox2box(info, tilt) - - -def box2dumpbox(orig, box): - lohi, tilt = lmp.box2lmpbox(orig, box) - xy = tilt[0] - xz = tilt[1] - yz = tilt[2] - bounds = np.zeros([3, 2]) - bounds[0][0] = lohi[0][0] + min(0.0, xy, xz, xy + xz) - bounds[0][1] = lohi[0][1] + max(0.0, xy, xz, xy + xz) - bounds[1][0] = lohi[1][0] + min(0.0, yz) - bounds[1][1] = lohi[1][1] + max(0.0, yz) - bounds[2][0] = lohi[2][0] - bounds[2][1] = lohi[2][1] - return bounds, tilt - - -def load_file(fname: FileType, begin=0, step=1): - lines = [] - buff = [] - cc = -1 - with open_file(fname) as fp: - while True: - line = fp.readline().rstrip("\n") - if not line: - if cc >= begin and (cc - begin) % step == 0: - lines += buff - buff = [] - cc += 1 - return lines - if "ITEM: TIMESTEP" in line: - if cc >= begin and (cc - begin) % step == 0: - lines += buff - buff = [] - cc += 1 - if cc >= begin and (cc - begin) % step == 0: - buff.append(line) - - -def get_spin_keys(inputfile): - """ - Read input file and get the keys for spin info in dump. - - Parameters - ---------- - inputfile : str - Path to the input file. - - Returns - ------- - list or None - List of spin info keys if found, None otherwise. - """ - if inputfile is None: - return None - - if not os.path.isfile(inputfile): - warnings.warn(f"Input file {inputfile} not found.") - return None - - with open(inputfile) as f: - for line in f.readlines(): - ls = line.split() - if ( - len(ls) > 7 - and ls[0] == "compute" - and all(key in ls for key in ["sp", "spx", "spy", "spz"]) - ): - compute_name = ls[1] - return [ - f"c_{compute_name}[{ls.index(key) - 3}]" - for key in ["sp", "spx", "spy", "spz"] - ] - - return None - - -def get_spin(lines, spin_keys): - """ - Get the spin info from the dump file. - - Parameters - ---------- - lines : list - The content of the dump file. - spin_keys : list - The keys for spin info in dump file. - the spin info is stored in sp, spx, spy, spz or spin_keys, which is the spin norm and the spin vector - 1 1 0.00141160 5.64868599 0.01005602 1.54706291 0.00000000 0.00000000 1.00000000 -1.40772100 -2.03739417 -1522.64797384 -0.00397809 -0.00190426 -0.00743976 - """ - blk, head = _get_block(lines, "ATOMS") - heads = head.split() - - if spin_keys is not None and all(i in heads for i in spin_keys): - key = spin_keys - else: - return None - - try: - idx_id = heads.index("id") - 2 - idx_sp, idx_spx, idx_spy, idx_spz = (heads.index(k) - 2 for k in key) - - norm = [] - vec = [] - atom_ids = [] - for line in blk: - words = line.split() - norm.append([float(words[idx_sp])]) - vec.append( - [float(words[idx_spx]), float(words[idx_spy]), float(words[idx_spz])] - ) - atom_ids.append(int(words[idx_id])) - - spin = np.array(norm) * np.array(vec) - atom_ids, spin = zip(*sorted(zip(atom_ids, spin))) - return np.array(spin) - except (ValueError, IndexError) as e: - warnings.warn(f"Error processing spin data: {str(e)}") - return None - - -def system_data( - lines, type_map=None, type_idx_zero=True, unwrap=False, input_file=None -): - array_lines = split_traj(lines) - lines = array_lines[0] - system = {} - system["atom_numbs"] = get_natoms_vec(lines) - system["atom_names"] = [] - if type_map is None: - for ii in range(len(system["atom_numbs"])): - system["atom_names"].append("TYPE_%d" % ii) # noqa: UP031 - else: - assert len(type_map) >= len(system["atom_numbs"]) - for ii in range(len(system["atom_numbs"])): - system["atom_names"].append(type_map[ii]) - bounds, tilt = get_dumpbox(lines) - orig, cell = dumpbox2box(bounds, tilt) - system["orig"] = np.array(orig) - np.array(orig) - system["cells"] = [np.array(cell)] - system["atom_types"] = get_atype(lines, type_idx_zero=type_idx_zero) - system["coords"] = [safe_get_posi(lines, cell, np.array(orig), unwrap)] - spin_keys = get_spin_keys(input_file) - spin = get_spin(lines, spin_keys) - has_spin = False - if spin is not None: - system["spins"] = [spin] - has_spin = True - for ii in range(1, len(array_lines)): - bounds, tilt = get_dumpbox(array_lines[ii]) - orig, cell = dumpbox2box(bounds, tilt) - system["cells"].append(cell) - atype = get_atype(array_lines[ii], type_idx_zero=type_idx_zero) - # map atom type; a[as[a][as[as[b]]]] = b[as[b][as^{-1}[b]]] = b[id] - idx = np.argsort(atype, kind="stable")[ - np.argsort(np.argsort(system["atom_types"], kind="stable"), kind="stable") - ] - system["coords"].append( - safe_get_posi(array_lines[ii], cell, np.array(orig), unwrap)[idx] - ) - if has_spin: - spin = get_spin(array_lines[ii], spin_keys) - if spin is not None: - system["spins"].append(spin[idx]) - else: - warnings.warn( - f"Warning: spin info is not found in frame {ii}, remove spin info." - ) - system.pop("spins") - has_spin = False - if has_spin: - system["spins"] = np.array(system["spins"]) - system["cells"] = np.array(system["cells"]) - system["coords"] = np.array(system["coords"]) - return system - - -def split_traj(dump_lines): - marks = [] - for idx, ii in enumerate(dump_lines): - if "ITEM: TIMESTEP" in ii: - marks.append(idx) - if len(marks) == 0: - return None - elif len(marks) == 1: - return [dump_lines] - else: - block_size = marks[1] - marks[0] - ret = [] - for ii in marks: - ret.append(dump_lines[ii : ii + block_size]) - # for ii in range(len(marks)-1): - # assert(marks[ii+1] - marks[ii] == block_size) - return ret - return None - - -def from_system_data(system, f_idx=0, timestep=0): - """Convert system data to LAMMPS dump format string. - - Parameters - ---------- - system : dict - System data dictionary containing atoms, coordinates, cell, etc. - f_idx : int, optional - Frame index to dump (default: 0) - timestep : int, optional - Timestep number for the dump (default: 0) - - Returns - ------- - str - LAMMPS dump format string - """ - ret = "" - - # Get basic system info - natoms = sum(system["atom_numbs"]) - coords = system["coords"][f_idx] - cell = system["cells"][f_idx] - atom_types = system["atom_types"] - orig = system.get("orig", np.zeros(3)) - - # Convert cell to dump format (bounds and tilt) - bounds, tilt = box2dumpbox(orig, cell) - - # Write timestep - ret += "ITEM: TIMESTEP\n" - ret += f"{timestep}\n" - - # Write number of atoms - ret += "ITEM: NUMBER OF ATOMS\n" - ret += f"{natoms}\n" - - # Write box bounds - ret += "ITEM: BOX BOUNDS xy xz yz pp pp pp\n" - ret += f"{bounds[0][0]:.10f} {bounds[0][1]:.10f} {tilt[0]:.10f}\n" - ret += f"{bounds[1][0]:.10f} {bounds[1][1]:.10f} {tilt[1]:.10f}\n" - ret += f"{bounds[2][0]:.10f} {bounds[2][1]:.10f} {tilt[2]:.10f}\n" - - # Write atoms header - ret += "ITEM: ATOMS id type x y z\n" - - # Write atom data - for ii in range(natoms): - atom_id = ii + 1 # LAMMPS uses 1-based indexing - atom_type = atom_types[ii] + 1 # LAMMPS uses 1-based type indexing - x, y, z = coords[ii] - ret += f"{atom_id} {atom_type} {x:.10f} {y:.10f} {z:.10f}\n" - - return ret - - -if __name__ == "__main__": - # fname = 'dump.hti' - # lines = open(fname).read().split('\n') - # # print(get_natoms(lines)) - # # print(get_natomtypes(lines)) - # # print(get_natoms_vec(lines)) - # posi = get_posi(lines) - # dbox1, tilt1 = box2dumpbox(orig, box) - # print(dbox - dbox1) - # print(tilt - tilt1) - # print(orig) - # print(box) - # np.savetxt('tmp.out', posi - orig, fmt='%.6f') - # print(system_data(lines)) - lines = load_file("conf_unfold.dump", begin=0, step=1) - al = split_traj(lines) - s = system_data(lines, ["O", "H"]) - # l = np.linalg.norm(s['cells'][1],axis=1) - # p = s['coords'][0] + l - # np.savetxt('p',p,fmt='%1.10f') +from dpdata.formats.lammps.dump import * # noqa: F403 diff --git a/dpdata/lammps/lmp.py b/dpdata/lammps/lmp.py index c9d60ec53..30225fcae 100644 --- a/dpdata/lammps/lmp.py +++ b/dpdata/lammps/lmp.py @@ -1,649 +1,3 @@ -#!/usr/bin/env python3 from __future__ import annotations -import numpy as np - -from dpdata.periodic_table import ELEMENTS, Element - -ptr_float_fmt = "%15.10f" -ptr_int_fmt = "%6d" -ptr_key_fmt = "%15s" - -# Mapping of LAMMPS atom styles to their column layouts -# Format: (atom_id_col, atom_type_col, x_col, y_col, z_col, has_molecule_id, has_charge, charge_col) -ATOM_STYLE_COLUMNS = { - "atomic": (0, 1, 2, 3, 4, False, False, None), - "angle": (0, 2, 3, 4, 5, True, False, None), - "bond": (0, 2, 3, 4, 5, True, False, None), - "charge": (0, 1, 3, 4, 5, False, True, 2), - "full": (0, 2, 4, 5, 6, True, True, 3), - "molecular": (0, 2, 3, 4, 5, True, False, None), - "dipole": (0, 1, 3, 4, 5, False, True, 2), - "sphere": (0, 1, 4, 5, 6, False, False, None), -} - - -def detect_atom_style(lines: list[str]) -> str | None: - """Detect LAMMPS atom style from data file content. - - Parameters - ---------- - lines : list - Lines from LAMMPS data file - - Returns - ------- - str or None - Detected atom style, or None if not detected - """ - # Look for atom style in comments after "Atoms" section header - atom_lines = get_atoms(lines) - if not atom_lines: - return None - - # Find the "Atoms" line - for idx, line in enumerate(lines): - if "Atoms" in line: - # Check if there's a comment with atom style after "Atoms" - if "#" in line: - comment_part = line.split("#")[1].strip().lower() - for style in ATOM_STYLE_COLUMNS: - if style in comment_part: - return style - break - - # If no explicit style found, try to infer from first data line - if atom_lines: - first_line = atom_lines[0].split() - num_cols = len(first_line) - - # Try to match based on number of columns and content patterns - # This is a heuristic approach - if num_cols == 5: - # Could be atomic style: atom-ID atom-type x y z - return "atomic" - elif num_cols == 6: - # Could be charge or bond/molecular style - # Try to determine if column 2 (index 2) looks like a charge (float) or type (int) - try: - val = float(first_line[2]) - # If it's a small float, likely a charge - if abs(val) < 10 and val != int(val): - return "charge" - else: - # Likely molecule ID (integer), so bond/molecular style - return "bond" - except ValueError: - return "atomic" # fallback - elif num_cols == 7: - # Could be full style: atom-ID molecule-ID atom-type charge x y z - return "full" - elif num_cols >= 8: - # Could be dipole or sphere style - # For now, default to dipole if we have enough columns - return "dipole" - - return None # Unable to detect - - -def _get_block(lines, keys): - for idx in range(len(lines)): - if keys in lines[idx]: - break - if idx == len(lines) - 1: - return None - idx_s = idx + 2 - idx = idx_s - ret = [] - while True: - if idx == len(lines) or len(lines[idx].split()) == 0: - break - else: - ret.append(lines[idx]) - idx += 1 - return ret - - -def lmpbox2box(lohi, tilt): - xy = tilt[0] - xz = tilt[1] - yz = tilt[2] - orig = np.array([lohi[0][0], lohi[1][0], lohi[2][0]]) - lens = [] - for dd in range(3): - lens.append(lohi[dd][1] - lohi[dd][0]) - xx = [lens[0], 0, 0] - yy = [xy, lens[1], 0] - zz = [xz, yz, lens[2]] - return orig, np.array([xx, yy, zz]) - - -def box2lmpbox(orig, box): - lohi = np.zeros([3, 2]) - for dd in range(3): - lohi[dd][0] = orig[dd] - tilt = np.zeros(3) - tilt[0] = box[1][0] - tilt[1] = box[2][0] - tilt[2] = box[2][1] - lens = np.zeros(3) - lens[0] = box[0][0] - lens[1] = box[1][1] - lens[2] = box[2][2] - for dd in range(3): - lohi[dd][1] = lohi[dd][0] + lens[dd] - return lohi, tilt - - -def get_atoms(lines): - return _get_block(lines, "Atoms") - - -def get_natoms(lines): - for ii in lines: - if "atoms" in ii: - return int(ii.split()[0]) - return None - - -def get_natomtypes(lines): - for ii in lines: - if "atom types" in ii: - return int(ii.split()[0]) - return None - - -def _atom_info_mol(line): - vec = line.split() - # idx, mole_type, atom_type, charge, x, y, z - return ( - int(vec[0]), - int(vec[1]), - int(vec[2]), - float(vec[3]), - float(vec[4]), - float(vec[5]), - float(vec[6]), - ) - - -def _atom_info_atom(line): - vec = line.split() - # idx, atom_type, x, y, z - return int(vec[0]), int(vec[1]), float(vec[2]), float(vec[3]), float(vec[4]) - - -def _atom_info_style(line: str, atom_style: str = "atomic") -> dict[str, int | float]: - """Parse atom information based on the specified atom style. - - Parameters - ---------- - line : str - The atom line from LAMMPS data file - atom_style : str - The LAMMPS atom style (atomic, full, charge, etc.) - - Returns - ------- - dict - Dictionary containing parsed atom information with keys: - 'atom_id', 'atom_type', 'x', 'y', 'z', 'molecule_id' (if present), 'charge' (if present) - """ - if atom_style not in ATOM_STYLE_COLUMNS: - raise ValueError( - f"Unsupported atom style: {atom_style}. Supported styles: {list(ATOM_STYLE_COLUMNS.keys())}" - ) - - vec = line.split() - columns = ATOM_STYLE_COLUMNS[atom_style] - - result = { - "atom_id": int(vec[columns[0]]), - "atom_type": int(vec[columns[1]]), - "x": float(vec[columns[2]]), - "y": float(vec[columns[3]]), - "z": float(vec[columns[4]]), - } - - # Add molecule ID if present - if columns[5]: # has_molecule_id - result["molecule_id"] = int( - vec[1] - ) # molecule ID is always in column 1 when present - - # Add charge if present - if columns[6]: # has_charge - result["charge"] = float(vec[columns[7]]) # charge_col - - return result - - -def get_natoms_vec(lines: list[str], atom_style: str = "atomic") -> list[int]: - """Get number of atoms for each atom type. - - Parameters - ---------- - lines : list - Lines from LAMMPS data file - atom_style : str - The LAMMPS atom style - - Returns - ------- - list - Number of atoms for each atom type - """ - atype = get_atype(lines, atom_style=atom_style) - natoms_vec = [] - natomtypes = get_natomtypes(lines) - for ii in range(natomtypes): - natoms_vec.append(sum(atype == ii + 1)) - assert sum(natoms_vec) == get_natoms(lines) - return natoms_vec - - -def get_atype( - lines: list[str], type_idx_zero: bool = False, atom_style: str = "atomic" -) -> np.ndarray: - """Get atom types from LAMMPS data file. - - Parameters - ---------- - lines : list - Lines from LAMMPS data file - type_idx_zero : bool - Whether to use zero-based indexing for atom types - atom_style : str - The LAMMPS atom style - - Returns - ------- - np.ndarray - Array of atom types - """ - alines = get_atoms(lines) - atype = [] - for ii in alines: - atom_info = _atom_info_style(ii, atom_style) - at = atom_info["atom_type"] - if type_idx_zero: - atype.append(at - 1) - else: - atype.append(at) - return np.array(atype, dtype=int) - - -def get_posi(lines: list[str], atom_style: str = "atomic") -> np.ndarray: - """Get atomic positions from LAMMPS data file. - - Parameters - ---------- - lines : list - Lines from LAMMPS data file - atom_style : str - The LAMMPS atom style - - Returns - ------- - np.ndarray - Array of atomic positions - """ - atom_lines = get_atoms(lines) - posis = [] - for ii in atom_lines: - atom_info = _atom_info_style(ii, atom_style) - posis.append([atom_info["x"], atom_info["y"], atom_info["z"]]) - return np.array(posis) - - -def get_charges(lines: list[str], atom_style: str = "atomic") -> np.ndarray | None: - """Get atomic charges from LAMMPS data file if the atom style supports charges. - - Parameters - ---------- - lines : list - Lines from LAMMPS data file - atom_style : str - The LAMMPS atom style - - Returns - ------- - np.ndarray or None - Array of atomic charges if atom style has charges, None otherwise - """ - if atom_style not in ATOM_STYLE_COLUMNS: - raise ValueError(f"Unsupported atom style: {atom_style}") - - # Check if this atom style has charges - if not ATOM_STYLE_COLUMNS[atom_style][6]: # has_charge - return None - - atom_lines = get_atoms(lines) - charges = [] - for ii in atom_lines: - atom_info = _atom_info_style(ii, atom_style) - charges.append(atom_info["charge"]) - return np.array(charges) - - -def get_spins(lines: list[str], atom_style: str = "atomic") -> np.ndarray | None: - atom_lines = get_atoms(lines) - if len(atom_lines[0].split()) < 8: - return None - spins_ori = [] - spins_norm = [] - for ii in atom_lines: - iis = ii.split() - spins_ori.append([float(jj) for jj in iis[5:8]]) - spins_norm.append([float(iis[-1])]) - return np.array(spins_ori) * np.array(spins_norm) - - -def get_lmpbox(lines): - box_info = [] - tilt = np.zeros(3) - for ii in lines: - if "xlo" in ii and "xhi" in ii: - box_info.append([float(ii.split()[0]), float(ii.split()[1])]) - break - for ii in lines: - if "ylo" in ii and "yhi" in ii: - box_info.append([float(ii.split()[0]), float(ii.split()[1])]) - break - for ii in lines: - if "zlo" in ii and "zhi" in ii: - box_info.append([float(ii.split()[0]), float(ii.split()[1])]) - break - for ii in lines: - if "xy" in ii and "xz" in ii and "yz" in ii: - tilt = np.array([float(jj) for jj in ii.split()[0:3]]) - return box_info, tilt - - -def system_data( - lines: list[str], - type_map: list[str] | None = None, - type_idx_zero: bool = True, - atom_style: str = "atomic", -) -> dict: - """Parse LAMMPS data file to system data format. - - Parameters - ---------- - lines : list - Lines from LAMMPS data file - type_map : list, optional - Mapping from atom types to element names - type_idx_zero : bool - Whether to use zero-based indexing for atom types - atom_style : str - The LAMMPS atom style (atomic, full, charge, etc.) - - Returns - ------- - dict - System data dictionary - """ - system = {} - system["atom_numbs"] = get_natoms_vec(lines, atom_style=atom_style) - system["atom_names"] = [] - if type_map is None: - for ii in range(len(system["atom_numbs"])): - system["atom_names"].append("Type_%d" % ii) # noqa: UP031 - else: - assert len(type_map) >= len(system["atom_numbs"]) - for ii in range(len(system["atom_numbs"])): - system["atom_names"].append(type_map[ii]) - lohi, tilt = get_lmpbox(lines) - orig, cell = lmpbox2box(lohi, tilt) - system["orig"] = np.array(orig) - system["cells"] = [np.array(cell)] - natoms = sum(system["atom_numbs"]) - system["atom_types"] = get_atype( - lines, type_idx_zero=type_idx_zero, atom_style=atom_style - ) - system["coords"] = [get_posi(lines, atom_style=atom_style)] - system["cells"] = np.array(system["cells"]) - system["coords"] = np.array(system["coords"]) - - # Add charges if the atom style supports them - charges = get_charges(lines, atom_style=atom_style) - if charges is not None: - system["charges"] = np.array([charges]) - - spins = get_spins(lines, atom_style=atom_style) - if spins is not None: - system["spins"] = np.array([spins]) - - return system - - -def to_system_data( - lines: list[str], - type_map: list[str] | None = None, - type_idx_zero: bool = True, - atom_style: str = "atomic", -) -> dict: - """Parse LAMMPS data file to system data format. - - Parameters - ---------- - lines : list - Lines from LAMMPS data file - type_map : list, optional - Mapping from atom types to element names - type_idx_zero : bool - Whether to use zero-based indexing for atom types - atom_style : str - The LAMMPS atom style. If "auto", attempts to detect automatically - from file. Default is "atomic". - - Returns - ------- - dict - System data dictionary - """ - # Attempt automatic detection if requested - if atom_style == "auto": - detected_style = detect_atom_style(lines) - if detected_style: - atom_style = detected_style - else: - atom_style = "atomic" # fallback to default - - return system_data( - lines, type_map=type_map, type_idx_zero=type_idx_zero, atom_style=atom_style - ) - - -def rotate_to_lower_triangle( - cell: np.ndarray, coord: np.ndarray -) -> tuple[np.ndarray, np.ndarray]: - """Rotate the cell to lower triangular and ensure the diagonal elements are non-negative. - - Args: - cell (np.ndarray): The original cell matrix. - coord (np.ndarray): The coordinates of the atoms. - - Returns - ------- - tuple[np.ndarray, np.ndarray]: The rotated cell and adjusted coordinates. - """ - q, _ = np.linalg.qr(cell.T) - cell = np.matmul(cell, q) - coord = np.matmul(coord, q) - - # Ensure the diagonal elements of the cell are non-negative - rot = np.eye(3) - if cell[0][0] < 0: - rot[0][0] = -1 - if cell[1][1] < 0: - rot[1][1] = -1 - if cell[2][2] < 0: - rot[2][2] = -1 - cell = np.matmul(cell, rot) - coord = np.matmul(coord, rot) - return cell, coord - - -def _get_lammps_masses(system) -> np.ndarray | None: - """Get masses for the LAMMPS ``Masses`` section. - - Prefer explicitly stored masses when available. Otherwise, infer masses from - ``atom_names`` when all names are valid chemical element symbols. - - Parameters - ---------- - system : dict - System data dictionary - - Returns - ------- - np.ndarray or None - Per-type masses aligned with ``atom_names``. Returns ``None`` when the - masses cannot be determined safely. - - Raises - ------ - ValueError - If explicit ``system["masses"]`` is present but does not match the - length of ``atom_names``. - """ - atom_names = system["atom_names"] - masses = system.get("masses") - if masses is not None: - masses = np.asarray(masses, dtype=float) - if masses.ndim != 1 or len(masses) != len(atom_names): - raise ValueError( - 'Explicit system["masses"] must be a 1D array with the same ' - 'length as system["atom_names"] to write the LAMMPS Masses ' - "section." - ) - return masses - - if not all(name in ELEMENTS for name in atom_names): - return None - - return np.array([Element(name).mass for name in atom_names], dtype=float) - - -def from_system_data(system, f_idx=0): - ret = "" - ret += "\n" - natoms = sum(system["atom_numbs"]) - ntypes = len(system["atom_numbs"]) - cell, coord = rotate_to_lower_triangle( - system["cells"][f_idx], system["coords"][f_idx] - ) - ret += "%d atoms\n" % natoms # noqa: UP031 - ret += "%d atom types\n" % ntypes # noqa: UP031 - ret += (ptr_float_fmt + " " + ptr_float_fmt + " xlo xhi\n") % ( - 0, - cell[0][0], - ) # noqa: UP031 - ret += (ptr_float_fmt + " " + ptr_float_fmt + " ylo yhi\n") % ( - 0, - cell[1][1], - ) # noqa: UP031 - ret += (ptr_float_fmt + " " + ptr_float_fmt + " zlo zhi\n") % ( - 0, - cell[2][2], - ) # noqa: UP031 - ret += ( - ptr_float_fmt + " " + ptr_float_fmt + " " + ptr_float_fmt + " xy xz yz\n" - ) % ( - cell[1][0], - cell[2][0], - cell[2][1], - ) # noqa: UP031 - ret += "\n" - - masses = _get_lammps_masses(system) - if masses is not None: - ret += "Masses\n" - ret += "\n" - mass_fmt = ptr_int_fmt + " " + ptr_float_fmt + " # %s\n" # noqa: UP031 - for ii, (mass, atom_name) in enumerate(zip(masses, system["atom_names"])): - ret += mass_fmt % (ii + 1, mass, atom_name) - ret += "\n" - - ret += "Atoms # atomic\n" - ret += "\n" - coord_fmt = ( - ptr_int_fmt - + " " - + ptr_int_fmt - + " " - + ptr_float_fmt - + " " - + ptr_float_fmt - + " " - + ptr_float_fmt - + "\n" - ) # noqa: UP031 - - if "spins" in system: - coord_fmt = ( - coord_fmt.strip("\n") - + " " - + ptr_float_fmt - + " " - + ptr_float_fmt - + " " - + ptr_float_fmt - + " " - + ptr_float_fmt - + "\n" - ) # noqa: UP031 - spins_norm = np.linalg.norm(system["spins"][f_idx], axis=1) - for ii in range(natoms): - if "spins" in system: - if spins_norm[ii] != 0: - ret += coord_fmt % ( - ii + 1, - system["atom_types"][ii] + 1, - coord[ii][0] - system["orig"][0], - coord[ii][1] - system["orig"][1], - coord[ii][2] - system["orig"][2], - system["spins"][f_idx][ii][0] / spins_norm[ii], - system["spins"][f_idx][ii][1] / spins_norm[ii], - system["spins"][f_idx][ii][2] / spins_norm[ii], - spins_norm[ii], - ) # noqa: UP031 - else: - ret += coord_fmt % ( - ii + 1, - system["atom_types"][ii] + 1, - coord[ii][0] - system["orig"][0], - coord[ii][1] - system["orig"][1], - coord[ii][2] - system["orig"][2], - system["spins"][f_idx][ii][0], - system["spins"][f_idx][ii][1], - system["spins"][f_idx][ii][2] + 1, - spins_norm[ii], - ) # noqa: UP031 - else: - ret += coord_fmt % ( - ii + 1, - system["atom_types"][ii] + 1, - coord[ii][0] - system["orig"][0], - coord[ii][1] - system["orig"][1], - coord[ii][2] - system["orig"][2], - ) # noqa: UP031 - return ret - - -if __name__ == "__main__": - fname = "water-SPCE.data" - lines = open(fname).read().split("\n") - bonds, tilt = get_lmpbox(lines) - # print(bonds, tilt) - orig, box = lmpbox2box(bonds, tilt) - # print(orig, box) - bonds1, tilt1 = box2lmpbox(orig, box) - # print(bonds1, tilt1) - print(bonds1 - bonds) - print(tilt1 - tilt) - print(box) - print(get_atype(lines)) - print(get_posi(lines)) +from dpdata.formats.lammps.lmp import * # noqa: F403 diff --git a/dpdata/lmdb/__init__.py b/dpdata/lmdb/__init__.py index 53a3e8f0e..dd764bd55 100644 --- a/dpdata/lmdb/__init__.py +++ b/dpdata/lmdb/__init__.py @@ -1,5 +1,3 @@ from __future__ import annotations -from .format import LMDBFormat - -__all__ = ["LMDBFormat"] +from dpdata.formats.lmdb import * # noqa: F403 diff --git a/dpdata/lmdb/format.py b/dpdata/lmdb/format.py index 9b518be6b..3db613b44 100644 --- a/dpdata/lmdb/format.py +++ b/dpdata/lmdb/format.py @@ -1,286 +1,3 @@ from __future__ import annotations -import os - -import lmdb -import msgpack -import msgpack_numpy as m -import numpy as np - -from dpdata.format import Format - -m.patch() - - -class LMDBError(Exception): - """Base class for LMDB errors.""" - - -class LMDBMetadataError(LMDBError): - """Metadata not found in LMDB.""" - - -class LMDBFrameError(LMDBError): - """Frame data not found in LMDB.""" - - -class LMDBFormat(Format): - """ - Class for handling the LMDB format, which stores atomic configurations in a - Lightning Memory-Mapped Database (LMDB). - - This format is optimized for machine learning workflows where fast, random - access to a large number of frames is required. All frames from multiple - systems (with potentially different numbers of atoms) are stored in a - single LMDB database file. - - Both single systems and multiple systems are supported via the standard - ``dpdata`` APIs. - - Examples - -------- - **Saving a single LabeledSystem** - - >>> import dpdata - >>> system = dpdata.LabeledSystem("path/to/input.vasp", fmt="vasp/outcar") - >>> system.to("lmdb", "my_single_system.lmdb") - - **Loading a single LabeledSystem** - - >>> loaded_system = dpdata.LabeledSystem("my_single_system.lmdb", fmt="lmdb") - - **Saving multiple systems to a single LMDB database** - - >>> import dpdata - >>> system_1 = dpdata.LabeledSystem("path/to/system1/OUTCAR", fmt="vasp/outcar") - >>> system_2 = dpdata.LabeledSystem("path/to/system2/OUTCAR", fmt="vasp/outcar") - >>> multi_systems_obj = dpdata.MultiSystems(system_1, system_2) - >>> multi_systems_obj.to("lmdb", "my_multi_system_db.lmdb") - - **Loading multiple systems from a single LMDB database** - - >>> import dpdata - >>> loaded_multi_systems = dpdata.MultiSystems.from_file("my_multi_system_db.lmdb", fmt="lmdb") - """ - - def to_multi_systems( - self, formulas, directory, map_size=1000000000, frame_idx_fmt="012d", **kwargs - ): - """Implement MultiSystems.to for LMDB format. - - Parameters - ---------- - formulas : list[str] - list of formulas - directory : str - directory of system - map_size : int, optional - Maximum size of the LMDB database in bytes. Default is 1GB. - frame_idx_fmt : str, optional - The format string used to encode the frame index as a key. Default is "012d". - **kwargs : dict - other parameters - - Yields - ------ - tuple - (self, formula) to be used by to_system - """ - self._frame_idx_fmt = frame_idx_fmt - self._global_frame_idx = 0 - self._system_info = [] - os.makedirs(directory, exist_ok=True) - with lmdb.open(directory, map_size=map_size) as env: - with env.begin(write=True) as txn: - self._txn = txn - for ff in formulas: - yield (self, ff) - # Finalize metadata - metadata = { - "nframes": self._global_frame_idx, - "system_info": self._system_info, - "frame_idx_fmt": self._frame_idx_fmt, - } - txn.put(b"__metadata__", msgpack.packb(metadata, use_bin_type=True)) - self._txn = None - - def _dump_to_txn(self, data, txn, formula, dtypes): - from dpdata.data_type import Axis - - nframes = data["coords"].shape[0] - - # Identify symbolic shapes and frame-dependent keys - data_shapes = {} - frame_dependent_keys = [] - for dt in dtypes: - if dt.name in data: - if dt.shape is not None: - data_shapes[dt.name] = [ - s.value if isinstance(s, Axis) else s for s in dt.shape - ] - if Axis.NFRAMES in dt.shape: - frame_dependent_keys.append(dt.name) - else: - data_shapes[dt.name] = None - - # Record system info - # natoms needs to be extracted from data - if "atom_numbs" in data: - natoms_list = data["atom_numbs"] - else: - # Fallback for systems without atom_numbs (should not happen in valid dpdata systems) - natoms_list = [] - - self._system_info.append( - { - "formula": formula, - "natoms": natoms_list, - "nframes": nframes, - "start_idx": self._global_frame_idx, - "data_shapes": data_shapes, - "frame_dependent_keys": frame_dependent_keys, - } - ) - - for i in range(nframes): - frame_data = {} - for key, val in data.items(): - if key in frame_dependent_keys: - frame_data[key] = val[i] - else: - frame_data[key] = val - - key = f"{self._global_frame_idx:{self._frame_idx_fmt}}".encode("ascii") - value = msgpack.packb(frame_data, use_bin_type=True) - txn.put(key, value) - self._global_frame_idx += 1 - - def to_labeled_system(self, data, file_name, **kwargs): - """Save a single LabeledSystem to an LMDB database.""" - from dpdata.system import LabeledSystem - - if isinstance(file_name, tuple) and file_name[0] is self: - txn, formula = self._txn, file_name[1] - self._dump_to_txn(data, txn, formula, LabeledSystem.DTYPES) - else: - # Single system call: use to_multi_systems logic - # Infer formula from data if possible, or use default - formula = kwargs.get("formula", "unknown") - gen = self.to_multi_systems([formula], file_name, **kwargs) - handle = next(gen) - self.to_labeled_system(data, handle, **kwargs) - try: - next(gen) - except StopIteration: - pass - - def to_system(self, data, file_name, **kwargs): - """Save a single System to an LMDB database.""" - from dpdata.system import System - - if isinstance(file_name, tuple) and file_name[0] is self: - txn, formula = self._txn, file_name[1] - self._dump_to_txn(data, txn, formula, System.DTYPES) - else: - # Single system call - formula = kwargs.get("formula", "unknown") - gen = self.to_multi_systems([formula], file_name, **kwargs) - handle = next(gen) - self.to_system(data, handle, **kwargs) - try: - next(gen) - except StopIteration: - pass - - def from_multi_systems(self, file_name, map_size=1000000000, **kwargs): - """Load multiple systems from a single LMDB database. - - Parameters - ---------- - file_name : str - The path to the LMDB database directory. - map_size : int, optional - Maximum size of the LMDB database in bytes. - **kwargs : dict - other parameters - - Yields - ------ - dict - data dictionary for each system - """ - from dpdata.data_type import Axis, DataType - from dpdata.system import LabeledSystem, System - - with lmdb.open(file_name, readonly=True) as env: - with env.begin() as txn: - metadata_packed = txn.get(b"__metadata__") - if metadata_packed is None: - raise LMDBMetadataError("LMDB database does not contain metadata.") - metadata = msgpack.unpackb(metadata_packed, raw=False) - frame_idx_fmt = metadata.get("frame_idx_fmt", "012d") - - for sys_info in metadata["system_info"]: - system_frames = [] - start_idx = sys_info["start_idx"] - nframes = sys_info["nframes"] - data_shapes = sys_info.get("data_shapes", {}) - frame_dependent_keys = sys_info.get("frame_dependent_keys", []) - - for i in range(start_idx, start_idx + nframes): - key = f"{i:{frame_idx_fmt}}".encode("ascii") - value = txn.get(key) - if value is None: - raise LMDBFrameError(f"Frame data not found for key: {key}") - frame_data = msgpack.unpackb(value, raw=False) - system_frames.append(frame_data) - - # Aggregate data for one system - first_frame = system_frames[0] - is_labeled = "energies" in first_frame - cls = LabeledSystem if is_labeled else System - - # Auto-register unknown data types - existing_dt_names = [dt.name for dt in cls.DTYPES] - new_dts = [] - axis_map = {a.value: a for a in Axis} - for key, val in first_frame.items(): - if key not in existing_dt_names and key in data_shapes: - shape_raw = data_shapes[key] - if shape_raw is not None: - shape = tuple([axis_map.get(s, s) for s in shape_raw]) - else: - shape = None - - v_arr = np.array(val) - new_dts.append( - DataType(key, type(v_arr), shape=shape, required=False) - ) - - if new_dts: - cls.register_data_type(*new_dts) - - agg_data = {} - for key, val in first_frame.items(): - if key in frame_dependent_keys: - agg_data[key] = np.array([d[key] for d in system_frames]) - else: - agg_data[key] = val - - yield agg_data - - def from_labeled_system(self, file_name, **kwargs): - """Load data for a single LabeledSystem from an LMDB database.""" - if isinstance(file_name, dict): - return file_name - # from_multi_systems returns a generator of dicts - gen = self.from_multi_systems(file_name, **kwargs) - return next(gen) - - def from_system(self, file_name, **kwargs): - """Load data for a single System from an LMDB database.""" - if isinstance(file_name, dict): - return file_name - # from_multi_systems returns a generator of dicts - gen = self.from_multi_systems(file_name, **kwargs) - return next(gen) +from dpdata.formats.lmdb.format import * # noqa: F403 diff --git a/dpdata/openmx/__init__.py b/dpdata/openmx/__init__.py index e69de29bb..16d07d584 100644 --- a/dpdata/openmx/__init__.py +++ b/dpdata/openmx/__init__.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from dpdata.formats.openmx import * # noqa: F403 diff --git a/dpdata/openmx/omx.py b/dpdata/openmx/omx.py index 89f853687..4b0635a96 100644 --- a/dpdata/openmx/omx.py +++ b/dpdata/openmx/omx.py @@ -1,200 +1,3 @@ -#!/usr/bin/python3 from __future__ import annotations -from typing import TYPE_CHECKING - -import numpy as np - -from dpdata.utils import open_file - -if TYPE_CHECKING: - from dpdata.utils import FileType - -from ..unit import ( - EnergyConversion, - ForceConversion, - LengthConversion, - PressureConversion, -) - -ry2ev = EnergyConversion("rydberg", "eV").value() -kbar2evperang3 = PressureConversion("kbar", "eV/angstrom^3").value() - -length_convert = LengthConversion("bohr", "angstrom").value() -energy_convert = EnergyConversion("hartree", "eV").value() -force_convert = ForceConversion("hartree/bohr", "eV/angstrom").value() - -import warnings -from collections import OrderedDict - - -def load_atom(lines): - atom_names = [] - atom_names_mode = False - for line in lines: - if "" in line: - atom_names_mode = False - elif atom_names_mode: - parts = line.split() - atom_names.append(parts[1]) - atom_names_original = atom_names - atom_names = list(OrderedDict.fromkeys(set(atom_names))) - atom_names = sorted( - atom_names, key=atom_names_original.index - ) # Unique ordering of atomic species - ntypes = len(atom_names) - atom_numbs = [0] * ntypes - atom_types = [] - atom_types_mode = False - for line in lines: - if "" in line: - atom_types_mode = False - elif atom_types_mode: - parts = line.split() - for i, atom_name in enumerate(atom_names): - if parts[1] == atom_name: - atom_numbs[i] += 1 - atom_types.append(i) - atom_types = np.array(atom_types) - return atom_names, atom_types, atom_numbs - - -def load_cells(lines): - cells = [] - for line in lines: - if "Cell_Vectors=" in line: - part = line.split("Cell_Vectors=")[1] - parts = part.split() - values = list(map(float, parts[:9])) - cell = [values[0:3], values[3:6], values[6:9]] - cells.append(cell) - # Checking SCF converged or not - for token in line.split(): - if token.startswith("scf_conv="): - scf_conv = int(token.split("=")[1]) - if scf_conv == 0: - warnings.warn("SCF not converged!", stacklevel=2) - cells = np.array(cells) - return cells - - -# load atom_names, atom_numbs, atom_types, cells -def load_param_file(fname: FileType, mdname: FileType): - with open_file(fname) as dat_file: - lines = dat_file.readlines() - atom_names, atom_types, atom_numbs = load_atom(lines) - - with open_file(mdname) as md_file: - lines = md_file.readlines() - cells = load_cells(lines) - return atom_names, atom_numbs, atom_types, cells - - -def load_coords(lines, atom_names, natoms): - cnt = 0 - coord, coords = [], [] - for line in lines: - if "time=" in line: - continue - for atom_name in atom_names: - atom_name += " " - if atom_name in line: - cnt += 1 - parts = line.split() - for_line = [float(parts[1]), float(parts[2]), float(parts[3])] - coord.append(for_line) - if cnt == natoms: - coords.append(coord) - cnt = 0 - coord = [] - coords = np.array(coords) - return coords - - -def load_data(mdname: FileType, atom_names, natoms): - with open_file(mdname) as md_file: - lines = md_file.readlines() - coords = load_coords(lines, atom_names, natoms) - steps = [str(i) for i in range(1, coords.shape[0] + 1)] - return coords, steps - - -def to_system_data(fname: FileType, mdname: FileType): - data = {} - ( - data["atom_names"], - data["atom_numbs"], - data["atom_types"], - data["cells"], - ) = load_param_file(fname, mdname) - data["coords"], steps = load_data( - mdname, - data["atom_names"], - np.sum(data["atom_numbs"]), - ) - data["orig"] = np.zeros(3) - return data, steps - - -def load_energy(lines): - energy = [] - for line in lines: - if "time=" in line: - parts = line.split() - ene_line = float(parts[4]) # Hartree - energy.append(ene_line) - continue - energy = energy_convert * np.array(energy) # Hartree -> eV - return energy - - -def load_force(lines, atom_names, atom_numbs): - cnt = 0 - field, fields = [], [] - for line in lines: - if "time=" in line: - continue - for atom_name in atom_names: - atom_name += " " - if atom_name in line: - cnt += 1 - parts = line.split() - for_line = [float(parts[4]), float(parts[5]), float(parts[6])] - field.append(for_line) - if cnt == np.sum(atom_numbs): - fields.append(field) - cnt = 0 - field = [] - force = force_convert * np.array(fields) - return force - - -# load energy, force -def to_system_label(fname, mdname): - atom_names, atom_numbs, atom_types, cells = load_param_file(fname, mdname) - with open_file(mdname) as md_file: - lines = md_file.readlines() - energy = load_energy(lines) - force = load_force(lines, atom_names, atom_numbs) - return energy, force - - -if __name__ == "__main__": - file_name = "Au111Surface" - fname = f"{file_name}.dat" - mdname = f"{file_name}.md" - atom_names, atom_numbs, atom_types, cells = load_param_file(fname, mdname) - coords, steps = load_data(mdname, atom_names, np.sum(atom_numbs)) - data, steps = to_system_data(fname, mdname) - energy, force = to_system_label(fname, mdname) - print(atom_names) - print(atom_numbs) - print(atom_types) -# print(cells.shape) -# print(coords.shape) -# print(len(energy)) -# print(force.shape) +from dpdata.formats.openmx.omx import * # noqa: F403 diff --git a/dpdata/orca/__init__.py b/dpdata/orca/__init__.py index e69de29bb..ff93fd88b 100644 --- a/dpdata/orca/__init__.py +++ b/dpdata/orca/__init__.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from dpdata.formats.orca import * # noqa: F403 diff --git a/dpdata/orca/output.py b/dpdata/orca/output.py index a0915162b..be44132cc 100644 --- a/dpdata/orca/output.py +++ b/dpdata/orca/output.py @@ -1,73 +1,3 @@ from __future__ import annotations -from typing import TYPE_CHECKING - -import numpy as np - -from dpdata.utils import open_file - -if TYPE_CHECKING: - from dpdata.utils import FileType - - -def read_orca_sp_output( - fn: FileType, -) -> tuple[np.ndarray, np.ndarray, float, np.ndarray]: - """Read from ORCA output. - - Note that both the energy and the gradient should be printed. - - Parameters - ---------- - fn : str - file name - - Returns - ------- - np.ndarray - atomic symbols - np.ndarray - atomic coordinates - float - total potential energy - np.ndarray - atomic forces - """ - coord = None - symbols = None - forces = None - energy = None - with open_file(fn) as f: - flag = 0 - for line in f: - if flag in (1, 3, 4): - flag += 1 - elif flag == 2: - s = line.split() - if not len(s): - flag = 0 - else: - symbols.append(s[0].capitalize()) - coord.append([float(s[1]), float(s[2]), float(s[3])]) - elif flag == 5: - s = line.split() - if not len(s): - flag = 0 - else: - forces.append([float(s[3]), float(s[4]), float(s[5])]) - elif line.startswith("CARTESIAN COORDINATES (ANGSTROEM)"): - # coord - flag = 1 - coord = [] - symbols = [] - elif line.startswith("CARTESIAN GRADIENT"): - flag = 3 - forces = [] - elif line.startswith("FINAL SINGLE POINT ENERGY"): - energy = float(line.split()[-1]) - symbols = np.array(symbols) - forces = -np.array(forces) - coord = np.array(coord) - assert coord.shape == forces.shape - - return symbols, coord, energy, forces +from dpdata.formats.orca.output import * # noqa: F403 diff --git a/dpdata/plugins/3dmol.py b/dpdata/plugins/3dmol.py index 56ec25161..d3ce0e9a7 100644 --- a/dpdata/plugins/3dmol.py +++ b/dpdata/plugins/3dmol.py @@ -3,7 +3,7 @@ import numpy as np from dpdata.format import Format -from dpdata.xyz.xyz import coord_to_xyz +from dpdata.formats.xyz.xyz import coord_to_xyz @Format.register("3dmol") diff --git a/dpdata/plugins/abacus.py b/dpdata/plugins/abacus.py index 0423bda58..c8736fe39 100644 --- a/dpdata/plugins/abacus.py +++ b/dpdata/plugins/abacus.py @@ -5,12 +5,12 @@ import numpy as np -import dpdata.abacus.md -import dpdata.abacus.relax -import dpdata.abacus.scf -from dpdata.abacus.stru import get_frame_from_stru, make_unlabeled_stru +import dpdata.formats.abacus.md +import dpdata.formats.abacus.relax +import dpdata.formats.abacus.scf from dpdata.data_type import Axis, DataType from dpdata.format import Format +from dpdata.formats.abacus.stru import get_frame_from_stru, make_unlabeled_stru from dpdata.utils import open_file if TYPE_CHECKING: @@ -90,7 +90,7 @@ def register_move_data(data): class AbacusSCFFormat(Format): # @Format.post("rot_lower_triangular") def from_labeled_system(self, file_name, **kwargs): - data = dpdata.abacus.scf.get_frame(file_name) + data = dpdata.formats.abacus.scf.get_frame(file_name) register_mag_data(data) register_move_data(data) return data @@ -102,7 +102,7 @@ def from_labeled_system(self, file_name, **kwargs): class AbacusMDFormat(Format): # @Format.post("rot_lower_triangular") def from_labeled_system(self, file_name, **kwargs): - data = dpdata.abacus.md.get_frame(file_name) + data = dpdata.formats.abacus.md.get_frame(file_name) register_mag_data(data) register_move_data(data) return data @@ -114,7 +114,7 @@ def from_labeled_system(self, file_name, **kwargs): class AbacusRelaxFormat(Format): # @Format.post("rot_lower_triangular") def from_labeled_system(self, file_name, **kwargs): - data = dpdata.abacus.relax.get_frame(file_name) + data = dpdata.formats.abacus.relax.get_frame(file_name) register_mag_data(data) register_move_data(data) return data diff --git a/dpdata/plugins/amber.py b/dpdata/plugins/amber.py index c51af3465..60cfc393c 100644 --- a/dpdata/plugins/amber.py +++ b/dpdata/plugins/amber.py @@ -4,8 +4,8 @@ import subprocess as sp import tempfile -import dpdata.amber.md -import dpdata.amber.sqm +import dpdata.formats.amber.md +import dpdata.formats.amber.sqm from dpdata.driver import Driver, Minimizer from dpdata.format import Format from dpdata.utils import open_file @@ -26,7 +26,7 @@ def from_system( parm7_file = file_name + ".parm7" if nc_file is None: nc_file = file_name + ".nc" - return dpdata.amber.md.read_amber_traj( + return dpdata.formats.amber.md.read_amber_traj( parm7_file=parm7_file, nc_file=nc_file, use_element_symbols=use_element_symbols, @@ -55,7 +55,7 @@ def from_labeled_system( mden_file = file_name + ".mden" if mdout_file is None: mdout_file = file_name + ".mdout" - return dpdata.amber.md.read_amber_traj( + return dpdata.formats.amber.md.read_amber_traj( parm7_file, nc_file, mdfrc_file, mden_file, mdout_file, use_element_symbols ) @@ -64,11 +64,11 @@ def from_labeled_system( class SQMOutFormat(Format): def from_system(self, fname, **kwargs): """Read from ambertools sqm.out.""" - return dpdata.amber.sqm.parse_sqm_out(fname) + return dpdata.formats.amber.sqm.parse_sqm_out(fname) def from_labeled_system(self, fname, **kwargs): """Read from ambertools sqm.out.""" - data = dpdata.amber.sqm.parse_sqm_out(fname) + data = dpdata.formats.amber.sqm.parse_sqm_out(fname) assert "forces" in list(data.keys()), f"No forces in {fname}" return data @@ -104,7 +104,7 @@ def to_system(self, data, fname=None, frame_idx=0, **kwargs): mult : int, default=1 multiplicity. Only 1 is allowed. """ - return dpdata.amber.sqm.make_sqm_in(data, fname, frame_idx, **kwargs) + return dpdata.formats.amber.sqm.make_sqm_in(data, fname, frame_idx, **kwargs) @Driver.register("sqm") diff --git a/dpdata/plugins/cp2k.py b/dpdata/plugins/cp2k.py index f5c1b5394..61f2eaf9d 100644 --- a/dpdata/plugins/cp2k.py +++ b/dpdata/plugins/cp2k.py @@ -2,9 +2,9 @@ import glob -import dpdata.cp2k.output -from dpdata.cp2k.output import Cp2kSystems +import dpdata.formats.cp2k.output from dpdata.format import Format +from dpdata.formats.cp2k.output import Cp2kSystems string_warning = """ Hi, you got an error from dpdata, @@ -42,7 +42,7 @@ def from_labeled_system(self, file_name, restart=False, **kwargs): data["energies"], data["forces"], tmp_virial, - ) = dpdata.cp2k.output.get_frames(file_name) + ) = dpdata.formats.cp2k.output.get_frames(file_name) if tmp_virial is not None: data["virials"] = tmp_virial return data diff --git a/dpdata/plugins/deepmd.py b/dpdata/plugins/deepmd.py index 860f52d02..99bd9b237 100644 --- a/dpdata/plugins/deepmd.py +++ b/dpdata/plugins/deepmd.py @@ -6,10 +6,10 @@ import numpy as np import dpdata -import dpdata.deepmd.comp -import dpdata.deepmd.hdf5 -import dpdata.deepmd.mixed -import dpdata.deepmd.raw +import dpdata.formats.deepmd.comp +import dpdata.formats.deepmd.hdf5 +import dpdata.formats.deepmd.mixed +import dpdata.formats.deepmd.raw from dpdata.data_type import Axis, DataType from dpdata.driver import Driver from dpdata.format import Format @@ -45,17 +45,17 @@ def register_spin(): class DeePMDRawFormat(Format): def from_system(self, file_name, type_map=None, **kwargs): register_spin() - return dpdata.deepmd.raw.to_system_data( + return dpdata.formats.deepmd.raw.to_system_data( file_name, type_map=type_map, labels=False ) def to_system(self, data, file_name, **kwargs): """Dump the system in deepmd raw format to directory `file_name`.""" - dpdata.deepmd.raw.dump(file_name, data) + dpdata.formats.deepmd.raw.dump(file_name, data) def from_labeled_system(self, file_name, type_map=None, **kwargs): register_spin() - return dpdata.deepmd.raw.to_system_data( + return dpdata.formats.deepmd.raw.to_system_data( file_name, type_map=type_map, labels=True ) @@ -67,7 +67,7 @@ def from_labeled_system(self, file_name, type_map=None, **kwargs): class DeePMDCompFormat(Format): def from_system(self, file_name, type_map=None, **kwargs): register_spin() - return dpdata.deepmd.comp.to_system_data( + return dpdata.formats.deepmd.comp.to_system_data( file_name, type_map=type_map, labels=False ) @@ -92,11 +92,13 @@ def to_system(self, data, file_name, set_size=5000, prec=np.float64, **kwargs): **kwargs : dict other parameters """ - dpdata.deepmd.comp.dump(file_name, data, set_size=set_size, comp_prec=prec) + dpdata.formats.deepmd.comp.dump( + file_name, data, set_size=set_size, comp_prec=prec + ) def from_labeled_system(self, file_name, type_map=None, **kwargs): register_spin() - return dpdata.deepmd.comp.to_system_data( + return dpdata.formats.deepmd.comp.to_system_data( file_name, type_map=type_map, labels=True ) @@ -130,7 +132,7 @@ class DeePMDMixedFormat(Format): """ def from_system_mix(self, file_name, type_map=None, **kwargs): - return dpdata.deepmd.mixed.to_system_data( + return dpdata.formats.deepmd.mixed.to_system_data( file_name, type_map=type_map, labels=False ) @@ -155,10 +157,12 @@ def to_system( **kwargs : dict other parameters """ - dpdata.deepmd.mixed.dump(file_name, data, set_size=set_size, comp_prec=prec) + dpdata.formats.deepmd.mixed.dump( + file_name, data, set_size=set_size, comp_prec=prec + ) def from_labeled_system_mix(self, file_name, type_map=None, **kwargs): - return dpdata.deepmd.mixed.to_system_data( + return dpdata.formats.deepmd.mixed.to_system_data( file_name, type_map=type_map, labels=True ) @@ -193,7 +197,7 @@ def mix_system(self, *system, type_map, atom_numb_pad=None, **kwargs): >>> import dpdata >>> dpdata.MultiSystems(*systems).to_deepmd_npy_mixed("mixed_dir", atom_numb_pad=8) """ - return dpdata.deepmd.mixed.mix_system( + return dpdata.formats.deepmd.mixed.mix_system( *system, type_map=type_map, atom_numb_pad=atom_numb_pad, **kwargs ) @@ -257,14 +261,14 @@ def _from_system( register_spin() if isinstance(file_name, (h5py.Group, h5py.File)): - return dpdata.deepmd.hdf5.to_system_data( + return dpdata.formats.deepmd.hdf5.to_system_data( file_name, "", type_map=type_map, labels=labels ) elif isinstance(file_name, str): s = file_name.split("#") name = s[1] if len(s) > 1 else "" with h5py.File(s[0], "r") as f: - return dpdata.deepmd.hdf5.to_system_data( + return dpdata.formats.deepmd.hdf5.to_system_data( f, name, type_map=type_map, labels=labels ) else: @@ -357,14 +361,14 @@ def to_system( import h5py if isinstance(file_name, (h5py.Group, h5py.File)): - dpdata.deepmd.hdf5.dump( + dpdata.formats.deepmd.hdf5.dump( file_name, "", data, set_size=set_size, comp_prec=comp_prec ) elif isinstance(file_name, str): s = file_name.split("#") name = s[1] if len(s) > 1 else "" with h5py.File(s[0], "w") as f: - dpdata.deepmd.hdf5.dump( + dpdata.formats.deepmd.hdf5.dump( f, name, data, set_size=set_size, comp_prec=comp_prec ) else: diff --git a/dpdata/plugins/dftbplus.py b/dpdata/plugins/dftbplus.py index 247fedc9e..191576e05 100644 --- a/dpdata/plugins/dftbplus.py +++ b/dpdata/plugins/dftbplus.py @@ -2,8 +2,8 @@ import numpy as np -from dpdata.dftbplus.output import read_dftb_plus from dpdata.format import Format +from dpdata.formats.dftbplus.output import read_dftb_plus from dpdata.unit import EnergyConversion, ForceConversion energy_convert = EnergyConversion("hartree", "eV").value() diff --git a/dpdata/plugins/fhi_aims.py b/dpdata/plugins/fhi_aims.py index 3c198aff6..310496e2a 100644 --- a/dpdata/plugins/fhi_aims.py +++ b/dpdata/plugins/fhi_aims.py @@ -1,6 +1,6 @@ from __future__ import annotations -import dpdata.fhi_aims.output +import dpdata.formats.fhi_aims.output from dpdata.format import Format @@ -20,7 +20,7 @@ def from_labeled_system( data["energies"], data["forces"], tmp_virial, - ) = dpdata.fhi_aims.output.get_frames( + ) = dpdata.formats.fhi_aims.output.get_frames( file_name, md=md, begin=begin, @@ -45,7 +45,9 @@ def from_labeled_system(self, file_name, **kwargs): data["energies"], data["forces"], tmp_virial, - ) = dpdata.fhi_aims.output.get_frames(file_name, md=False, begin=0, step=1) + ) = dpdata.formats.fhi_aims.output.get_frames( + file_name, md=False, begin=0, step=1 + ) if tmp_virial is not None: data["virials"] = tmp_virial return data diff --git a/dpdata/plugins/gaussian.py b/dpdata/plugins/gaussian.py index d2c0f7237..bfc8b273d 100644 --- a/dpdata/plugins/gaussian.py +++ b/dpdata/plugins/gaussian.py @@ -7,9 +7,9 @@ import numpy as np -import dpdata.gaussian.fchk -import dpdata.gaussian.gjf -import dpdata.gaussian.log +import dpdata.formats.gaussian.fchk +import dpdata.formats.gaussian.gjf +import dpdata.formats.gaussian.log from dpdata.data_type import Axis, DataType from dpdata.driver import Driver from dpdata.format import Format @@ -35,7 +35,7 @@ def register_hessian_data(data): class GaussianLogFormat(Format): def from_labeled_system(self, file_name: FileType, md=False, **kwargs): try: - return dpdata.gaussian.log.to_system_data(file_name, md=md) + return dpdata.formats.gaussian.log.to_system_data(file_name, md=md) except AssertionError: return {"energies": [], "forces": [], "nopbc": True} @@ -46,7 +46,7 @@ def from_labeled_system( self, file_name: FileType, has_forces=True, has_hessian=True, **kwargs ): try: - data = dpdata.gaussian.fchk.to_system_data( + data = dpdata.formats.gaussian.fchk.to_system_data( file_name, has_forces=has_forces, has_hessian=has_hessian ) register_hessian_data(data) @@ -77,7 +77,7 @@ def from_system(self, file_name: FileType, **kwargs): """ with open_file(file_name) as fp: text = fp.read() - return dpdata.gaussian.gjf.read_gaussian_input(text) + return dpdata.formats.gaussian.gjf.read_gaussian_input(text) def to_system(self, data: dict, file_name: FileType, **kwargs): """Generate Gaussian input file. @@ -89,9 +89,9 @@ def to_system(self, data: dict, file_name: FileType, **kwargs): file_name : str file name **kwargs : dict - Other parameters to make input files. See :meth:`dpdata.gaussian.gjf.make_gaussian_input` + Other parameters to make input files. See :meth:`dpdata.formats.gaussian.gjf.make_gaussian_input` """ - text = dpdata.gaussian.gjf.make_gaussian_input(data, **kwargs) + text = dpdata.formats.gaussian.gjf.make_gaussian_input(data, **kwargs) with open_file(file_name, "w") as fp: fp.write(text) @@ -108,7 +108,7 @@ class GaussianDriver(Driver): gaussian_exec : str, default=g16 path to gaussian program **kwargs : dict - other arguments to make input files. See :meth:`dpdata.gaussian.gjf.make_gaussian_input` + other arguments to make input files. See :meth:`dpdata.formats.gaussian.gjf.make_gaussian_input` Examples -------- diff --git a/dpdata/plugins/gromacs.py b/dpdata/plugins/gromacs.py index a7066bbcc..837b6f6d8 100644 --- a/dpdata/plugins/gromacs.py +++ b/dpdata/plugins/gromacs.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING -import dpdata.gromacs.gro +import dpdata.formats.gromacs.gro from dpdata.format import Format from dpdata.utils import open_file @@ -25,7 +25,7 @@ def from_system(self, file_name, format_atom_name=True, **kwargs): **kwargs : dict other parameters """ - return dpdata.gromacs.gro.file_to_system_data( + return dpdata.formats.gromacs.gro.file_to_system_data( file_name, format_atom_name=format_atom_name, **kwargs ) @@ -49,11 +49,13 @@ def to_system( if frame_idx == -1: strs = [] for idx in range(data["coords"].shape[0]): - gro_str = dpdata.gromacs.gro.from_system_data(data, f_idx=idx, **kwargs) + gro_str = dpdata.formats.gromacs.gro.from_system_data( + data, f_idx=idx, **kwargs + ) strs.append(gro_str) gro_str = "\n".join(strs) else: - gro_str = dpdata.gromacs.gro.from_system_data( + gro_str = dpdata.formats.gromacs.gro.from_system_data( data, f_idx=frame_idx, **kwargs ) diff --git a/dpdata/plugins/lammps.py b/dpdata/plugins/lammps.py index b00d4ff0c..9a4622659 100644 --- a/dpdata/plugins/lammps.py +++ b/dpdata/plugins/lammps.py @@ -4,8 +4,8 @@ import numpy as np -import dpdata.lammps.dump -import dpdata.lammps.lmp +import dpdata.formats.lammps.dump +import dpdata.formats.lammps.lmp from dpdata.data_type import Axis, DataType from dpdata.format import Format from dpdata.utils import open_file @@ -103,7 +103,9 @@ def from_system( """ with open_file(file_name) as fp: lines = [line.rstrip("\n") for line in fp] - data = dpdata.lammps.lmp.to_system_data(lines, type_map, atom_style=atom_style) + data = dpdata.formats.lammps.lmp.to_system_data( + lines, type_map, atom_style=atom_style + ) register_spin(data) register_charge(data) return data @@ -123,7 +125,7 @@ def to_system(self, data, file_name: FileType, frame_idx=0, **kwargs): other parameters """ assert frame_idx < len(data["coords"]) - w_str = dpdata.lammps.lmp.from_system_data(data, frame_idx) + w_str = dpdata.formats.lammps.lmp.from_system_data(data, frame_idx) with open_file(file_name, "w") as fp: fp.write(w_str) @@ -164,8 +166,8 @@ def from_system( dict The system data """ - lines = dpdata.lammps.dump.load_file(file_name, begin=begin, step=step) - data = dpdata.lammps.dump.system_data( + lines = dpdata.formats.lammps.dump.load_file(file_name, begin=begin, step=step) + data = dpdata.formats.lammps.dump.system_data( lines, type_map, unwrap=unwrap, input_file=input_file ) register_spin(data) @@ -188,6 +190,6 @@ def to_system(self, data, file_name: FileType, frame_idx=0, timestep=0, **kwargs other parameters """ assert frame_idx < len(data["coords"]) - w_str = dpdata.lammps.dump.from_system_data(data, frame_idx, timestep) + w_str = dpdata.formats.lammps.dump.from_system_data(data, frame_idx, timestep) with open_file(file_name, "w") as fp: fp.write(w_str) diff --git a/dpdata/plugins/lmdb.py b/dpdata/plugins/lmdb.py index 8391c1fae..5c83dbe21 100644 --- a/dpdata/plugins/lmdb.py +++ b/dpdata/plugins/lmdb.py @@ -1,6 +1,6 @@ from __future__ import annotations from dpdata.format import Format -from dpdata.lmdb.format import LMDBFormat +from dpdata.formats.lmdb.format import LMDBFormat Format.register("lmdb")(LMDBFormat) diff --git a/dpdata/plugins/openmx.py b/dpdata/plugins/openmx.py index 4e16566dc..dc0af9c2e 100644 --- a/dpdata/plugins/openmx.py +++ b/dpdata/plugins/openmx.py @@ -1,7 +1,7 @@ from __future__ import annotations +import dpdata.formats.openmx.omx import dpdata.md.pbc -import dpdata.openmx.omx from dpdata.format import Format @@ -35,7 +35,7 @@ def from_system(self, file_name: str, **kwargs) -> dict: fname = f"{file_name}.dat" mdname = f"{file_name}.md" - data, _ = dpdata.openmx.omx.to_system_data(fname, mdname) + data, _ = dpdata.formats.openmx.omx.to_system_data(fname, mdname) data["coords"] = dpdata.md.pbc.apply_pbc( data["coords"], data["cells"], @@ -61,12 +61,12 @@ def from_labeled_system(self, file_name: str, **kwargs) -> dict: fname = f"{file_name}.dat" mdname = f"{file_name}.md" - data, cs = dpdata.openmx.omx.to_system_data(fname, mdname) + data, cs = dpdata.formats.openmx.omx.to_system_data(fname, mdname) data["coords"] = dpdata.md.pbc.apply_pbc( data["coords"], data["cells"], ) - data["energies"], data["forces"] = dpdata.openmx.omx.to_system_label( + data["energies"], data["forces"] = dpdata.formats.openmx.omx.to_system_label( fname, mdname ) return data diff --git a/dpdata/plugins/orca.py b/dpdata/plugins/orca.py index 7a0b806c9..2c436be1e 100644 --- a/dpdata/plugins/orca.py +++ b/dpdata/plugins/orca.py @@ -5,7 +5,7 @@ import numpy as np from dpdata.format import Format -from dpdata.orca.output import read_orca_sp_output +from dpdata.formats.orca.output import read_orca_sp_output from dpdata.unit import EnergyConversion, ForceConversion if TYPE_CHECKING: diff --git a/dpdata/plugins/psi4.py b/dpdata/plugins/psi4.py index 2bbfc2321..9aca1b80e 100644 --- a/dpdata/plugins/psi4.py +++ b/dpdata/plugins/psi4.py @@ -5,8 +5,8 @@ import numpy as np from dpdata.format import Format -from dpdata.psi4.input import write_psi4_input -from dpdata.psi4.output import read_psi4_output +from dpdata.formats.psi4.input import write_psi4_input +from dpdata.formats.psi4.output import read_psi4_output from dpdata.unit import EnergyConversion, ForceConversion from dpdata.utils import open_file diff --git a/dpdata/plugins/pwmat.py b/dpdata/plugins/pwmat.py index 38a5bb297..5ee8483c2 100644 --- a/dpdata/plugins/pwmat.py +++ b/dpdata/plugins/pwmat.py @@ -4,8 +4,8 @@ import numpy as np -import dpdata.pwmat.atomconfig -import dpdata.pwmat.movement +import dpdata.formats.pwmat.atomconfig +import dpdata.formats.pwmat.movement from dpdata.format import Format from dpdata.utils import open_file @@ -33,7 +33,7 @@ def from_labeled_system( data["energies"], tmp_force, tmp_virial, - ) = dpdata.pwmat.movement.get_frames( + ) = dpdata.formats.pwmat.movement.get_frames( file_name, begin=begin, step=step, convergence_check=convergence_check ) if tmp_force is not None: @@ -58,7 +58,7 @@ class PwmatAtomconfigFormat(Format): def from_system(self, file_name: FileType, **kwargs): with open_file(file_name) as fp: lines = [line.rstrip("\n") for line in fp] - return dpdata.pwmat.atomconfig.to_system_data(lines) + return dpdata.formats.pwmat.atomconfig.to_system_data(lines) def to_system(self, data, file_name: FileType, frame_idx=0, *args, **kwargs): """Dump the system in pwmat atom.config format. @@ -77,6 +77,6 @@ def to_system(self, data, file_name: FileType, frame_idx=0, *args, **kwargs): other parameters """ assert frame_idx < len(data["coords"]) - w_str = dpdata.pwmat.atomconfig.from_system_data(data, frame_idx) + w_str = dpdata.formats.pwmat.atomconfig.from_system_data(data, frame_idx) with open_file(file_name, "w") as fp: fp.write(w_str) diff --git a/dpdata/plugins/pymatgen.py b/dpdata/plugins/pymatgen.py index b8099a3ab..e9c91fc8a 100644 --- a/dpdata/plugins/pymatgen.py +++ b/dpdata/plugins/pymatgen.py @@ -2,8 +2,8 @@ import numpy as np -import dpdata.pymatgen.molecule -import dpdata.pymatgen.structure +import dpdata.formats.pymatgen.molecule +import dpdata.formats.pymatgen.structure from dpdata.format import Format @@ -24,7 +24,7 @@ def from_system(self, structure, **kwargs) -> dict: dict data dict """ - return dpdata.pymatgen.structure.from_system_data(structure) + return dpdata.formats.pymatgen.structure.from_system_data(structure) def to_system(self, data, **kwargs): """Convert System to Pymatgen Structure obj.""" @@ -56,7 +56,7 @@ def from_system(self, file_name, **kwargs): except ModuleNotFoundError as e: raise ImportError("No module pymatgen.Molecule") from e - return dpdata.pymatgen.molecule.to_system_data(file_name) + return dpdata.formats.pymatgen.molecule.to_system_data(file_name) def to_system(self, data, **kwargs): """Convert System to Pymatgen Molecule obj.""" diff --git a/dpdata/plugins/qe.py b/dpdata/plugins/qe.py index 682bb202e..b9bd84c0c 100644 --- a/dpdata/plugins/qe.py +++ b/dpdata/plugins/qe.py @@ -1,8 +1,8 @@ from __future__ import annotations +import dpdata.formats.qe.scf +import dpdata.formats.qe.traj import dpdata.md.pbc -import dpdata.qe.scf -import dpdata.qe.traj from dpdata.format import Format @@ -10,7 +10,7 @@ class QECPTrajFormat(Format): @Format.post("rot_lower_triangular") def from_system(self, file_name, begin=0, step=1, **kwargs): - data, _ = dpdata.qe.traj.to_system_data( + data, _ = dpdata.formats.qe.traj.to_system_data( file_name + ".in", file_name, begin=begin, step=step ) data["coords"] = dpdata.md.pbc.apply_pbc( @@ -21,14 +21,14 @@ def from_system(self, file_name, begin=0, step=1, **kwargs): @Format.post("rot_lower_triangular") def from_labeled_system(self, file_name, begin=0, step=1, **kwargs): - data, cs = dpdata.qe.traj.to_system_data( + data, cs = dpdata.formats.qe.traj.to_system_data( file_name + ".in", file_name, begin=begin, step=step ) data["coords"] = dpdata.md.pbc.apply_pbc( data["coords"], data["cells"], ) - data["energies"], data["forces"], es = dpdata.qe.traj.to_system_label( + data["energies"], data["forces"], es = dpdata.formats.qe.traj.to_system_label( file_name + ".in", file_name, begin=begin, step=step ) assert cs == es, "the step key between files are not consistent" @@ -49,7 +49,7 @@ def from_labeled_system(self, file_name, **kwargs): data["energies"], data["forces"], tmp_virial, - ) = dpdata.qe.scf.get_frame(file_name) + ) = dpdata.formats.qe.scf.get_frame(file_name) if tmp_virial is not None: data["virials"] = tmp_virial return data diff --git a/dpdata/plugins/rdkit.py b/dpdata/plugins/rdkit.py index f01b277d6..2b336a165 100644 --- a/dpdata/plugins/rdkit.py +++ b/dpdata/plugins/rdkit.py @@ -1,6 +1,6 @@ from __future__ import annotations -import dpdata.rdkit.utils +import dpdata.formats.rdkit.utils from dpdata.format import Format @@ -31,7 +31,7 @@ def from_bond_order_system(self, file_name, **kwargs): for m in rdkit.Chem.SDMolSupplier(file_name, sanitize=False, removeHs=False) ] if len(mols) > 1: - mol = dpdata.rdkit.utils.combine_molecules(mols) + mol = dpdata.formats.rdkit.utils.combine_molecules(mols) else: mol = mols[0] return mol diff --git a/dpdata/plugins/siesta.py b/dpdata/plugins/siesta.py index 906eeb51f..3404d37c9 100644 --- a/dpdata/plugins/siesta.py +++ b/dpdata/plugins/siesta.py @@ -1,7 +1,7 @@ from __future__ import annotations -import dpdata.siesta.aiMD_output -import dpdata.siesta.output +import dpdata.formats.siesta.aiMD_output +import dpdata.formats.siesta.output from dpdata.format import Format @@ -18,7 +18,7 @@ def from_system(self, file_name, **kwargs): _e, _f, _v, - ) = dpdata.siesta.output.obtain_frame(file_name) + ) = dpdata.formats.siesta.output.obtain_frame(file_name) return data def from_labeled_system(self, file_name, **kwargs): @@ -32,7 +32,7 @@ def from_labeled_system(self, file_name, **kwargs): data["energies"], data["forces"], data["virials"], - ) = dpdata.siesta.output.obtain_frame(file_name) + ) = dpdata.formats.siesta.output.obtain_frame(file_name) return data @@ -50,7 +50,7 @@ def from_system(self, file_name, **kwargs): _e, _f, _v, - ) = dpdata.siesta.aiMD_output.get_aiMD_frame(file_name) + ) = dpdata.formats.siesta.aiMD_output.get_aiMD_frame(file_name) return data def from_labeled_system(self, file_name, **kwargs): @@ -64,5 +64,5 @@ def from_labeled_system(self, file_name, **kwargs): data["energies"], data["forces"], data["virials"], - ) = dpdata.siesta.aiMD_output.get_aiMD_frame(file_name) + ) = dpdata.formats.siesta.aiMD_output.get_aiMD_frame(file_name) return data diff --git a/dpdata/plugins/vasp.py b/dpdata/plugins/vasp.py index 49d25ecba..84d3142b6 100644 --- a/dpdata/plugins/vasp.py +++ b/dpdata/plugins/vasp.py @@ -4,9 +4,9 @@ import numpy as np -import dpdata.vasp.outcar -import dpdata.vasp.poscar -import dpdata.vasp.xml +import dpdata.formats.vasp.outcar +import dpdata.formats.vasp.poscar +import dpdata.formats.vasp.xml from dpdata.data_type import Axis, DataType from dpdata.format import Format from dpdata.utils import open_file, uniq_atom_names @@ -36,7 +36,7 @@ class VASPPoscarFormat(Format): def from_system(self, file_name: FileType, **kwargs): with open_file(file_name) as fp: lines = [line.rstrip("\n") for line in fp] - data = dpdata.vasp.poscar.to_system_data(lines) + data = dpdata.formats.vasp.poscar.to_system_data(lines) data = uniq_atom_names(data) register_move_data(data) return data @@ -75,7 +75,7 @@ def to_system(self, data, frame_idx=0, **kwargs): other parameters """ assert frame_idx < len(data["coords"]) - return dpdata.vasp.poscar.from_system_data(data, frame_idx) + return dpdata.formats.vasp.poscar.from_system_data(data, frame_idx) # rotate the system to lammps convention @@ -97,7 +97,7 @@ def from_labeled_system( data["energies"], tmp_force, tmp_virial, - ) = dpdata.vasp.outcar.get_frames( + ) = dpdata.formats.vasp.outcar.get_frames( file_name, begin=begin, step=step, @@ -136,7 +136,7 @@ def from_labeled_system( data["energies"], data["forces"], tmp_virial, - ) = dpdata.vasp.xml.analyze( + ) = dpdata.formats.vasp.xml.analyze( file_name, type_idx_zero=True, begin=begin, diff --git a/dpdata/plugins/xyz.py b/dpdata/plugins/xyz.py index d005f114f..63aaeabe3 100644 --- a/dpdata/plugins/xyz.py +++ b/dpdata/plugins/xyz.py @@ -10,8 +10,8 @@ if TYPE_CHECKING: from dpdata.utils import FileType -from dpdata.xyz.quip_gap_xyz import QuipGapxyzSystems, format_single_frame -from dpdata.xyz.xyz import coord_to_xyz, xyz_to_coord +from dpdata.formats.xyz.quip_gap_xyz import QuipGapxyzSystems, format_single_frame +from dpdata.formats.xyz.xyz import coord_to_xyz, xyz_to_coord @Format.register("xyz") diff --git a/dpdata/psi4/__init__.py b/dpdata/psi4/__init__.py index e69de29bb..06ef58244 100644 --- a/dpdata/psi4/__init__.py +++ b/dpdata/psi4/__init__.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from dpdata.formats.psi4 import * # noqa: F403 diff --git a/dpdata/psi4/input.py b/dpdata/psi4/input.py index 3959cb753..f151f0a48 100644 --- a/dpdata/psi4/input.py +++ b/dpdata/psi4/input.py @@ -1,62 +1,3 @@ from __future__ import annotations -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - import numpy as np - -# Angston is used in Psi4 by default -template = """molecule {{ -{atoms:s} -{charge:d} {multiplicity:d} -}} -set basis {basis:s} -set gradient_write on -G, wfn = gradient("WB97M-D3BJ", return_wfn=True) -wfn.energy() -wfn.gradient().print_out() -""" - - -def write_psi4_input( - types: np.ndarray, - coords: np.ndarray, - method: str, - basis: str, - charge: int = 0, - multiplicity: int = 1, -) -> str: - """Write Psi4 input file. - - Parameters - ---------- - types : np.ndarray - atomic symbols - coords : np.ndarray - atomic coordinates - method : str - computational method - basis : str - basis set; see https://psicode.org/psi4manual/master/basissets_tables.html - charge : int, default=0 - charge of system - multiplicity : int, default=1 - multiplicity of system - - Returns - ------- - str - content of Psi4 input file - """ - return template.format( - atoms="\n".join( - [ - "{:s} {:16.9f} {:16.9f} {:16.9f}".format(*ii) - for ii in zip(types, *coords.T) - ] - ), - charge=charge, - multiplicity=multiplicity, - method=method, - basis=basis, - ) +from dpdata.formats.psi4.input import * # noqa: F403 diff --git a/dpdata/psi4/output.py b/dpdata/psi4/output.py index c3594ffb4..66f1e33cf 100644 --- a/dpdata/psi4/output.py +++ b/dpdata/psi4/output.py @@ -1,80 +1,3 @@ from __future__ import annotations -from typing import TYPE_CHECKING - -import numpy as np - -from dpdata.unit import LengthConversion -from dpdata.utils import open_file - -if TYPE_CHECKING: - from dpdata.utils import FileType - - -def read_psi4_output(fn: FileType) -> tuple[str, np.ndarray, float, np.ndarray]: - """Read from Psi4 output. - - Note that both the energy and the gradient should be printed. - - Parameters - ---------- - fn : str - file name - - Returns - ------- - str - atomic symbols - np.ndarray - atomic coordinates - float - total potential energy - np.ndarray - atomic forces - """ - coord = None - symbols = None - forces = None - energy = None - length_unit = None - with open_file(fn) as f: - flag = 0 - for line in f: - if flag in (1, 3, 4, 5, 6): - flag += 1 - elif flag == 2: - s = line.split() - if not len(s): - flag = 0 - else: - symbols.append(s[0].capitalize()) - coord.append([float(s[1]), float(s[2]), float(s[3])]) - elif flag == 7: - s = line.split() - if not len(s): - flag = 0 - else: - forces.append([float(s[1]), float(s[2]), float(s[3])]) - elif line.startswith( - " Center X Y Z Mass" - ): - # coord - flag = 1 - coord = [] - symbols = [] - elif line.startswith(" Geometry (in "): - # remove ), - length_unit = line.split()[2][:-2].lower() - elif line.startswith(" ## Total Gradient"): - flag = 3 - forces = [] - elif line.startswith(" Total Energy ="): - energy = float(line.split()[-1]) - assert length_unit is not None - length_convert = LengthConversion(length_unit, "angstrom").value() - symbols = np.array(symbols) - forces = -np.array(forces) - coord = np.array(coord) * length_convert - assert coord.shape == forces.shape - - return symbols, coord, energy, forces +from dpdata.formats.psi4.output import * # noqa: F403 diff --git a/dpdata/pwmat/__init__.py b/dpdata/pwmat/__init__.py index e69de29bb..3a8bde615 100644 --- a/dpdata/pwmat/__init__.py +++ b/dpdata/pwmat/__init__.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from dpdata.formats.pwmat import * # noqa: F403 diff --git a/dpdata/pwmat/atomconfig.py b/dpdata/pwmat/atomconfig.py index 5f01c8409..c6b5928d5 100644 --- a/dpdata/pwmat/atomconfig.py +++ b/dpdata/pwmat/atomconfig.py @@ -1,95 +1,3 @@ -#!/usr/bin/python3 from __future__ import annotations -import numpy as np - -from ..periodic_table import ELEMENTS - - -def _to_system_data_lower(lines): - system = {} - natoms = int(lines[0].split()[0]) - cell = [] - for idx, ii in enumerate(lines): - if "lattice" in ii or "Lattice" in ii or "LATTICE" in ii: - for kk in range(idx + 1, idx + 1 + 3): - vector = [float(jj) for jj in lines[kk].split()[0:3]] - cell.append(vector) - system["cells"] = np.array([cell]) - coord = [] - atomic_number = [] - atom_numbs = [] - for idx, ii in enumerate(lines): - if "Position" in ii or "POSITION" in ii or "position" in ii: - for kk in range(idx + 1, idx + 1 + natoms): - min = kk - for jj in range(kk + 1, idx + 1 + natoms): - if int(lines[jj].split()[0]) < int(lines[min].split()[0]): - min = jj - lines[min], lines[kk] = lines[kk], lines[min] - for gg in range(idx + 1, idx + 1 + natoms): - tmpv = [float(jj) for jj in lines[gg].split()[1:4]] - tmpv = np.matmul(np.array(tmpv), system["cells"][0]) - coord.append(tmpv) - tmpn = int(lines[gg].split()[0]) - atomic_number.append(tmpn) - for ii in np.unique(sorted(atomic_number)): - atom_numbs.append(atomic_number.count(ii)) - system["atom_numbs"] = [int(ii) for ii in atom_numbs] - system["coords"] = np.array([coord]) - system["orig"] = np.zeros(3) - atom_types = [] - for idx, ii in enumerate(system["atom_numbs"]): - for jj in range(ii): - atom_types.append(idx) - system["atom_types"] = np.array(atom_types, dtype=int) - system["atom_names"] = [ELEMENTS[ii - 1] for ii in np.unique(sorted(atomic_number))] - return system - - -def to_system_data(lines): - return _to_system_data_lower(lines) - - -def from_system_data(system, f_idx=0, skip_zeros=True): - ret = "" - natoms = sum(system["atom_numbs"]) - ret += "%d" % natoms # noqa: UP031 - ret += "\n" - ret += "LATTICE" - ret += "\n" - for ii in system["cells"][f_idx]: - for jj in ii: - ret += f"{jj:.16e} " - ret += "\n" - ret += "POSITION" - ret += "\n" - atom_numbs = system["atom_numbs"] - atom_names = system["atom_names"] - atype = system["atom_types"] - posis = system["coords"][f_idx] - # atype_idx = [[idx,tt] for idx,tt in enumerate(atype)] - # sort_idx = np.argsort(atype, kind = 'mergesort') - sort_idx = np.lexsort((np.arange(len(atype)), atype)) - atype = atype[sort_idx] - posis = posis[sort_idx] - symbal = [] - for ii, jj in zip(atom_numbs, atom_names): - for kk in range(ii): - symbal.append(jj) - atomic_numbers = [] - for ii in symbal: - atomic_numbers.append(ELEMENTS.index(ii) + 1) - posi_list = [] - for jj, ii in zip(atomic_numbers, posis): - ii = np.matmul(ii, np.linalg.inv(system["cells"][0])) - posi_list.append("%d %15.10f %15.10f %15.10f 1 1 1" % (jj, ii[0], ii[1], ii[2])) # noqa: UP031 - for kk in range(len(posi_list)): - min = kk - for jj in range(kk, len(posi_list)): - if int(posi_list[jj].split()[0]) < int(posi_list[min].split()[0]): - min = jj - posi_list[min], posi_list[kk] = posi_list[kk], posi_list[min] - posi_list.append("") - ret += "\n".join(posi_list) - return ret +from dpdata.formats.pwmat.atomconfig import * # noqa: F403 diff --git a/dpdata/pwmat/movement.py b/dpdata/pwmat/movement.py index ccfd819db..d20575b48 100644 --- a/dpdata/pwmat/movement.py +++ b/dpdata/pwmat/movement.py @@ -1,208 +1,3 @@ from __future__ import annotations -import warnings - -import numpy as np - -from ..periodic_table import ELEMENTS - - -def system_info(lines, type_idx_zero=False): - atom_names = [] - atom_numbs = [] - nelm = 0 - natoms = int(lines[0].split()[0]) - iteration = float(lines[0].split("Etot")[0].split("=")[1].split(",")[0]) - # print(iteration) - if iteration > 0: - nelm = 40 - else: - nelm = 100 - atomic_number = [] - for idx, ii in enumerate(lines): - if ("Position" in ii) and ("nonperiodic_Position" not in ii): - for kk in range(idx + 1, idx + 1 + natoms): - min = kk - for jj in range(kk + 1, idx + 1 + natoms): - if int(lines[jj].split()[0]) < int(lines[min].split()[0]): - min = jj - lines[min], lines[kk] = lines[kk], lines[min] - for gg in range(idx + 1, idx + 1 + natoms): - tmpn = int(lines[gg].split()[0]) - atomic_number.append(tmpn) - for ii in np.unique(sorted(atomic_number)): - atom_numbs.append(atomic_number.count(ii)) - atom_types = [] - for idx, ii in enumerate(atom_numbs): - for jj in range(ii): - if type_idx_zero: - atom_types.append(idx) - else: - atom_types.append(idx + 1) - for ii in np.unique(sorted(atomic_number)): - atom_names.append(ELEMENTS[ii - 1]) - return atom_names, atom_numbs, np.array(atom_types, dtype=int), nelm - - -def get_movement_block(fp): - blk = [] - for ii in fp: - if not ii: - return blk - blk.append(ii.rstrip("\n")) - if "------------" in ii: - return blk - return blk - - -# we assume that the force is printed ... -def get_frames(fname, begin=0, step=1, convergence_check=True): - fp = open(fname) - blk = get_movement_block(fp) - - atom_names, atom_numbs, atom_types, nelm = system_info(blk, type_idx_zero=True) - ntot = sum(atom_numbs) - - all_coords = [] - all_cells = [] - all_energies = [] - all_atomic_energy = [] - all_forces = [] - all_virials = [] - - cc = 0 - rec_failed = [] - while len(blk) > 0: - if cc >= begin and (cc - begin) % step == 0: - coord, cell, energy, force, virial, is_converge = analyze_block( - blk, ntot, nelm - ) - if len(coord) == 0: - break - if is_converge or not convergence_check: - all_coords.append(coord) - all_cells.append(cell) - all_energies.append(energy) - all_forces.append(force) - if virial is not None: - all_virials.append(virial) - if not is_converge: - rec_failed.append(cc + 1) - - blk = get_movement_block(fp) - cc += 1 - - if len(rec_failed) > 0: - prt = ( - "so they are not collected." - if convergence_check - else "but they are still collected due to the requirement for ignoring convergence checks." - ) - warnings.warn( - f"The following structures were unconverged: {rec_failed}; " + prt - ) - - if len(all_virials) == 0: - all_virials = None - else: - all_virials = np.array(all_virials) - fp.close() - return ( - atom_names, - atom_numbs, - atom_types, - np.array(all_cells), - np.array(all_coords), - np.array(all_energies), - np.array(all_forces), - all_virials, - ) - - -def analyze_block(lines, ntot, nelm): - coord = [] - cell = [] - energy = None - # atomic_energy = [] - force = [] - virial = None - is_converge = True - sc_index = 0 - for idx, ii in enumerate(lines): - if "Iteration" in ii: - sc_index = int(ii.split("SCF =")[1]) - if sc_index >= nelm: - is_converge = False - energy = float( - ii.split("Etot,Ep,Ek (eV)")[1].split()[2] - ) # use Ep, not Etot=Ep+Ek - elif "----------" in ii: - assert (force is not None) and len(coord) > 0 and len(cell) > 0 - # all_coords.append(coord) - # all_cells.append(cell) - # all_energies.append(energy) - # all_forces.append(force) - # if virial is not None : - # all_virials.append(virial) - return coord, cell, energy, force, virial, is_converge - # elif 'NPT' in ii: - # tmp_v = [] - elif "Lattice vector" in ii: - if "stress" in lines[idx + 1]: - tmp_v = [] - for dd in range(3): - tmp_l = lines[idx + 1 + dd] - cell.append([float(ss) for ss in tmp_l.split()[0:3]]) - tmp_v.append([float(stress) for stress in tmp_l.split()[5:8]]) - virial = np.zeros([3, 3]) - virial[0][0] = tmp_v[0][0] - virial[0][1] = tmp_v[0][1] - virial[0][2] = tmp_v[0][2] - virial[1][0] = tmp_v[1][0] - virial[1][1] = tmp_v[1][1] - virial[1][2] = tmp_v[1][2] - virial[2][0] = tmp_v[2][0] - virial[2][1] = tmp_v[2][1] - virial[2][2] = tmp_v[2][2] - volume = np.linalg.det(np.array(cell)) - virial = virial * 160.2 * 10.0 / volume - else: - for dd in range(3): - tmp_l = lines[idx + 1 + dd] - cell.append([float(ss) for ss in tmp_l.split()[0:3]]) - - # else : - # for dd in range(3) : - # tmp_l = lines[idx+1+dd] - # cell.append([float(ss) - # for ss in tmp_l.split()[0:3]]) - # virial = np.zeros([3,3]) - elif ("Position" in ii) and ("nonperiodic_Position" not in ii): - for kk in range(idx + 1, idx + 1 + ntot): - min = kk - for jj in range(kk + 1, idx + 1 + ntot): - if int(lines[jj].split()[0]) < int(lines[min].split()[0]): - min = jj - lines[min], lines[kk] = lines[kk], lines[min] - for gg in range(idx + 1, idx + 1 + ntot): - info = [float(jj) for jj in lines[gg].split()[1:4]] - info = np.matmul(np.array(info), np.array(cell)) - coord.append(info) - elif "Force" in ii: - for kk in range(idx + 1, idx + 1 + ntot): - min = kk - for jj in range(kk + 1, idx + 1 + ntot): - if int(lines[jj].split()[0]) < int(lines[min].split()[0]): - min = jj - lines[min], lines[kk] = lines[kk], lines[min] - for gg in range(idx + 1, idx + 1 + ntot): - info = [ - -float(ss) for ss in lines[gg].split() - ] # forces in MOVEMENT file are dE/dR, lacking a minus sign - force.append(info[1:4]) - # elif 'Atomic-Energy' in ii: - # for jj in range(idx+1, idx+1+ntot) : - # tmp_l = lines[jj] - # info = [float(ss) for ss in tmp_l.split()] - # atomic_energy.append(info[1]) - return coord, cell, energy, force, virial, is_converge +from dpdata.formats.pwmat.movement import * # noqa: F403 diff --git a/dpdata/pymatgen/__init__.py b/dpdata/pymatgen/__init__.py index e69de29bb..eefbaad82 100644 --- a/dpdata/pymatgen/__init__.py +++ b/dpdata/pymatgen/__init__.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from dpdata.formats.pymatgen import * # noqa: F403 diff --git a/dpdata/pymatgen/molecule.py b/dpdata/pymatgen/molecule.py index 8d397984a..05467c12d 100644 --- a/dpdata/pymatgen/molecule.py +++ b/dpdata/pymatgen/molecule.py @@ -1,30 +1,3 @@ from __future__ import annotations -from collections import Counter - -import numpy as np - - -def to_system_data(file_name, protect_layer=9): - from pymatgen.core import Molecule - - mol = Molecule.from_file(file_name) - elem_mol = list(str(site.species.elements[0]) for site in mol.sites) - elem_counter = Counter(elem_mol) - atom_names = list(elem_counter.keys()) - atom_numbs = list(elem_counter.values()) - atom_types = [list(atom_names).index(e) for e in elem_mol] - natoms = np.sum(atom_numbs) - - tmpcoord = np.copy(mol.cart_coords) - - system = {} - system["atom_names"] = atom_names - system["atom_numbs"] = atom_numbs - system["atom_types"] = np.array(atom_types, dtype=int) - # center = [c - h_cell_size for c in mol.center_of_mass] - system["orig"] = np.array([0, 0, 0]) - - system["coords"] = np.array([tmpcoord]) - system["cells"] = np.array([10.0 * np.eye(3)]) - return system +from dpdata.formats.pymatgen.molecule import * # noqa: F403 diff --git a/dpdata/pymatgen/structure.py b/dpdata/pymatgen/structure.py index 1f74dbdd0..08e1cf0e9 100644 --- a/dpdata/pymatgen/structure.py +++ b/dpdata/pymatgen/structure.py @@ -1,30 +1,3 @@ from __future__ import annotations -import numpy as np - - -def from_system_data(structure) -> dict: - """Convert one pymatgen structure to dpdata's datadict.""" - symbols = [ii.specie.symbol for ii in structure] - atom_names = list(structure.symbol_set) - atom_numbs = [symbols.count(symbol) for symbol in atom_names] - atom_types = np.array([atom_names.index(symbol) for symbol in symbols]).astype(int) - coords = structure.cart_coords - cells = structure.lattice.matrix - if all(structure.pbc): - pbc = True - elif not any(structure.pbc): - pbc = False - else: - raise ValueError(f"Partial pbc condition {structure.pbc} is not supported") - - info_dict = { - "atom_names": atom_names, - "atom_numbs": atom_numbs, - "atom_types": atom_types, - "coords": np.array([coords]), - "cells": np.array([cells]), - "orig": np.zeros(3), - "nopbc": not pbc, - } - return info_dict +from dpdata.formats.pymatgen.structure import * # noqa: F403 diff --git a/dpdata/qe/__init__.py b/dpdata/qe/__init__.py index e69de29bb..dc582be29 100644 --- a/dpdata/qe/__init__.py +++ b/dpdata/qe/__init__.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from dpdata.formats.qe import * # noqa: F403 diff --git a/dpdata/qe/scf.py b/dpdata/qe/scf.py old mode 100755 new mode 100644 index 341261d22..4f461792e --- a/dpdata/qe/scf.py +++ b/dpdata/qe/scf.py @@ -1,188 +1,3 @@ -#!/usr/bin/env python3 from __future__ import annotations -import os - -import numpy as np - -from dpdata.utils import open_file - -from .traj import ( - kbar2evperang3, - ry2ev, -) -from .traj import ( - length_convert as bohr2ang, -) - -_QE_BLOCK_KEYWORDS = [ - "ATOMIC_SPECIES", - "ATOMIC_POSITIONS", - "K_POINTS", - "ADDITIONAL_K_POINTS", - "CELL_PARAMETERS", - "CONSTRAINTS", - "OCCUPATIONS", - "ATOMIC_VELOCITIES", - "ATOMIC_FORCES", - "SOLVENTS", - "HUBBARD", -] - - -def get_block(lines, keyword, skip=0): - ret = [] - for idx, ii in enumerate(lines): - if keyword in ii: - blk_idx = idx + 1 + skip - while len(lines[blk_idx].split()) == 0: - blk_idx += 1 - while ( - len(lines[blk_idx].split()) != 0 - and (lines[blk_idx].split()[0] not in _QE_BLOCK_KEYWORDS) - ) and blk_idx != len(lines): - ret.append(lines[blk_idx]) - blk_idx += 1 - break - return ret - - -def get_cell(lines): - ret = [] - for idx, ii in enumerate(lines): - if "ibrav" in ii: - break - blk = lines[idx : idx + 2] - ibrav = int(blk[0].replace(",", "").split("=")[-1]) - if ibrav == 0: - for iline in lines: - if "CELL_PARAMETERS" in iline and "angstrom" not in iline.lower(): - raise RuntimeError( - "CELL_PARAMETERS must be written in Angstrom. Other units are not supported yet." - ) - blk = get_block(lines, "CELL_PARAMETERS") - for ii in blk: - ret.append([float(jj) for jj in ii.split()[0:3]]) - ret = np.array(ret) - elif ibrav == 1: - a = None - for iline in lines: - line = iline.replace("=", " ").replace(",", "").split() - if len(line) >= 2 and "a" == line[0]: - # print("line = ", line) - a = float(line[1]) - if len(line) >= 2 and "celldm(1)" == line[0]: - a = float(line[1]) * bohr2ang - # print("a = ", a) - if not a: - raise RuntimeError("parameter 'a' or 'celldm(1)' cannot be found.") - ret = np.array([[a, 0.0, 0.0], [0.0, a, 0.0], [0.0, 0.0, a]]) - else: - raise RuntimeError("ibrav > 1 not supported yet.") - return ret - - -def get_coords(lines, cell): - coord = [] - atom_symbol_list = [] - for iline in lines: - if "ATOMIC_POSITIONS" in iline and ( - "angstrom" not in iline.lower() and "crystal" not in iline.lower() - ): - raise RuntimeError( - "ATOMIC_POSITIONS must be written in Angstrom or crystal. Other units are not supported yet." - ) - if "ATOMIC_POSITIONS" in iline and "angstrom" in iline.lower(): - blk = get_block(lines, "ATOMIC_POSITIONS") - for ii in blk: - coord.append([float(jj) for jj in ii.split()[1:4]]) - atom_symbol_list.append(ii.split()[0]) - coord = np.array(coord) - elif "ATOMIC_POSITIONS" in iline and "crystal" in iline.lower(): - blk = get_block(lines, "ATOMIC_POSITIONS") - for ii in blk: - coord.append([float(jj) for jj in ii.split()[1:4]]) - atom_symbol_list.append(ii.split()[0]) - coord = np.array(coord) - coord = np.matmul(coord, cell) - atom_symbol_list = np.array(atom_symbol_list) - tmp_names, symbol_idx = np.unique(atom_symbol_list, return_index=True) - atom_types = [] - atom_numbs = [] - # preserve the atom_name order - atom_names = atom_symbol_list[np.sort(symbol_idx, kind="stable")] - for jj in atom_symbol_list: - for idx, ii in enumerate(atom_names): - if jj == ii: - atom_types.append(idx) - for idx in range(len(atom_names)): - atom_numbs.append(atom_types.count(idx)) - atom_types = np.array(atom_types) - - return list(atom_names), atom_numbs, atom_types, coord - - -def get_energy(lines): - energy = None - for ii in lines: - if "! total energy" in ii: - energy = ry2ev * float(ii.split("=")[1].split()[0]) - return energy - - -def get_force(lines, natoms): - blk = get_block(lines, "Forces acting on atoms", skip=1) - ret = [] - blk = blk[0 : sum(natoms)] - for ii in blk: - ret.append([float(jj) for jj in ii.split("=")[1].split()]) - ret = np.array(ret) - ret *= ry2ev / bohr2ang - return ret - - -def get_stress(lines): - blk = get_block(lines, "total stress") - if len(blk) == 0: - return None - ret = [] - for ii in blk: - ret.append([float(jj) for jj in ii.split()[3:6]]) - ret = np.array(ret) - ret *= kbar2evperang3 - return ret - - -def get_frame(fname): - if isinstance(fname, str): - path_out = fname - outname = os.path.basename(path_out) - # the name of the input file is assumed to be different from the output by 'in' and 'out' - inname = outname.replace("out", "in") - path_in = os.path.join(os.path.dirname(path_out), inname) - elif isinstance(fname, list) and len(fname) == 2: - path_in = fname[0] - path_out = fname[1] - else: - raise RuntimeError("invalid input") - with open_file(path_out) as fp: - outlines = fp.read().split("\n") - with open_file(path_in) as fp: - inlines = fp.read().split("\n") - cell = get_cell(inlines) - atom_names, natoms, types, coords = get_coords(inlines, cell) - energy = get_energy(outlines) - force = get_force(outlines, natoms) - stress = get_stress(outlines) - if stress is not None: - stress = (stress * np.linalg.det(cell))[np.newaxis, :, :] - return ( - atom_names, - natoms, - types, - cell[np.newaxis, :, :], - coords[np.newaxis, :, :], - np.array(energy)[np.newaxis], - force[np.newaxis, :, :], - stress, - ) +from dpdata.formats.qe.scf import * # noqa: F403 diff --git a/dpdata/qe/traj.py b/dpdata/qe/traj.py index aa12ebb8d..1b4f67754 100644 --- a/dpdata/qe/traj.py +++ b/dpdata/qe/traj.py @@ -1,284 +1,3 @@ -#!/usr/bin/python3 from __future__ import annotations -import warnings -from typing import TYPE_CHECKING - -import numpy as np - -from dpdata.utils import open_file - -if TYPE_CHECKING: - from dpdata.utils import FileType - -import os - -from ..unit import ( - EnergyConversion, - ForceConversion, - LengthConversion, - PressureConversion, -) - -ry2ev = EnergyConversion("rydberg", "eV").value() -kbar2evperang3 = PressureConversion("kbar", "eV/angstrom^3").value() -gpa2evperbohr = PressureConversion("GPa", "eV/bohr^3").value() - -length_convert = LengthConversion("bohr", "angstrom").value() -energy_convert = EnergyConversion("hartree", "eV").value() -force_convert = ForceConversion("hartree/bohr", "eV/angstrom").value() - - -def load_key(lines, key): - for ii in lines: - if key in ii: - words = ii.split(",") - for jj in words: - if key in jj: - return jj.split("=")[1] - return None - - -def load_block(lines, key, nlines): - for idx, ii in enumerate(lines): - if key in ii: - break - return lines[idx + 1 : idx + 1 + nlines] - - -def convert_celldm(ibrav, celldm): - if ibrav == 1: - return celldm[0] * np.eye(3) - elif ibrav == 2: - return celldm[0] * 0.5 * np.array([[-1, 0, 1], [0, 1, 1], [-1, 1, 0]]) - elif ibrav == 3: - return celldm[0] * 0.5 * np.array([[1, 1, 1], [-1, 1, 1], [-1, -1, 1]]) - elif ibrav == -3: - return celldm[0] * 0.5 * np.array([[-1, 1, 1], [1, -1, 1], [1, 1, -1]]) - else: - warnings.warn( - "unsupported ibrav " - + str(ibrav) - + " if no .cel file, the cell convertion may be wrong. " - ) - return np.eye(3) - # raise RuntimeError('unsupported ibrav ' + str(ibrav)) - - -def load_cell_parameters(lines): - blk = load_block(lines, "CELL_PARAMETERS", 3) - ret = [] - for ii in blk: - ret.append([float(jj) for jj in ii.split()[0:3]]) - return np.array(ret) - - -def load_atom_names(lines, ntypes): - blk = load_block(lines, "ATOMIC_SPECIES", ntypes) - return [ii.split()[0] for ii in blk] - - -def load_celldm(lines): - celldm = np.zeros(6) - for ii in range(6): - key = "celldm(%d)" % (ii + 1) # noqa: UP031 - val = load_key(lines, key) - if val is not None: - celldm[ii] = float(val) - return celldm - - -def load_atom_types(lines, natoms, atom_names): - blk = load_block(lines, "ATOMIC_POSITIONS", natoms) - ret = [] - for ii in blk: - ret.append(atom_names.index(ii.split()[0])) - return np.array(ret, dtype=int) - - -def load_param_file(fname: FileType): - with open_file(fname) as fp: - lines = fp.read().split("\n") - natoms = int(load_key(lines, "nat")) - ntypes = int(load_key(lines, "ntyp")) - atom_names = load_atom_names(lines, ntypes) - atom_types = load_atom_types(lines, natoms, atom_names) - atom_numbs = [] - for ii in range(ntypes): - atom_numbs.append(np.sum(atom_types == ii)) - ibrav = int(load_key(lines, "ibrav")) - celldm = load_celldm(lines) - if ibrav == 0: - cell = load_cell_parameters(lines) - else: - cell = convert_celldm(ibrav, celldm) - cell = cell * length_convert - # print(atom_names) - # print(atom_numbs) - # print(atom_types) - # print(cell) - return atom_names, atom_numbs, atom_types, cell - - -def _load_pos_block(fp, natoms): - head = fp.readline() - if not head: - # print('get None') - return None, None - else: - ss = head.split()[0] - blk = [] - for ii in range(natoms): - newline = fp.readline() - if not newline: - return None, None - blk.append([float(jj) for jj in newline.split()]) - return blk, ss - - -def load_data(fname: FileType, natoms, begin=0, step=1, convert=1.0): - coords = [] - steps = [] - cc = 0 - with open_file(fname) as fp: - while True: - blk, ss = _load_pos_block(fp, natoms) - if blk is None: - break - else: - if cc >= begin and (cc - begin) % step == 0: - coords.append(blk) - steps.append(ss) - cc += 1 - coords = convert * np.array(coords) - return coords, steps - - -# def load_pos(fname, natoms) : -# coords = [] -# with open_file(fname) as fp: -# while True: -# blk = _load_pos_block(fp, natoms) -# # print(blk) -# if blk == None : -# break -# else : -# coords.append(blk) -# coords= length_convert * np.array(coords) -# return coords - - -def load_energy(fname, begin=0, step=1): - data = np.loadtxt(fname, ndmin=2) - steps = [] - for ii in data[begin::step, 0]: - steps.append("%d" % ii) # noqa: UP031 - with open_file(fname) as fp: - while True: - line = fp.readline() - if not line: - return None - if line.split()[0][0] != "#": - nw = len(line.split()) - break - data = np.reshape(data, [-1, nw]) - return energy_convert * data[begin::step, 5], steps - - -# def load_force(fname, natoms) : -# coords = [] -# with open_file(fname) as fp: -# while True: -# blk = _load_pos_block(fp, natoms) -# # print(blk) -# if blk == None : -# break -# else : -# coords.append(blk) -# coords= force_convert * np.array(coords) -# return coords - - -def to_system_data(input_name, prefix, begin=0, step=1): - data = {} - data["atom_names"], data["atom_numbs"], data["atom_types"], cell = load_param_file( - input_name - ) - data["coords"], csteps = load_data( - prefix + ".pos", - np.sum(data["atom_numbs"]), - begin=begin, - step=step, - convert=length_convert, - ) - data["orig"] = np.zeros(3) - try: - data["cells"], tmp_steps = load_data( - prefix + ".cel", 3, begin=begin, step=step, convert=length_convert - ) - data["cells"] = np.transpose(data["cells"], (0, 2, 1)) - if csteps != tmp_steps: - csteps.append(None) - tmp_steps.append(None) - for int_id in range(len(csteps)): - if csteps[int_id] != tmp_steps[int_id]: - break - step_id = begin + int_id * step - raise RuntimeError( - f"the step key between files are not consistent. " - f"The difference locates at step: {step_id}, " - f".pos is {csteps[int_id]}, .cel is {tmp_steps[int_id]}" - ) - except FileNotFoundError: - data["cells"] = np.tile(cell, (data["coords"].shape[0], 1, 1)) - - # handle virial - stress_fname = prefix + ".str" - if os.path.exists(stress_fname): - # 1. Read stress tensor (in GPa) for each structure - stress, vsteps = load_data(stress_fname, 3, begin=begin, step=step, convert=1.0) - if csteps != vsteps: - csteps.append(None) - vsteps.append(None) - for int_id in range(len(csteps)): - if csteps[int_id] != vsteps[int_id]: - break - step_id = begin + int_id * step - raise RuntimeError( - f"the step key between files are not consistent. " - f"The difference locates at step: {step_id}, " - f".pos is {csteps[int_id]}, .str is {vsteps[int_id]}" - ) - # 2. Calculate volume from cell. revert unit to bohr before taking det - volumes = np.linalg.det(data["cells"] / length_convert).reshape(-1) - # 3. Calculate virials for each structure, shape [nf x 3 x 3] - data["virials"] = gpa2evperbohr * volumes[:, None, None] * stress - - return data, csteps - - -def to_system_label(input_name, prefix, begin=0, step=1): - atom_names, atom_numbs, atom_types, cell = load_param_file(input_name) - energy, esteps = load_energy(prefix + ".evp", begin=begin, step=step) - force, fsteps = load_data( - prefix + ".for", - np.sum(atom_numbs), - begin=begin, - step=step, - convert=force_convert, - ) - assert esteps == fsteps, "the step key between files are not consistent " - return energy, force, esteps - - -if __name__ == "__main__": - prefix = "nacl" - atom_names, atom_numbs, atom_types, cell = load_param_file(prefix + ".in") - coords = load_data(prefix + ".pos", np.sum(atom_numbs)) - cells = load_data(prefix + ".cel", 3) - print(atom_names) - print(atom_numbs) - print(atom_types) - print(cells) - print(coords.shape) - print(cells.shape) +from dpdata.formats.qe.traj import * # noqa: F403 diff --git a/dpdata/rdkit/__init__.py b/dpdata/rdkit/__init__.py index e69de29bb..b86820923 100644 --- a/dpdata/rdkit/__init__.py +++ b/dpdata/rdkit/__init__.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from dpdata.formats.rdkit import * # noqa: F403 diff --git a/dpdata/rdkit/sanitize.py b/dpdata/rdkit/sanitize.py index 9afc52c9a..781a8742a 100644 --- a/dpdata/rdkit/sanitize.py +++ b/dpdata/rdkit/sanitize.py @@ -1,728 +1,3 @@ from __future__ import annotations -import os -import time -from copy import deepcopy - - -def get_explicit_valence(atom, verbose=False): - exp_val_calculated_from_bonds = int( - sum([bond.GetBondTypeAsDouble() for bond in atom.GetBonds()]) - ) - try: - try: - from rdkit import Chem - - exp_val = atom.GetValence(Chem.ValenceType.EXPLICIT) - valence_method = "GetValence(Chem.ValenceType.EXPLICIT)" - except (ImportError, AttributeError, TypeError): - exp_val = atom.GetExplicitValence() - valence_method = "GetExplicitValence()" - if exp_val != exp_val_calculated_from_bonds: - if verbose: - print( - f"Explicit valence given by {valence_method} and sum of bond order are inconsistent on {atom.GetSymbol()}{atom.GetIdx() + 1}, using sum of bond order." - ) - return exp_val_calculated_from_bonds - except Exception: - return exp_val_calculated_from_bonds - - -def regularize_formal_charges(mol, sanitize=True, verbose=False): - """Regularize formal charges of atoms.""" - from rdkit import Chem - - assert isinstance(mol, Chem.rdchem.Mol) - for atom in mol.GetAtoms(): - assign_formal_charge_for_atom(atom, verbose) - if sanitize: - try: - Chem.SanitizeMol(mol) - return mol - except Exception: - return None - else: - return mol - - -def assign_formal_charge_for_atom(atom, verbose=False): - """Assigen formal charge according to 8-electron rule for element B,C,N,O,S,P,As.""" - from rdkit import Chem - - assert isinstance(atom, Chem.rdchem.Atom) - valence = get_explicit_valence(atom, verbose) - if atom.GetSymbol() == "B": - atom.SetFormalCharge(3 - valence) - elif atom.GetSymbol() == "C": - atom.SetFormalCharge(valence - 4) - if valence == 3: - print( - f"Detect a valence of 3 on #C{atom.GetIdx() + 1}, the formal charge of this atom will be assigned to -1" - ) - elif valence > 4: - raise ValueError(f"#C{atom.GetIdx() + 1} has a valence larger than 4") - elif atom.GetSymbol() == "N": - if valence > 4: - raise ValueError(f"#N{atom.GetIdx() + 1} has a valence larger than 4") - else: - atom.SetFormalCharge(valence - 3) - elif atom.GetSymbol() == "O": - atom.SetFormalCharge(valence - 2) - elif atom.GetSymbol() == "S": - if valence == 1: - atom.SetFormalCharge(-1) - elif valence == 3: - atom.SetFormalCharge(1) - elif valence > 6: - raise ValueError(f"#S{atom.GetIdx() + 1} has a valence larger than 6") - else: - atom.SetFormalCharge(0) - elif atom.GetSymbol() == "P" or atom.GetSymbol() == "As": - if valence == 5: - atom.SetFormalCharge(0) - elif valence > 5: - raise ValueError( - f"#{atom.GetSymbol()}{atom.GetIdx() + 1} has a valence larger than 5" - ) - else: - atom.SetFormalCharge(valence - 3) - - -# print bond and atom information (for debugger) -def print_bonds(mol): - for bond in mol.GetBonds(): - begin_atom = bond.GetBeginAtom() - end_atom = bond.GetEndAtom() - print( - f"{begin_atom.GetSymbol()}{begin_atom.GetIdx() + 1} {end_atom.GetSymbol()}{end_atom.GetIdx() + 1} {bond.GetBondType()}" - ) - - -def print_atoms(mol): - for atom in mol.GetAtoms(): - print( - f"{atom.GetSymbol()}{atom.GetIdx() + 1} {atom.GetFormalCharge()} {get_explicit_valence(atom)}" - ) - - -def is_terminal_oxygen(O_atom): - return len(O_atom.GetNeighbors()) == 1 - - -def get_terminal_oxygens(atom): - terminal_oxygens = [] - for nei in atom.GetNeighbors(): - if nei.GetSymbol() == "O" or nei.GetSymbol() == "S": - if is_terminal_oxygen(nei): - terminal_oxygens.append(nei) - return terminal_oxygens - - -def is_terminal_NR2(N_atom): - return len(N_atom.GetNeighbors()) == 3 - - -def get_terminal_NR2s(atom): - terminal_NR2s = [] - for nei in atom.GetNeighbors(): - if nei.GetSymbol() == "N": - if is_terminal_NR2(nei): - terminal_NR2s.append(nei) - terminal_NR2s.sort( - key=lambda N_atom: len( - [atom for atom in N_atom.GetNeighbors() if atom.GetSymbol() == "H"] - ) - ) - return terminal_NR2s - - -def sanitize_phosphate_Patom(P_atom, verbose=True): - from rdkit import Chem - - if P_atom.GetSymbol() == "P": - terminal_oxygens = get_terminal_oxygens(P_atom) - mol = P_atom.GetOwningMol() - if len(terminal_oxygens) > 1: - if verbose: - print("Phospate group detected, sanitizing it...") - # set one P=O and two P-O - bond1 = mol.GetBondBetweenAtoms( - P_atom.GetIdx(), terminal_oxygens[0].GetIdx() - ) - bond1.SetBondType(Chem.rdchem.BondType.DOUBLE) - for ii in range(1, len(terminal_oxygens)): - bond = mol.GetBondBetweenAtoms( - P_atom.GetIdx(), terminal_oxygens[ii].GetIdx() - ) - bond.SetBondType(Chem.rdchem.BondType.SINGLE) - terminal_oxygens[ii].SetFormalCharge(-1) - - -def sanitize_phosphate(mol): - for atom in mol.GetAtoms(): - sanitize_phosphate_Patom(atom) - return mol - - -def sanitize_sulfate_Satom(S_atom, verbose=True): - from rdkit import Chem - - if S_atom.GetSymbol() == "S": - terminal_oxygens = get_terminal_oxygens(S_atom) - mol = S_atom.GetOwningMol() - if len(terminal_oxygens) == 3: - if verbose: - print("Sulfate group detected, sanitizing it...") - # set one S-O and two S=O - bond1 = mol.GetBondBetweenAtoms( - S_atom.GetIdx(), terminal_oxygens[0].GetIdx() - ) - bond1.SetBondType(Chem.rdchem.BondType.SINGLE) - terminal_oxygens[0].SetFormalCharge(-1) - for ii in range(1, len(terminal_oxygens)): - bond = mol.GetBondBetweenAtoms( - S_atom.GetIdx(), terminal_oxygens[ii].GetIdx() - ) - bond.SetBondType(Chem.rdchem.BondType.DOUBLE) - - -def sanitize_sulfate(mol): - for atom in mol.GetAtoms(): - sanitize_sulfate_Satom(atom) - return mol - - -def sanitize_carboxyl_Catom(C_atom, verbose=True): - from rdkit import Chem - - if C_atom.GetSymbol() == "C": - terminal_oxygens = get_terminal_oxygens(C_atom) - mol = C_atom.GetOwningMol() - if len(terminal_oxygens) == 2: - if verbose: - print("Carbonxyl group detected, sanitizing it...") - # set one C-O and one C=O - bond1 = mol.GetBondBetweenAtoms( - C_atom.GetIdx(), terminal_oxygens[0].GetIdx() - ) - bond1.SetBondType(Chem.rdchem.BondType.SINGLE) - terminal_oxygens[0].SetFormalCharge(-1) - - bond2 = mol.GetBondBetweenAtoms( - C_atom.GetIdx(), terminal_oxygens[1].GetIdx() - ) - bond2.SetBondType(Chem.rdchem.BondType.DOUBLE) - terminal_oxygens[1].SetFormalCharge(0) - - -def sanitize_carboxyl(mol): - for atom in mol.GetAtoms(): - sanitize_carboxyl_Catom(atom) - return mol - - -def sanitize_guanidine_Catom(C_atom, verbose=True): - from rdkit import Chem - - if C_atom.GetSymbol() == "C": - terminal_NR2s = get_terminal_NR2s(C_atom) - mol = C_atom.GetOwningMol() - if len(terminal_NR2s) == 3: - if verbose: - print("Guanidyl group detected, sanitizing it...") - # set two C-N and one C=N+ - bond1 = mol.GetBondBetweenAtoms(C_atom.GetIdx(), terminal_NR2s[0].GetIdx()) - bond1.SetBondType(Chem.rdchem.BondType.SINGLE) - terminal_NR2s[0].SetFormalCharge(-1) - - bond2 = mol.GetBondBetweenAtoms(C_atom.GetIdx(), terminal_NR2s[1].GetIdx()) - bond2.SetBondType(Chem.rdchem.BondType.SINGLE) - terminal_NR2s[1].SetFormalCharge(0) - - bond3 = mol.GetBondBetweenAtoms(C_atom.GetIdx(), terminal_NR2s[2].GetIdx()) - bond3.SetBondType(Chem.rdchem.BondType.DOUBLE) - terminal_NR2s[2].SetFormalCharge(1) - - -def sanitize_guanidine(mol): - for atom in mol.GetAtoms(): - sanitize_guanidine_Catom(atom) - return mol - - -def sanitize_nitro_Natom(N_atom, verbose=True): - from rdkit import Chem - - if N_atom.GetSymbol() == "N": - terminal_oxygens = get_terminal_oxygens(N_atom) - mol = N_atom.GetOwningMol() - if len(terminal_oxygens) == 2: - if verbose: - print("Nitro group detected, sanitizing it...") - # set one N-O and one N=O - bond1 = mol.GetBondBetweenAtoms( - N_atom.GetIdx(), terminal_oxygens[0].GetIdx() - ) - bond1.SetBondType(Chem.rdchem.BondType.SINGLE) - terminal_oxygens[0].SetFormalCharge(-1) - - bond2 = mol.GetBondBetweenAtoms( - N_atom.GetIdx(), terminal_oxygens[1].GetIdx() - ) - bond2.SetBondType(Chem.rdchem.BondType.DOUBLE) - terminal_oxygens[1].SetFormalCharge(0) - - -def sanitize_nitro(mol): - for atom in mol.GetAtoms(): - sanitize_nitro_Natom(atom) - return mol - - -def is_terminal_nitrogen(N_atom): - if N_atom.GetSymbol() == "N" and len(N_atom.GetNeighbors()) == 1: - return True - else: - return False - - -def sanitize_nitrine_Natom(atom, verbose=True): - from rdkit import Chem - - if atom.GetSymbol() == "N" and len(atom.GetNeighbors()) == 2: - mol = atom.GetOwningMol() - nei1, nei2 = atom.GetNeighbors()[0], atom.GetNeighbors()[1] - if nei1.GetSymbol() == "N" and nei2.GetSymbol() == "N": - if is_terminal_nitrogen(nei1): - N_terminal = nei1 - N_non_terminal = nei2 - elif is_terminal_nitrogen(nei2): - N_terminal = nei2 - N_non_terminal = nei1 - else: - N_terminal = None - N_non_terminal = None - if (N_terminal is not None) and (N_non_terminal is not None): - # set X-N=[N+]=[N-] - if verbose: - print("Detecting nitrine group, fixing it...") - bond = mol.GetBondBetweenAtoms(atom.GetIdx(), N_terminal.GetIdx()) - bond.SetBondType(Chem.rdchem.BondType.DOUBLE) - N_terminal.SetFormalCharge(-1) - - bond = mol.GetBondBetweenAtoms(atom.GetIdx(), N_non_terminal.GetIdx()) - bond.SetBondType(Chem.rdchem.BondType.DOUBLE) - atom.SetFormalCharge(1) - - -def contain_hetero_aromatic(mol): - flag = False - for atom in mol.GetAtoms(): - if atom.GetSymbol() != "C" and atom.GetIsAromatic(): - flag = True - break - return flag - - -# for carbon with explicit valence > 4 -def regularize_carbon_bond_order(atom, verbose=True): - from rdkit import Chem - - if atom.GetSymbol() == "C" and get_explicit_valence(atom) > 4: - if verbose: - print("Detecting carbon with explicit valence > 4, fixing it...") - mol = atom.GetOwningMol() - double_bond_idx = -1 - for nei in atom.GetNeighbors(): - bond = mol.GetBondBetweenAtoms(atom.GetIdx(), nei.GetIdx()) - if bond.GetBondTypeAsDouble() == 2: - double_bond_idx = bond.GetIdx() - break - if double_bond_idx != -1: - for bond in atom.GetBonds(): - if bond.GetIdx() != double_bond_idx: - bond.SetBondType(Chem.rdchem.BondType.SINGLE) - - -# for nitrogen with explicit valence > 4 -def regularize_nitrogen_bond_order(atom, verbose=True): - from rdkit import Chem - - mol = atom.GetOwningMol() - if atom.GetSymbol() == "N" and get_explicit_valence(atom) > 4: - O_atoms = get_terminal_oxygens(atom) - for O_atom in O_atoms: - bond = mol.GetBondBetweenAtoms(atom.GetIdx(), O_atom.GetIdx()) - if bond.GetBondTypeAsDouble() == 2: - bond.SetBondType(Chem.rdchem.BondType.SINGLE) - O_atom.SetFormalCharge(-1) - - -def sanitize_mol(mol, verbose=False): - for atom in mol.GetAtoms(): - sanitize_carboxyl_Catom(atom, verbose) - sanitize_guanidine_Catom(atom, verbose) - sanitize_phosphate_Patom(atom, verbose) - sanitize_sulfate_Satom(atom, verbose) - sanitize_nitro_Natom(atom, verbose) - sanitize_nitrine_Natom(atom, verbose) - regularize_carbon_bond_order(atom, verbose) - regularize_nitrogen_bond_order(atom, verbose) - return mol - - -# copy from FEprep -def mol_edit_log(mol, i, j): - if not mol.HasProp("edit"): - mol.SetProp("edit", "%d_%d" % (i, j)) # noqa: UP031 - else: - edited = mol.GetProp("edit") - mol.SetProp("edit", edited + ",%d_%d" % (i, j)) # noqa: UP031 - - -def kekulize_aromatic_heterocycles(mol_in, assign_formal_charge=True, sanitize=True): - from rdkit import Chem - from rdkit.Chem.rdchem import BondType - - mol = Chem.RWMol(mol_in) - rings = Chem.rdmolops.GetSymmSSSR(mol) - rings = [list(i) for i in list(rings)] - rings.sort(key=lambda r: len(r)) - - def search_and_assign_ring( - mol, ring, hetero, start, forward=True, start_switch=True - ): - j = start - switch = start_switch - lring = len(ring) - delta = 1 if forward else -1 - n_edit = 0 - n_double = 0 - while not ((j in hetero) & (not switch)): - btype = BondType.SINGLE if switch else BondType.DOUBLE - bond = mol.GetBondBetweenAtoms(ring[j], ring[(j + delta) % lring]) - if bond.GetBondType() == BondType.AROMATIC: - bond.SetBondType(btype) - mol_edit_log(mol, ring[j], ring[(j + delta) % lring]) - # print(ring[j], ring[(j + delta) % lring], bond.GetBondType()) - if btype == BondType.DOUBLE: - n_double += 1 - n_edit += 1 - else: - break - j = (j + delta) % lring - switch = not switch - return n_edit, n_double - - def print_bondtypes(mol, rings): - for ring in rings: - lring = len(ring) - btype = [] - for i in range(lring): - btype.append( - mol.GetBondBetweenAtoms( - ring[i], ring[(i + 1) % lring] - ).GetBondType() - ) - atoms = [mol.GetAtomWithIdx(i).GetSymbol() for i in ring] - print(ring) - print(atoms) - print(btype) - - def hetero_priority(idx, mol): - atom = mol.GetAtomWithIdx(idx) - sym = atom.GetSymbol() - valence = len(atom.GetBonds()) - - if (sym in ["O", "S"]) & (valence == 2): - return 0 - elif sym in ["N", "P", "As", "B"]: - if valence == 3: - return 1 - elif valence == 2: - return 2 - - # save carbon/hetero aromatic rings - CAr = [] - HAr = [] - for ring in rings: - lring = len(ring) - bAllAr = True - bAllC = True - for i in range(lring): - atom = mol.GetAtomWithIdx(ring[i]) - if atom.GetSymbol() != "C": - bAllC = False - - bond = mol.GetBondBetweenAtoms(ring[i], ring[(i + 1) % lring]) - if bond.GetBondType() != BondType.AROMATIC: - bAllAr = False - if bAllAr and bAllC: - CAr.append(ring) - elif bAllAr and not bAllC: - HAr.append(ring) - - if len(HAr) == 0: - # no hetrerocycles - return mol_in - else: - # edit heterocycles - for ring in HAr: - lring = len(ring) - cring = len(CAr) - hetero = [] - hasDouble = [] - fuseCAr = [] - fuseDouble = [] - for i in range(lring): - fuseCAr.append(-1) - for j in range(cring): - if ring[i] in CAr[j]: - fuseCAr[i] = j - break - if i > 1: - if (fuseCAr[i] == fuseCAr[i - 1]) & (fuseCAr[i] >= 0): - fuseDouble.append(i) - atom = mol.GetAtomWithIdx(ring[i]) - if atom.GetSymbol() != "C": - hetero.append(i) - atom_bonds = atom.GetBonds() - btype = [bond.GetBondType() for bond in atom_bonds] - # print(btype) - if BondType.DOUBLE in btype: - hasDouble.append(i) - bond = mol.GetBondBetweenAtoms(ring[i], ring[(i + 1) % lring]) - - if (fuseCAr[0] == fuseCAr[lring - 1]) & (fuseCAr[0] >= 0): - fuseDouble.append(0) - - if (len(hetero) > 0) | (len(hasDouble) > 0): - n_targetDouble = lring // 2 - n_targetEdit = lring - hetero_prior = {i: hetero_priority(ring[i], mol) for i in hetero} - hetero.sort(key=lambda i: hetero_prior[i]) - for i in hasDouble: - d1, e1 = search_and_assign_ring(mol, ring, hetero, i, forward=True) - d2, e2 = search_and_assign_ring(mol, ring, hetero, i, forward=False) - n_targetDouble -= d1 + d2 + 1 - n_targetEdit -= e1 + e2 - for i in fuseDouble: - bond = mol.GetBondBetweenAtoms(ring[i], ring[(i - 1) % lring]) - if bond.GetBondType() == BondType.AROMATIC: - bond.SetBondType(BondType.DOUBLE) - mol_edit_log(mol, ring[i], ring[(i - 1) % lring]) - d1, e1 = search_and_assign_ring(mol, ring, hetero, i, forward=True) - d2, e2 = search_and_assign_ring( - mol, ring, hetero, (i - 1) % lring, forward=False - ) - n_targetDouble -= d1 + d2 + 1 - n_targetEdit -= e1 + e2 + 1 - for i in hetero: - atom = mol.GetAtomWithIdx(ring[i]) - if (hetero_prior[i] == 2) | (n_targetDouble * 2 >= n_targetEdit): - forward_btype = mol.GetBondBetweenAtoms( - ring[i], ring[(i + 1) % lring] - ).GetBondType() - backward_btype = mol.GetBondBetweenAtoms( - ring[i], ring[(i - 1) % lring] - ).GetBondType() - if forward_btype != BondType.AROMATIC: - switch = forward_btype == BondType.DOUBLE - d1, e1 = search_and_assign_ring( - mol, ring, hetero, i, forward=False, start_switch=switch - ) - d2 = e2 = 0 - elif backward_btype != BondType.AROMATIC: - switch = backward_btype == BondType.DOUBLE - d1, e1 = search_and_assign_ring( - mol, ring, hetero, i, forward=True, start_switch=switch - ) - d2 = e2 = 0 - else: - d1, e1 = search_and_assign_ring( - mol, ring, hetero, i, forward=True, start_switch=True - ) - d2, e2 = search_and_assign_ring( - mol, ring, hetero, i, forward=False, start_switch=False - ) - n_targetDouble -= d1 + d2 - n_targetEdit -= e1 + e2 - else: - d1, e1 = search_and_assign_ring( - mol, ring, hetero, i, forward=True, start_switch=True - ) - d2, e2 = search_and_assign_ring( - mol, ring, hetero, i, forward=False, start_switch=True - ) - n_targetDouble -= d1 + d2 - n_targetEdit -= e1 + e2 - - for ring in CAr: - lring = len(ring) - for i in range(lring): - bond = mol.GetBondBetweenAtoms(ring[i], ring[(i + 1) % lring]) - bond.SetBondType(BondType.AROMATIC) - print("Manual kekulization for aromatic heterocycles:") - print_bondtypes(mol, rings) - - atoms = mol.GetAtoms() - for i in range(len(atoms)): - mol.ReplaceAtom(i, Chem.Atom(atoms[i].GetSymbol())) - mol_edited = mol.GetMol() - # charge assignment - if assign_formal_charge: - mol_edited = regularize_formal_charges(mol_edited, sanitize=False) - if not sanitize: - return mol_edited - else: - try: - Chem.SanitizeMol(mol_edited) - return mol_edited - except Exception as e: - raise RuntimeError( - f"Manual kekulization for aromatic heterocycles failed, below are errors:\n\t {e}" - ) - - -def convert_by_obabel( - mol, cache_dir=os.path.join(os.getcwd(), ".cache"), obabel_path="obabel" -): - from openbabel import openbabel - from rdkit import Chem - - if not os.path.exists(cache_dir): - os.mkdir(cache_dir) - if mol.HasProp("_Name"): - name = mol.GetProp("_Name") - else: - name = f"mol{int(time.time())}" - mol_file_in = os.path.join(cache_dir, f"{name}.mol") - mol_file_out = os.path.join(cache_dir, f"{name}_obabel.mol") - Chem.MolToMolFile(mol, mol_file_in, kekulize=False) - obConversion = openbabel.OBConversion() - obConversion.SetInAndOutFormats("mol", "mol") - mol = openbabel.OBMol() - obConversion.ReadFile(mol, mol_file_in) - obConversion.WriteFile(mol, mol_file_out) - mol_obabel = Chem.MolFromMolFile(mol_file_out, removeHs=False, sanitize=False) - return mol_obabel - - -def super_sanitize_mol(mol, name=None, verbose=True): - from rdkit import Chem - - if name is None: - if mol.HasProp("_Name"): - name = mol.GetProp("_Name") - else: - name = "mol" - try: - if verbose: - print("=====Stage 1: use Hermite procedure=====") - # use our procedure - mol = sanitize_mol(mol, verbose) - mol = regularize_formal_charges(mol, sanitize=False) - mol_copy = deepcopy(mol) - Chem.SanitizeMol(mol_copy) - if verbose: - print(name, "Success.") - return mol_copy - except Exception as e: - try: - if verbose: - print( - "Hermite procedure failed, maybe due to unsupported representation of hetero aromatic rings, re-try with obabel" - ) - print("=====Stage 2: re-try with obabel=====") - mol = convert_by_obabel(mol) - mol = sanitize_mol(mol, verbose) - mol = kekulize_aromatic_heterocycles( - mol, assign_formal_charge=False, sanitize=False - ) # aromatic heterocycles - mol = regularize_formal_charges(mol, sanitize=False) - mol_copy = deepcopy(mol) - Chem.SanitizeMol(mol_copy) - if verbose: - print(name, "Success.") - return mol_copy - except Exception as e: - if verbose: - print(e) - print(name, "Failed!") - return None - - -class Sanitizer: - def __init__(self, level="medium", raise_errors=True, verbose=False): - """Set up sanitizer. - --------. - - Parameters - ---------- - level : 'low', 'medium' or 'high'. - `low` - use rdkit.Chem.SanitizeMol() to sanitize - `medium` - before using rdkit, assign formal charges of each atom first, which requires - the rightness of bond order information - `high` - try to regularize bond order of nitro, phosphate, sulfate, nitrine, guanidine, - pyridine-oxide function groups and aromatic heterocycles. If failed, the program - will call obabel to pre-process the mol object and re-try the procedure. - raise_errors : bool, default=True - If True, raise SanitizeError when failed. - verbose : bool, default=False - If True, print error information when failed. - """ - self._check_level(level) - self.level = level - self.raise_errors = raise_errors - self.verbose = verbose - - def _check_level(self, level): - if level not in ["low", "medium", "high"]: - raise ValueError( - f"Invalid level '{level}', please set to 'low', 'medium' or 'high'" - ) - - def _handle_exception(self, error_info): - if self.raise_errors: - raise SanitizeError(error_info) - elif self.verbose: - print(error_info) - - def sanitize(self, mol): - """Sanitize mol according to `self.level`. If failed, return None.""" - from rdkit import Chem - - if self.level == "low": - try: - Chem.SanitizeMol(mol) - return mol - except Exception as e: - error_info = f"Sanitization Failed, please use more strict sanitizer by setting 'level' to 'medium' or 'high'. The error occurs:\n\t{e}" - self._handle_exception(error_info) - return None - elif self.level == "medium": - try: - mol = regularize_formal_charges(mol, sanitize=False) - Chem.SanitizeMol(mol) - return mol - except Exception as e: - error_info = f"Sanitization Failed, please use more strict sanitizer by setting 'level' to 'high'. The error occurs:\n\t{e}" - self._handle_exception(error_info) - return None - elif self.level == "high": - mol = super_sanitize_mol(mol, verbose=self.verbose) - error_info = "Sanitization Failed. Please check your molecule file." - if mol is None: - self._handle_exception(error_info) - return mol - - -class SanitizeError(Exception): - def __init__(self, content="Sanitization Failed."): - self.content = content - - def __str__(self): - return self.content - - def __repr__(self): - return self.__str__() +from dpdata.formats.rdkit.sanitize import * # noqa: F403 diff --git a/dpdata/rdkit/utils.py b/dpdata/rdkit/utils.py index efeef6070..124910271 100644 --- a/dpdata/rdkit/utils.py +++ b/dpdata/rdkit/utils.py @@ -1,131 +1,3 @@ from __future__ import annotations -import numpy as np - - -def mol_to_system_data(mol): - from rdkit import Chem - - if not isinstance(mol, Chem.rdchem.Mol): - raise TypeError(f"rdkit.Chem.Mol required, not {type(mol)}") - - num_confs = mol.GetNumConformers() - if num_confs: - atom_symbols = [at.GetSymbol() for at in mol.GetAtoms()] - atom_names, atom_types, atom_numbs = np.unique( - atom_symbols, return_inverse=True, return_counts=True - ) - coords = np.array([conf.GetPositions() for conf in mol.GetConformers()]) - bonds = np.array( - [ - [ - bond.GetBeginAtomIdx(), - bond.GetEndAtomIdx(), - bond.GetBondTypeAsDouble(), - ] - for bond in mol.GetBonds() - ] - ) - formal_charges = np.array( - [at.GetFormalCharge() for at in mol.GetAtoms()], dtype=np.int32 - ) - data = {} - data["atom_numbs"] = list(atom_numbs) - data["atom_names"] = list(atom_names) - data["atom_types"] = atom_types - data["cells"] = np.array( - [ - [[100.0, 0.0, 0.0], [0.0, 100.0, 0.0], [0.0, 0.0, 100.0]] - for _ in range(num_confs) - ] - ) - data["coords"] = coords - data["bonds"] = bonds - data["formal_charges"] = formal_charges - data["orig"] = np.array([0.0, 0.0, 0.0]) - # other properties - if mol.HasProp("_Name"): - data["_name"] = mol.GetProp("_Name") - return data - else: - raise ValueError("The moleclue does not contain 3-D conformers") - - -def system_data_to_mol(data): - from rdkit import Chem - - mol_ed = Chem.RWMol() - atom_symbols = [data["atom_names"][i] for i in data["atom_types"]] - # add atoms - for atom_type in data["atom_types"]: - symbol = data["atom_names"][atom_type] - atom = Chem.Atom(symbol) - mol_ed.AddAtom(atom) - # add bonds - for bond_info in data["bonds"]: - if bond_info[2] == 1: - mol_ed.AddBond(int(bond_info[0]), int(bond_info[1]), Chem.BondType.SINGLE) - elif bond_info[2] == 2: - mol_ed.AddBond(int(bond_info[0]), int(bond_info[1]), Chem.BondType.DOUBLE) - elif bond_info[2] == 3: - mol_ed.AddBond(int(bond_info[0]), int(bond_info[1]), Chem.BondType.TRIPLE) - elif bond_info[2] == 1.5: - mol_ed.AddBond(int(bond_info[0]), int(bond_info[1]), Chem.BondType.AROMATIC) - # set conformers - for frame_idx in range(data["coords"].shape[0]): - conf = Chem.rdchem.Conformer(len(data["atom_types"])) - for atom_idx in range(len(data["atom_types"])): - conf.SetAtomPosition(atom_idx, data["coords"][frame_idx][atom_idx]) - mol_ed.AddConformer(conf, assignId=True) - mol = mol_ed.GetMol() - # set formal charges - for idx, atom in enumerate(mol.GetAtoms()): - atom.SetFormalCharge(int(data["formal_charges"][idx])) - # set mol name - if "_name" in list(data.keys()): - mol.SetProp("_Name", data["_name"]) - # sanitize - Chem.SanitizeMol(mol_ed) - return mol - - -def check_same_atom(atom_1, atom_2): - if atom_1.GetIdx() != atom_2.GetIdx(): - return False - elif atom_1.GetSymbol() != atom_2.GetSymbol(): - return False - else: - return True - - -def check_same_molecule(mol_1, mol_2): - flag = True - for bond_1, bond_2 in zip(mol_1.GetBonds(), mol_2.GetBonds()): - begin_atom_1, end_atom_1 = bond_1.GetBeginAtom(), bond_1.GetEndAtom() - begin_atom_2, end_atom_2 = bond_2.GetBeginAtom(), bond_2.GetEndAtom() - if not check_same_atom(begin_atom_1, begin_atom_2): - flag = False - break - elif not check_same_atom(end_atom_1, end_atom_2): - flag = False - break - return flag - - -def check_molecule_list(mols): - flag = True - for mol in mols[1:]: - if not check_same_molecule(mol, mols[0]): - flag = False - break - return flag - - -def combine_molecules(mols): - if check_molecule_list(mols): - for mol in mols[1:]: - for conf in mol.GetConformers(): - mols[0].AddConformer(conf, assignId=True) - return mols[0] - else: - raise ValueError("molecules are not of the same topology.") +from dpdata.formats.rdkit.utils import * # noqa: F403 diff --git a/dpdata/siesta/__init__.py b/dpdata/siesta/__init__.py index e69de29bb..0210250ed 100644 --- a/dpdata/siesta/__init__.py +++ b/dpdata/siesta/__init__.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from dpdata.formats.siesta import * # noqa: F403 diff --git a/dpdata/siesta/aiMD_output.py b/dpdata/siesta/aiMD_output.py index daa4f6a25..04a154aea 100644 --- a/dpdata/siesta/aiMD_output.py +++ b/dpdata/siesta/aiMD_output.py @@ -1,187 +1,3 @@ -# !/usr/bin/python3 from __future__ import annotations -import numpy as np - -ev2ev = 1 -ang2ang = 1 - - -#############################read output##################################### -def get_single_line_tail(fin, keyword, num=1): - file = open(fin) - part_res = [] - for value in file: - if keyword in value: - temp = len(value.split()) - num - part_res.append(float(value.split()[temp])) - - file.close() - return part_res - - -## atomnum: number of atoms, row numbers -## begin_column: begin column num -## read_column_num: read column num -## column_num: the column number in nxet reading line -def extract_keyword( - fout, - keyword, - down_line_num, - begin_column, - read_column_num, - is_repeated_read, - column_num, -): - file = open(fout) - ret = [] - part_ret = [] - flag = 0 - idx = 0 - extr_frame = 0 - length = obtain_nframe(fout) - # for (num,value) in enumerate(file): - for value in file: - if keyword in value: - flag = 1 - continue - if flag == 1: - if idx < down_line_num: - idx += 1 - else: - flag = 0 - part_ret.append(np.array(ret)) - ret = [] - extr_frame += 1 - if extr_frame == length: - file.close() - return part_ret - ## is_repeated_read = 0: only read 1 time for SCF - ## is_repeated_read = 1: read all for aiMD --> get all frames - if is_repeated_read: - idx = 0 - continue - - for i in range(begin_column, read_column_num): - if len(value.split()) == column_num: - if not value.split()[i].isalpha(): - ret.append(float(value.strip().split()[i])) - else: - ret.append(value.strip().split()[i]) - continue - file.close() - return part_ret - - -def obtain_nframe(fname): - fp = open(fname) - flag = False - idx = 0 - temp = 0 - for ii in fp: - if "siesta: Stress tensor (static) (eV/Ang**3):" in ii: - flag = True - continue - if flag: - if "siesta: Pressure (static):" not in ii: - if len(ii.split()) == 3: - temp += 1 - if temp == 3: - idx += 1 - # print(idx) - flag = False - temp = 0 - fp.close() - return idx - - -def get_atom_types(fout, atomnums): - covert_type = extract_keyword( - fout, "outcoor: Atomic coordinates (Ang):", atomnums, 3, 4, 0, 6 - )[0] - atomtype = [] - # print(covert_type) - for i in range(0, len(covert_type)): - atomtype.append(int(covert_type[i]) - 1) - return atomtype - - -def get_atom_name(fout): - file = open(fout) - ret = [] - for value in file: - if "Species number:" in value: - for j in range(len(value.split())): - if value.split()[j] == "Label:": - ret.append(value.split()[j + 1]) - break - file.close() - return ret - - -def get_atom_numbs(atomtypes): - atom_numbs = [] - for i in set(atomtypes): - atom_numbs.append(atomtypes.count(i)) - return atom_numbs - - -def get_virial(fout, cell): - viri = extract_keyword( - fout, "siesta: Stress tensor (static) (eV/Ang**3):", 3, 0, 3, 1, 3 - ) - vols = [] - length = obtain_nframe(fout) - for ii in range(length): - vols.append(np.linalg.det(cell[ii].reshape([3, 3]))) - for jj in range(len(viri[ii])): - ## siesta: 1eV/A^3= 1.60217*10^11 Pa , ---> qe: kBar=10^8Pa - # ii *= vols[idx] * 1e3 / 1.602176621e6 * (1.602176621e3) - viri[ii][jj] *= vols[ii] - return viri - - -def covert_dimension(arr, num): - arr = np.array(arr) - frames = len(arr) - ret = np.zeros((frames, num, 3)) - for i in range(frames): - ret[i] = arr[i].reshape(num, 3) - return ret - - -def get_aiMD_frame(fname): - NumberOfSpecies = int( - get_single_line_tail(fname, "redata: Number of Atomic Species")[0] - ) - atom_names = get_atom_name(fname) - tot_natoms = int(get_single_line_tail(fname, "Number of atoms", 3)[0]) - - atom_types = get_atom_types(fname, tot_natoms) - atom_numbs = get_atom_numbs(atom_types) - assert max(atom_types) + 1 == NumberOfSpecies - - cell = extract_keyword(fname, "outcell: Unit cell vectors (Ang):", 3, 0, 3, 1, 3) - coord = extract_keyword( - fname, "outcoor: Atomic coordinates (Ang):", tot_natoms, 0, 3, 1, 6 - ) - energy = get_single_line_tail(fname, "siesta: E_KS(eV) =") - force = extract_keyword( - fname, "siesta: Atomic forces (eV/Ang):", tot_natoms, 1, 4, 1, 4 - ) - virial = get_virial(fname, cell) - - cells = covert_dimension(np.array(cell), 3) - coords = covert_dimension(np.array(coord), tot_natoms) - forces = covert_dimension(np.array(force), tot_natoms) - virials = covert_dimension(np.array(virial), 3) - return ( - atom_names, - atom_numbs, - np.array(atom_types), - cells, - coords, - np.array(energy), - forces, - virials, - ) +from dpdata.formats.siesta.aiMD_output import * # noqa: F403 diff --git a/dpdata/siesta/output.py b/dpdata/siesta/output.py index 0c944d5b5..eb7e383b8 100644 --- a/dpdata/siesta/output.py +++ b/dpdata/siesta/output.py @@ -1,142 +1,3 @@ -#!/usr/bin/python3 from __future__ import annotations -import numpy as np - -ev2ev = 1 -ang2ang = 1 - - -#############################read output##################################### -def get_single_line_tail(fin, keyword, num=1): - file = open(fin) - res = [] - for value in file: - if keyword in value: - temp = len(value.split()) - num - res.append(float(value.split()[temp])) - file.close() - return res - return res - - -## atomnum: number of atoms, row numbers -## begin_column: begin column num -## column_num: read column num -def extract_keyword(fout, keyword, down_line_num, begin_column, column_num): - file = open(fout) - ret = [] - flag = 0 - idx = 0 - # for (num,value) in enumerate(file): - for value in file: - if keyword in value: - flag = 1 - continue - if flag == 1: - if idx < down_line_num: - idx += 1 - else: - flag = 0 - continue - if len(value.split()) >= column_num: - for i in range(begin_column, column_num): - if not value.split()[i].isalpha(): - ret.append(float(value.strip().split()[i])) - else: - ret.append(value.strip().split()[i]) - ## compatible siesta-4.0.2 and siesta-4.1-b4 - else: - flag = 0 - idx = 0 - file.close() - return ret - - -def get_atom_types(fout, atomnums): - covert_type = extract_keyword( - fout, "outcoor: Atomic coordinates (Ang):", atomnums, 3, 4 - ) - atomtype = [] - for i in range(0, len(covert_type)): - atomtype.append(int(covert_type[i]) - 1) - return atomtype - - -def get_atom_name(fout): - file = open(fout) - ret = [] - for value in file: - if "Species number:" in value: - for j in range(len(value.split())): - if value.split()[j] == "Label:": - ret.append(value.split()[j + 1]) - break - file.close() - return ret - - -def get_atom_numbs(atomtypes): - atom_numbs = [] - for i in set(atomtypes): - atom_numbs.append(atomtypes.count(i)) - return atom_numbs - - -def get_virial(fout, cells): - vols = [] - for ii in cells: - ### calucate vol - vols.append(np.linalg.det(ii.reshape([3, 3]))) - ret = extract_keyword(fout, "siesta: Stress tensor (static) (eV/Ang**3):", 3, 1, 4) - ret = np.array([ret]) - for idx, ii in enumerate(ret): - ## siesta: 1eV/A^3= 1.60217*10^11 Pa , ---> qe: kBar=10^8Pa - # ii *= vols[idx] * 1e3 / 1.602176621e6 * (1.602176621e3) - ii *= vols[idx] - return ret - - -def obtain_frame(fname): - NumberOfSpecies = int( - get_single_line_tail(fname, "redata: Number of Atomic Species")[0] - ) - atom_names = get_atom_name(fname) - tot_natoms = int(get_single_line_tail(fname, "Number of atoms", 3)[0]) - atom_types = get_atom_types(fname, tot_natoms) - atom_numbs = get_atom_numbs(atom_types) - assert max(atom_types) + 1 == NumberOfSpecies - cell = extract_keyword(fname, "outcell: Unit cell vectors (Ang):", 3, 0, 3) - coord = extract_keyword( - fname, "outcoor: Atomic coordinates (Ang):", tot_natoms, 0, 3 - ) - energy = get_single_line_tail(fname, "siesta: E_KS(eV) =") - force = extract_keyword(fname, "siesta: Atomic forces (eV/Ang):", tot_natoms, 1, 4) - virial = get_virial(fname, np.array([cell])) - - cell = np.array(cell).reshape(3, 3) - coord = np.array(coord).reshape(tot_natoms, 3) - force = np.array(force).reshape(tot_natoms, 3) - virial = np.array(virial).reshape(3, 3) - - # data = {} - # data['orig'] = np.array([0, 0, 0]) - # data['atom_names'] = atom_names - # data['atom_numbs'] = atom_numbs - # data['atom_types'] = np.array(atom_types) - # data['cells'] = np.array([cell]) - # data['coords'] = np.array([coord]) - # data['energies'] = np.array([energy]) - # data['forces'] = np.array([force]) - # data['virials'] = virial - # return data - return ( - atom_names, - atom_numbs, - np.array(atom_types), - np.array([cell]), - np.array([coord]), - np.array(energy), - np.array([force]), - np.array([virial]), - ) +from dpdata.formats.siesta.output import * # noqa: F403 diff --git a/dpdata/system.py b/dpdata/system.py index 6023891ff..4150abc89 100644 --- a/dpdata/system.py +++ b/dpdata/system.py @@ -22,10 +22,10 @@ # ensure all plugins are loaded! import dpdata.plugins import dpdata.plugins.deepmd -from dpdata.amber.mask import load_param_file, pick_by_amber_mask from dpdata.data_type import Axis, DataError, DataType, get_data_types from dpdata.driver import Driver, Minimizer from dpdata.format import Format +from dpdata.formats.amber.mask import load_param_file, pick_by_amber_mask from dpdata.plugin import Plugin from dpdata.utils import ( add_atom_names, diff --git a/dpdata/vasp/__init__.py b/dpdata/vasp/__init__.py index e69de29bb..82488a465 100644 --- a/dpdata/vasp/__init__.py +++ b/dpdata/vasp/__init__.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from dpdata.formats.vasp import * # noqa: F403 diff --git a/dpdata/vasp/outcar.py b/dpdata/vasp/outcar.py index a16fd6f9f..e5f0b7c71 100644 --- a/dpdata/vasp/outcar.py +++ b/dpdata/vasp/outcar.py @@ -1,275 +1,3 @@ from __future__ import annotations -import re -import warnings - -import numpy as np - - -def atom_name_from_potcar_string(instr: str) -> str: - """Get atom name from a potcar element name. - - e.g. Sn_d -> Sn - - Parameters - ---------- - instr : str - input potcar elemenet name - - Returns - ------- - name: str - name of atoms - """ - if "_" in instr: - # for case like : TITEL = PAW_PBE Sn_d 06Sep2000 - return instr.split("_")[0] - else: - return instr - - -def system_info( - lines: list[str], - type_idx_zero: bool = False, -) -> tuple[list[str], list[int], np.ndarray, int | None, int | None]: - """Get system information from lines of an OUTCAR file. - - Parameters - ---------- - lines : list[str] - the lines of the OUTCAR file - type_idx_zero : bool - if true atom types starts from 0 otherwise from 1. - - Returns - ------- - atom_names: list[str] - name of atoms - atom_numbs: list[int] - number of atoms that have a certain name. same length as atom_names - atom_types: np.ndarray - type of each atom, the array has same lenght as number of atoms - nelm: optional[int] - the value of NELM parameter - nwrite: optional[int] - the value of NWRITE parameter - """ - atom_names = [] - atom_names_potcar = [] - atom_numbs = None - nelm = None - nwrite = None - for ii in lines: - if "TITEL" in ii: - # get atom names from POTCAR info, tested only for PAW_PBE ... - # for case like : TITEL = PAW_PBE Sn_d 06Sep2000 - _ii = ii.split()[3] - atom_names.append(atom_name_from_potcar_string(_ii)) - elif "POTCAR:" in ii: - # get atom names from POTCAR info, tested only for PAW_PBE ... - # for case like : POTCAR: PAW_PBE Ti 08Apr2002 - _ii = ii.split()[2] - atom_names_potcar.append(atom_name_from_potcar_string(_ii)) - # a stricker check for "NELM"; compatible with distingct formats in different versions(6 and older, newers_expect-to-work) of vasp - elif nelm is None: - m = re.search(r"NELM\s*=\s*(\d+)", ii) - if m: - nelm = int(m.group(1)) - elif nwrite is None: - m = re.search(r"NWRITE\s*=\s*(\d+)", ii) - if m: - nwrite = int(m.group(1)) - if "ions per type" in ii: - atom_numbs_ = [int(s) for s in ii.split()[4:]] - if atom_numbs is None: - atom_numbs = atom_numbs_ - else: - assert atom_numbs == atom_numbs_, "in consistent numb atoms in OUTCAR" - if len(atom_names) == 0: - # try to use atom_names_potcar - if len(atom_names_potcar) == 0: - raise ValueError("cannot get atom names from potcar") - nnames = len(atom_names_potcar) - # the names are repeated. check if it is the case - assert atom_names_potcar[: nnames // 2] == atom_names_potcar[nnames // 2 :] - atom_names = atom_names_potcar[: nnames // 2] - assert nelm is not None, "cannot find maximum steps for each SC iteration" - assert atom_numbs is not None, "cannot find ion type info in OUTCAR" - if len(atom_numbs) != len(atom_names): - raise RuntimeError( - f"The number of the atom numbers per each type ({len(atom_numbs)}) " - f"does not match that of the atom types ({len(atom_names)}) detected " - f"from the OUTCAR. This issue may be cause by a bug in vasp <= 6.3. " - f"Please try to convert data from vasprun.xml instead." - ) - atom_names = atom_names[: len(atom_numbs)] - atom_types = [] - for idx, ii in enumerate(atom_numbs): - for jj in range(ii): - if type_idx_zero: - atom_types.append(idx) - else: - atom_types.append(idx + 1) - return atom_names, atom_numbs, np.array(atom_types, dtype=int), nelm, nwrite - - -def get_outcar_block(fp, ml=False): - blk = [] - energy_token = ["free energy TOTEN", "free energy ML TOTEN"] - ml_index = int(ml) - for ii in fp: - if not ii: - return blk - blk.append(ii.rstrip("\n")) - if energy_token[ml_index] in ii: - return blk - return blk - - -def check_outputs(coord, cell, force): - if len(force) == 0: - raise ValueError("cannot find forces in OUTCAR block") - if len(coord) == 0: - raise ValueError("cannot find coordinates in OUTCAR block") - if len(cell) == 0: - raise ValueError("cannot find cell in OUTCAR block") - return True - - -# we assume that the force is printed ... -def get_frames(fname, begin=0, step=1, ml=False, convergence_check=True): - with open(fname) as fp: - return _get_frames_lower( - fp, - fname, - begin=begin, - step=step, - ml=ml, - convergence_check=convergence_check, - ) - - -def _get_frames_lower(fp, fname, begin=0, step=1, ml=False, convergence_check=True): - blk = get_outcar_block(fp) - - atom_names, atom_numbs, atom_types, nelm, nwrite = system_info( - blk, type_idx_zero=True - ) - ntot = sum(atom_numbs) - - all_coords = [] - all_cells = [] - all_energies = [] - all_forces = [] - all_virials = [] - - cc = 0 - rec_failed = [] - while len(blk) > 0: - if cc >= begin and (cc - begin) % step == 0: - coord, cell, energy, force, virial, is_converge = analyze_block( - blk, ntot, nelm, ml - ) - if energy is None: - break - if nwrite == 0: - has_label = len(force) > 0 and len(coord) > 0 and len(cell) > 0 - if not has_label: - warnings.warn("cannot find labels in the frame, ingore") - else: - has_label = check_outputs(coord, cell, force) - if (is_converge or not convergence_check) and has_label: - all_coords.append(coord) - all_cells.append(cell) - all_energies.append(energy) - all_forces.append(force) - if virial is not None: - all_virials.append(virial) - if not is_converge: - rec_failed.append(cc + 1) - - blk = get_outcar_block(fp, ml) - cc += 1 - - if len(rec_failed) > 0: - prt = ( - "so they are not collected." - if convergence_check - else "but they are still collected due to the requirement for ignoring convergence checks." - ) - warnings.warn( - f"The following structures were unconverged: {rec_failed}; " + prt - ) - - if len(all_virials) == 0: - all_virials = None - else: - all_virials = np.array(all_virials) - return ( - atom_names, - atom_numbs, - atom_types, - np.array(all_cells), - np.array(all_coords), - np.array(all_energies), - np.array(all_forces), - all_virials, - ) - - -def analyze_block(lines, ntot, nelm, ml=False): - coord = [] - cell = [] - energy = None - force = [] - virial = None - is_converge = True - sc_index = 0 - # select different searching tokens based on the ml label - energy_token = ["free energy TOTEN", "free energy ML TOTEN"] - energy_index = [4, 5] - virial_token = ["FORCE on cell =-STRESS in cart. coord. units", "ML FORCE"] - virial_index = [14, 4] - cell_token = ["VOLUME and BASIS", "ML FORCE"] - cell_index = [5, 12] - ml_index = int(ml) - for idx, ii in enumerate(lines): - # if set ml == True, is_converged will always be True - if ("Iteration" in ii) and (not ml): - sc_index = int(ii.split()[3][:-1]) - if sc_index >= nelm: - is_converge = False - elif energy_token[ml_index] in ii: - energy = float(ii.split()[energy_index[ml_index]]) - return coord, cell, energy, force, virial, is_converge - elif cell_token[ml_index] in ii: - for dd in range(3): - tmp_l = lines[idx + cell_index[ml_index] + dd] - cell.append([float(ss) for ss in tmp_l.replace("-", " -").split()[0:3]]) - elif virial_token[ml_index] in ii: - in_kB_index = virial_index[ml_index] - while idx + in_kB_index < len(lines) and ( - not lines[idx + in_kB_index].split()[0:2] == ["in", "kB"] - ): - in_kB_index += 1 - assert idx + in_kB_index < len(lines), ( - 'ERROR: "in kB" is not found in OUTCAR. Unable to extract virial.' - ) - tmp_v = [float(ss) for ss in lines[idx + in_kB_index].split()[2:8]] - virial = np.zeros([3, 3]) - virial[0][0] = tmp_v[0] - virial[1][1] = tmp_v[1] - virial[2][2] = tmp_v[2] - virial[0][1] = tmp_v[3] - virial[1][0] = tmp_v[3] - virial[1][2] = tmp_v[4] - virial[2][1] = tmp_v[4] - virial[0][2] = tmp_v[5] - virial[2][0] = tmp_v[5] - elif "TOTAL-FORCE" in ii and (("ML" in ii) == ml): - for jj in range(idx + 2, idx + 2 + ntot): - tmp_l = lines[jj] - info = [float(ss) for ss in tmp_l.split()] - coord.append(info[:3]) - force.append(info[3:6]) - return coord, cell, energy, force, virial, is_converge +from dpdata.formats.vasp.outcar import * # noqa: F403 diff --git a/dpdata/vasp/poscar.py b/dpdata/vasp/poscar.py index 78b8dbbeb..c207dd2ae 100644 --- a/dpdata/vasp/poscar.py +++ b/dpdata/vasp/poscar.py @@ -1,134 +1,3 @@ -#!/usr/bin/python3 from __future__ import annotations -import numpy as np - - -def _to_system_data_lower(lines, cartesian=True, selective_dynamics=False): - def move_flag_mapper(flag): - if flag == "T": - return True - elif flag == "F": - return False - else: - raise RuntimeError(f"Invalid move flag: {flag}") - - """Treat as cartesian poscar.""" - system = {} - system["atom_names"] = [str(ii) for ii in lines[5].split()] - system["atom_numbs"] = [int(ii) for ii in lines[6].split()] - scale = float(lines[1]) - cell = [] - move_flags = [] - for ii in range(2, 5): - boxv = [float(jj) for jj in lines[ii].split()] - boxv = np.array(boxv) * scale - cell.append(boxv) - system["cells"] = [np.array(cell)] - natoms = sum(system["atom_numbs"]) - coord = [] - for ii in range(8, 8 + natoms): - tmp = lines[ii].split() - tmpv = [float(jj) for jj in tmp[:3]] - if cartesian: - tmpv = np.array(tmpv) * scale - else: - tmpv = np.matmul(np.array(tmpv), system["cells"][0]) - coord.append(tmpv) - if selective_dynamics: - if len(tmp) == 6: - move_flags.append(list(map(move_flag_mapper, tmp[3:]))) - else: - raise RuntimeError( - f"Invalid move flags, should be 6 columns, got {tmp}" - ) - - system["coords"] = [np.array(coord)] - system["orig"] = np.zeros(3) - atom_types = [] - for idx, ii in enumerate(system["atom_numbs"]): - for jj in range(ii): - atom_types.append(idx) - system["atom_types"] = np.array(atom_types, dtype=int) - system["cells"] = np.array(system["cells"]) - system["coords"] = np.array(system["coords"]) - if move_flags: - move_flags = np.array(move_flags, dtype=bool) - move_flags = move_flags.reshape((1, natoms, 3)) - system["move"] = np.array(move_flags, dtype=bool) - return system - - -def to_system_data(lines): - # remove the line that has 'selective dynamics' - selective_dynamics = False - if lines[7][0] == "S" or lines[7][0] == "s": - selective_dynamics = True - lines.pop(7) - is_cartesian = lines[7][0] in ["C", "c", "K", "k"] - if not is_cartesian: - if lines[7][0] not in ["d", "D"]: - raise RuntimeError( - "seem not to be a valid POSCAR of vasp 5.x, may be a POSCAR of vasp 4.x?" - ) - return _to_system_data_lower(lines, is_cartesian, selective_dynamics) - - -def from_system_data(system, f_idx=0, skip_zeros=True): - ret = "" - for ii, name in zip(system["atom_numbs"], system["atom_names"]): - if ii == 0: - continue - ret += "%s%d " % (name, ii) # noqa: UP031 - ret += "\n" - ret += "1.0\n" - for ii in system["cells"][f_idx]: - for jj in ii: - ret += f"{jj:.16e} " - ret += "\n" - for idx, ii in enumerate(system["atom_names"]): - if system["atom_numbs"][idx] == 0: - continue - ret += f"{ii} " - ret += "\n" - for ii in system["atom_numbs"]: - if ii == 0: - continue - ret += "%d " % ii # noqa: UP031 - ret += "\n" - move = system.get("move", None) - if move is not None and len(move) > 0: - ret += "Selective Dynamics\n" - - # should use Cartesian for VESTA software - ret += "Cartesian\n" - atype = system["atom_types"] - posis = system["coords"][f_idx] - # atype_idx = [[idx,tt] for idx,tt in enumerate(atype)] - # sort_idx = np.argsort(atype, kind = 'mergesort') - sort_idx = np.lexsort((np.arange(len(atype)), atype)) - atype = atype[sort_idx] - posis = posis[sort_idx] - if move is not None and len(move) > 0: - move = move[f_idx][sort_idx] - - if isinstance(move, np.ndarray): - move = move.tolist() - - posi_list = [] - for idx in range(len(posis)): - ii_posi = posis[idx] - line = f"{ii_posi[0]:15.10f} {ii_posi[1]:15.10f} {ii_posi[2]:15.10f}" - if move is not None and len(move) > 0: - move_flags = move[idx] - if not isinstance(move_flags, list) or len(move_flags) != 3: - raise RuntimeError( - f"Invalid move flags: {move_flags}, should be a list of 3 bools" - ) - line += " " + " ".join("T" if flag else "F" for flag in move_flags) - - posi_list.append(line) - - posi_list.append("") - ret += "\n".join(posi_list) - return ret +from dpdata.formats.vasp.poscar import * # noqa: F403 diff --git a/dpdata/vasp/xml.py b/dpdata/vasp/xml.py old mode 100755 new mode 100644 index 1b407c254..808ea7adb --- a/dpdata/vasp/xml.py +++ b/dpdata/vasp/xml.py @@ -1,176 +1,3 @@ -#!/usr/bin/env python3 from __future__ import annotations -import xml.etree.ElementTree as ET -from typing import Any - -import numpy as np - - -def check_name(item, name): - assert item.attrib["name"] == name, ( - "item attrib '{}' dose not math required '{}'".format(item.attrib["name"], name) - ) - - -def get_varray(varray): - array = [] - for vv in varray.findall("v"): - array.append([float(ii) for ii in vv.text.split()]) - return np.array(array) - - -def analyze_atominfo(atominfo_xml): - check_name(atominfo_xml.find("array"), "atoms") - eles = [] - types = [] - visited = set() - for ii in atominfo_xml.find("array").find("set"): - atom_type = int(ii.findall("c")[1].text) - if atom_type not in visited: - eles.append(ii.findall("c")[0].text.strip()) - visited.add(atom_type) - types.append(atom_type) - return eles, types - - -def analyze_calculation( - cc: Any, - nelm: int | None, -) -> tuple[np.ndarray, np.ndarray, float, np.ndarray, np.ndarray | None, bool | None]: - """Analyze a calculation block. - - Parameters - ---------- - cc : xml.etree.ElementTree.Element - The xml element for a ion step calculation - nelm : Optional[int] - The number nelm, if it is not None, convergence check is performed. - - Returns - ------- - posi : np.ndarray - The positions - cell : np.ndarray - The cell - ener : float - The energy - force : np.ndarray - The forces - str : Optional[np.ndarray] - The stress - is_converged: Optional[bool] - If the scf calculation is converged. Only return boolean when - nelm is not None. Otherwise return None. - - """ - structure_xml = cc.find("structure") - check_name(structure_xml.find("crystal").find("varray"), "basis") - check_name(structure_xml.find("varray"), "positions") - cell = get_varray(structure_xml.find("crystal").find("varray")) - posi = get_varray(structure_xml.find("varray")) - strs = None - is_converged = None - if nelm is not None: - niter = len(cc.findall(".//scstep")) - is_converged = niter < nelm - for vv in cc.findall("varray"): - if vv.attrib["name"] == "forces": - forc = get_varray(vv) - elif vv.attrib["name"] == "stress": - strs = get_varray(vv) - for ii in cc.find("energy").findall("i"): - if ii.attrib["name"] == "e_fr_energy": - ener = float(ii.text) - return posi, cell, ener, forc, strs, is_converged - - -def formulate_config(eles, types, posi, cell, ener, forc, strs_): - strs = strs_ / 1602 - natoms = len(types) - ntypes = len(eles) - ret = "" - ret += "#N %d %d\n" % (natoms, ntypes - 1) # noqa: UP031 - ret += "#C " - for ii in eles: - ret += " " + ii - ret += "\n" - ret += "##\n" - ret += f"#X {cell[0][0]:13.8f} {cell[0][1]:13.8f} {cell[0][2]:13.8f}\n" - ret += f"#Y {cell[1][0]:13.8f} {cell[1][1]:13.8f} {cell[1][2]:13.8f}\n" - ret += f"#Z {cell[2][0]:13.8f} {cell[2][1]:13.8f} {cell[2][2]:13.8f}\n" - ret += "#W 1.0\n" - ret += "#E %.10f\n" % (ener / natoms) - ret += f"#S {strs[0][0]:.9e} {strs[1][1]:.9e} {strs[2][2]:.9e} {strs[0][1]:.9e} {strs[1][2]:.9e} {strs[0][2]:.9e}\n" - ret += "#F\n" - for ii in range(natoms): - sp = np.matmul(cell.T, posi[ii]) - ret += "%d" % (types[ii] - 1) # noqa: UP031 - ret += f" {sp[0]:12.6f} {sp[1]:12.6f} {sp[2]:12.6f}" - ret += f" {forc[ii][0]:12.6f} {forc[ii][1]:12.6f} {forc[ii][2]:12.6f}" - ret += "\n" - return ret - - -def analyze(fname, type_idx_zero=False, begin=0, step=1, convergence_check=True): - """Deal with broken xml file.""" - all_posi = [] - all_cell = [] - all_ener = [] - all_forc = [] - all_strs = [] - cc = 0 - if convergence_check: - tree = ET.parse(fname) - root = tree.getroot() - parameters = root.find(".//parameters") - nelm = parameters.find(".//i[@name='NELM']") - # will check convergence - nelm = int(nelm.text) - else: - # not checking convergence - nelm = None - try: - for event, elem in ET.iterparse(fname): - if elem.tag == "atominfo": - eles, types = analyze_atominfo(elem) - types = np.array(types, dtype=int) - if type_idx_zero: - types = types - 1 - if elem.tag == "calculation": - posi, cell, ener, forc, strs, is_converged = analyze_calculation( - elem, nelm - ) - # record when not checking convergence or is_converged - # and the step criteria is satisfied - if ( - (nelm is None or is_converged) - and cc >= begin - and (cc - begin) % step == 0 - ): - all_posi.append(posi) - all_cell.append(cell) - all_ener.append(ener) - all_forc.append(forc) - if strs is not None: - all_strs.append(strs) - cc += 1 - except ET.ParseError: - return ( - eles, - types, - np.array(all_cell), - np.array(all_posi), - np.array(all_ener), - np.array(all_forc), - np.array(all_strs), - ) - return ( - eles, - types, - np.array(all_cell), - np.array(all_posi), - np.array(all_ener), - np.array(all_forc), - np.array(all_strs), - ) +from dpdata.formats.vasp.xml import * # noqa: F403 diff --git a/dpdata/xyz/__init__.py b/dpdata/xyz/__init__.py index e69de29bb..c43228e18 100644 --- a/dpdata/xyz/__init__.py +++ b/dpdata/xyz/__init__.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from dpdata.formats.xyz import * # noqa: F403 diff --git a/dpdata/xyz/quip_gap_xyz.py b/dpdata/xyz/quip_gap_xyz.py index 71e976de6..4eca6dc75 100644 --- a/dpdata/xyz/quip_gap_xyz.py +++ b/dpdata/xyz/quip_gap_xyz.py @@ -1,250 +1,3 @@ -#!/usr/bin/env python3 -# %% from __future__ import annotations -import re -from collections import OrderedDict - -import numpy as np - -from dpdata.periodic_table import Element - - -class QuipGapxyzSystems: - """deal with QuipGapxyzFile.""" - - def __init__(self, file_name): - self.file_object = open(file_name) - self.block_generator = self.get_block_generator() - - def __iter__(self): - return self - - def __next__(self): - return self.handle_single_xyz_frame(next(self.block_generator)) - - def __del__(self): - self.file_object.close() - - def get_block_generator(self): - p3 = re.compile(r"^\s*(\d+)\s*") - while True: - line = self.file_object.readline() - if not line: - break - if p3.match(line): - atom_num = int(p3.match(line).group(1)) - lines = [] - lines.append(line) - for ii in range(atom_num + 1): - lines.append(self.file_object.readline()) - if not lines[-1]: - raise RuntimeError( - f"this xyz file may lack of lines, should be {atom_num + 2};lines:{lines}" - ) - yield lines - - @staticmethod - def handle_single_xyz_frame(lines): - atom_num = int(lines[0].strip("\n").strip()) - if len(lines) != atom_num + 2: - raise RuntimeError( - f"format error, atom_num=={atom_num}, {len(lines)}!=atom_num+2" - ) - data_format_line = lines[1].strip("\n").strip() + " " - field_value_pattern = re.compile( - r"(?P\S+)=(?P[\'\"]?)(?P.*?)(?P=quote)\s+" - ) - prop_pattern = re.compile( - r"(?P\w+?):(?P[a-zA-Z]):(?P\d+)" - ) - - data_format_list = [ - kv_dict.groupdict() - for kv_dict in field_value_pattern.finditer(data_format_line) - ] - field_dict = {} - for item in data_format_list: - field_dict[item["key"]] = item["value"] - - Properties = field_dict["Properties"] - prop_list = [ - kv_dict.groupdict() for kv_dict in prop_pattern.finditer(Properties) - ] - - data_lines = [] - for line in lines[2:]: - data_lines.append(list(filter(bool, line.strip().split()))) - data_array = np.array(data_lines) - used_colomn = 0 - - type_array = None - coords_array = None - Z_array = None - force_array = None - virials = None - for kv_dict in prop_list: - if kv_dict["key"] == "species": - if kv_dict["datatype"] != "S": - raise RuntimeError( - "datatype for species must be 'S' instead of {}".format( - kv_dict["datatype"] - ) - ) - field_length = int(kv_dict["value"]) - type_array = data_array[ - :, used_colomn : used_colomn + field_length - ].flatten() - used_colomn += field_length - continue - elif kv_dict["key"] == "pos": - if kv_dict["datatype"] != "R": - raise RuntimeError( - "datatype for pos must be 'R' instead of {}".format( - kv_dict["datatype"] - ) - ) - field_length = int(kv_dict["value"]) - coords_array = data_array[:, used_colomn : used_colomn + field_length] - used_colomn += field_length - continue - elif kv_dict["key"] == "Z": - if kv_dict["datatype"] != "I": - raise RuntimeError( - "datatype for pos must be 'R' instead of {}".format( - kv_dict["datatype"] - ) - ) - field_length = int(kv_dict["value"]) - Z_array = data_array[ - :, used_colomn : used_colomn + field_length - ].flatten() - used_colomn += field_length - continue - elif kv_dict["key"] == "force": - if kv_dict["datatype"] != "R": - raise RuntimeError( - "datatype for pos must be 'R' instead of {}".format( - kv_dict["datatype"] - ) - ) - field_length = int(kv_dict["value"]) - force_array = data_array[:, used_colomn : used_colomn + field_length] - used_colomn += field_length - continue - else: - raise RuntimeError("unknown field {}".format(kv_dict["key"])) - - type_num_dict = OrderedDict() - atom_type_list = [] - type_map = {} - temp_atom_max_index = 0 - if type_array is None: - raise RuntimeError("type_array can't be None type, check .xyz file") - for ii in type_array: - if ii not in type_map: - type_map[ii] = temp_atom_max_index - temp_atom_max_index += 1 - temp_atom_index = type_map[ii] - atom_type_list.append(temp_atom_index) - type_num_dict[ii] = 1 - else: - temp_atom_index = type_map[ii] - atom_type_list.append(temp_atom_index) - type_num_dict[ii] += 1 - type_num_list = [] - for atom_type, atom_num in type_num_dict.items(): - type_num_list.append((atom_type, atom_num)) - type_num_array = np.array(type_num_list) - if field_dict.get("virial", None): - virials = np.array( - [ - np.array( - list(filter(bool, field_dict["virial"].split(" "))) - ).reshape(3, 3) - ] - ).astype(np.float64) - else: - virials = None - - info_dict = {} - info_dict["atom_names"] = list(type_num_array[:, 0]) - info_dict["atom_numbs"] = list(type_num_array[:, 1].astype(int)) - info_dict["atom_types"] = np.array(atom_type_list).astype(int) - info_dict["cells"] = np.array( - [ - np.array(list(filter(bool, field_dict["Lattice"].split(" ")))).reshape( - 3, 3 - ) - ] - ).astype(np.float64) - info_dict["coords"] = np.array([coords_array]).astype(np.float64) - info_dict["energies"] = np.array([field_dict["energy"]]).astype(np.float64) - info_dict["forces"] = np.array([force_array]).astype(np.float64) - if virials is not None: - info_dict["virials"] = virials - info_dict["orig"] = np.zeros(3) - return info_dict - - -def format_single_frame(data, frame_idx): - """Format a single frame of system data into QUIP/GAP XYZ format lines. - - Parameters - ---------- - data : dict - system data - frame_idx : int - frame index - - Returns - ------- - list[str] - lines for the frame - """ - # Number of atoms - natoms = len(data["atom_types"]) - - # Build header line with metadata - header_parts = [] - - # Energy - energy = data["energies"][frame_idx] - header_parts.append(f"energy={energy:.12e}") - - # Virial (if present) - if "virials" in data: - virial = data["virials"][frame_idx] - virial_str = " ".join(f"{v:.12e}" for v in virial.flatten()) - header_parts.append(f'virial="{virial_str}"') - - # Lattice - cell = data["cells"][frame_idx] - lattice_str = " ".join(f"{c:.12e}" for c in cell.flatten()) - header_parts.append(f'Lattice="{lattice_str}"') - - # Properties - header_parts.append("Properties=species:S:1:pos:R:3:Z:I:1:force:R:3") - - header_line = " ".join(header_parts) - - # Format atom lines - atom_lines = [] - coords = data["coords"][frame_idx] - forces = data["forces"][frame_idx] - atom_names = np.array(data["atom_names"]) - atom_types = data["atom_types"] - - for i in range(natoms): - atom_type_idx = atom_types[i] - species = atom_names[atom_type_idx] - x, y, z = coords[i] - fx, fy, fz = forces[i] - atomic_number = Element(species).Z - - atom_line = f"{species} {x:.11e} {y:.11e} {z:.11e} {atomic_number} {fx:.11e} {fy:.11e} {fz:.11e}" - atom_lines.append(atom_line) - - # Combine all lines for this frame - frame_lines = [str(natoms), header_line] + atom_lines - return frame_lines +from dpdata.formats.xyz.quip_gap_xyz import * # noqa: F403 diff --git a/dpdata/xyz/xyz.py b/dpdata/xyz/xyz.py index 0c36ac32b..9db695c2a 100644 --- a/dpdata/xyz/xyz.py +++ b/dpdata/xyz/xyz.py @@ -1,59 +1,3 @@ from __future__ import annotations -import numpy as np - - -def coord_to_xyz(coord: np.ndarray, types: list) -> str: - """Convert coordinates and types to xyz format. - - Parameters - ---------- - coord : np.ndarray - coordinates, Nx3 array - types : list - list of types - - Returns - ------- - str - xyz format string - - Examples - -------- - >>> coord_to_xyz(np.ones((1,3)), ["C"]) - 1 - - C 1.000000 1.000000 1.000000 - """ - buff = [str(len(types)), ""] - for at, cc in zip(types, coord): - buff.append("{} {:.6f} {:.6f} {:.6f}".format(at, *cc)) - return "\n".join(buff) - - -def xyz_to_coord(xyz: str) -> tuple[np.ndarray, list]: - """Convert xyz format to coordinates and types. - - Parameters - ---------- - xyz : str - xyz format string - - Returns - ------- - coords : np.ndarray - coordinates, Nx3 array - types : list - list of types - """ - symbols = [] - coords = [] - for ii, line in enumerate(xyz.split("\n")): - if ii == 0: - natoms = int(line.strip()) - elif 2 <= ii <= 1 + natoms: - # symbol x y z - symbol, x, y, z = line.split() - coords.append((float(x), float(y), float(z))) - symbols.append(symbol) - return np.array(coords), symbols +from dpdata.formats.xyz.xyz import * # noqa: F403 diff --git a/tests/context.py b/tests/context.py index 3214e28ea..85d7c33cf 100644 --- a/tests/context.py +++ b/tests/context.py @@ -5,7 +5,7 @@ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) import dpdata -import dpdata.gaussian.gjf # noqa: F401 +import dpdata.formats.gaussian.gjf # noqa: F401 import dpdata.md.msd # noqa: F401 import dpdata.md.water # noqa: F401 import dpdata.stat # noqa: F401 diff --git a/tests/test_abacus_stru_dump.py b/tests/test_abacus_stru_dump.py index cf071920d..7b4317c37 100644 --- a/tests/test_abacus_stru_dump.py +++ b/tests/test_abacus_stru_dump.py @@ -7,7 +7,7 @@ from context import dpdata from test_vasp_poscar_dump import myfilecmp -from dpdata.abacus.stru import parse_pos_oneline +from dpdata.formats.abacus.stru import parse_pos_oneline class TestStruDump(unittest.TestCase): diff --git a/tests/test_cell_to_low_triangle.py b/tests/test_cell_to_low_triangle.py index 34d0a90ae..93a128263 100644 --- a/tests/test_cell_to_low_triangle.py +++ b/tests/test_cell_to_low_triangle.py @@ -3,12 +3,13 @@ import unittest import numpy as np -from context import dpdata + +from dpdata.formats.cp2k.cell import cell_to_low_triangle class TestCellToLowTriangle(unittest.TestCase): def test_func1(self): - cell_1 = dpdata.cp2k.cell.cell_to_low_triangle( + cell_1 = cell_to_low_triangle( 6, 6, 6, np.pi * 1 / 2, np.pi * 1 / 2, np.pi * 1 / 2 ) cell_2 = np.asarray([[6, 0, 0], [0, 6, 0], [0, 0, 6]]) @@ -17,7 +18,7 @@ def test_func1(self): self.assertAlmostEqual(cell_1[ii, jj], cell_2[ii, jj], places=6) def test_func2(self): - cell_1 = dpdata.cp2k.cell.cell_to_low_triangle( + cell_1 = cell_to_low_triangle( 6, 6, 6, np.pi * 1 / 3, np.pi * 1 / 3, np.pi * 1 / 3 ) cell_2 = np.asarray( @@ -28,7 +29,7 @@ def test_func2(self): self.assertAlmostEqual(cell_1[ii, jj], cell_2[ii, jj], places=6) def test_func3(self): - cell_1 = dpdata.cp2k.cell.cell_to_low_triangle( + cell_1 = cell_to_low_triangle( 6, 7, 8, np.pi * 133 / 180, np.pi * 84 / 180, np.pi * 69 / 180 ) cell_2 = np.asarray( @@ -45,21 +46,17 @@ def test_func3(self): def test_func4(self): with self.assertRaises(Exception) as c: - dpdata.cp2k.cell.cell_to_low_triangle( - 0.1, 6, 6, np.pi * 1 / 2, np.pi * 1 / 2, np.pi * 1 / 2 - ) + cell_to_low_triangle(0.1, 6, 6, np.pi * 1 / 2, np.pi * 1 / 2, np.pi * 1 / 2) self.assertTrue("A==0.1" in str(c.exception)) def test_func5(self): with self.assertRaises(Exception) as c: - dpdata.cp2k.cell.cell_to_low_triangle( - 6, 6, 6, np.pi * 3 / 180, np.pi * 1 / 2, np.pi * 1 / 2 - ) + cell_to_low_triangle(6, 6, 6, np.pi * 3 / 180, np.pi * 1 / 2, np.pi * 1 / 2) self.assertTrue("alpha" in str(c.exception)) def test_func6(self): with self.assertRaises(Exception) as c: - dpdata.cp2k.cell.cell_to_low_triangle( + cell_to_low_triangle( 6, 7, 8, np.pi * 153 / 180, np.pi * 84 / 180, np.pi * 69 / 180 ) self.assertTrue("lz^2" in str(c.exception)) diff --git a/tests/test_gaussian_driver.py b/tests/test_gaussian_driver.py index ff1638488..3c28738c1 100644 --- a/tests/test_gaussian_driver.py +++ b/tests/test_gaussian_driver.py @@ -9,6 +9,8 @@ from comp_sys import CompSys, IsNoPBC from context import dpdata +from dpdata.formats.gaussian.gjf import detect_multiplicity + @unittest.skipIf(shutil.which("g16") is None, "g16 is not installed") @unittest.skipIf( @@ -83,9 +85,7 @@ def test_detect_multiplicity(self): self._check_multiplicity(["C", "H"], 2) def _check_multiplicity(self, symbols, multiplicity): - self.assertEqual( - dpdata.gaussian.gjf.detect_multiplicity(np.array(symbols)), multiplicity - ) + self.assertEqual(detect_multiplicity(np.array(symbols)), multiplicity) def tearDown(self): if os.path.exists("gaussian/tmp.gjf"): diff --git a/tests/test_lammps_lmp_dump.py b/tests/test_lammps_lmp_dump.py index a717c6cfc..86e4b3cdd 100644 --- a/tests/test_lammps_lmp_dump.py +++ b/tests/test_lammps_lmp_dump.py @@ -8,7 +8,7 @@ from context import dpdata from poscars.poscar_ref_oh import TestPOSCARoh -from dpdata.lammps.lmp import rotate_to_lower_triangle +from dpdata.formats.lammps.lmp import rotate_to_lower_triangle TEST_DIR = os.path.dirname(__file__) POSCAR_CONF_LMP = os.path.join(TEST_DIR, "poscars", "conf.lmp") diff --git a/tests/test_lammps_spin.py b/tests/test_lammps_spin.py index d3d58920e..bcb3442b7 100644 --- a/tests/test_lammps_spin.py +++ b/tests/test_lammps_spin.py @@ -7,7 +7,7 @@ import numpy as np from context import dpdata -from dpdata.lammps.dump import get_spin +from dpdata.formats.lammps.dump import get_spin TRAJ_NO_ID = """ITEM: TIMESTEP 0 diff --git a/tests/test_lmdb.py b/tests/test_lmdb.py index ee651edce..bc0fdeecc 100644 --- a/tests/test_lmdb.py +++ b/tests/test_lmdb.py @@ -17,7 +17,7 @@ ) from context import dpdata -from dpdata.lmdb.format import LMDBFrameError, LMDBMetadataError +from dpdata.formats.lmdb.format import LMDBFrameError, LMDBMetadataError class TestLMDBLabeledSystem(unittest.TestCase, CompLabeledSys, IsPBC): diff --git a/tests/test_msd.py b/tests/test_msd.py index 5d26db645..9d53ba0fc 100644 --- a/tests/test_msd.py +++ b/tests/test_msd.py @@ -5,6 +5,8 @@ import numpy as np from context import dpdata +from dpdata.md.msd import msd as calc_msd + class TestMSD(unittest.TestCase): def setUp(self): @@ -22,9 +24,9 @@ def setUp(self): def test_msd(self): # print(self.system['atom_types'] == 0) - msd = dpdata.md.msd.msd(self.system) - msd0 = dpdata.md.msd.msd(self.system, self.system["atom_types"] == 0) - msd1 = dpdata.md.msd.msd(self.system, self.system["atom_types"] == 1) + msd = calc_msd(self.system) + msd0 = calc_msd(self.system, self.system["atom_types"] == 0) + msd1 = calc_msd(self.system, self.system["atom_types"] == 1) # print(msd) ncomp = msd.shape[0] for ii in range(ncomp): diff --git a/tests/test_qe_cp_traj.py b/tests/test_qe_cp_traj.py index d6403ff67..a670bd4d0 100644 --- a/tests/test_qe_cp_traj.py +++ b/tests/test_qe_cp_traj.py @@ -5,6 +5,8 @@ import numpy as np from context import dpdata +from dpdata.formats.qe.traj import convert_celldm + bohr2ang = dpdata.unit.LengthConversion("bohr", "angstrom").value() @@ -61,7 +63,7 @@ def setUp(self): class TestConverCellDim(unittest.TestCase): def test_case_null(self): - cell = dpdata.qe.traj.convert_celldm(8, [1, 1, 1]) + cell = convert_celldm(8, [1, 1, 1]) ref = np.eye(3) for ii in range(3): for jj in range(3): diff --git a/tests/test_water_ions.py b/tests/test_water_ions.py index 40c1c143c..34ab21279 100644 --- a/tests/test_water_ions.py +++ b/tests/test_water_ions.py @@ -5,6 +5,13 @@ from context import dpdata +from dpdata.md.water import ( + compute_bonds, + compute_bonds_ase, + compute_bonds_naive, + find_ions, +) + try: import ase # noqa: F401 import ase.neighborlist # noqa: F401 @@ -20,16 +27,14 @@ def setUp(self): self.system.from_lammps_lmp( os.path.join("poscars", "conf.waterion.lmp"), type_map=["O", "H"] ) - self.bonds = dpdata.md.water.compute_bonds( + self.bonds = compute_bonds( self.system.data["cells"][0], self.system.data["coords"][0], self.system.data["atom_types"], ) def test_ions_count(self): - no, noh, noh2, noh3, nh = dpdata.md.water.find_ions( - self.system.data["atom_types"], self.bonds - ) + no, noh, noh2, noh3, nh = find_ions(self.system.data["atom_types"], self.bonds) self.assertEqual(len(no), 0) self.assertEqual(len(noh), 1) self.assertEqual(len(noh2), 125) @@ -46,12 +51,12 @@ def setUp(self): self.system.from_lammps_lmp( os.path.join("poscars", "conf.waterion.lmp"), type_map=["O", "H"] ) - self.bonds = dpdata.md.water.compute_bonds_naive( + self.bonds = compute_bonds_naive( self.system.data["cells"][0], self.system.data["coords"][0], self.system.data["atom_types"], ) - self.bonds_ase = dpdata.md.water.compute_bonds_ase( + self.bonds_ase = compute_bonds_ase( self.system.data["cells"][0], self.system.data["coords"][0], self.system.data["atom_types"],