diff --git a/dpdata/__init__.py b/dpdata/__init__.py
index f2cd233ff..69442385e 100644
--- a/dpdata/__init__.py
+++ b/dpdata/__init__.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from . import lammps, md, vasp
+from . import md
 from .bond_order_system import BondOrderSystem
 from .system import LabeledSystem, MultiSystems, System
 
@@ -9,11 +9,10 @@
 except ImportError:
     from .__about__ import __version__
 
+
 __all__ = [
     "__version__",
-    "lammps",
     "md",
-    "vasp",
     "System",
     "LabeledSystem",
     "MultiSystems",
diff --git a/dpdata/abacus/__init__.py b/dpdata/abacus/__init__.py
index e69de29bb..c917d8d4c 100644
--- a/dpdata/abacus/__init__.py
+++ b/dpdata/abacus/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import annotations
+
+from dpdata.formats.abacus import *  # noqa: F403
diff --git a/dpdata/abacus/md.py b/dpdata/abacus/md.py
index 8df156c94..2474d87db 100644
--- a/dpdata/abacus/md.py
+++ b/dpdata/abacus/md.py
@@ -1,224 +1,3 @@
 from __future__ import annotations
 
-import os
-import warnings
-
-import numpy as np
-
-from dpdata.utils import open_file
-
-from .scf import (
-    bohr2ang,
-    get_geometry_in,
-    get_mag_force,
-    kbar2evperang3,
-)
-from .stru import get_frame_from_stru
-
-# Read in geometries from an ABACUS MD trajectory.
-# The atomic coordinates are read in from generated files in OUT.XXXX.
-# Energies, forces
-# IMPORTANT: the program defaultly takes STRU input file as standard cell information,
-# therefore the direct and cartesan coordinates read could be different from the ones in
-# the output cif files!!!
-# It is highly recommanded to use ORTHOGANAL coordinates in STRU file if you wish to get
-# same coordinates in both dpdata and output cif files.
-
-
-def get_path_out(fname, inlines):
-    # This function is different from the same-name function in scf.py.
-    # This function returns OUT.XXXX's base directory.
-    path_out = os.path.join(fname, "OUT.ABACUS/")
-    for line in inlines:
-        if len(line) > 0 and "suffix" in line and "suffix" == line.split()[0]:
-            suffix = line.split()[1]
-            path_out = os.path.join(fname, f"OUT.{suffix}/")
-            break
-    return path_out
-
-
-def get_coord_dump_freq(inlines):
-    for line in inlines:
-        if len(line) > 0 and "md_dumpfreq" in line and "md_dumpfreq" == line.split()[0]:
-            return int(line.split()[1])
-    return 1
-
-
-def get_coords_from_dump(dumplines, natoms):
-    nlines = len(dumplines)
-    total_natoms = sum(natoms)
-    # The output of VIRIAL, FORCE, and VELOCITY are controlled by INPUT parameters dump_virial, dump_force, and dump_vel, respectively.
-    # So the search of keywords can determine whether these datas are printed into MD_dump.
-    calc_stress = False
-    calc_force = False
-    check_line = 6
-    if "VIRIAL" in dumplines[6]:
-        calc_stress = True
-        check_line = 10
-    assert "POSITION" in dumplines[check_line], (
-        "keywords 'POSITION' cannot be found in the 6th line. Please check."
-    )
-    if "FORCE" in dumplines[check_line]:
-        calc_force = True
-
-    nframes_dump = -1
-    if calc_stress:
-        nframes_dump = int(nlines / (total_natoms + 13))
-    else:
-        nframes_dump = int(nlines / (total_natoms + 9))
-    assert nframes_dump > 0, (
-        "Number of lines in MD_dump file = %d. Number of atoms = %d. The MD_dump file is incomplete."  # noqa: UP031
-        % (nlines, total_natoms)
-    )
-    cells = np.zeros([nframes_dump, 3, 3])
-    stresses = np.zeros([nframes_dump, 3, 3])
-    forces = np.zeros([nframes_dump, total_natoms, 3])
-    coords = np.zeros([nframes_dump, total_natoms, 3])
-    iframe = 0
-    for iline in range(nlines):
-        if "MDSTEP" in dumplines[iline]:
-            # read in LATTICE_CONSTANT
-            # for abacus version >= v3.1.4, the unit is angstrom, and "ANGSTROM" is added at the end
-            # for abacus version <  v3.1.4, the unit is bohr
-            celldm = float(dumplines[iline + 1].split()[1])
-            newversion = True
-            if "Angstrom" not in dumplines[iline + 1]:
-                celldm *= bohr2ang  # transfer unit to ANGSTROM
-                newversion = False
-
-            # read in LATTICE_VECTORS
-            for ix in range(3):
-                cells[iframe, ix] = (
-                    np.array([float(i) for i in dumplines[iline + 3 + ix].split()[0:3]])
-                    * celldm
-                )
-                if calc_stress:
-                    stresses[iframe, ix] = np.array(
-                        [float(i) for i in dumplines[iline + 7 + ix].split()[0:3]]
-                    )
-
-            if calc_stress:
-                skipline = 11
-            else:
-                skipline = 7
-
-            for iat in range(total_natoms):
-                # INDEX    LABEL    POSITION (Angstrom)    FORCE (eV/Angstrom)    VELOCITY (Angstrom/fs)
-                # 0  Sn  0.000000000000  0.000000000000  0.000000000000  -0.000000000000  -0.000000000001  -0.000000000001  0.001244557166  -0.000346684288  0.000768457739
-                # 1  Sn  0.000000000000  3.102800034079  3.102800034079  -0.000186795145  -0.000453823768  -0.000453823768  0.000550996187  -0.000886442775  0.001579501983
-                # for abacus version >= v3.1.4, the value of POSITION is the real cartessian position, and unit is angstrom, and if cal_force the VELOCITY is added at the end.
-                # for abacus version < v3.1.4, the real position = POSITION * celldm
-                coords[iframe, iat] = np.array(
-                    [float(i) for i in dumplines[iline + skipline + iat].split()[2:5]]
-                )
-
-                if not newversion:
-                    coords[iframe, iat] *= celldm
-
-                if calc_force:
-                    forces[iframe, iat] = np.array(
-                        [
-                            float(i)
-                            for i in dumplines[iline + skipline + iat].split()[5:8]
-                        ]
-                    )
-            iframe += 1
-    assert iframe == nframes_dump, (
-        "iframe=%d, nframe_dump=%d. Number of frames does not match number of lines in MD_dump."  # noqa: UP031
-        % (iframe, nframes_dump)
-    )
-    stresses *= kbar2evperang3
-    return coords, cells, forces, stresses
-
-
-def get_energy(outlines, ndump, dump_freq):
-    energy = []
-    nenergy = 0
-    for line_idx, line in enumerate(outlines):
-        if "final etot is" in line or "#TOTAL ENERGY#" in line:
-            if nenergy % dump_freq == 0:
-                energy.append(float(line.split()[-2]))
-            nenergy += 1
-        elif "!! convergence has not been achieved" in line:
-            if nenergy % dump_freq == 0:
-                energy.append(np.nan)
-            nenergy += 1
-    assert ndump == len(energy), (
-        "Number of total energies in running_md.log = %d. Number of frames in MD_dump = %d. Please check."  # noqa: UP031
-        % (len(energy), ndump)
-    )
-    energy = np.array(energy)
-    return energy
-
-
-def get_frame(fname):
-    if isinstance(fname, str):
-        # if the input parameter is only one string, it is assumed that it is the
-        # base directory containing INPUT file;
-        path_in = os.path.join(fname, "INPUT")
-    else:
-        raise RuntimeError("invalid input")
-    with open_file(path_in) as fp:
-        inlines = fp.read().split("\n")
-    geometry_path_in = get_geometry_in(fname, inlines)  # base dir of STRU
-    path_out = get_path_out(fname, inlines)
-
-    data = get_frame_from_stru(geometry_path_in)
-    natoms = data["atom_numbs"]
-    # should remove spins from STRU file
-    if "spins" in data:
-        data.pop("spins")
-
-    # This coords is not to be used.
-    dump_freq = get_coord_dump_freq(inlines=inlines)
-    # ndump = int(os.popen("ls -l %s | grep 'md_pos_' | wc -l" %path_out).readlines()[0])
-    # number of dumped geometry files
-    # coords = get_coords_from_cif(ndump, dump_freq, atom_names, natoms, types, path_out, cell)
-    with open_file(os.path.join(path_out, "MD_dump")) as fp:
-        dumplines = fp.read().split("\n")
-    coords, cells, force, stress = get_coords_from_dump(dumplines, natoms)
-    ndump = np.shape(coords)[0]
-    with open_file(os.path.join(path_out, "running_md.log")) as fp:
-        outlines = fp.read().split("\n")
-    energy = get_energy(outlines, ndump, dump_freq)
-
-    unconv_stru = ""
-    for i, iene in enumerate(energy):
-        if np.isnan(iene):
-            coords = np.delete(coords, i - ndump, axis=0)
-            cells = np.delete(cells, i - ndump, axis=0)
-            force = np.delete(force, i - ndump, axis=0)
-            stress = np.delete(stress, i - ndump, axis=0)
-            energy = np.delete(energy, i - ndump, axis=0)
-            unconv_stru += "%d " % i  # noqa: UP031
-    ndump = len(energy)
-    if unconv_stru != "":
-        warnings.warn(f"Structure {unconv_stru} are unconverged and not collected!")
-
-    for iframe in range(ndump):
-        stress[iframe] *= np.linalg.det(cells[iframe, :, :].reshape([3, 3]))
-    if np.sum(np.abs(stress[0])) < 1e-10:
-        stress = None
-
-    magmom, magforce = get_mag_force(outlines)
-
-    data["cells"] = cells
-    # for idx in range(ndump):
-    #    data['cells'][:, :, :] = cell
-    data["coords"] = coords
-    data["energies"] = energy
-    data["forces"] = force
-    data["virials"] = stress
-    if not isinstance(data["virials"], np.ndarray):
-        del data["virials"]
-    data["orig"] = np.zeros(3)
-    if len(magmom) > 0:
-        data["spins"] = magmom
-    if len(magforce) > 0:
-        data["force_mags"] = magforce
-
-    # need to expand the move.
-    if "move" in data:
-        data["move"] = [data["move"][0] for i in range(ndump)]
-
-    return data
+from dpdata.formats.abacus.md import *  # noqa: F403
diff --git a/dpdata/abacus/relax.py b/dpdata/abacus/relax.py
index db60412b8..3d7a40315 100644
--- a/dpdata/abacus/relax.py
+++ b/dpdata/abacus/relax.py
@@ -1,265 +1,3 @@
 from __future__ import annotations
 
-import glob
-import os
-
-import numpy as np
-
-from dpdata.utils import open_file
-
-from .scf import (
-    bohr2ang,
-    collect_force,
-    collect_stress,
-    get_geometry_in,
-    get_mag_force,
-    kbar2evperang3,
-)
-from .stru import get_frame_from_stru
-
-# Read in geometries from an ABACUS RELAX(CELL-RELAX) trajectory in OUT.XXXX/runnning_relax/cell-relax.log.
-
-
-def get_log_file(fname, inlines):
-    suffix = "ABACUS"
-    calculation = "scf"
-    for line in inlines:
-        if "suffix" in line and "suffix" == line.split()[0]:
-            suffix = line.split()[1]
-        elif "calculation" in line and "calculation" == line.split()[0]:
-            calculation = line.split()[1]
-    logf = os.path.join(fname, f"OUT.{suffix}/running_{calculation}.log")
-    return logf
-
-
-def get_relax_stru_files(output_dir):
-    """Find the STRU files in the output directory.
-
-    Args:
-        output_dir (str): output directory
-
-    Returns
-    -------
-    strus: list of STRU files
-    example:
-        ["STRU_ION1_D", "STRU_ION2_D"]
-    """
-    return glob.glob(os.path.join(output_dir, "STRU_ION*_D"))
-
-
-def get_coords_from_log(loglines, natoms, stru_files=None):
-    """NOTICE: unit of coords and cells is Angstrom
-    order:
-        coordinate
-        cell (no output if cell is not changed)
-        energy (no output, if SCF is not converged)
-        force (no output, if cal_force is not setted or abnormal ending)
-        stress (no output, if set cal_stress is not setted or abnormal ending).
-    """
-    natoms_log = 0
-    for line in loglines:
-        if line[13:41] == "number of atom for this type":
-            natoms_log += int(line.split()[-1])
-
-    assert natoms_log > 0 and natoms_log == natoms, (
-        f"ERROR: detected atom number in log file is {natoms_log}, while the atom number in STRU file is {natoms}"
-    )
-
-    energy = []
-    cells = []
-    coords = []
-    coord_direct = []  # if the coordinate is direct type or not
-
-    for i in range(len(loglines)):
-        line = loglines[i]
-        if line[18:41] == "lattice constant (Bohr)":
-            a0 = float(line.split()[-1])
-        elif len(loglines[i].split()) >= 2 and loglines[i].split()[1] == "COORDINATES":
-            # read coordinate information
-            coords.append([])
-            direct_coord = False
-            if loglines[i].split()[0] == "DIRECT":
-                coord_direct.append(True)
-                for k in range(2, 2 + natoms):
-                    coords[-1].append(
-                        list(map(lambda x: float(x), loglines[i + k].split()[1:4]))
-                    )
-            elif loglines[i].split()[0] == "CARTESIAN":
-                coord_direct.append(False)
-                for k in range(2, 2 + natoms):
-                    coords[-1].append(
-                        list(
-                            map(
-                                lambda x: float(x) * a0 * bohr2ang,
-                                loglines[i + k].split()[1:4],
-                            )
-                        )
-                    )
-            else:
-                assert False, "Unrecongnized coordinate type, %s, line:%d" % (  # noqa: UP031
-                    loglines[i].split()[0],
-                    i,
-                )
-
-        elif (
-            loglines[i][1:56]
-            == "Lattice vectors: (Cartesian coordinate: in unit of a_0)"
-        ):
-            # add the cell information for previous structures
-            while len(cells) < len(coords) - 1:
-                cells.append(cells[-1])
-            # get current cell information
-            cells.append([])
-            for k in range(1, 4):
-                cells[-1].append(
-                    list(
-                        map(
-                            lambda x: float(x) * a0 * bohr2ang,
-                            loglines[i + k].split()[0:3],
-                        )
-                    )
-                )
-
-        elif line[1:14] == "final etot is" or "#TOTAL ENERGY#" in line:
-            # add the energy for previous structures whose SCF is not converged
-            while len(energy) < len(coords) - 1:
-                energy.append(np.nan)
-            # get the energy of current structure
-            energy.append(float(line.split()[-2]))
-
-    # in some relax method (like: bfgs_trad), the coordinate is not outputed in running_relax.log
-    # but if out_stru is true, then STRU_ION*_D will be outputed in OUT.ABACUS
-    # we should read cell and coord from STRU_ION*_D files
-    if len(energy) > 1 and len(coords) == 1:
-        # the energies of all structrues are collected, but coords have only the first structure
-        if (
-            stru_files is not None and len(stru_files) > 1
-        ):  # if stru_files are not only STRU_ION_D
-            stru_file_name = [os.path.basename(i) for i in stru_files]
-            coords = coords[:1] + [np.nan for i in range(len(energy) - 1)]
-            coord_direct = coord_direct[:1] + [False for i in range(len(energy) - 1)]
-            cells = cells[:1] + [np.nan for i in range(len(energy) - 1)]
-            for iframe in range(1, len(energy)):
-                if f"STRU_ION{iframe}_D" in stru_file_name:
-                    # read the structure from STRU_ION*_D
-                    stru_data = get_frame_from_stru(
-                        stru_files[stru_file_name.index(f"STRU_ION{iframe}_D")]
-                    )
-                    coords[iframe] = stru_data["coords"][0]
-                    cells[iframe] = stru_data["cells"][0]
-
-    force = collect_force(loglines)
-    stress = collect_stress(loglines)
-
-    # delete last structures which has no energy
-    while len(energy) < len(coords):
-        del coords[-1]
-        del coord_direct[-1]
-
-    # add cells for last structures whose cell is not changed
-    while len(cells) < len(coords):
-        cells.append(cells[-1])
-
-    # only keep structures that have all of coord, force and stress
-    if len(stress) == 0 and len(force) == 0:
-        minl = len(coords)
-    elif len(stress) == 0:
-        minl = min(len(coords), len(force))
-        force = force[:minl]
-    elif len(force) == 0:
-        minl = min(len(coords), len(stress))
-        stress = stress[:minl]
-    else:
-        minl = min(len(coords), len(force), len(stress))
-        force = force[:minl]
-        stress = stress[:minl]
-
-    coords = coords[:minl]
-    energy = energy[:minl]
-    cells = cells[:minl]
-
-    # delete structures whose energy is np.nan
-    for i in range(minl):
-        if (
-            np.isnan(energy[i - minl])
-            or np.any(np.isnan(coords[i - minl]))
-            or np.any(np.isnan(cells[i - minl]))
-        ):
-            del energy[i - minl]
-            del coords[i - minl]
-            del cells[i - minl]
-            del coord_direct[i - minl]
-            if len(force) > 0:
-                del force[i - minl]
-            if len(stress) > 0:
-                del stress[i - minl]
-
-    energy = np.array(energy)
-    cells = np.array(cells)
-    coords = np.array(coords)
-    stress = np.array(stress)
-    force = np.array(force)
-
-    # transfer direct coordinate to cartessian type
-    for i in range(len(coords)):
-        if coord_direct[i]:
-            coords[i] = coords[i].dot(cells[i])
-
-    if len(stress) > 0:
-        virial = np.zeros([len(cells), 3, 3])
-        for i in range(len(cells)):
-            volume = np.linalg.det(cells[i, :, :].reshape([3, 3]))
-            virial[i] = stress[i] * kbar2evperang3 * volume
-    else:
-        virial = None
-
-    return energy, cells, coords, force, stress, virial
-
-
-def get_frame(fname):
-    if isinstance(fname, str):
-        # if the input parameter is only one string, it is assumed that it is the
-        # base directory containing INPUT file;
-        path_in = os.path.join(fname, "INPUT")
-    else:
-        raise RuntimeError("invalid input")
-    with open_file(path_in) as fp:
-        inlines = fp.read().split("\n")
-    geometry_path_in = get_geometry_in(fname, inlines)  # base dir of STRU
-
-    data = get_frame_from_stru(geometry_path_in)
-    natoms = sum(data["atom_numbs"])
-    # should remove spins from STRU file
-    if "spins" in data:
-        data.pop("spins")
-
-    logf = get_log_file(fname, inlines)
-    assert os.path.isfile(logf), f"Error: can not find {logf}"
-    with open_file(logf) as f1:
-        lines = f1.readlines()
-
-    relax_stru_files = get_relax_stru_files(os.path.dirname(logf))
-
-    energy, cells, coords, force, stress, virial = get_coords_from_log(
-        lines, natoms, stru_files=relax_stru_files
-    )
-
-    magmom, magforce = get_mag_force(lines)
-
-    data["cells"] = cells
-    data["coords"] = coords
-    data["energies"] = energy
-    data["forces"] = force
-    if isinstance(virial, np.ndarray):
-        data["virials"] = virial
-    data["stress"] = stress
-    data["orig"] = np.zeros(3)
-
-    if len(magmom) > 0:
-        data["spins"] = magmom
-    if len(magforce) > 0:
-        data["force_mags"] = magforce
-    if "move" in data:
-        data["move"] = [data["move"][0] for i in range(len(data["energies"]))]
-
-    return data
+from dpdata.formats.abacus.relax import *  # noqa: F403
diff --git a/dpdata/abacus/scf.py b/dpdata/abacus/scf.py
index 167d3067f..2a2479998 100644
--- a/dpdata/abacus/scf.py
+++ b/dpdata/abacus/scf.py
@@ -1,255 +1,3 @@
 from __future__ import annotations
 
-import os
-import re
-import warnings
-
-import numpy as np
-
-from dpdata.utils import open_file
-
-from ..unit import LengthConversion, PressureConversion
-from .stru import get_frame_from_stru
-
-bohr2ang = LengthConversion("bohr", "angstrom").value()
-kbar2evperang3 = PressureConversion("kbar", "eV/angstrom^3").value()
-
-
-def CheckFile(ifile):
-    if not os.path.isfile(ifile):
-        print(f"Can not find file {ifile}")
-        return False
-    return True
-
-
-def get_geometry_in(fname, inlines):
-    geometry_path_in = os.path.join(fname, "STRU")
-    for line in inlines:
-        if "stru_file" in line and "stru_file" == line.split()[0]:
-            atom_file = line.split()[1]
-            geometry_path_in = os.path.join(fname, atom_file)
-            break
-    return geometry_path_in
-
-
-def get_path_out(fname, inlines):
-    path_out = os.path.join(fname, "OUT.ABACUS/running_scf.log")
-    for line in inlines:
-        if "suffix" in line and "suffix" == line.split()[0]:
-            suffix = line.split()[1]
-            path_out = os.path.join(fname, f"OUT.{suffix}/running_scf.log")
-            break
-    return path_out
-
-
-def get_energy(outlines):
-    Etot = None
-    for line in reversed(outlines):
-        if "final etot is" in line:  # for LTS
-            Etot = float(line.split()[-2])  # in eV
-            return Etot, True
-        elif "TOTAL ENERGY" in line:  # for develop
-            Etot = float(line.split()[-2])  # in eV
-            return Etot, True
-        elif "convergence has NOT been achieved!" in line:
-            return Etot, False
-        elif "convergence has not been achieved" in line:
-            return Etot, False
-
-    return Etot, False
-
-
-def collect_force(outlines):
-    force = []
-    for i, line in enumerate(outlines):
-        # if "TOTAL-FORCE (eV/Angstrom)" in line:
-        if "TOTAL-FORCE" in line:
-            value_pattern = re.compile(
-                r"^\s*[A-Z][a-z]?[1-9][0-9]*\s+[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\s+[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\s+[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\s*$"
-            )
-            j = i
-            # find the first line of force
-            noforce = False
-            while not value_pattern.match(outlines[j]):
-                j += 1
-                if (
-                    j >= i + 10
-                ):  # if can not find the first line of force in 10 lines, then stop
-                    warnings.warn("Warning: can not find the first line of force")
-                    noforce = True
-                    break
-            if noforce:
-                break
-
-            force.append([])
-            while value_pattern.match(outlines[j]):
-                force[-1].append([float(ii) for ii in outlines[j].split()[1:4]])
-                j += 1
-    return force  # only return the last force
-
-
-def get_force(outlines, natoms):
-    force = collect_force(outlines)
-    if len(force) == 0:
-        return None
-    else:
-        return np.array(force[-1])  # only return the last force
-
-
-def collect_stress(outlines):
-    stress = []
-    for i, line in enumerate(outlines):
-        # if "TOTAL-STRESS (KBAR)" in line:
-        if "TOTAL-STRESS" in line:
-            value_pattern = re.compile(
-                r"^\s*[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\s+[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\s+[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\s*$"
-            )
-            j = i
-            nostress = False
-            while not value_pattern.match(outlines[j]):
-                j += 1
-                if (
-                    j >= i + 10
-                ):  # if can not find the first line of stress in 10 lines, then stop
-                    warnings.warn("Warning: can not find the first line of stress")
-                    nostress = True
-                    break
-            if nostress:
-                break
-
-            stress.append([])
-            while value_pattern.match(outlines[j]):
-                stress[-1].append(
-                    list(map(lambda x: float(x), outlines[j].split()[0:3]))
-                )
-                j += 1
-    return stress
-
-
-def get_stress(outlines):
-    stress = collect_stress(outlines)
-    if len(stress) == 0:
-        return None
-    else:
-        return np.array(stress[-1]) * kbar2evperang3  # only return the last stress
-
-
-def get_mag_force(outlines):
-    """Read atomic magmom and magnetic force from OUT.ABACUS/running_scf.log.
-
-    Returns
-    -------
-    magmom: list of list of atomic magnetic moments (three dimensions: ION_STEP * NATOMS * 1/3)
-    magforce: list of list of atomic magnetic forces (three dimensions: ION_STEP * NATOMS * 1/3)
-    e.g.:
-    -------------------------------------------------------------------------------------------
-    Total Magnetism (uB)
-    -------------------------------------------------------------------------------------------
-        Fe         0.0000000001         0.0000000000         3.0000000307
-        Fe        -0.0000000000        -0.0000000000         3.0000001151
-    -------------------------------------------------------------------------------------------
-    -------------------------------------------------------------------------------------------
-    Magnetic force (eV/uB)
-    -------------------------------------------------------------------------------------------
-        Fe         0.0000000000         0.0000000000        -1.2117698671
-        Fe         0.0000000000         0.0000000000        -1.2117928796
-    -------------------------------------------------------------------------------------------
-
-    """
-    mags = []
-    magforces = []
-    for i, line in enumerate(outlines):
-        if "Total Magnetism (uB)" in line:
-            j = i + 2
-            mag = []
-            while "-------------------------" not in outlines[j]:
-                imag = [float(ii) for ii in outlines[j].split()[1:]]
-                if len(imag) == 1:
-                    imag = [0, 0, imag[0]]
-                mag.append(imag)
-                j += 1
-            mags.append(mag)
-        if "Magnetic force (eV/uB)" in line:
-            j = i + 2
-            magforce = []
-            while "-------------------------" not in outlines[j]:
-                imagforce = [float(ii) for ii in outlines[j].split()[1:]]
-                if len(imagforce) == 1:
-                    imagforce = [0, 0, imagforce[0]]
-                magforce.append(imagforce)
-                j += 1
-            magforces.append(magforce)
-    return np.array(mags), np.array(magforces)
-
-
-def get_frame(fname):
-    data = {
-        "atom_names": [],
-        "atom_numbs": [],
-        "atom_types": [],
-        "cells": np.array([]),
-        "coords": np.array([]),
-        "energies": np.array([]),
-        "forces": np.array([]),
-    }
-
-    if isinstance(fname, str):
-        # if the input parameter is only one string, it is assumed that it is the
-        # base directory containing INPUT file;
-        path_in = os.path.join(fname, "INPUT")
-    else:
-        raise RuntimeError("invalid input")
-
-    if not CheckFile(path_in):
-        return data
-
-    with open_file(path_in) as fp:
-        inlines = fp.read().split("\n")
-
-    geometry_path_in = get_geometry_in(fname, inlines)
-
-    # get OUT.ABACUS/running_scf.log
-    path_out = get_path_out(fname, inlines)
-    if not (CheckFile(geometry_path_in) and CheckFile(path_out)):
-        return data
-    with open_file(path_out) as fp:
-        outlines = fp.read().split("\n")
-
-    # get energy
-    energy, converge = get_energy(outlines)
-    if not converge:
-        return data
-
-    # read STRU file
-    data = get_frame_from_stru(geometry_path_in)
-    natoms = sum(data["atom_numbs"])
-    # should remove spins from STRU file
-    if "spins" in data:
-        data.pop("spins")
-    move = data.pop("move", None)
-
-    # get magmom and magforce, force and stress
-    magmom, magforce = get_mag_force(outlines)
-    if len(magmom) > 0:
-        magmom = magmom[-1:]
-    if len(magforce) > 0:
-        magforce = magforce[-1:]
-
-    force = get_force(outlines, natoms)
-    stress = get_stress(outlines)
-
-    data["energies"] = np.array(energy)[np.newaxis]
-    data["forces"] = np.empty((0,)) if force is None else force[np.newaxis, :, :]
-    data["orig"] = np.zeros(3)
-    if stress is not None:
-        cell = data["cells"][0]
-        stress *= np.abs(np.linalg.det(cell))
-        data["virials"] = stress[np.newaxis, :, :]
-
-    if len(magmom) > 0:
-        data["spins"] = magmom
-    if len(magforce) > 0:
-        data["force_mags"] = magforce
-    if move is not None:
-        data["move"] = move
-    return data
+from dpdata.formats.abacus.scf import *  # noqa: F403
diff --git a/dpdata/abacus/stru.py b/dpdata/abacus/stru.py
index 50ec2cb72..3ea68177d 100644
--- a/dpdata/abacus/stru.py
+++ b/dpdata/abacus/stru.py
@@ -1,820 +1,3 @@
 from __future__ import annotations
 
-import os
-import re
-import warnings
-
-import numpy as np
-
-from ..unit import LengthConversion
-
-bohr2ang = LengthConversion("bohr", "angstrom").value()
-
-
-def split_stru_block(lines):
-    """Split the ABACUS STRU file into blocks by keyword.
-
-    Args:
-        lines (list): list of lines in the ABACUS STRU file.
-
-    Returns
-    -------
-    dict: dictionary of blocks.
-    """
-
-    def clean_comment(line):
-        return re.split("[#]", line)[0]
-
-    ABACUS_STRU_KEYS = [
-        "ATOMIC_SPECIES",
-        "NUMERICAL_ORBITAL",
-        "LATTICE_CONSTANT",
-        "LATTICE_VECTORS",
-        "ATOMIC_POSITIONS",
-        "NUMERICAL_DESCRIPTOR",
-        "PAW_FILES",
-    ]
-    blocks = {i: [] for i in ABACUS_STRU_KEYS}
-    i = 0
-    while i < len(lines):
-        line = clean_comment(lines[i]).strip()
-        if line in ABACUS_STRU_KEYS:
-            key = line
-            for j in range(i + 1, len(lines)):
-                if clean_comment(lines[j]).strip() == "":
-                    continue
-                elif clean_comment(lines[j]).strip() in ABACUS_STRU_KEYS:
-                    break
-                else:
-                    blocks[key].append(clean_comment(lines[j]))
-            i = j
-        else:
-            i += 1
-
-    return blocks
-
-
-def parse_atomic_species_block(lines):
-    """Parse the ATOMIC_SPECIES block.
-
-    Args:
-        lines (list): list of lines in the ATOMIC_SPECIES block.
-
-    Returns
-    -------
-    tuple: tuple of atom_names, masses, and pp_files.
-
-    """
-    atom_names, masses, pp_files = [], [], []
-    for line in lines:
-        line = line.split()
-        atom_names.append(line[0])
-        masses.append(float(line[1]))
-
-        # for standard STRU, the pseudo potential file is required,
-        # but it is not required for dpdata.
-        if len(line) > 2:
-            pp_files.append(line[2])
-        else:
-            pp_files.append(None)
-
-    return atom_names, masses, pp_files
-
-
-def parse_numerical_orbital_block(lines):
-    """Parse the NUMERICAL_ORBITAL block.
-
-    Args:
-        lines (list): list of lines in the NUMERICAL_ORBITAL block.
-
-    Returns
-    -------
-    list: list of orbital files.
-    """
-    return [line.strip() for line in lines]
-
-
-def parse_lattice_constant_block(lines):
-    """Parse the LATTICE_CONSTANT block.
-
-    Args:
-        lines (list): list of lines in the LATTICE_CONSTANT block.
-
-    Returns
-    -------
-    float: the lattice constant.
-    """
-    return float(lines[0])
-
-
-def parse_lattice_vectors_block(lines):
-    """Parse the LATTICE_VECTORS block.
-
-    Args:
-        lines (list): list of lines in the LATTICE_VECTORS block.
-
-    Returns
-    -------
-    np.ndarray: the cell vectors.
-    """
-    cell = np.zeros((3, 3))
-    for i, line in enumerate(lines):
-        cell[i] = [float(x) for x in line.split()]
-    return cell
-
-
-def parse_pos_oneline(pos_line):
-    """Parses a line from the atom position block in a structure file.
-
-    The content in atom position block can include:
-    - `m` or NO key word: Three numbers (0 or 1) controlling atom movement in geometry relaxation calculations.
-    - `v`, `vel`, or `velocity`: Three components of initial velocity of atoms in geometry relaxation calculations.
-    - `mag` or `magmom`: Start magnetization for each atom. Can be one number (colinear) or three numbers (non-colinear).
-    - `angle1`: In non-colinear case, angle between c-axis and real spin (in degrees).
-    - `angle2`: In non-colinear case, angle between a-axis and real spin projection in ab-plane (in degrees).
-    - `cs` or `constrain`: Three numbers (0 or 1) controlling the spin constraint of the atom.
-    - `lambda`: Three numbers controlling the lambda of the atom.
-
-    Parameters
-    ----------
-    pos_line : A line from the atom position block.
-
-    Returns
-    -------
-    tuple: A tuple containing:
-          - pos (list of float): The position coordinates.
-          - move (list of int or None): Movement control values.
-          - velocity (list of float or None): Initial velocity components.
-          - magmom (float, list of float, or None): Magnetization values.
-          - angle1 (float or None): Angle1 value.
-          - angle2 (float or None): Angle2 value.
-          - constrain (list of bool or None): Spin constraint values.
-          - lambda1 (float, list of float, or None): Lambda values.
-
-        e.g.:
-        ```
-        Fe
-        1.0
-        2
-        0.0 0.0 0.0 m 0 0 0 mag 1.0 angle1 90 angle2 0 cs 0 0 0
-        0.5 0.5 0.5 m 1 1 1 mag 1.0 angle1 90 angle2 180
-        ```
-    """
-    pos_line = pos_line.split("#")[0]  # remove comments
-    sline = pos_line.split()
-    pos = [float(i) for i in sline[:3]]
-    move = None
-    velocity = None
-    magmom = None
-    angle1 = None
-    angle2 = None
-    constrain = None
-    lambda1 = None
-    if len(sline) > 3:
-        mag_list = None
-        velocity_list = None
-        move_list = []
-        angle1_list = None
-        angle2_list = None
-        constrain_list = None
-        lambda_list = None
-        label = "move"
-        for i in range(3, len(sline)):
-            # firstly read the label
-            if sline[i] == "m":
-                label = "move"
-            elif sline[i] in ["v", "vel", "velocity"]:
-                label = "velocity"
-                velocity_list = []
-            elif sline[i] in ["mag", "magmom"]:
-                label = "magmom"
-                mag_list = []
-            elif sline[i] == "angle1":
-                label = "angle1"
-                angle1_list = []
-            elif sline[i] == "angle2":
-                label = "angle2"
-                angle2_list = []
-            elif sline[i] in ["constrain", "sc"]:
-                label = "constrain"
-                constrain_list = []
-            elif sline[i] in ["lambda"]:
-                label = "lambda"
-                lambda_list = []
-
-            # the read the value to the list
-            elif label == "move":
-                move_list.append(int(sline[i]))
-            elif label == "velocity":
-                velocity_list.append(float(sline[i]))
-            elif label == "magmom":
-                mag_list.append(float(sline[i]))
-            elif label == "angle1":
-                angle1_list.append(float(sline[i]))
-            elif label == "angle2":
-                angle2_list.append(float(sline[i]))
-            elif label == "constrain":
-                constrain_list.append(bool(int(sline[i])))
-            elif label == "lambda":
-                lambda_list.append(float(sline[i]))
-
-        if move_list is not None and len(move_list) > 0:
-            if len(move_list) == 3:
-                move = move_list
-            else:
-                raise RuntimeError(f"Invalid setting of move: {pos_line}")
-
-        if velocity_list is not None:
-            if len(velocity_list) == 3:
-                velocity = velocity_list
-            else:
-                raise RuntimeError(f"Invalid setting of velocity: {pos_line}")
-
-        if mag_list is not None:
-            if len(mag_list) == 3:
-                magmom = mag_list
-            elif len(mag_list) == 1:
-                magmom = mag_list[0]
-            else:
-                raise RuntimeError(f"Invalid magnetic moment {pos_line}")
-
-        if angle1_list is not None:
-            if len(angle1_list) == 1:
-                angle1 = angle1_list[0]
-            else:
-                raise RuntimeError(f"Invalid angle1 {pos_line}")
-
-        if angle2_list is not None:
-            if len(angle2_list) == 1:
-                angle2 = angle2_list[0]
-            else:
-                raise RuntimeError(f"Invalid angle2 {pos_line}")
-
-        if constrain_list is not None:
-            if len(constrain_list) == 3:
-                constrain = constrain_list
-            elif len(constrain_list) == 1:
-                constrain = constrain_list[0]
-            else:
-                raise RuntimeError(f"Invalid constrain {pos_line}")
-
-        if lambda_list is not None:
-            if len(lambda_list) == 3:
-                lambda1 = lambda_list
-            elif len(lambda_list) == 1:
-                lambda1 = lambda_list[0]
-            else:
-                raise RuntimeError(f"Invalid lambda {pos_line}")
-
-    return pos, move, velocity, magmom, angle1, angle2, constrain, lambda1
-
-
-def get_atom_mag_cartesian(atommag, angle1, angle2):
-    """Transform atommag, angle1, angle2 to magmom in cartesian coordinates.
-
-    Parameters
-    ----------
-    atommag : float/list of float/None
-        Atom magnetic moment.
-    angle1 : float/None
-        value of angle1.
-    angle2 : float/None
-        value of angle2.
-    ABACUS support defining mag, angle1, angle2 at the same time.
-    angle1 is the angle between z-axis and real spin (in degrees).
-    angle2 is the angle between x-axis and real spin projection in xy-plane (in degrees).
-    If only mag is defined, then transfer it to magmom directly.
-    And if mag, angle1, angle2 are defined, then mag is only the norm of magmom, and the direction is defined by angle1 and angle2.
-    """
-    if atommag is None:
-        return None
-    if not (isinstance(atommag, list) or isinstance(atommag, float)):
-        raise RuntimeError(f"Invalid atommag: {atommag}")
-
-    if angle1 is None and angle2 is None:
-        if isinstance(atommag, list):
-            return atommag
-        else:
-            return [0, 0, atommag]
-    else:
-        a1 = 0
-        a2 = 0
-        if angle1 is not None:
-            a1 = angle1
-        if angle2 is not None:
-            a2 = angle2
-        if isinstance(atommag, list):
-            mag_norm = np.linalg.norm(atommag)
-        else:
-            mag_norm = atommag
-        return [
-            mag_norm * np.sin(np.radians(a1)) * np.cos(np.radians(a2)),
-            mag_norm * np.sin(np.radians(a1)) * np.sin(np.radians(a2)),
-            mag_norm * np.cos(np.radians(a1)),
-        ]
-
-
-def get_cartesian_coords(coords, coord_type, celldm, cell):
-    """Transform the atomic coordinates to cartesian coordinates.
-
-    Args:
-        coords (np.ndarray): atomic coordinates read from the STRU file.
-        coord_type (str): the coordination type, either "cartesian" or "direct".
-        celldm (float): the lattice constant.
-        cell (np.ndarray): the cell vectors in angstrom.
-
-    Returns
-    -------
-    np.ndarray: the cartesian coordinates in angstrom.
-    """
-    if coord_type == "cartesian":
-        return coords * celldm * bohr2ang
-    elif coord_type == "direct":
-        return np.matmul(coords, cell)
-    else:
-        raise RuntimeError(f"Invalid coordination type: {coord_type}")
-
-
-def parse_pos(coords_lines, atom_names, celldm, cell):
-    """Read the atomic positions block in the ABACUS STRU file.
-
-    Args:
-        coords_lines (list): list of lines in the atomic positions block.
-        atom_names (list): list of atom names.
-        celldm (float): the lattice constant.
-        cell (np.ndarray): the cell vectors in angstrom, and has multipy celldm.
-
-    Returns
-    -------
-    tuple: tuple of atom_numbs, coords, move, mags, velocity, sc, lambda_
-    Note: for atomic magnetic moment, we finnaly transform it to non-collinear magnetic moment in cartesian coordinates,
-        and do not return the angle1 and angle2, and the magnetic moment of each atom type.
-
-    """
-    coord_type = coords_lines[0].split()[0].lower()  # cartisan or direct
-    atom_numbs = []  # the number of each atom type
-    coords = []  # coordinations of atoms
-    move = []  # move flag of each atom
-    velocity = []  # velocity of each atom
-    mags = []  # magnetic moment of each atom
-    sc = []  # spin constraint flag of each atom
-    lambda_ = []  # lambda of each atom
-
-    ntype = len(atom_names)
-    line_idx = 1  # starting line of first element
-    define_atom_mag = False
-    for it in range(ntype):
-        atom_name = coords_lines[line_idx].split()[0]
-        if atom_name != atom_names[it]:
-            raise RuntimeError(
-                f"Read atom name '{atom_name}' is not equal to the expected atom name '{atom_names[it]}'"
-            )
-        atom_type_mag = float(coords_lines[line_idx + 1].split()[0])
-        line_idx += 2
-        atom_numbs.append(int(coords_lines[line_idx].split()[0]))
-        line_idx += 1
-        for iline in range(atom_numbs[it]):
-            pos, imove, ivelocity, imagmom, iangle1, iangle2, iconstrain, ilambda1 = (
-                parse_pos_oneline(coords_lines[line_idx])
-            )
-
-            coords.append(get_cartesian_coords(np.array(pos), coord_type, celldm, cell))
-
-            move.append(imove)
-            velocity.append(ivelocity)
-            sc.append(iconstrain)
-            lambda_.append(ilambda1)
-
-            # calculate the magnetic moment in cartesian coordinates
-            mag = get_atom_mag_cartesian(imagmom, iangle1, iangle2)
-            if mag is None:
-                mag = [0, 0, atom_type_mag]
-            mags.append(mag)
-
-            if imagmom is not None:
-                define_atom_mag = True
-
-            line_idx += 1
-    coords = np.array(coords)  # need transformation!!!
-
-    if all([i is None for i in move]):
-        move = []
-    else:
-        move = np.array(move, dtype=bool)
-
-    if all([i is None for i in velocity]):
-        velocity = []
-    else:
-        velocity = np.array(velocity)
-
-    if all([i is None for i in sc]):
-        sc = []
-
-    if all([i is None for i in lambda_]):
-        lambda_ = []
-
-    # here return the magnetic moment only when the atom magnetic moment is specified.
-    if not define_atom_mag:
-        mags = []
-    else:
-        mags = np.array(mags)
-
-    return atom_numbs, coords, move, mags, velocity, sc, lambda_
-
-
-def right_hand_rule(
-    cell: np.ndarray, coord: np.ndarray
-) -> tuple[np.ndarray, np.ndarray]:
-    """Rotate the cell and coord to make the cell fit the right-hand rule.
-
-    Args:
-        cell (np.ndarray): the cell vectors.
-        coord (np.ndarray): the atomic coordinates in cartesian.
-
-    Returns
-    -------
-    tuple: the rotated cell and coord.
-    """
-    if np.linalg.det(cell) < 0:
-        cell = -cell
-        coord = -coord
-    return cell, coord
-
-
-def get_frame_from_stru(stru):
-    """Read the ABACUS STRU file and return the dpdata frame.
-
-    The description of ABACUS STRU can be found in https://abacus.deepmodeling.com/en/latest/advanced/input_files/stru.html
-
-    Args:
-        stru (str): path to the ABACUS STRU file.
-
-    Returns
-    -------
-    data: the parsed stru information in dictionary.
-    {
-        "atom_names": list of atom names,
-        "atom_numbs": list of atom numbers,
-        "atom_types": list of atom types,
-        "masses": list of atomic masses,
-        "pp_files", list of pseudo potential files,
-        "orb_files", list of orbital files,
-        "dpks_descriptor": the deepks descriptor file,
-
-        # below are the information in each frame
-
-        "cells": list of cell vectors,
-        "coords": list of atomic coordinates,
-        "spins": list of magnetic moments, # return only when set "mag xxx" for each atom in STRU file
-        "moves": list of move flags,
-    }
-    For some keys, if the information is not provided in the STRU file, then it will not be included in the dictionary.
-    "spins" is designed for delta spin calculation, and when dpdata.System is write to lmp format, the spin will be written as magmom.
-    But we should note that this file format is valid only for a spin lammps job, not for a normal job.
-    If you want to use dpgen to run the non-spin job, then you should not define "mag x x x" in the STRU file.
-    """
-    if not os.path.isfile(stru):
-        raise FileNotFoundError(f"ABACUS STRU file {stru} not found!!!")
-
-    # 1. read the file and split the lines to blocks
-    with open(stru) as f:
-        lines = f.readlines()
-    blocks = split_stru_block(lines)
-
-    # 2. parse the blocks
-    atom_names, masses, pp_files = parse_atomic_species_block(blocks["ATOMIC_SPECIES"])
-    orb_files = parse_numerical_orbital_block(blocks.get("NUMERICAL_ORBITAL", []))
-    dpks_descriptor = blocks.get("NUMERICAL_DESCRIPTOR", [])
-    celldm = parse_lattice_constant_block(blocks["LATTICE_CONSTANT"])
-    cell = parse_lattice_vectors_block(blocks["LATTICE_VECTORS"])
-    cell = np.array(cell) * celldm * bohr2ang
-    atom_numbs, coords, move, mags, velocity, sc, lambda_ = parse_pos(
-        blocks["ATOMIC_POSITIONS"], atom_names, celldm, cell
-    )
-
-    cell, coords = right_hand_rule(cell, coords)
-    data = {
-        "atom_names": atom_names,
-        "atom_numbs": atom_numbs,
-        "atom_types": np.array(
-            [i for i in range(len(atom_numbs)) for j in range(atom_numbs[i])]
-        ),
-        "masses": np.array(masses),
-        "pp_files": pp_files,
-        "cells": np.array([cell]),
-        "coords": np.array([coords]),
-    }
-    if len(mags) > 0:
-        data["spins"] = np.array([mags])
-    if len(orb_files) > 0:
-        data["orb_files"] = orb_files
-    if len(dpks_descriptor) > 0:
-        data["dpks_descriptor"] = dpks_descriptor[0].strip()
-    if len(move) > 0:
-        data["move"] = np.array([move])
-
-    return data
-
-
-def make_unlabeled_stru(
-    data,
-    frame_idx,
-    pp_file=None,
-    numerical_orbital=None,
-    numerical_descriptor=None,
-    mass=None,
-    move=None,
-    velocity=None,
-    mag=None,
-    angle1=None,
-    angle2=None,
-    sc=None,
-    lambda_=None,
-    link_file=False,
-    dest_dir=None,
-    **kwargs,
-):
-    """Make an unlabeled STRU file from a dictionary.
-
-    Parameters
-    ----------
-    data : dict
-        System data
-    frame_idx : int
-        The index of the frame to dump
-    pp_file : list of string or dict
-        List of pseudo potential files, or a dictionary of pseudo potential files for each atomnames
-    numerical_orbital : list of string or dict, optional
-        List of orbital files, or a dictionary of orbital files for each atomnames
-    numerical_descriptor : str, optional
-        numerical descriptor file
-    mass : list of float, optional
-        List of atomic masses
-    move : list of (list of list of bool), optional
-        List of the move flag of each xyz direction of each atom for each frame
-    velocity : list of list of float, optional
-        List of the velocity of each xyz direction of each atom
-    mag : list of (list of float or float), optional
-        List of the magnetic moment of each atom, can be a list of three floats or one float
-        For noncollinear, three floats are the xyz component of the magnetic moment.
-        For collinear, one float is the norm of the magnetic moment.
-    angle1 : list of float, optional
-        List of the angle1 of each atom. For noncollinear calculation, it is the angle between the magnetic moment and the z-axis.
-    angle2 : list of float, optional
-        List of the angle2 of each atom. For noncollinear calculation, it is the angle between the projection of magnetic moment on xy plane and the x-axis.
-    sc : list of (bool or list of 3 bool), optional
-        List of the spin constraint flag of each atom. Each element can be a bool or a list of three bools or None.
-    lambda_ : list of (float or list of 3 float), optional
-        List of the lambda of each atom. Each element can be a float or a list of three floats.
-    link_file : bool, optional
-        Whether to link the pseudo potential files and orbital files in the STRU file.
-        If True, then only filename will be written in the STRU file, and make a soft link to the real file.
-    dest_dir : str, optional
-        The destination directory to make the soft link of the pseudo potential files and orbital files.
-    For velocity, mag, angle1, angle2, sc, and lambda_, if the value is None, then the corresponding information will not be written.
-    ABACUS support defining "mag" and "angle1"/"angle2" at the same time, and in this case, the "mag" only define the norm of the magnetic moment, and "angle1" and "angle2" define the direction of the magnetic moment.
-    If data has spins, then it will be written as mag to STRU file; while if mag is passed at the same time, then mag will be used.
-    """
-
-    def _link_file(dest_dir, src_file):
-        if not os.path.isfile(src_file):
-            print(f"ERROR: link_file: {src_file} is not a file.")
-            return False
-        src_file = os.path.abspath(src_file)
-        if not os.path.isdir(dest_dir):
-            os.makedirs(dest_dir)
-        dest_file = os.path.join(dest_dir, os.path.basename(src_file))
-        if os.path.isfile(dest_file):
-            if os.path.samefile(src_file, dest_file):
-                return True
-            else:
-                os.remove(dest_file)
-        os.symlink(src_file, dest_file)
-        return True
-
-    def ndarray2list(i):
-        if isinstance(i, np.ndarray):
-            return i.tolist()
-        else:
-            return i
-
-    def process_file_input(file_input, atom_names, input_name):
-        # For pp_file and numerical_orbital, process the file input, and return a list of file names
-        # file_input can be a list of file names, or a dictionary of file names for each atom names
-        if isinstance(file_input, (list, tuple)):
-            if len(file_input) != len(atom_names):
-                raise ValueError(
-                    f"{input_name} length is not equal to the number of atom types"
-                )
-            return file_input
-        elif isinstance(file_input, dict):
-            for element in atom_names:
-                if element not in file_input:
-                    raise KeyError(f"{input_name} does not contain {element}")
-            return [file_input[element] for element in atom_names]
-        else:
-            raise ValueError(f"Invalid {input_name}: {file_input}")
-
-    if link_file and dest_dir is None:
-        print(
-            "WARNING: make_unlabeled_stru: link_file is True, but dest_dir is None. Will write the filename to STRU but not making soft link."
-        )
-    if dest_dir is not None and dest_dir.strip() == "":
-        dest_dir = "."
-
-    # check the input data
-    if mass is None and data.get("masses") is not None and len(data["masses"]) > 0:
-        mass = data["masses"]
-
-    if (
-        pp_file is None
-        and data.get("pp_files") is not None
-        and len(data["pp_files"]) > 0
-    ):
-        pp_file = data["pp_files"]
-
-    if (
-        numerical_orbital is None
-        and data.get("orb_files") is not None
-        and len(data["orb_files"]) > 0
-    ):
-        numerical_orbital = data["orb_files"]
-
-    if numerical_descriptor is None and data.get("dpks_descriptor") is not None:
-        numerical_descriptor = data["dpks_descriptor"]
-
-    if mag is None and data.get("spins") is not None and len(data["spins"]) > 0:
-        mag = data["spins"][frame_idx]
-
-    if move is None and data.get("move", None) is not None and len(data["move"]) > 0:
-        move = data["move"][frame_idx]
-
-    # check the length of the input data
-    atom_numbs = sum(data["atom_numbs"])
-    for key in [move, velocity, mag, angle1, angle2, sc, lambda_]:
-        if key is not None:
-            if (
-                not isinstance(ndarray2list(key), (list, tuple))
-                and len(key) != atom_numbs
-            ):
-                key_name = [name for name, value in locals().items() if value is key][0]
-                print(
-                    f"ERROR: make_unlabeled_stru: the length of '{key_name}' ({len(key)}) should be equal to the number of atom number ({atom_numbs})."
-                )
-                return ""
-
-    # ATOMIC_SPECIES block
-    out = "ATOMIC_SPECIES\n"
-    if pp_file is not None:
-        ppfiles = process_file_input(
-            ndarray2list(pp_file), data["atom_names"], "pp_file"
-        )
-    else:
-        warnings.warn(
-            "pp_file is not provided, will use empty string for pseudo potential file."
-        )
-        ppfiles = [""] * len(data["atom_names"])
-
-    for iele in range(len(data["atom_names"])):
-        if data["atom_numbs"][iele] == 0:
-            continue
-        out += data["atom_names"][iele] + " "
-        if mass is not None:
-            out += f"{mass[iele]:.3f} "
-        else:
-            out += "1 "
-
-        ipp_file = ppfiles[iele]
-        if ipp_file != "":
-            if not link_file:
-                out += ipp_file
-            else:
-                out += os.path.basename(ipp_file.rstrip("/"))
-                if dest_dir is not None:
-                    _link_file(dest_dir, ipp_file)
-        out += "\n"
-    out += "\n"
-
-    # NUMERICAL_ORBITAL block
-    if numerical_orbital is not None:
-        numerical_orbital = ndarray2list(numerical_orbital)
-        orbfiles = process_file_input(
-            numerical_orbital, data["atom_names"], "numerical_orbital"
-        )
-        orbfiles = [
-            orbfiles[i]
-            for i in range(len(data["atom_names"]))
-            if data["atom_numbs"][i] != 0
-        ]
-        out += "NUMERICAL_ORBITAL\n"
-        for iorb in orbfiles:
-            if not link_file:
-                out += iorb
-            else:
-                out += os.path.basename(iorb.rstrip("/"))
-                if dest_dir is not None:
-                    _link_file(dest_dir, iorb)
-            out += "\n"
-        out += "\n"
-
-    # deepks block
-    if numerical_descriptor is not None:
-        assert isinstance(numerical_descriptor, str)
-        if not link_file:
-            out += f"NUMERICAL_DESCRIPTOR\n{numerical_descriptor}\n"
-        else:
-            out += f"NUMERICAL_DESCRIPTOR\n{os.path.basename(numerical_descriptor)}\n"
-            if dest_dir is not None:
-                _link_file(dest_dir, numerical_descriptor)
-        out += "\n"
-
-    # LATTICE_CONSTANT and LATTICE_VECTORS block
-    out += "LATTICE_CONSTANT\n"
-    out += str(1 / bohr2ang) + "\n\n"
-
-    out += "LATTICE_VECTORS\n"
-    for ix in range(3):
-        for iy in range(3):
-            out += str(data["cells"][frame_idx][ix][iy]) + " "
-        out += "\n"
-    out += "\n"
-
-    # ATOMIC_POSITIONS block
-    out += "ATOMIC_POSITIONS\n"
-    out += "Cartesian    # Cartesian(Unit is LATTICE_CONSTANT)\n"
-    # ret += "\n"
-    natom_tot = 0  # in for loop, it is also the atom index
-    for iele in range(len(data["atom_names"])):
-        if data["atom_numbs"][iele] == 0:
-            continue
-        out += data["atom_names"][iele] + "\n"
-        out += "0.0\n"
-        out += str(data["atom_numbs"][iele]) + "\n"
-        for iatom in range(data["atom_numbs"][iele]):
-            iatomtype = np.nonzero(data["atom_types"] == iele)[0][
-                iatom
-            ]  # it is the atom index
-            iout = f"{data['coords'][frame_idx][iatomtype, 0]:.12f} {data['coords'][frame_idx][iatomtype, 1]:.12f} {data['coords'][frame_idx][iatomtype, 2]:.12f}"
-            # add flags for move, velocity, mag, angle1, angle2, and sc
-            if move is not None:
-                if (
-                    isinstance(ndarray2list(move[iatomtype]), (list, tuple))
-                    and len(move[iatomtype]) == 3
-                ):
-                    iout += " " + " ".join(
-                        ["1" if ii else "0" for ii in move[iatomtype]]
-                    )
-                elif isinstance(ndarray2list(move[iatomtype]), (int, float, bool)):
-                    iout += " 1 1 1" if move[iatomtype] else " 0 0 0"
-            else:
-                iout += " 1 1 1"
-
-            if (
-                velocity is not None
-                and isinstance(ndarray2list(velocity[iatomtype]), (list, tuple))
-                and len(velocity[iatomtype]) == 3
-            ):
-                iout += " v " + " ".join([f"{ii:.12f}" for ii in velocity[iatomtype]])
-
-            if mag is not None:
-                if isinstance(ndarray2list(mag[iatomtype]), (list, tuple)) and len(
-                    mag[iatomtype]
-                ) in [1, 3]:
-                    iout += " mag " + " ".join([f"{ii:.12f}" for ii in mag[iatomtype]])
-                elif isinstance(ndarray2list(mag[iatomtype]), (int, float)):
-                    iout += " mag " + f"{mag[iatomtype]:.12f}"
-
-            if angle1 is not None and isinstance(
-                ndarray2list(angle1[iatomtype]), (int, float)
-            ):
-                iout += " angle1 " + f"{angle1[iatomtype]:.12f}"
-
-            if angle2 is not None and isinstance(
-                ndarray2list(angle2[iatomtype]), (int, float)
-            ):
-                iout += " angle2 " + f"{angle2[iatomtype]:.12f}"
-
-            if sc is not None:
-                if isinstance(ndarray2list(sc[iatomtype]), (list, tuple)) and len(
-                    sc[iatomtype]
-                ) in [1, 3]:
-                    iout += " sc " + " ".join(
-                        ["1" if ii else "0" for ii in sc[iatomtype]]
-                    )
-                elif isinstance(ndarray2list(sc[iatomtype]), (int, float, bool)):
-                    iout += " sc " + "1" if sc[iatomtype] else "0"
-
-            if lambda_ is not None:
-                if isinstance(ndarray2list(lambda_[iatomtype]), (list, tuple)) and len(
-                    lambda_[iatomtype]
-                ) in [1, 3]:
-                    iout += " lambda " + " ".join(
-                        [f"{ii:.12f}" for ii in lambda_[iatomtype]]
-                    )
-                elif isinstance(ndarray2list(lambda_[iatomtype]), (int, float)):
-                    iout += " lambda " + f"{lambda_[iatomtype]:.12f}"
-
-            out += iout + "\n"
-            natom_tot += 1
-    assert natom_tot == sum(data["atom_numbs"])
-    return out
+from dpdata.formats.abacus.stru import *  # noqa: F403
diff --git a/dpdata/amber/__init__.py b/dpdata/amber/__init__.py
index e69de29bb..6d20c397a 100644
--- a/dpdata/amber/__init__.py
+++ b/dpdata/amber/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import annotations
+
+from dpdata.formats.amber import *  # noqa: F403
diff --git a/dpdata/amber/mask.py b/dpdata/amber/mask.py
index 155e2a7be..177b34788 100644
--- a/dpdata/amber/mask.py
+++ b/dpdata/amber/mask.py
@@ -1,42 +1,3 @@
-"""Amber mask."""
-
 from __future__ import annotations
 
-try:
-    import parmed
-except ImportError:
-    pass
-
-
-def pick_by_amber_mask(param, maskstr, coords=None):
-    """Pick atoms by amber masks.
-
-    Parameters
-    ----------
-    param : str or parmed.Structure
-        filename of Amber param file or parmed.Structure
-    maskstr : str
-        Amber masks
-    coords : np.ndarray (optional)
-        frame coordinates, shape: N*3
-    """
-    parm = load_param_file(param)
-    if coords is not None:
-        parm.initialize_topology(xyz=coords)
-    sele = []
-    if len(maskstr) > 0:
-        newmaskstr = maskstr.replace("@0", "!@*")
-        sele = [
-            parm.atoms[i].idx
-            for i in parmed.amber.mask.AmberMask(parm, newmaskstr).Selected()
-        ]
-    return sele
-
-
-def load_param_file(param_file):
-    if isinstance(param_file, str):
-        return parmed.load_file(param_file)
-    elif isinstance(param_file, parmed.Structure):
-        return param_file
-    else:
-        raise RuntimeError("Unsupported structure")
+from dpdata.formats.amber.mask import *  # noqa: F403
diff --git a/dpdata/amber/md.py b/dpdata/amber/md.py
index 06d9e2032..37a1186af 100644
--- a/dpdata/amber/md.py
+++ b/dpdata/amber/md.py
@@ -1,190 +1,3 @@
 from __future__ import annotations
 
-import os
-import re
-
-import numpy as np
-
-from dpdata.amber.mask import pick_by_amber_mask
-from dpdata.unit import EnergyConversion
-from dpdata.utils import open_file
-
-from ..periodic_table import ELEMENTS
-
-kcalmol2eV = EnergyConversion("kcal_mol", "eV").value()
-symbols = ["X"] + ELEMENTS
-
-energy_convert = kcalmol2eV
-force_convert = energy_convert
-
-
-def cell_lengths_angles_to_cell(
-    cell_lengths: np.ndarray, cell_angles: np.ndarray
-) -> np.ndarray:
-    """Convert cell lengths and angles to cell vectors.
-
-    Parameters
-    ----------
-    cell_lengths
-        Cell lengths with shape ``(..., 3)`` where the last dimension is
-        ``a, b, c``.
-    cell_angles
-        Cell angles in degrees with shape ``(..., 3)`` where the last dimension
-        is ``alpha, beta, gamma``.
-
-    Returns
-    -------
-    np.ndarray
-        Cell vectors with shape ``(..., 3, 3)``.
-    """
-    alpha = np.deg2rad(cell_angles[..., 0])
-    beta = np.deg2rad(cell_angles[..., 1])
-    gamma = np.deg2rad(cell_angles[..., 2])
-
-    a = cell_lengths[..., 0]
-    b = cell_lengths[..., 1]
-    c = cell_lengths[..., 2]
-
-    if np.any(cell_lengths <= 0.0):
-        raise RuntimeError("Invalid AMBER cell lengths")
-    if np.any((cell_angles <= 0.0) | (cell_angles >= 180.0)):
-        raise RuntimeError("Invalid AMBER cell angles")
-
-    cos_alpha = np.cos(alpha)
-    cos_beta = np.cos(beta)
-    cos_gamma = np.cos(gamma)
-    sin_gamma = np.sin(gamma)
-    ly = b * sin_gamma
-    if np.any(ly <= 1e-8):
-        raise RuntimeError("Invalid AMBER cell angles")
-
-    z_factor = (
-        1
-        - cos_alpha**2
-        - cos_beta**2
-        - cos_gamma**2
-        + 2 * cos_alpha * cos_beta * cos_gamma
-    )
-    lz2 = c**2 * z_factor / sin_gamma**2
-    if np.any(lz2 <= 1e-8):
-        raise RuntimeError("Invalid AMBER cell angles")
-
-    z = np.sqrt(z_factor) / sin_gamma
-
-    shape = (*cell_lengths.shape[:-1], 3, 3)
-    cells = np.zeros(shape)
-    cells[..., 0, 0] = a
-    cells[..., 1, 0] = b * cos_gamma
-    cells[..., 1, 1] = b * sin_gamma
-    cells[..., 2, 0] = c * cos_beta
-    cells[..., 2, 1] = c * (cos_alpha - cos_beta * cos_gamma) / sin_gamma
-    cells[..., 2, 2] = c * z
-    return cells
-
-
-def read_amber_traj(
-    parm7_file,
-    nc_file,
-    mdfrc_file=None,
-    mden_file=None,
-    mdout_file=None,
-    use_element_symbols=None,
-    labeled=True,
-):
-    """The amber trajectory includes:
-    * nc, NetCDF format, stores coordinates
-    * mdfrc, NetCDF format, stores forces
-    * mden (optional), text format, stores energies
-    * mdout (optional), text format, may store energies if there is no mden_file
-    * parm7, text format, stores types.
-
-    Parameters
-    ----------
-    parm7_file, nc_file, mdfrc_file, mden_file, mdout_file:
-        filenames
-    use_element_symbols : None or list or str
-        If use_element_symbols is a list of atom indexes, these atoms will use element symbols
-        instead of amber types. For example, a ligand will use C, H, O, N, and so on
-        instead of h1, hc, o, os, and so on.
-        IF use_element_symbols is str, it will be considered as Amber mask.
-    labeled : bool
-        Whether to return labeled data
-    """
-    from scipy.io import netcdf_file
-
-    flag_atom_type = False
-    flag_atom_numb = False
-    amber_types = []
-    atomic_number = []
-    with open_file(parm7_file) as f:
-        for line in f:
-            if line.startswith("%FLAG"):
-                flag_atom_type = line.startswith("%FLAG AMBER_ATOM_TYPE")
-                flag_atom_numb = (use_element_symbols is not None) and line.startswith(
-                    "%FLAG ATOMIC_NUMBER"
-                )
-            elif flag_atom_type or flag_atom_numb:
-                if line.startswith("%FORMAT"):
-                    fmt = re.findall(r"\d+", line)
-                    fmt0 = int(fmt[0])
-                    fmt1 = int(fmt[1])
-                else:
-                    for ii in range(fmt0):
-                        start_index = ii * fmt1
-                        end_index = (ii + 1) * fmt1
-                        if end_index >= len(line):
-                            continue
-                        content = line[start_index:end_index].strip()
-                        if flag_atom_type:
-                            amber_types.append(content)
-                        elif flag_atom_numb:
-                            atomic_number.append(int(content))
-    if use_element_symbols is not None:
-        if isinstance(use_element_symbols, str):
-            use_element_symbols = pick_by_amber_mask(parm7_file, use_element_symbols)
-        for ii in use_element_symbols:
-            amber_types[ii] = symbols[atomic_number[ii]]
-
-    with netcdf_file(nc_file, "r") as f:
-        coords = np.array(f.variables["coordinates"][:])
-        cell_lengths = np.array(f.variables["cell_lengths"][:])
-        cell_angles = np.array(f.variables["cell_angles"][:])
-        cells = cell_lengths_angles_to_cell(cell_lengths, cell_angles)
-
-    if labeled:
-        with netcdf_file(mdfrc_file, "r") as f:
-            forces = np.array(f.variables["forces"][:])
-
-        # load energy from mden_file or mdout_file
-        energies = []
-        if mden_file is not None and os.path.isfile(mden_file):
-            with open_file(mden_file) as f:
-                for line in f:
-                    if line.startswith("L6"):
-                        s = line.split()
-                        if s[2] != "E_pot":
-                            energies.append(float(s[2]))
-        elif mdout_file is not None and os.path.isfile(mdout_file):
-            with open_file(mdout_file) as f:
-                for line in f:
-                    if "EPtot" in line:
-                        s = line.split()
-                        energies.append(float(s[-1]))
-        else:
-            raise RuntimeError("Please provide one of mden_file and mdout_file")
-
-    atom_names, atom_types, atom_numbs = np.unique(
-        amber_types, return_inverse=True, return_counts=True
-    )
-
-    data = {}
-    data["atom_names"] = list(atom_names)
-    data["atom_numbs"] = list(atom_numbs)
-    data["atom_types"] = atom_types
-    if labeled:
-        data["forces"] = forces * force_convert
-        data["energies"] = np.array(energies) * energy_convert
-    data["coords"] = coords
-    data["cells"] = cells
-    data["orig"] = np.array([0, 0, 0])
-    return data
+from dpdata.formats.amber.md import *  # noqa: F403
diff --git a/dpdata/amber/sqm.py b/dpdata/amber/sqm.py
index 93e41f9aa..81db3ee9a 100644
--- a/dpdata/amber/sqm.py
+++ b/dpdata/amber/sqm.py
@@ -1,120 +1,3 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
-
-import numpy as np
-
-from dpdata.periodic_table import ELEMENTS
-from dpdata.unit import EnergyConversion
-from dpdata.utils import open_file
-
-if TYPE_CHECKING:
-    from dpdata.utils import FileType
-
-kcal2ev = EnergyConversion("kcal_mol", "eV").value()
-
-START = 0
-READ_CHARGE = 2
-READ_COORDS_START = 3
-READ_COORDS = 6
-READ_FORCES = 7
-
-
-def parse_sqm_out(fname: FileType):
-    """Read atom symbols, charges and coordinates from ambertools sqm.out file."""
-    atom_symbols = []
-    coords = []
-    charges = []
-    forces = []
-    energies = []
-
-    with open_file(fname) as f:
-        flag = START
-        for line in f:
-            if line.startswith(" Total SCF energy"):
-                energy = float(line.strip().split()[-2])
-                energies = [energy]
-            elif line.startswith("  Atom    Element       Mulliken Charge"):
-                flag = READ_CHARGE
-                charges = []
-            elif line.startswith(" Total Mulliken Charge"):
-                flag = START
-            elif line.startswith(" Final Structure"):
-                flag = READ_COORDS_START
-                coords = []
-            elif line.startswith("QMMM: Forces on QM atoms"):
-                flag = READ_FORCES
-                forces = []
-            elif flag == READ_CHARGE:
-                ls = line.strip().split()
-                atom_symbols.append(ls[-2])
-                charges.append(float(ls[-1]))
-            elif READ_COORDS_START <= flag < READ_COORDS:
-                flag += 1
-            elif flag == READ_COORDS:
-                coords.append([float(x) for x in line.strip().split()[-3:]])
-                if len(coords) == len(charges):
-                    flag = START
-            elif flag == READ_FORCES:
-                ll = line.strip()
-                if not ll.startswith("QMMM: Atm "):
-                    flag = START
-                    continue
-                forces.append([float(ll[-60:-40]), float(ll[-40:-20]), float(ll[-20:])])
-                if len(forces) == len(charges):
-                    flag = START
-
-    data = {}
-    atom_names, data["atom_types"], atom_numbs = np.unique(
-        atom_symbols, return_inverse=True, return_counts=True
-    )
-    data["charges"] = np.array(charges)
-    data["atom_names"] = list(atom_names)
-    data["atom_numbs"] = list(atom_numbs)
-    data["orig"] = np.array([0, 0, 0])
-    data["cells"] = np.array(
-        [[[100.0, 0.0, 0.0], [0.0, 100.0, 0.0], [0.0, 0.0, 100.0]]]
-    )
-    data["nopbc"] = True
-    data["coords"] = np.array([coords])
-
-    energies = np.array(energies)
-    forces = -np.array([forces], dtype=np.float64) * kcal2ev
-    if len(forces) > 0:
-        data["energies"] = energies
-        data["forces"] = forces
-
-    return data
-
-
-def make_sqm_in(data, fname: FileType | None = None, frame_idx=0, **kwargs):
-    symbols = [data["atom_names"][ii] for ii in data["atom_types"]]
-    atomic_numbers = [ELEMENTS.index(ss) + 1 for ss in symbols]
-    charge = kwargs.get("charge", 0)
-
-    # multiplicity
-    mult = kwargs.get("mult", 1)
-    if mult != 1:
-        raise RuntimeError("Multiplicity is not 1, which is not supported by sqm")
-
-    maxcyc = kwargs.get("maxcyc", 0)  # 0 represents a single-point calculation
-    theory = kwargs.get("qm_theory", "DFTB3")
-    ret = "Run semi-emperical minimization\n"
-    ret += " &qmmm\n"
-    ret += f"     qm_theory='{theory}'\n"
-    ret += f"     qmcharge={charge}\n"
-    ret += f"     maxcyc={maxcyc}\n"
-    ret += "     verbosity=4\n"
-    ret += " /\n"
-    for ii in range(len(data["atom_types"])):
-        ret += "{:>4s}{:>6s}{:>16s}{:>16s}{:>16s}\n".format(
-            str(atomic_numbers[ii]),
-            str(symbols[ii]),
-            f"{data['coords'][frame_idx][ii, 0]:.6f}",
-            f"{data['coords'][frame_idx][ii, 1]:.6f}",
-            f"{data['coords'][frame_idx][ii, 2]:.6f}",
-        )
-    if fname is not None:
-        with open_file(fname, "w") as fp:
-            fp.write(ret)
-    return ret
+from dpdata.formats.amber.sqm import *  # noqa: F403
diff --git a/dpdata/bond_order_system.py b/dpdata/bond_order_system.py
index 7a23acca5..db29d6b65 100644
--- a/dpdata/bond_order_system.py
+++ b/dpdata/bond_order_system.py
@@ -6,11 +6,11 @@
 
 import numpy as np
 
-import dpdata.rdkit.utils
-from dpdata.rdkit.sanitize import Sanitizer
+import dpdata.formats.rdkit.utils
+from dpdata.formats.rdkit.sanitize import Sanitizer
 from dpdata.system import Axis, DataType, System
 
-# import dpdata.rdkit.mol2
+# import dpdata.formats.rdkit.mol2
 
 
 class BondOrderSystem(System):
@@ -79,7 +79,7 @@ def __init__(
         self.sanitizer = Sanitizer(sanitize_level, raise_errors, verbose)
 
         if data:
-            mol = dpdata.rdkit.utils.system_data_to_mol(data)
+            mol = dpdata.formats.rdkit.utils.system_data_to_mol(data)
             self.from_rdkit_mol(mol)
         if file_name:
             self.from_fmt(
@@ -161,7 +161,7 @@ def __add__(self, other):
     #         magic method "+" operation
     #     '''
     #     if isinstance(other, BondOrderSystem):
-    #         if dpdata.rdkit.utils.check_same_molecule(self.rdkit_mol, other.rdkit_mol):
+    #         if dpdata.formats.rdkit.utils.check_same_molecule(self.rdkit_mol, other.rdkit_mol):
     #             self.__class__(self, data=other.data)
     #         else:
     #             raise RuntimeError("The two systems are not of the same topology.")
@@ -171,7 +171,7 @@ def __add__(self, other):
     def from_rdkit_mol(self, rdkit_mol):
         """Initialize from a rdkit.Chem.rdchem.Mol object."""
         rdkit_mol = self.sanitizer.sanitize(rdkit_mol)
-        self.data = dpdata.rdkit.utils.mol_to_system_data(rdkit_mol)
+        self.data = dpdata.formats.rdkit.utils.mol_to_system_data(rdkit_mol)
         self.data["bond_dict"] = dict(
             [(f"{int(bond[0])}-{int(bond[1])}", bond[2]) for bond in self.data["bonds"]]
         )
diff --git a/dpdata/cp2k/__init__.py b/dpdata/cp2k/__init__.py
index e69de29bb..8e66eb10e 100644
--- a/dpdata/cp2k/__init__.py
+++ b/dpdata/cp2k/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import annotations
+
+from dpdata.formats.cp2k import *  # noqa: F403
diff --git a/dpdata/cp2k/cell.py b/dpdata/cp2k/cell.py
index a3021b815..5fd326de9 100644
--- a/dpdata/cp2k/cell.py
+++ b/dpdata/cp2k/cell.py
@@ -1,68 +1,3 @@
-# %%
 from __future__ import annotations
 
-import numpy as np
-
-
-def cell_to_low_triangle(A, B, C, alpha, beta, gamma):
-    """Convert cell to low triangle matrix.
-
-    Parameters
-    ----------
-    A : float
-        cell length A
-    B : float
-        cell length B
-    C : float
-        cell length C
-    alpha : float
-        radian. The angle between vector B and  vector C.
-    beta : float
-        radian. The angle between vector A and  vector C.
-    gamma : float
-        radian. The angle between vector B and  vector C.
-
-    Returns
-    -------
-    cell : list
-        The cell matrix used by dpdata in low triangle form.
-    """
-    if not np.pi * 5 / 180 < alpha < np.pi * 175 / 180:
-        raise RuntimeError(
-            f"alpha=={alpha}: must be a radian, and \
-            must be in np.pi*5/180 < alpha < np.pi*175/180"
-        )
-    if not np.pi * 5 / 180 < beta < np.pi * 175 / 180:
-        raise RuntimeError(
-            f"beta=={beta}: must be a radian, and \
-            must be in np.pi*5/180 < beta < np.pi*175/180"
-        )
-    if not np.pi * 5 / 180 < gamma < np.pi * 175 / 180:
-        raise RuntimeError(
-            f"gamma=={gamma}: must be a radian, and \
-                must be in np.pi*5/180 < gamma < np.pi*175/180"
-        )
-    if not A > 0.2:
-        raise RuntimeError(f"A=={A}, must be greater than 0.2")
-    if not B > 0.2:
-        raise RuntimeError(f"B=={B}, must be greater than 0.2")
-    if not C > 0.2:
-        raise RuntimeError(f"C=={C}, must be greater than 0.2")
-
-    lx = A
-    xy = B * np.cos(gamma)
-    xz = C * np.cos(beta)
-    ly = B * np.sin(gamma)
-    if not ly > 0.1:
-        raise RuntimeError(
-            "ly:=B* np.sin(gamma)=={}, must be greater than 0.1", format(ly)
-        )
-    yz = (B * C * np.cos(alpha) - xy * xz) / ly
-    if not C**2 - xz**2 - yz**2 > 0.01:
-        raise RuntimeError(
-            "lz^2:=C**2-xz**2-yz**2=={}, must be greater than 0.01",
-            format(C**2 - xz**2 - yz**2),
-        )
-    lz = np.sqrt(C**2 - xz**2 - yz**2)
-    cell = np.asarray([[lx, 0, 0], [xy, ly, 0], [xz, yz, lz]]).astype("float32")
-    return cell
+from dpdata.formats.cp2k.cell import *  # noqa: F403
diff --git a/dpdata/cp2k/output.py b/dpdata/cp2k/output.py
index bf575f728..b7f4ce4d4 100644
--- a/dpdata/cp2k/output.py
+++ b/dpdata/cp2k/output.py
@@ -1,510 +1,3 @@
-# %%
 from __future__ import annotations
 
-import math
-import re
-from collections import OrderedDict
-
-import numpy as np
-
-from ..unit import (
-    EnergyConversion,
-    ForceConversion,
-    LengthConversion,
-    PressureConversion,
-)
-from .cell import cell_to_low_triangle
-
-AU_TO_ANG = LengthConversion("bohr", "angstrom").value()
-AU_TO_EV = EnergyConversion("hartree", "eV").value()
-AU_TO_EV_EVERY_ANG = ForceConversion("hartree/bohr", "eV/angstrom").value()
-delimiter_patterns = []
-delimiter_p1 = re.compile(r"^ \* GO CP2K GO! \*+")
-delimiter_p2 = re.compile(r"^ \*+")
-delimiter_patterns.append(delimiter_p1)
-delimiter_patterns.append(delimiter_p2)
-avail_patterns = []
-avail_patterns.append(re.compile(r"^ INITIAL POTENTIAL ENERGY"))
-avail_patterns.append(re.compile(r"^ ENSEMBLE TYPE"))
-
-
-class Cp2kSystems:
-    """deal with cp2k outputfile."""
-
-    def __init__(self, log_file_name, xyz_file_name, restart=False):
-        self.log_file_object = open(log_file_name)
-        self.xyz_file_object = open(xyz_file_name)
-        self.log_block_generator = self.get_log_block_generator()
-        self.xyz_block_generator = self.get_xyz_block_generator()
-        self.restart_flag = restart
-
-        self.cell = None
-        self.print_level = None
-
-        self.atomic_kinds = None
-
-        if self.restart_flag:
-            self.handle_single_log_frame(next(self.log_block_generator))
-
-    def __del__(self):
-        self.log_file_object.close()
-        self.xyz_file_object.close()
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        info_dict = {}
-        log_info_dict = self.handle_single_log_frame(next(self.log_block_generator))
-        # print(log_info_dict)
-        xyz_info_dict = self.handle_single_xyz_frame(next(self.xyz_block_generator))
-        # eq1 = [v1==v2 for v1,v2 in zip(log_info_dict['atom_numbs'], xyz_info_dict['atom_numbs'])]
-        # eq2 = [v1==v2 for v1,v2 in zip(log_info_dict['atom_names'], xyz_info_dict['atom_names'])]
-        # eq3 = [v1==v2 for v1,v2 in zip(log_info_dict['atom_types'], xyz_info_dict['atom_types'])]
-        # assert all(eq1), (log_info_dict,xyz_info_dict,'There may be errors in the file. If it is a restart task; use restart=True')
-        # assert all(eq2), (log_info_dict,xyz_info_dict,'There may be errors in the file. If it is a restart task; use restart=True')
-        # assert all(eq3), (log_info_dict,xyz_info_dict,'There may be errors in the file. If it is a restart task; use restart=True')
-        assert math.isclose(
-            log_info_dict["energies"][0], xyz_info_dict["energies"][0], abs_tol=1.0e-6
-        ), (
-            log_info_dict["energies"],
-            xyz_info_dict["energies"],
-            "There may be errors in the file",
-        )
-        info_dict.update(log_info_dict)
-        info_dict.update(xyz_info_dict)
-        return info_dict
-
-    def get_log_block_generator(self):
-        lines = []
-        delimiter_flag = False
-        yield_flag = False
-        while True:
-            line = self.log_file_object.readline()
-            if line:
-                lines.append(line)
-                if any(p.match(line) for p in delimiter_patterns):
-                    if delimiter_flag is True:
-                        yield_flag = True
-                        yield lines
-                        lines = []
-                        delimiter_flag = False
-                    else:
-                        line = self.log_file_object.readline()
-                        lines.append(line)
-                        if any(p.match(line) for p in avail_patterns):
-                            delimiter_flag = True
-            else:
-                if not yield_flag:
-                    raise StopIteration("None of the delimiter patterns are matched")
-                break
-        if delimiter_flag is True:
-            raise RuntimeError("This file lacks some content, please check")
-
-    def get_xyz_block_generator(self):
-        p3 = re.compile(r"^\s*(\d+)\s*")
-        yield_flag = False
-        while True:
-            line = self.xyz_file_object.readline()
-            if not line:
-                if not yield_flag:
-                    raise StopIteration("None of the xyz patterns are matched")
-                break
-            if p3.match(line):
-                yield_flag = True
-                atom_num = int(p3.match(line).group(1))
-                lines = []
-                lines.append(line)
-                for ii in range(atom_num + 1):
-                    lines.append(self.xyz_file_object.readline())
-                if not lines[-1]:
-                    raise RuntimeError(
-                        f"this xyz file may lack of lines, should be {atom_num + 2};lines:{lines}"
-                    )
-                yield lines
-
-    def handle_single_log_frame(self, lines):
-        info_dict = {}
-        energy_pattern_1 = re.compile(
-            r" INITIAL POTENTIAL ENERGY\[hartree\]\s+=\s+(?P<number>\S+)"
-        )
-        #  CONSERVED QUANTITY [hartree] =                              -0.279168013085E+04
-        energy_pattern_2 = re.compile(
-            r" POTENTIAL ENERGY\[hartree\]\s+=\s+(?P<number>\S+)"
-        )
-        energy = None
-        cell_length_pattern = re.compile(
-            r" (INITIAL ){0,1}CELL LNTHS\[bohr\]\s+=\s+(?P<A>\S+)\s+(?P<B>\S+)\s+(?P<C>\S+)"
-        )
-        cell_angle_pattern = re.compile(
-            r" (INITIAL ){0,1}CELL ANGLS\[deg\]\s+=\s+(?P<alpha>\S+)\s+(?P<beta>\S+)\s+(?P<gamma>\S+)"
-        )
-        cell_A, cell_B, cell_C = (
-            0,
-            0,
-            0,
-        )
-        cell_alpha, cell_beta, cell_gamma = (
-            0,
-            0,
-            0,
-        )
-        cell_a_pattern = re.compile(
-            r" CELL\| Vector a \[angstrom\]:\s+(?P<ax>\S+)\s+(?P<ay>\S+)\s+(?P<az>\S+)"
-        )
-        cell_b_pattern = re.compile(
-            r" CELL\| Vector b \[angstrom\]:\s+(?P<bx>\S+)\s+(?P<by>\S+)\s+(?P<bz>\S+)"
-        )
-        cell_c_pattern = re.compile(
-            r" CELL\| Vector c \[angstrom\]:\s+(?P<cx>\S+)\s+(?P<cy>\S+)\s+(?P<cz>\S+)"
-        )
-        force_start_pattern = re.compile(r" ATOMIC FORCES in")
-        force_flag = False
-        force_end_pattern = re.compile(r" SUM OF ATOMIC FORCES")
-        force_lines = []
-        cell_flag = 0
-        print_level_pattern = re.compile(
-            r" GLOBAL\| Global print level\s+(?P<print_level>\S+)"
-        )
-        print_level_flag = 0
-        atomic_kinds_pattern = re.compile(r"\s+\d+\. Atomic kind:\s+(?P<akind>\S+)")
-        atomic_kinds = []
-        stress_sign = "STRESS"
-        stress_flag = 0
-        stress = []
-
-        for line in lines:
-            if stress_flag == 3:
-                if line == "\n":
-                    stress_flag = 0
-                else:
-                    stress.append(line.split()[1:4])
-            if stress_flag == 2:
-                stress_flag = 3
-            if stress_flag == 1:
-                stress_flag = 2
-            if stress_sign in line:
-                stress_flag = 1
-            if force_start_pattern.match(line):
-                force_flag = True
-            if force_end_pattern.match(line):
-                assert force_flag is True, (
-                    force_flag,
-                    "there may be errors in this file ",
-                )
-                force_flag = False
-            if force_flag is True:
-                force_lines.append(line)
-            if energy_pattern_1.match(line):
-                energy = (
-                    float(energy_pattern_1.match(line).groupdict()["number"]) * AU_TO_EV
-                )
-                # print('1to', energy)
-            if energy_pattern_2.match(line):
-                energy = (
-                    float(energy_pattern_2.match(line).groupdict()["number"]) * AU_TO_EV
-                )
-            if cell_length_pattern.match(line):
-                cell_A = (
-                    float(cell_length_pattern.match(line).groupdict()["A"]) * AU_TO_ANG
-                )
-                cell_B = (
-                    float(cell_length_pattern.match(line).groupdict()["B"]) * AU_TO_ANG
-                )
-                cell_C = (
-                    float(cell_length_pattern.match(line).groupdict()["C"]) * AU_TO_ANG
-                )
-                cell_flag += 1
-            if cell_angle_pattern.match(line):
-                cell_alpha = np.deg2rad(
-                    float(cell_angle_pattern.match(line).groupdict()["alpha"])
-                )
-                cell_beta = np.deg2rad(
-                    float(cell_angle_pattern.match(line).groupdict()["beta"])
-                )
-                cell_gamma = np.deg2rad(
-                    float(cell_angle_pattern.match(line).groupdict()["gamma"])
-                )
-                cell_flag += 1
-            if print_level_pattern.match(line):
-                print_level = print_level_pattern.match(line).groupdict()["print_level"]
-                print_level_flag += 1
-            if cell_a_pattern.match(line):
-                cell_ax = float(cell_a_pattern.match(line).groupdict()["ax"])
-                cell_ay = float(cell_a_pattern.match(line).groupdict()["ay"])
-                cell_az = float(cell_a_pattern.match(line).groupdict()["az"])
-                cell_flag += 1
-            if cell_b_pattern.match(line):
-                cell_bx = float(cell_b_pattern.match(line).groupdict()["bx"])
-                cell_by = float(cell_b_pattern.match(line).groupdict()["by"])
-                cell_bz = float(cell_b_pattern.match(line).groupdict()["bz"])
-                cell_flag += 1
-            if cell_c_pattern.match(line):
-                cell_cx = float(cell_c_pattern.match(line).groupdict()["cx"])
-                cell_cy = float(cell_c_pattern.match(line).groupdict()["cy"])
-                cell_cz = float(cell_c_pattern.match(line).groupdict()["cz"])
-                cell_flag += 1
-
-            if atomic_kinds_pattern.match(line):
-                akind = atomic_kinds_pattern.match(line).groupdict()["akind"]
-                atomic_kinds.append(akind)
-        if print_level_flag == 1:
-            self.print_level = print_level
-            if print_level == "LOW":
-                raise RuntimeError(
-                    "please provide cp2k output with higher print level(at least MEDIUM)"
-                )
-
-        if cell_flag == 2:
-            self.cell = cell_to_low_triangle(
-                cell_A, cell_B, cell_C, cell_alpha, cell_beta, cell_gamma
-            )
-        elif cell_flag == 5:
-            self.cell = np.asarray(
-                [
-                    [cell_ax, cell_ay, cell_az],
-                    [cell_bx, cell_by, cell_bz],
-                    [cell_cx, cell_cy, cell_cz],
-                ]
-            ).astype("float64")
-        if atomic_kinds:
-            self.atomic_kinds = atomic_kinds
-        # print(self.atomic_kinds)
-        # lx = cell_A
-        # xy = cell_B * np.cos(cell_gamma)
-        # xz = cell_C * np.cos(cell_beta)
-        # ly = cell_B* np.sin(cell_gamma)
-        # yz = (cell_B*cell_C*np.cos(cell_alpha)-xy*xz)/ly
-        # lz = np.sqrt(cell_C**2-xz**2-yz**2)
-        # self.cell = [[lx, 0 , 0],
-        #         [xy, ly, 0 ],
-        #         [xz, yz, lz]]
-
-        element_index = -1
-        element_dict = OrderedDict()
-        atom_types_idx_list = []
-        forces_list = []
-        for line in force_lines[3:]:
-            line_list = line.split()
-            # print(line_list)
-            if element_dict.get(line_list[1]):
-                element_dict[line_list[1]][1] += 1
-            else:
-                element_index += 1
-                element_dict[line_list[1]] = [element_index, 1]
-            atom_types_idx_list.append(element_dict[line_list[1]][0])
-            forces_list.append(
-                [
-                    float(line_list[3]) * AU_TO_EV_EVERY_ANG,
-                    float(line_list[4]) * AU_TO_EV_EVERY_ANG,
-                    float(line_list[5]) * AU_TO_EV_EVERY_ANG,
-                ]
-            )
-        # print(atom_types_idx_list)
-        # atom_names=list(element_dict.keys())
-        atom_names = self.atomic_kinds
-        atom_numbs = []
-
-        GPa = PressureConversion("eV/angstrom^3", "GPa").value()
-        if stress:
-            stress = np.array(stress)
-            stress = stress.astype("float64")
-            stress = stress[np.newaxis, :, :]
-            # stress to virial conversion, default unit in cp2k is GPa
-            # note the stress is virial = stress * volume
-            virial = stress * np.linalg.det(self.cell) / GPa
-            virial = virial.squeeze()
-        else:
-            virial = None
-        for ii in element_dict.keys():
-            atom_numbs.append(element_dict[ii][1])
-        # print(atom_numbs)
-        info_dict["atom_names"] = atom_names
-        info_dict["atom_numbs"] = atom_numbs
-        info_dict["atom_types"] = np.asarray(atom_types_idx_list)
-        info_dict["print_level"] = self.print_level
-        info_dict["cells"] = np.asarray([self.cell]).astype("float64")
-        info_dict["energies"] = np.asarray([energy]).astype("float64")
-        info_dict["forces"] = np.asarray([forces_list]).astype("float64")
-        if virial is not None:
-            info_dict["virials"] = np.asarray([virial]).astype("float64")
-        return info_dict
-
-    def handle_single_xyz_frame(self, lines):
-        info_dict = {}
-        atom_num = int(lines[0].strip("\n").strip())
-        if len(lines) != atom_num + 2:
-            raise RuntimeError(
-                f"format error, atom_num=={atom_num}, {len(lines)}!=atom_num+2"
-            )
-        data_format_line = lines[1].strip("\n").strip() + " "
-        prop_pattern = re.compile(r"(?P<prop>\w+)\s*=\s*(?P<number>.*?)[, ]")
-        prop_dict = dict(prop_pattern.findall(data_format_line))
-
-        energy = 0
-        if prop_dict.get("E"):
-            energy = float(prop_dict.get("E")) * AU_TO_EV
-            # info_dict['energies'] = np.array([prop_dict['E']]).astype('float64')
-
-        element_index = -1
-        element_dict = OrderedDict()
-        atom_types_list = []
-        coords_list = []
-        for line in lines[2:]:
-            line_list = line.split()
-            if element_dict.get(line_list[0]):
-                element_dict[line_list[0]][1] += 1
-            else:
-                element_index += 1
-                element_dict[line_list[0]] = [element_index, 1]
-            atom_types_list.append(element_dict[line_list[0]][0])
-            # coords_list.append([float(line_list[1])*AU_TO_ANG,
-            #     float(line_list[2])*AU_TO_ANG,
-            #     float(line_list[3])*AU_TO_ANG])
-            coords_list.append(
-                [float(line_list[1]), float(line_list[2]), float(line_list[3])]
-            )
-        atom_names = list(element_dict.keys())
-        atom_numbs = []
-        for ii in atom_names:
-            atom_numbs.append(element_dict[ii][1])
-        # info_dict['atom_names'] = atom_names
-        # info_dict['atom_numbs'] = atom_numbs
-        # info_dict['atom_types'] = np.asarray(atom_types_list)
-        info_dict["coords"] = np.asarray([coords_list]).astype("float64")
-        info_dict["energies"] = np.array([energy]).astype("float64")
-        info_dict["orig"] = np.zeros(3)
-        return info_dict
-
-
-# %%
-
-
-def get_frames(fname):
-    coord_flag = False
-    force_flag = False
-    stress_flag = False
-    eV = EnergyConversion("hartree", "eV").value()
-    angstrom = LengthConversion("bohr", "angstrom").value()
-    GPa = PressureConversion("eV/angstrom^3", "GPa").value()
-    atom_symbol_idx_list = []
-    atom_symbol_list = []
-    cell = []
-    coord = []
-    force = []
-    stress = []
-
-    fp = open(fname)
-    # check if output is converged, if not, return sys = 0
-    content = fp.read()
-    count = content.count("SCF run converged")
-    if count == 0:
-        fp.close()
-        return [], [], [], [], [], [], [], None
-
-    # search duplicated header
-    fp.seek(0)
-    header_idx = []
-    for idx, ii in enumerate(fp):
-        if "Multiplication driver" in ii:
-            header_idx.append(idx)
-
-    # parse from last header
-    fp.seek(0)
-    for idx, ii in enumerate(fp):
-        if idx > header_idx[-1]:
-            if "CELL| Vector" in ii:
-                cell.append(ii.split()[4:7])
-            if "Atomic kind:" in ii:
-                atom_symbol_list.append(ii.split()[3])
-
-            # beginning of coords block
-            if "Atom  Kind  Element" in ii or "Atom Kind Element" in ii:
-                coord_flag = True
-            # parse coords lines
-            elif coord_flag:
-                if ii == "\n":
-                    coord_flag = len(coord) == 0  # skip empty line at the beginning
-                else:
-                    coord.append(ii.split()[4:7])
-                    atom_symbol_idx_list.append(ii.split()[1])
-
-            if "ENERGY|" in ii:
-                energy = ii.split()[8]
-            if " Atom   Kind " in ii:
-                force_flag = True
-                force_idx = idx
-            if force_flag:
-                if idx > force_idx:
-                    if "SUM OF ATOMIC FORCES" in ii:
-                        force_flag = False
-                    else:
-                        force.append(ii.split()[3:6])
-            # add reading stress tensor
-            if "STRESS TENSOR [GPa" in ii:
-                stress_flag = True
-                stress_idx = idx
-            if stress_flag:
-                if idx > stress_idx + 2:
-                    if ii == "\n":
-                        stress_flag = False
-                    else:
-                        stress.append(ii.split()[1:4])
-
-    fp.close()
-    assert coord, "cannot find coords"
-    assert energy, "cannot find energies"
-    assert force, "cannot find forces"
-
-    # conver to float array and add extra dimension for nframes
-    cell = np.array(cell)
-    cell = cell.astype("float64")
-    cell = cell[np.newaxis, :, :]
-    coord = np.array(coord)
-    coord = coord.astype("float64")
-    coord = coord[np.newaxis, :, :]
-    atom_symbol_idx_list = np.array(atom_symbol_idx_list)
-    atom_symbol_idx_list = atom_symbol_idx_list.astype(int)
-    atom_symbol_idx_list = atom_symbol_idx_list - 1
-    atom_symbol_list = np.array(atom_symbol_list)
-    atom_symbol_list = atom_symbol_list[atom_symbol_idx_list]
-    force = np.array(force)
-    force = force.astype("float64")
-    force = force[np.newaxis, :, :]
-
-    # virial is not necessary
-    if stress:
-        stress = np.array(stress)
-        stress = stress.astype("float64")
-        stress = stress[np.newaxis, :, :]
-        # stress to virial conversion, default unit in cp2k is GPa
-        # note the stress is virial = stress * volume
-        virial = stress * np.linalg.det(cell[0]) / GPa
-    else:
-        virial = None
-
-    # force unit conversion, default unit in cp2k is hartree/bohr
-    force = force * eV / angstrom
-    # energy unit conversion, default unit in cp2k is hartree
-    energy = float(energy) * eV
-    energy = np.array(energy).astype("float64")
-    energy = energy[np.newaxis]
-
-    tmp_names, symbol_idx = np.unique(atom_symbol_list, return_index=True)
-    atom_types = []
-    atom_numbs = []
-    # preserve the atom_name order
-    atom_names = atom_symbol_list[np.sort(symbol_idx, kind="stable")]
-    for jj in atom_symbol_list:
-        for idx, ii in enumerate(atom_names):
-            if jj == ii:
-                atom_types.append(idx)
-    for idx in range(len(atom_names)):
-        atom_numbs.append(atom_types.count(idx))
-
-    atom_types = np.array(atom_types)
-
-    return list(atom_names), atom_numbs, atom_types, cell, coord, energy, force, virial
-
-
-# %%
+from dpdata.formats.cp2k.output import *  # noqa: F403
diff --git a/dpdata/deepmd/__init__.py b/dpdata/deepmd/__init__.py
index e69de29bb..2ee2e6795 100644
--- a/dpdata/deepmd/__init__.py
+++ b/dpdata/deepmd/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import annotations
+
+from dpdata.formats.deepmd import *  # noqa: F403
diff --git a/dpdata/deepmd/comp.py b/dpdata/deepmd/comp.py
index 410d789e1..f2da0108a 100644
--- a/dpdata/deepmd/comp.py
+++ b/dpdata/deepmd/comp.py
@@ -1,162 +1,3 @@
 from __future__ import annotations
 
-import glob
-import os
-import shutil
-import warnings
-
-import numpy as np
-
-import dpdata
-from dpdata.utils import open_file
-
-from .raw import load_type
-
-
-def _cond_load_data(fname):
-    tmp = None
-    if os.path.isfile(fname):
-        tmp = np.load(fname)
-    return tmp
-
-
-def _load_set(folder, nopbc: bool):
-    coords = np.load(os.path.join(folder, "coord.npy"))
-    if nopbc:
-        cells = np.zeros((coords.shape[0], 3, 3))
-    else:
-        cells = np.load(os.path.join(folder, "box.npy"))
-    return cells, coords
-
-
-def to_system_data(folder, type_map=None, labels=True):
-    # data is empty
-    data = load_type(folder, type_map=type_map)
-    data["orig"] = np.zeros([3])
-    if os.path.isfile(os.path.join(folder, "nopbc")):
-        data["nopbc"] = True
-    sets = sorted(glob.glob(os.path.join(folder, "set.*")))
-    all_cells = []
-    all_coords = []
-    for ii in sets:
-        cells, coords = _load_set(ii, data.get("nopbc", False))
-        nframes = np.reshape(cells, [-1, 3, 3]).shape[0]
-        all_cells.append(np.reshape(cells, [nframes, 3, 3]))
-        all_coords.append(np.reshape(coords, [nframes, -1, 3]))
-    data["cells"] = np.concatenate(all_cells, axis=0)
-    data["coords"] = np.concatenate(all_coords, axis=0)
-    # allow custom dtypes
-    if labels:
-        dtypes = dpdata.system.LabeledSystem.DTYPES
-    else:
-        dtypes = dpdata.system.System.DTYPES
-
-    for dtype in dtypes:
-        if dtype.name in (
-            "atom_numbs",
-            "atom_names",
-            "atom_types",
-            "orig",
-            "cells",
-            "coords",
-            "real_atom_names",
-            "nopbc",
-        ):
-            # skip as these data contains specific rules
-            continue
-        if not (len(dtype.shape) and dtype.shape[0] == dpdata.system.Axis.NFRAMES):
-            warnings.warn(
-                f"Shape of {dtype.name} is not (nframes, ...), but {dtype.shape}. This type of data will not converted from deepmd/npy format."
-            )
-            continue
-        natoms = data["atom_types"].shape[0]
-        shape = [
-            natoms if xx == dpdata.system.Axis.NATOMS else xx for xx in dtype.shape[1:]
-        ]
-        all_data = []
-        for ii in sets:
-            tmp = _cond_load_data(os.path.join(ii, dtype.deepmd_name + ".npy"))
-            if tmp is not None:
-                all_data.append(np.reshape(tmp, [tmp.shape[0], *shape]))
-        if len(all_data) > 0:
-            data[dtype.name] = np.concatenate(all_data, axis=0)
-    return data
-
-
-def dump(folder, data, set_size=5000, comp_prec=np.float32, remove_sets=True):
-    os.makedirs(folder, exist_ok=True)
-    sets = sorted(glob.glob(os.path.join(folder, "set.*")))
-    if len(sets) > 0:
-        if remove_sets:
-            for ii in sets:
-                shutil.rmtree(ii)
-        else:
-            raise RuntimeError(
-                "found "
-                + str(sets)
-                + " in "
-                + folder
-                + "not a clean deepmd raw dir. please firstly clean set.* then try compress"
-            )
-    # dump raw
-    np.savetxt(os.path.join(folder, "type.raw"), data["atom_types"], fmt="%d")
-    np.savetxt(os.path.join(folder, "type_map.raw"), data["atom_names"], fmt="%s")
-    # BondOrder System
-    if "bonds" in data:
-        np.savetxt(
-            os.path.join(folder, "bonds.raw"),
-            data["bonds"],
-            header="begin_atom, end_atom, bond_order",
-        )
-    if "formal_charges" in data:
-        np.savetxt(os.path.join(folder, "formal_charges.raw"), data["formal_charges"])
-    # reshape frame properties and convert prec
-    nframes = data["cells"].shape[0]
-    # dump frame properties: cell, coord, energy, force and virial
-    nsets = nframes // set_size
-    if set_size * nsets < nframes:
-        nsets += 1
-    for ii in range(nsets):
-        set_stt = ii * set_size
-        set_end = (ii + 1) * set_size
-        set_folder = os.path.join(folder, "set.%03d" % ii)  # noqa: UP031
-        os.makedirs(set_folder)
-    try:
-        os.remove(os.path.join(folder, "nopbc"))
-    except OSError:
-        pass
-    if data.get("nopbc", False):
-        with open_file(os.path.join(folder, "nopbc"), "w") as fw_nopbc:
-            pass
-    # allow custom dtypes
-    labels = "energies" in data
-    if labels:
-        dtypes = dpdata.system.LabeledSystem.DTYPES
-    else:
-        dtypes = dpdata.system.System.DTYPES
-    for dtype in dtypes:
-        if dtype.name in (
-            "atom_numbs",
-            "atom_names",
-            "atom_types",
-            "orig",
-            "real_atom_names",
-            "nopbc",
-        ):
-            # skip as these data contains specific rules
-            continue
-        if dtype.name not in data:
-            continue
-        if not (len(dtype.shape) and dtype.shape[0] == dpdata.system.Axis.NFRAMES):
-            warnings.warn(
-                f"Shape of {dtype.name} is not (nframes, ...), but {dtype.shape}. This type of data will not converted to deepmd/npy format."
-            )
-            continue
-        ddata = np.reshape(data[dtype.name], [nframes, -1])
-        if np.issubdtype(ddata.dtype, np.floating):
-            ddata = ddata.astype(comp_prec)
-        for ii in range(nsets):
-            set_stt = ii * set_size
-            set_end = (ii + 1) * set_size
-            set_folder = os.path.join(folder, "set.%03d" % ii)  # noqa: UP031
-            np.save(os.path.join(set_folder, dtype.deepmd_name), ddata[set_stt:set_end])
+from dpdata.formats.deepmd.comp import *  # noqa: F403
diff --git a/dpdata/deepmd/hdf5.py b/dpdata/deepmd/hdf5.py
index c2b3bd424..9ef1c5a75 100644
--- a/dpdata/deepmd/hdf5.py
+++ b/dpdata/deepmd/hdf5.py
@@ -1,228 +1,3 @@
-"""Utils for deepmd/hdf5 format."""
-
 from __future__ import annotations
 
-import warnings
-from typing import TYPE_CHECKING
-
-import numpy as np
-
-import dpdata
-
-if TYPE_CHECKING:
-    import h5py
-
-__all__ = ["to_system_data", "dump"]
-
-
-def to_system_data(
-    f: h5py.File | h5py.Group,
-    folder: str,
-    type_map: list | None = None,
-    labels: bool = True,
-):
-    """Load a HDF5 file.
-
-    Parameters
-    ----------
-    f : h5py.File or h5py.Group
-        HDF5 file or group object
-    folder : str
-        path in the HDF5 file
-    type_map : list
-        type map
-    labels : bool
-        labels
-    """
-    from wcmatch.glob import globfilter
-
-    g = f[folder] if folder else f
-
-    data = {}
-    # ignore empty files or groups
-    if "type.raw" not in g.keys():
-        return data
-    data["atom_types"] = g["type.raw"][:]
-    ntypes = np.max(data["atom_types"]) + 1
-    natoms = data["atom_types"].size
-    data["atom_numbs"] = []
-    for ii in range(ntypes):
-        data["atom_numbs"].append(np.count_nonzero(data["atom_types"] == ii))
-    data["atom_names"] = []
-    # if find type_map.raw, use it
-    if "type_map.raw" in g.keys():
-        my_type_map = list(np.char.decode(g["type_map.raw"][:]))
-    # else try to use arg type_map
-    elif type_map is not None:
-        my_type_map = type_map
-    # in the last case, make artificial atom names
-    else:
-        my_type_map = []
-        for ii in range(ntypes):
-            my_type_map.append("Type_%d" % ii)  # noqa: UP031
-    assert len(my_type_map) >= len(data["atom_numbs"])
-    for ii in range(len(data["atom_numbs"])):
-        data["atom_names"].append(my_type_map[ii])
-
-    data["orig"] = np.zeros([3])
-    if "nopbc" in g.keys():
-        data["nopbc"] = True
-    sets = globfilter(g.keys(), "set.*")
-
-    data_types = {}
-    # allow custom dtypes
-    if labels:
-        dtypes = dpdata.system.LabeledSystem.DTYPES
-    else:
-        dtypes = dpdata.system.System.DTYPES
-    for dtype in dtypes:
-        if dtype.name in (
-            "atom_numbs",
-            "atom_names",
-            "atom_types",
-            "orig",
-            "real_atom_types",
-            "real_atom_names",
-            "nopbc",
-        ):
-            # skip as these data contains specific rules
-            continue
-        if not (len(dtype.shape) and dtype.shape[0] == dpdata.system.Axis.NFRAMES):
-            warnings.warn(
-                f"Shape of {dtype.name} is not (nframes, ...), but {dtype.shape}. This type of data will not converted from deepmd/hdf5 format."
-            )
-            continue
-        shape = [
-            natoms if xx == dpdata.system.Axis.NATOMS else xx for xx in dtype.shape[1:]
-        ]
-
-        data_types[dtype.name] = {
-            "fn": dtype.deepmd_name,
-            "shape": shape,
-            "required": dtype.required
-            and not (dtype.name == "cells" and data.get("nopbc", False)),
-        }
-
-    for dt, prop in data_types.items():
-        all_data = []
-
-        for ii in sets:
-            set = g[ii]
-            fn = "{}.npy".format(prop["fn"])
-            if fn in set.keys():
-                dd = set[fn][:]
-                nframes = dd.shape[0]
-                all_data.append(np.reshape(dd, (nframes, *prop["shape"])))
-            elif prop["required"]:
-                raise RuntimeError(f"{folder}/{ii}/{fn} not found")
-
-        if len(all_data) > 0:
-            data[dt] = np.concatenate(all_data, axis=0)
-    if "cells" not in data:
-        nframes = data["coords"].shape[0]
-        data["cells"] = np.zeros((nframes, 3, 3))
-    return data
-
-
-def dump(
-    f: h5py.File | h5py.Group,
-    folder: str,
-    data: dict,
-    set_size=5000,
-    comp_prec=np.float32,
-) -> None:
-    """Dump data to a HDF5 file.
-
-    Parameters
-    ----------
-    f : h5py.File or h5py.Group
-        HDF5 file or group object
-    folder : str
-        path in the HDF5 file
-    data : dict
-        System or LabeledSystem data
-    set_size : int, default: 5000
-        size of a set
-    comp_prec : np.dtype, default: np.float32
-        precision of data
-    """
-    # if folder is None, use the root of the file
-    if folder:
-        if folder in f:
-            del f[folder]
-        g = f.create_group(folder)
-    else:
-        g = f
-    # ignore empty systems
-    if not len(data["coords"]):
-        return
-    # dump raw (array in fact)
-    g.create_dataset("type.raw", data=data["atom_types"])
-    g.create_dataset("type_map.raw", data=np.array(data["atom_names"], dtype="S"))
-    # BondOrder System
-    if "bonds" in data:
-        g.create_dataset("bonds.raw", data=data["bonds"])
-    if "formal_charges" in data:
-        g.create_dataset("formal_charges.raw", data=data["formal_charges"])
-    # reshape frame properties and convert prec
-    nframes = data["cells"].shape[0]
-
-    nopbc = data.get("nopbc", False)
-    reshaped_data = {}
-
-    data_types = {}
-
-    labels = "energies" in data
-    if labels:
-        dtypes = dpdata.system.LabeledSystem.DTYPES
-    else:
-        dtypes = dpdata.system.System.DTYPES
-    # allow custom dtypes
-    for dtype in dtypes:
-        if dtype.name in (
-            "atom_numbs",
-            "atom_names",
-            "atom_types",
-            "orig",
-            "real_atom_types",
-            "real_atom_names",
-            "nopbc",
-        ):
-            # skip as these data contains specific rules
-            continue
-        if not (len(dtype.shape) and dtype.shape[0] == dpdata.system.Axis.NFRAMES):
-            warnings.warn(
-                f"Shape of {dtype.name} is not (nframes, ...), but {dtype.shape}. This type of data will not converted to deepmd/hdf5 format."
-            )
-            continue
-
-        data_types[dtype.name] = {
-            "fn": dtype.deepmd_name,
-            "shape": (nframes, -1),
-            "dump": not (dtype.name == "cells" and nopbc),
-        }
-
-    for dt, prop in data_types.items():
-        if dt in data:
-            if prop["dump"]:
-                ddata = np.reshape(data[dt], prop["shape"])
-                if np.issubdtype(ddata.dtype, np.floating):
-                    ddata = ddata.astype(comp_prec)
-                reshaped_data[dt] = ddata
-
-    # dump frame properties: cell, coord, energy, force and virial
-    nsets = nframes // set_size
-    if set_size * nsets < nframes:
-        nsets += 1
-    for ii in range(nsets):
-        set_stt = ii * set_size
-        set_end = (ii + 1) * set_size
-        set_folder = g.create_group("set.%03d" % ii)  # noqa: UP031
-        for dt, prop in data_types.items():
-            if dt in reshaped_data:
-                set_folder.create_dataset(
-                    "{}.npy".format(prop["fn"]), data=reshaped_data[dt][set_stt:set_end]
-                )
-
-    if nopbc:
-        g.create_dataset("nopbc", data=True)
+from dpdata.formats.deepmd.hdf5 import *  # noqa: F403
diff --git a/dpdata/deepmd/mixed.py b/dpdata/deepmd/mixed.py
index 734b6a730..a730d076b 100644
--- a/dpdata/deepmd/mixed.py
+++ b/dpdata/deepmd/mixed.py
@@ -1,299 +1,3 @@
 from __future__ import annotations
 
-import copy
-import math
-
-import numpy as np
-
-import dpdata
-from dpdata.data_type import Axis
-
-from .comp import dump as comp_dump
-from .comp import to_system_data as comp_to_system_data
-
-
-def _pad_to(sys_data, target_natoms, dtypes):
-    """Pad system data dict so that NATOMS dimension becomes target_natoms.
-
-    Virtual atoms get real_atom_types = -1, and all other per-atom data is
-    padded with zeros.
-
-    Parameters
-    ----------
-    sys_data : dict
-        System data dict, already in mixed-type format.
-    target_natoms : int
-        Target number of atoms after padding.
-    dtypes : tuple[DataType, ...]
-        Registered data types to iterate for generic per-atom padding.
-    """
-    natoms = sys_data["atom_types"].shape[0]
-    npad = target_natoms - natoms
-    if npad <= 0:
-        return
-    nframes = sys_data["coords"].shape[0]
-
-    # Pad atom_types (all MIXED_TOKEN = 0)
-    sys_data["atom_types"] = np.concatenate(
-        [sys_data["atom_types"], np.zeros(npad, dtype=int)]
-    )
-    sys_data["atom_numbs"] = [target_natoms]
-
-    # Pad real_atom_types with -1 (virtual atom sentinel)
-    sys_data["real_atom_types"] = np.concatenate(
-        [
-            sys_data["real_atom_types"],
-            -np.ones((nframes, npad), dtype=sys_data["real_atom_types"].dtype),
-        ],
-        axis=1,
-    )
-
-    # Pad coords and all other per-atom data generically
-    reserved = {
-        "atom_numbs",
-        "atom_names",
-        "atom_types",
-        "orig",
-        "cells",
-        "real_atom_names",
-        "real_atom_types",
-        "nopbc",
-    }
-    for dtype in dtypes:
-        if dtype.name in reserved:
-            continue
-        if dtype.name not in sys_data:
-            continue
-        if not (
-            len(dtype.shape) >= 2
-            and dtype.shape[0] == Axis.NFRAMES
-            and Axis.NATOMS in dtype.shape
-        ):
-            continue
-        axis_natoms = list(dtype.shape).index(Axis.NATOMS)
-        arr = sys_data[dtype.name]
-        pad_width = [(0, 0)] * len(arr.shape)
-        pad_width[axis_natoms] = (0, npad)
-        sys_data[dtype.name] = np.pad(
-            arr, pad_width, mode="constant", constant_values=0
-        )
-
-
-def _strip_virtual_atoms(atom_types_row, coords, extra_data, dtypes):
-    """Strip virtual atoms (type -1) from a group of frames.
-
-    Parameters
-    ----------
-    atom_types_row : np.ndarray
-        1-D array of atom type indices for the group (same for all frames).
-    coords : np.ndarray
-        Coordinates array, shape (nframes, natoms_padded, 3).
-    extra_data : dict
-        Dict of {name: array} for this group, arrays already frame-sliced.
-    dtypes : tuple[DataType, ...]
-        Registered data types.
-
-    Returns
-    -------
-    atom_types : np.ndarray
-        Atom types with virtual atoms removed.
-    coords : np.ndarray
-        Coords with virtual atoms removed.
-    extra_data : dict
-        Extra data with virtual atoms removed.
-    """
-    real_mask = atom_types_row >= 0
-    if real_mask.all():
-        return atom_types_row, coords, extra_data
-
-    atom_types = atom_types_row[real_mask]
-    coords = coords[:, real_mask, :]
-
-    stripped = {}
-    for name, arr in extra_data.items():
-        for dtype in dtypes:
-            if dtype.name == name and Axis.NATOMS in dtype.shape:
-                axis_natoms = list(dtype.shape).index(Axis.NATOMS)
-                idx = [slice(None)] * len(arr.shape)
-                idx[axis_natoms] = real_mask
-                arr = arr[tuple(idx)]
-                break
-        stripped[name] = arr
-
-    return atom_types, coords, stripped
-
-
-def to_system_data(folder, type_map=None, labels=True):
-    data = comp_to_system_data(folder, type_map, labels)
-    # data is empty
-    old_type_map = data["atom_names"].copy()
-    if type_map is not None:
-        assert isinstance(type_map, list)
-        missing_type = [i for i in old_type_map if i not in type_map]
-        assert not missing_type, (
-            f"These types are missing in selected type_map: {missing_type} !"
-        )
-        index_map = np.array([type_map.index(i) for i in old_type_map])
-        data["atom_names"] = type_map.copy()
-    else:
-        index_map = None
-    all_real_atom_types_concat = data.pop("real_atom_types").astype(int)
-    if index_map is not None:
-        # Preserve -1 (virtual atom sentinel) during remapping
-        valid = all_real_atom_types_concat >= 0
-        remapped = np.full_like(all_real_atom_types_concat, -1)
-        remapped[valid] = index_map[all_real_atom_types_concat[valid]]
-        all_real_atom_types_concat = remapped
-    all_cells_concat = data["cells"]
-    all_coords_concat = data["coords"]
-
-    # handle custom registered data types
-    if labels:
-        dtypes = dpdata.system.LabeledSystem.DTYPES
-    else:
-        dtypes = dpdata.system.System.DTYPES
-    reserved = {
-        "atom_numbs",
-        "atom_names",
-        "atom_types",
-        "real_atom_names",
-        "real_atom_types",
-        "cells",
-        "coords",
-        "orig",
-        "nopbc",
-    }
-    extra_data = {}
-    for dtype in dtypes:
-        name = dtype.name
-        if name in reserved:
-            continue
-        if not (len(dtype.shape) and dtype.shape[0] == dpdata.system.Axis.NFRAMES):
-            continue
-        if name in data:
-            extra_data[name] = data.pop(name)
-
-    data_list = []
-    while True:
-        if all_real_atom_types_concat.size == 0:
-            break
-        # temp_formula = formula(data['atom_names'], temp_atom_numbs)
-        temp_idx = np.arange(all_real_atom_types_concat.shape[0])[
-            (all_real_atom_types_concat == all_real_atom_types_concat[0]).all(-1)
-        ]
-        rest_idx = np.arange(all_real_atom_types_concat.shape[0])[
-            (all_real_atom_types_concat != all_real_atom_types_concat[0]).any(-1)
-        ]
-
-        # Extract data for this group
-        group_atom_types = all_real_atom_types_concat[0]
-        group_coords = all_coords_concat[temp_idx]
-        group_extra = {}
-        for name in extra_data:
-            group_extra[name] = extra_data[name][temp_idx]
-            extra_data[name] = extra_data[name][rest_idx]
-
-        # Strip virtual atoms (type -1) introduced by padding
-        group_atom_types, group_coords, group_extra = _strip_virtual_atoms(
-            group_atom_types, group_coords, group_extra, dtypes
-        )
-
-        temp_atom_numbs = [
-            np.count_nonzero(group_atom_types == i)
-            for i in range(len(data["atom_names"]))
-        ]
-
-        temp_data = data.copy()
-        temp_data["atom_names"] = data["atom_names"].copy()
-        temp_data["atom_numbs"] = temp_atom_numbs
-        temp_data["atom_types"] = group_atom_types
-        all_real_atom_types_concat = all_real_atom_types_concat[rest_idx]
-        temp_data["cells"] = all_cells_concat[temp_idx]
-        all_cells_concat = all_cells_concat[rest_idx]
-        temp_data["coords"] = group_coords
-        all_coords_concat = all_coords_concat[rest_idx]
-
-        for name in group_extra:
-            temp_data[name] = group_extra[name]
-
-        data_list.append(temp_data)
-    return data_list
-
-
-def dump(folder, data, set_size=2000, comp_prec=np.float32, remove_sets=True):
-    # if not converted to mixed
-    if "real_atom_types" not in data:
-        from dpdata import LabeledSystem, System
-
-        # not change the original content
-        data = copy.deepcopy(data)
-
-        if "energies" in data:
-            temp_sys = LabeledSystem(data=data)
-        else:
-            temp_sys = System(data=data)
-        temp_sys.convert_to_mixed_type()
-
-    data = data.copy()
-    data["atom_names"] = data.pop("real_atom_names")
-    comp_dump(folder, data, set_size, comp_prec, remove_sets)
-
-
-def mix_system(*system, type_map, atom_numb_pad=None, **kwargs):
-    """Mix the systems into mixed_type ones according to the unified given type_map.
-
-    Parameters
-    ----------
-    *system : System
-        The systems to mix
-    type_map : list of str
-        Maps atom type to name
-    atom_numb_pad : int, optional
-        If provided, pad atom counts to the next multiple of this number
-        using virtual atoms (type -1 in real_atom_types). This reduces the
-        number of subdirectories when systems have many different atom counts.
-        For example, atom_numb_pad=8 groups systems into multiples of 8.
-    **kwargs : dict
-        Other parameters
-
-    Returns
-    -------
-    mixed_systems: dict
-        dict of mixed system with key 'atom_numbs'
-    """
-    mixed_systems = {}
-    temp_systems = {}
-    atom_numbs_frame_index = {}  # index of frames in cur sys
-    # Use LabeledSystem DTYPES as superset for generic per-atom padding
-    dtypes = dpdata.system.LabeledSystem.DTYPES
-    for sys in system:
-        tmp_sys = sys.copy()
-        natom = tmp_sys.get_natoms()
-        tmp_sys.convert_to_mixed_type(type_map=type_map)
-        if atom_numb_pad is not None and atom_numb_pad > 1:
-            padded_natom = math.ceil(natom / atom_numb_pad) * atom_numb_pad
-            _pad_to(tmp_sys.data, padded_natom, dtypes)
-            group_key = str(padded_natom)
-        else:
-            group_key = str(natom)
-        if group_key not in atom_numbs_frame_index:
-            atom_numbs_frame_index[group_key] = 0
-        atom_numbs_frame_index[group_key] += tmp_sys.get_nframes()
-        if group_key not in temp_systems or not temp_systems[group_key]:
-            temp_systems[group_key] = tmp_sys
-        else:
-            temp_systems[group_key].append(tmp_sys)
-    for natom_key in temp_systems:
-        if atom_numbs_frame_index[natom_key] > 0:
-            mixed_systems[natom_key] = temp_systems[natom_key]
-    return mixed_systems
-
-
-def split_system(sys, split_num=10000):
-    rest = sys.get_nframes() - split_num
-    if rest <= 0:
-        return sys, None, 0
-    else:
-        split_sys = sys.sub_system(range(split_num))
-        rest_sys = sys.sub_system(range(split_num, sys.get_nframes()))
-        return split_sys, rest_sys, rest
+from dpdata.formats.deepmd.mixed import *  # noqa: F403
diff --git a/dpdata/deepmd/raw.py b/dpdata/deepmd/raw.py
index 50dc5afd3..2c7d1d4ed 100644
--- a/dpdata/deepmd/raw.py
+++ b/dpdata/deepmd/raw.py
@@ -1,140 +1,3 @@
 from __future__ import annotations
 
-import os
-import warnings
-
-import numpy as np
-
-import dpdata
-from dpdata.utils import open_file
-
-
-def load_type(folder, type_map=None):
-    data = {}
-    data["atom_types"] = np.loadtxt(os.path.join(folder, "type.raw"), ndmin=1).astype(
-        int
-    )
-    ntypes = np.max(data["atom_types"]) + 1
-    data["atom_names"] = []
-    # if find type_map.raw, use it
-    if os.path.isfile(os.path.join(folder, "type_map.raw")):
-        with open_file(os.path.join(folder, "type_map.raw")) as fp:
-            my_type_map = fp.read().split()
-    # else try to use arg type_map
-    elif type_map is not None:
-        my_type_map = type_map
-    # in the last case, make artificial atom names
-    else:
-        my_type_map = []
-        for ii in range(ntypes):
-            my_type_map.append("Type_%d" % ii)  # noqa: UP031
-    data["atom_names"] = my_type_map
-    data["atom_numbs"] = []
-    for ii, _ in enumerate(data["atom_names"]):
-        data["atom_numbs"].append(np.count_nonzero(data["atom_types"] == ii))
-
-    return data
-
-
-def to_system_data(folder, type_map=None, labels=True):
-    if os.path.isdir(folder):
-        data = load_type(folder, type_map=type_map)
-        data["orig"] = np.zeros([3])
-        data["coords"] = np.loadtxt(os.path.join(folder, "coord.raw"), ndmin=2)
-        nframes = data["coords"].shape[0]
-        if os.path.isfile(os.path.join(folder, "nopbc")):
-            data["nopbc"] = True
-            data["cells"] = np.zeros((nframes, 3, 3))
-        else:
-            data["cells"] = np.loadtxt(os.path.join(folder, "box.raw"), ndmin=2)
-        data["cells"] = np.reshape(data["cells"], [nframes, 3, 3])
-        data["coords"] = np.reshape(data["coords"], [nframes, -1, 3])
-        if os.path.isfile(os.path.join(folder, "nopbc")):
-            data["nopbc"] = True
-        # allow custom dtypes
-        if labels:
-            dtypes = dpdata.system.LabeledSystem.DTYPES
-        else:
-            dtypes = dpdata.system.System.DTYPES
-        for dtype in dtypes:
-            if dtype.name in (
-                "atom_numbs",
-                "atom_names",
-                "atom_types",
-                "orig",
-                "cells",
-                "coords",
-                "real_atom_types",
-                "real_atom_names",
-                "nopbc",
-            ):
-                # skip as these data contains specific rules
-                continue
-            if not (len(dtype.shape) and dtype.shape[0] == dpdata.system.Axis.NFRAMES):
-                warnings.warn(
-                    f"Shape of {dtype.name} is not (nframes, ...), but {dtype.shape}. This type of data will not converted from deepmd/raw format."
-                )
-                continue
-            natoms = data["atom_types"].shape[0]
-            shape = [
-                natoms if xx == dpdata.system.Axis.NATOMS else xx
-                for xx in dtype.shape[1:]
-            ]
-            if os.path.exists(os.path.join(folder, f"{dtype.deepmd_name}.raw")):
-                data[dtype.name] = np.reshape(
-                    np.loadtxt(os.path.join(folder, f"{dtype.deepmd_name}.raw")),
-                    [nframes, *shape],
-                )
-        return data
-    else:
-        raise RuntimeError("not dir " + folder)
-
-
-def dump(folder, data):
-    os.makedirs(folder, exist_ok=True)
-    nframes = data["cells"].shape[0]
-    np.savetxt(os.path.join(folder, "type.raw"), data["atom_types"], fmt="%d")
-    np.savetxt(os.path.join(folder, "type_map.raw"), data["atom_names"], fmt="%s")
-    # BondOrder System
-    if "bonds" in data:
-        np.savetxt(
-            os.path.join(folder, "bonds.raw"),
-            data["bonds"],
-            header="begin_atom, end_atom, bond_order",
-        )
-    if "formal_charges" in data:
-        np.savetxt(os.path.join(folder, "formal_charges.raw"), data["formal_charges"])
-    try:
-        os.remove(os.path.join(folder, "nopbc"))
-    except OSError:
-        pass
-    if data.get("nopbc", False):
-        with open_file(os.path.join(folder, "nopbc"), "w") as fw_nopbc:
-            pass
-    # allow custom dtypes
-    labels = "energies" in data
-    if labels:
-        dtypes = dpdata.system.LabeledSystem.DTYPES
-    else:
-        dtypes = dpdata.system.System.DTYPES
-    for dtype in dtypes:
-        if dtype.name in (
-            "atom_numbs",
-            "atom_names",
-            "atom_types",
-            "orig",
-            "real_atom_types",
-            "real_atom_names",
-            "nopbc",
-        ):
-            # skip as these data contains specific rules
-            continue
-        if dtype.name not in data:
-            continue
-        if not (len(dtype.shape) and dtype.shape[0] == dpdata.system.Axis.NFRAMES):
-            warnings.warn(
-                f"Shape of {dtype.name} is not (nframes, ...), but {dtype.shape}. This type of data will not converted to deepmd/raw format."
-            )
-            continue
-        ddata = np.reshape(data[dtype.name], [nframes, -1])
-        np.savetxt(os.path.join(folder, f"{dtype.deepmd_name}.raw"), ddata)
+from dpdata.formats.deepmd.raw import *  # noqa: F403
diff --git a/dpdata/dftbplus/__init__.py b/dpdata/dftbplus/__init__.py
index e69de29bb..0e9a8e392 100644
--- a/dpdata/dftbplus/__init__.py
+++ b/dpdata/dftbplus/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import annotations
+
+from dpdata.formats.dftbplus import *  # noqa: F403
diff --git a/dpdata/dftbplus/output.py b/dpdata/dftbplus/output.py
index 49fdd2b1b..133adb120 100644
--- a/dpdata/dftbplus/output.py
+++ b/dpdata/dftbplus/output.py
@@ -1,83 +1,3 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
-
-import numpy as np
-
-from dpdata.utils import open_file
-
-if TYPE_CHECKING:
-    from dpdata.utils import FileType
-
-
-def read_dftb_plus(
-    fn_1: FileType, fn_2: FileType
-) -> tuple[str, np.ndarray, float, np.ndarray]:
-    """Read from DFTB+ input and output.
-
-    Parameters
-    ----------
-    fn_1 : str
-        DFTB+ input file name
-    fn_2 : str
-        DFTB+ output file name
-
-    Returns
-    -------
-    str
-        atomic symbols
-    np.ndarray
-        atomic coordinates
-    float
-        total potential energy
-    np.ndarray
-        atomic forces
-
-    """
-    coord = None
-    symbols = None
-    forces = None
-    energy = None
-    with open_file(fn_1) as f:
-        flag = 0
-        for line in f:
-            if flag == 1:
-                flag += 1
-            elif flag == 2:
-                components = line.split()
-                flag += 1
-            elif line.startswith("Geometry"):
-                flag = 1
-                coord = []
-                symbols = []
-            elif flag in (3, 4, 5, 6):
-                s = line.split()
-                components_num = int(s[1])
-                symbols.append(components[components_num - 1])
-                coord.append([float(s[2]), float(s[3]), float(s[4])])
-                flag += 1
-                if flag == 7:
-                    flag = 0
-    with open_file(fn_2) as f:
-        flag = 0
-        for line in f:
-            if line.startswith("Total Forces"):
-                flag = 8
-                forces = []
-            elif flag in (8, 9, 10, 11):
-                s = line.split()
-                forces.append([float(s[1]), float(s[2]), float(s[3])])
-                flag += 1
-                if flag == 12:
-                    flag = 0
-            elif line.startswith("Total energy:"):
-                s = line.split()
-                energy = float(s[2])
-                flag = 0
-
-    symbols = np.array(symbols)
-    forces = np.array(forces)
-    coord = np.array(coord)
-    assert coord.shape == forces.shape
-
-    return symbols, coord, energy, forces
+from dpdata.formats.dftbplus.output import *  # noqa: F403
diff --git a/dpdata/fhi_aims/__init__.py b/dpdata/fhi_aims/__init__.py
old mode 100755
new mode 100644
index e69de29bb..ac6ca3623
--- a/dpdata/fhi_aims/__init__.py
+++ b/dpdata/fhi_aims/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import annotations
+
+from dpdata.formats.fhi_aims import *  # noqa: F403
diff --git a/dpdata/fhi_aims/output.py b/dpdata/fhi_aims/output.py
old mode 100755
new mode 100644
index 762e8bf4d..206a23634
--- a/dpdata/fhi_aims/output.py
+++ b/dpdata/fhi_aims/output.py
@@ -1,204 +1,3 @@
 from __future__ import annotations
 
-import re
-import warnings
-
-import numpy as np
-
-latt_patt = r"\|\s+([0-9]{1,}[.][0-9]*)\s+([0-9]{1,}[.][0-9]*)\s+([0-9]{1,}[.][0-9]*)"
-pos_patt_first = r"\|\s+[0-9]{1,}[:]\s\w+\s(\w+)(\s.*[-]?[0-9]{1,}[.][0-9]*)(\s+[-]?[0-9]{1,}[.][0-9]*)(\s+[-]?[0-9]{1,}[.][0-9]*)"
-pos_patt_other = r"\s+[a][t][o][m]\s+([-]?[0-9]{1,}[.][0-9]*)\s+([-]?[0-9]{1,}[.][0-9]*)\s+([-]?[0-9]{1,}[.][0-9]*)\s+(\w{1,2})"
-force_patt = r"\|\s+[0-9]{1,}\s+([-]?[0-9]{1,}[.][0-9]*[E][+-][0-9]{1,})\s+([-]?[0-9]{1,}[.][0-9]*[E][+-][0-9]{1,})\s+([-]?[0-9]{1,}[.][0-9]*[E][+-][0-9]{1,})"
-eng_patt = r"Total energy uncorrected.*([-]?[0-9]{1,}[.][0-9]*[E][+-][0-9]{1,})\s+eV"
-# atom_numb_patt=r"Number of atoms.*([0-9]{1,})"
-
-debug = False
-
-
-def get_info(lines, type_idx_zero=False):
-    atom_types = []
-    atom_names = []
-    cell = []
-    atom_numbs = None
-    _atom_names = []
-
-    contents = "\n".join(lines)
-    # cell
-    # _tmp=re.findall(latt_patt,contents)
-    # for ii in _tmp:
-    #    vect=[float(kk) for kk in ii]
-    #    cell.append(vect)
-    # ------------------
-    for ln, l in enumerate(lines):
-        if l.startswith("  | Unit cell"):
-            break
-    _tmp = lines[ln + 1 : ln + 4]
-    for ii in _tmp:
-        v_str = ii.split("|")[1].split()
-        vect = [float(kk) for kk in v_str]
-        cell.append(vect)
-
-    _tmp = re.findall(pos_patt_first, contents)
-    for ii in _tmp:
-        _atom_names.append(ii[0])
-    atom_names = []
-    for ii in _atom_names:
-        if ii not in atom_names:
-            atom_names.append(ii)
-
-    atom_numbs = [_atom_names.count(ii) for ii in atom_names]
-    if type_idx_zero:
-        type_map = dict(zip(atom_names, range(len(atom_names))))
-    else:
-        type_map = dict(zip(atom_names, range(1, len(atom_names) + 1)))
-    atom_types = list(map(lambda k: type_map[k], _atom_names))
-    assert atom_numbs is not None, "cannot find ion type info in aims output"
-
-    return [cell, atom_numbs, atom_names, atom_types]
-
-
-def get_fhi_aims_block(fp):
-    blk = []
-    for ii in fp:
-        if not ii:
-            return blk
-        blk.append(ii.rstrip("\n"))
-        if "Begin self-consistency loop: Re-initialization" in ii:
-            return blk
-    return blk
-
-
-def get_frames(fname, md=True, begin=0, step=1, convergence_check=True):
-    fp = open(fname)
-    blk = get_fhi_aims_block(fp)
-    ret = get_info(blk, type_idx_zero=True)
-
-    cell, atom_numbs, atom_names, atom_types = ret[0], ret[1], ret[2], ret[3]
-    ntot = sum(atom_numbs)
-
-    all_coords = []
-    all_cells = []
-    all_energies = []
-    all_forces = []
-    all_virials = []
-
-    cc = 0
-    rec_failed = []
-    while len(blk) > 0:
-        if debug:
-            with open(str(cc), "w") as f:
-                f.write("\n".join(blk))
-        if cc >= begin and (cc - begin) % step == 0:
-            if cc == 0:
-                coord, _cell, energy, force, virial, is_converge = analyze_block(
-                    blk, first_blk=True, md=md
-                )
-            else:
-                coord, _cell, energy, force, virial, is_converge = analyze_block(
-                    blk, first_blk=False
-                )
-            if len(coord) == 0:
-                break
-            if is_converge or not convergence_check:
-                all_coords.append(coord)
-
-                if _cell:
-                    all_cells.append(_cell)
-                else:
-                    all_cells.append(cell)
-
-                all_energies.append(energy)
-                all_forces.append(force)
-                if virial is not None:
-                    all_virials.append(virial)
-            if not is_converge:
-                rec_failed.append(cc + 1)
-
-        blk = get_fhi_aims_block(fp)
-        cc += 1
-
-    if len(rec_failed) > 0:
-        prt = (
-            "so they are not collected."
-            if convergence_check
-            else "but they are still collected due to the requirement for ignoring convergence checks."
-        )
-        warnings.warn(
-            f"The following structures were unconverged: {rec_failed}; " + prt
-        )
-
-    if len(all_virials) == 0:
-        all_virials = None
-    else:
-        all_virials = np.array(all_virials)
-    fp.close()
-    return (
-        atom_names,
-        atom_numbs,
-        np.array(atom_types),
-        np.array(all_cells),
-        np.array(all_coords),
-        np.array(all_energies),
-        np.array(all_forces),
-        all_virials,
-    )
-
-
-def analyze_block(lines, first_blk=False, md=True):
-    coord = []
-    cell = []
-    energy = None
-    force = []
-    virial = None
-    atom_names = []
-    _atom_names = []
-
-    contents = "\n".join(lines)
-    try:
-        natom = int(re.findall("Number of atoms.*([0-9]{1,})", lines)[0])
-    except Exception:
-        natom = 0
-
-    if first_blk:
-        if md:
-            _tmp = re.findall(pos_patt_other, contents)[:]
-            for ii in _tmp[slice(int(len(_tmp) / 2), len(_tmp))]:
-                coord.append([float(kk) for kk in ii[:-1]])
-        else:
-            _tmp = re.findall(pos_patt_first, contents)
-            for ii in _tmp:
-                coord.append([float(kk) for kk in ii[1:]])
-    else:
-        _tmp = re.findall(pos_patt_other, contents)
-        for ii in _tmp:
-            coord.append([float(kk) for kk in ii[:-1]])
-
-    _tmp = re.findall(force_patt, contents)
-    for ii in _tmp:
-        force.append([float(kk) for kk in ii])
-
-    if "Self-consistency cycle converged" in contents:
-        is_converge = True
-    else:
-        is_converge = False
-
-    try:
-        _eng_patt = re.compile(eng_patt)
-        energy = float(_eng_patt.search(contents).group().split()[-2])
-    except Exception:
-        energy = None
-
-    if not energy:
-        is_converge = False
-
-    if energy:
-        assert (force is not None) and len(coord) > 0
-
-    return coord, cell, energy, force, virial, is_converge
-
-
-if __name__ == "__main__":
-    import sys
-
-    ret = get_frames(sys.argv[1], begin=0, step=1)
-    print(ret)
+from dpdata.formats.fhi_aims.output import *  # noqa: F403
diff --git a/dpdata/formats/__init__.py b/dpdata/formats/__init__.py
new file mode 100644
index 000000000..536b023e3
--- /dev/null
+++ b/dpdata/formats/__init__.py
@@ -0,0 +1 @@
+# Format modules for dpdata
diff --git a/dpdata/formats/abacus/__init__.py b/dpdata/formats/abacus/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/dpdata/formats/abacus/md.py b/dpdata/formats/abacus/md.py
new file mode 100644
index 000000000..8df156c94
--- /dev/null
+++ b/dpdata/formats/abacus/md.py
@@ -0,0 +1,224 @@
+from __future__ import annotations
+
+import os
+import warnings
+
+import numpy as np
+
+from dpdata.utils import open_file
+
+from .scf import (
+    bohr2ang,
+    get_geometry_in,
+    get_mag_force,
+    kbar2evperang3,
+)
+from .stru import get_frame_from_stru
+
+# Read in geometries from an ABACUS MD trajectory.
+# The atomic coordinates are read in from generated files in OUT.XXXX.
+# Energies, forces
+# IMPORTANT: the program defaultly takes STRU input file as standard cell information,
+# therefore the direct and cartesan coordinates read could be different from the ones in
+# the output cif files!!!
+# It is highly recommanded to use ORTHOGANAL coordinates in STRU file if you wish to get
+# same coordinates in both dpdata and output cif files.
+
+
+def get_path_out(fname, inlines):
+    # This function is different from the same-name function in scf.py.
+    # This function returns OUT.XXXX's base directory.
+    path_out = os.path.join(fname, "OUT.ABACUS/")
+    for line in inlines:
+        if len(line) > 0 and "suffix" in line and "suffix" == line.split()[0]:
+            suffix = line.split()[1]
+            path_out = os.path.join(fname, f"OUT.{suffix}/")
+            break
+    return path_out
+
+
+def get_coord_dump_freq(inlines):
+    for line in inlines:
+        if len(line) > 0 and "md_dumpfreq" in line and "md_dumpfreq" == line.split()[0]:
+            return int(line.split()[1])
+    return 1
+
+
+def get_coords_from_dump(dumplines, natoms):
+    nlines = len(dumplines)
+    total_natoms = sum(natoms)
+    # The output of VIRIAL, FORCE, and VELOCITY are controlled by INPUT parameters dump_virial, dump_force, and dump_vel, respectively.
+    # So the search of keywords can determine whether these datas are printed into MD_dump.
+    calc_stress = False
+    calc_force = False
+    check_line = 6
+    if "VIRIAL" in dumplines[6]:
+        calc_stress = True
+        check_line = 10
+    assert "POSITION" in dumplines[check_line], (
+        "keywords 'POSITION' cannot be found in the 6th line. Please check."
+    )
+    if "FORCE" in dumplines[check_line]:
+        calc_force = True
+
+    nframes_dump = -1
+    if calc_stress:
+        nframes_dump = int(nlines / (total_natoms + 13))
+    else:
+        nframes_dump = int(nlines / (total_natoms + 9))
+    assert nframes_dump > 0, (
+        "Number of lines in MD_dump file = %d. Number of atoms = %d. The MD_dump file is incomplete."  # noqa: UP031
+        % (nlines, total_natoms)
+    )
+    cells = np.zeros([nframes_dump, 3, 3])
+    stresses = np.zeros([nframes_dump, 3, 3])
+    forces = np.zeros([nframes_dump, total_natoms, 3])
+    coords = np.zeros([nframes_dump, total_natoms, 3])
+    iframe = 0
+    for iline in range(nlines):
+        if "MDSTEP" in dumplines[iline]:
+            # read in LATTICE_CONSTANT
+            # for abacus version >= v3.1.4, the unit is angstrom, and "ANGSTROM" is added at the end
+            # for abacus version <  v3.1.4, the unit is bohr
+            celldm = float(dumplines[iline + 1].split()[1])
+            newversion = True
+            if "Angstrom" not in dumplines[iline + 1]:
+                celldm *= bohr2ang  # transfer unit to ANGSTROM
+                newversion = False
+
+            # read in LATTICE_VECTORS
+            for ix in range(3):
+                cells[iframe, ix] = (
+                    np.array([float(i) for i in dumplines[iline + 3 + ix].split()[0:3]])
+                    * celldm
+                )
+                if calc_stress:
+                    stresses[iframe, ix] = np.array(
+                        [float(i) for i in dumplines[iline + 7 + ix].split()[0:3]]
+                    )
+
+            if calc_stress:
+                skipline = 11
+            else:
+                skipline = 7
+
+            for iat in range(total_natoms):
+                # INDEX    LABEL    POSITION (Angstrom)    FORCE (eV/Angstrom)    VELOCITY (Angstrom/fs)
+                # 0  Sn  0.000000000000  0.000000000000  0.000000000000  -0.000000000000  -0.000000000001  -0.000000000001  0.001244557166  -0.000346684288  0.000768457739
+                # 1  Sn  0.000000000000  3.102800034079  3.102800034079  -0.000186795145  -0.000453823768  -0.000453823768  0.000550996187  -0.000886442775  0.001579501983
+                # for abacus version >= v3.1.4, the value of POSITION is the real cartessian position, and unit is angstrom, and if cal_force the VELOCITY is added at the end.
+                # for abacus version < v3.1.4, the real position = POSITION * celldm
+                coords[iframe, iat] = np.array(
+                    [float(i) for i in dumplines[iline + skipline + iat].split()[2:5]]
+                )
+
+                if not newversion:
+                    coords[iframe, iat] *= celldm
+
+                if calc_force:
+                    forces[iframe, iat] = np.array(
+                        [
+                            float(i)
+                            for i in dumplines[iline + skipline + iat].split()[5:8]
+                        ]
+                    )
+            iframe += 1
+    assert iframe == nframes_dump, (
+        "iframe=%d, nframe_dump=%d. Number of frames does not match number of lines in MD_dump."  # noqa: UP031
+        % (iframe, nframes_dump)
+    )
+    stresses *= kbar2evperang3
+    return coords, cells, forces, stresses
+
+
+def get_energy(outlines, ndump, dump_freq):
+    energy = []
+    nenergy = 0
+    for line_idx, line in enumerate(outlines):
+        if "final etot is" in line or "#TOTAL ENERGY#" in line:
+            if nenergy % dump_freq == 0:
+                energy.append(float(line.split()[-2]))
+            nenergy += 1
+        elif "!! convergence has not been achieved" in line:
+            if nenergy % dump_freq == 0:
+                energy.append(np.nan)
+            nenergy += 1
+    assert ndump == len(energy), (
+        "Number of total energies in running_md.log = %d. Number of frames in MD_dump = %d. Please check."  # noqa: UP031
+        % (len(energy), ndump)
+    )
+    energy = np.array(energy)
+    return energy
+
+
+def get_frame(fname):
+    if isinstance(fname, str):
+        # if the input parameter is only one string, it is assumed that it is the
+        # base directory containing INPUT file;
+        path_in = os.path.join(fname, "INPUT")
+    else:
+        raise RuntimeError("invalid input")
+    with open_file(path_in) as fp:
+        inlines = fp.read().split("\n")
+    geometry_path_in = get_geometry_in(fname, inlines)  # base dir of STRU
+    path_out = get_path_out(fname, inlines)
+
+    data = get_frame_from_stru(geometry_path_in)
+    natoms = data["atom_numbs"]
+    # should remove spins from STRU file
+    if "spins" in data:
+        data.pop("spins")
+
+    # This coords is not to be used.
+    dump_freq = get_coord_dump_freq(inlines=inlines)
+    # ndump = int(os.popen("ls -l %s | grep 'md_pos_' | wc -l" %path_out).readlines()[0])
+    # number of dumped geometry files
+    # coords = get_coords_from_cif(ndump, dump_freq, atom_names, natoms, types, path_out, cell)
+    with open_file(os.path.join(path_out, "MD_dump")) as fp:
+        dumplines = fp.read().split("\n")
+    coords, cells, force, stress = get_coords_from_dump(dumplines, natoms)
+    ndump = np.shape(coords)[0]
+    with open_file(os.path.join(path_out, "running_md.log")) as fp:
+        outlines = fp.read().split("\n")
+    energy = get_energy(outlines, ndump, dump_freq)
+
+    unconv_stru = ""
+    for i, iene in enumerate(energy):
+        if np.isnan(iene):
+            coords = np.delete(coords, i - ndump, axis=0)
+            cells = np.delete(cells, i - ndump, axis=0)
+            force = np.delete(force, i - ndump, axis=0)
+            stress = np.delete(stress, i - ndump, axis=0)
+            energy = np.delete(energy, i - ndump, axis=0)
+            unconv_stru += "%d " % i  # noqa: UP031
+    ndump = len(energy)
+    if unconv_stru != "":
+        warnings.warn(f"Structure {unconv_stru} are unconverged and not collected!")
+
+    for iframe in range(ndump):
+        stress[iframe] *= np.linalg.det(cells[iframe, :, :].reshape([3, 3]))
+    if np.sum(np.abs(stress[0])) < 1e-10:
+        stress = None
+
+    magmom, magforce = get_mag_force(outlines)
+
+    data["cells"] = cells
+    # for idx in range(ndump):
+    #    data['cells'][:, :, :] = cell
+    data["coords"] = coords
+    data["energies"] = energy
+    data["forces"] = force
+    data["virials"] = stress
+    if not isinstance(data["virials"], np.ndarray):
+        del data["virials"]
+    data["orig"] = np.zeros(3)
+    if len(magmom) > 0:
+        data["spins"] = magmom
+    if len(magforce) > 0:
+        data["force_mags"] = magforce
+
+    # need to expand the move.
+    if "move" in data:
+        data["move"] = [data["move"][0] for i in range(ndump)]
+
+    return data
diff --git a/dpdata/formats/abacus/relax.py b/dpdata/formats/abacus/relax.py
new file mode 100644
index 000000000..db60412b8
--- /dev/null
+++ b/dpdata/formats/abacus/relax.py
@@ -0,0 +1,265 @@
+from __future__ import annotations
+
+import glob
+import os
+
+import numpy as np
+
+from dpdata.utils import open_file
+
+from .scf import (
+    bohr2ang,
+    collect_force,
+    collect_stress,
+    get_geometry_in,
+    get_mag_force,
+    kbar2evperang3,
+)
+from .stru import get_frame_from_stru
+
+# Read in geometries from an ABACUS RELAX(CELL-RELAX) trajectory in OUT.XXXX/runnning_relax/cell-relax.log.
+
+
+def get_log_file(fname, inlines):
+    suffix = "ABACUS"
+    calculation = "scf"
+    for line in inlines:
+        if "suffix" in line and "suffix" == line.split()[0]:
+            suffix = line.split()[1]
+        elif "calculation" in line and "calculation" == line.split()[0]:
+            calculation = line.split()[1]
+    logf = os.path.join(fname, f"OUT.{suffix}/running_{calculation}.log")
+    return logf
+
+
+def get_relax_stru_files(output_dir):
+    """Find the STRU files in the output directory.
+
+    Args:
+        output_dir (str): output directory
+
+    Returns
+    -------
+    strus: list of STRU files
+    example:
+        ["STRU_ION1_D", "STRU_ION2_D"]
+    """
+    return glob.glob(os.path.join(output_dir, "STRU_ION*_D"))
+
+
+def get_coords_from_log(loglines, natoms, stru_files=None):
+    """NOTICE: unit of coords and cells is Angstrom
+    order:
+        coordinate
+        cell (no output if cell is not changed)
+        energy (no output, if SCF is not converged)
+        force (no output, if cal_force is not setted or abnormal ending)
+        stress (no output, if set cal_stress is not setted or abnormal ending).
+    """
+    natoms_log = 0
+    for line in loglines:
+        if line[13:41] == "number of atom for this type":
+            natoms_log += int(line.split()[-1])
+
+    assert natoms_log > 0 and natoms_log == natoms, (
+        f"ERROR: detected atom number in log file is {natoms_log}, while the atom number in STRU file is {natoms}"
+    )
+
+    energy = []
+    cells = []
+    coords = []
+    coord_direct = []  # if the coordinate is direct type or not
+
+    for i in range(len(loglines)):
+        line = loglines[i]
+        if line[18:41] == "lattice constant (Bohr)":
+            a0 = float(line.split()[-1])
+        elif len(loglines[i].split()) >= 2 and loglines[i].split()[1] == "COORDINATES":
+            # read coordinate information
+            coords.append([])
+            direct_coord = False
+            if loglines[i].split()[0] == "DIRECT":
+                coord_direct.append(True)
+                for k in range(2, 2 + natoms):
+                    coords[-1].append(
+                        list(map(lambda x: float(x), loglines[i + k].split()[1:4]))
+                    )
+            elif loglines[i].split()[0] == "CARTESIAN":
+                coord_direct.append(False)
+                for k in range(2, 2 + natoms):
+                    coords[-1].append(
+                        list(
+                            map(
+                                lambda x: float(x) * a0 * bohr2ang,
+                                loglines[i + k].split()[1:4],
+                            )
+                        )
+                    )
+            else:
+                assert False, "Unrecongnized coordinate type, %s, line:%d" % (  # noqa: UP031
+                    loglines[i].split()[0],
+                    i,
+                )
+
+        elif (
+            loglines[i][1:56]
+            == "Lattice vectors: (Cartesian coordinate: in unit of a_0)"
+        ):
+            # add the cell information for previous structures
+            while len(cells) < len(coords) - 1:
+                cells.append(cells[-1])
+            # get current cell information
+            cells.append([])
+            for k in range(1, 4):
+                cells[-1].append(
+                    list(
+                        map(
+                            lambda x: float(x) * a0 * bohr2ang,
+                            loglines[i + k].split()[0:3],
+                        )
+                    )
+                )
+
+        elif line[1:14] == "final etot is" or "#TOTAL ENERGY#" in line:
+            # add the energy for previous structures whose SCF is not converged
+            while len(energy) < len(coords) - 1:
+                energy.append(np.nan)
+            # get the energy of current structure
+            energy.append(float(line.split()[-2]))
+
+    # in some relax method (like: bfgs_trad), the coordinate is not outputed in running_relax.log
+    # but if out_stru is true, then STRU_ION*_D will be outputed in OUT.ABACUS
+    # we should read cell and coord from STRU_ION*_D files
+    if len(energy) > 1 and len(coords) == 1:
+        # the energies of all structrues are collected, but coords have only the first structure
+        if (
+            stru_files is not None and len(stru_files) > 1
+        ):  # if stru_files are not only STRU_ION_D
+            stru_file_name = [os.path.basename(i) for i in stru_files]
+            coords = coords[:1] + [np.nan for i in range(len(energy) - 1)]
+            coord_direct = coord_direct[:1] + [False for i in range(len(energy) - 1)]
+            cells = cells[:1] + [np.nan for i in range(len(energy) - 1)]
+            for iframe in range(1, len(energy)):
+                if f"STRU_ION{iframe}_D" in stru_file_name:
+                    # read the structure from STRU_ION*_D
+                    stru_data = get_frame_from_stru(
+                        stru_files[stru_file_name.index(f"STRU_ION{iframe}_D")]
+                    )
+                    coords[iframe] = stru_data["coords"][0]
+                    cells[iframe] = stru_data["cells"][0]
+
+    force = collect_force(loglines)
+    stress = collect_stress(loglines)
+
+    # delete last structures which has no energy
+    while len(energy) < len(coords):
+        del coords[-1]
+        del coord_direct[-1]
+
+    # add cells for last structures whose cell is not changed
+    while len(cells) < len(coords):
+        cells.append(cells[-1])
+
+    # only keep structures that have all of coord, force and stress
+    if len(stress) == 0 and len(force) == 0:
+        minl = len(coords)
+    elif len(stress) == 0:
+        minl = min(len(coords), len(force))
+        force = force[:minl]
+    elif len(force) == 0:
+        minl = min(len(coords), len(stress))
+        stress = stress[:minl]
+    else:
+        minl = min(len(coords), len(force), len(stress))
+        force = force[:minl]
+        stress = stress[:minl]
+
+    coords = coords[:minl]
+    energy = energy[:minl]
+    cells = cells[:minl]
+
+    # delete structures whose energy is np.nan
+    for i in range(minl):
+        if (
+            np.isnan(energy[i - minl])
+            or np.any(np.isnan(coords[i - minl]))
+            or np.any(np.isnan(cells[i - minl]))
+        ):
+            del energy[i - minl]
+            del coords[i - minl]
+            del cells[i - minl]
+            del coord_direct[i - minl]
+            if len(force) > 0:
+                del force[i - minl]
+            if len(stress) > 0:
+                del stress[i - minl]
+
+    energy = np.array(energy)
+    cells = np.array(cells)
+    coords = np.array(coords)
+    stress = np.array(stress)
+    force = np.array(force)
+
+    # transfer direct coordinate to cartessian type
+    for i in range(len(coords)):
+        if coord_direct[i]:
+            coords[i] = coords[i].dot(cells[i])
+
+    if len(stress) > 0:
+        virial = np.zeros([len(cells), 3, 3])
+        for i in range(len(cells)):
+            volume = np.linalg.det(cells[i, :, :].reshape([3, 3]))
+            virial[i] = stress[i] * kbar2evperang3 * volume
+    else:
+        virial = None
+
+    return energy, cells, coords, force, stress, virial
+
+
+def get_frame(fname):
+    if isinstance(fname, str):
+        # if the input parameter is only one string, it is assumed that it is the
+        # base directory containing INPUT file;
+        path_in = os.path.join(fname, "INPUT")
+    else:
+        raise RuntimeError("invalid input")
+    with open_file(path_in) as fp:
+        inlines = fp.read().split("\n")
+    geometry_path_in = get_geometry_in(fname, inlines)  # base dir of STRU
+
+    data = get_frame_from_stru(geometry_path_in)
+    natoms = sum(data["atom_numbs"])
+    # should remove spins from STRU file
+    if "spins" in data:
+        data.pop("spins")
+
+    logf = get_log_file(fname, inlines)
+    assert os.path.isfile(logf), f"Error: can not find {logf}"
+    with open_file(logf) as f1:
+        lines = f1.readlines()
+
+    relax_stru_files = get_relax_stru_files(os.path.dirname(logf))
+
+    energy, cells, coords, force, stress, virial = get_coords_from_log(
+        lines, natoms, stru_files=relax_stru_files
+    )
+
+    magmom, magforce = get_mag_force(lines)
+
+    data["cells"] = cells
+    data["coords"] = coords
+    data["energies"] = energy
+    data["forces"] = force
+    if isinstance(virial, np.ndarray):
+        data["virials"] = virial
+    data["stress"] = stress
+    data["orig"] = np.zeros(3)
+
+    if len(magmom) > 0:
+        data["spins"] = magmom
+    if len(magforce) > 0:
+        data["force_mags"] = magforce
+    if "move" in data:
+        data["move"] = [data["move"][0] for i in range(len(data["energies"]))]
+
+    return data
diff --git a/dpdata/formats/abacus/scf.py b/dpdata/formats/abacus/scf.py
new file mode 100644
index 000000000..991396b65
--- /dev/null
+++ b/dpdata/formats/abacus/scf.py
@@ -0,0 +1,255 @@
+from __future__ import annotations
+
+import os
+import re
+import warnings
+
+import numpy as np
+
+from dpdata.utils import open_file
+
+from ...unit import LengthConversion, PressureConversion
+from .stru import get_frame_from_stru
+
+bohr2ang = LengthConversion("bohr", "angstrom").value()
+kbar2evperang3 = PressureConversion("kbar", "eV/angstrom^3").value()
+
+
+def CheckFile(ifile):
+    if not os.path.isfile(ifile):
+        print(f"Can not find file {ifile}")
+        return False
+    return True
+
+
+def get_geometry_in(fname, inlines):
+    geometry_path_in = os.path.join(fname, "STRU")
+    for line in inlines:
+        if "stru_file" in line and "stru_file" == line.split()[0]:
+            atom_file = line.split()[1]
+            geometry_path_in = os.path.join(fname, atom_file)
+            break
+    return geometry_path_in
+
+
+def get_path_out(fname, inlines):
+    path_out = os.path.join(fname, "OUT.ABACUS/running_scf.log")
+    for line in inlines:
+        if "suffix" in line and "suffix" == line.split()[0]:
+            suffix = line.split()[1]
+            path_out = os.path.join(fname, f"OUT.{suffix}/running_scf.log")
+            break
+    return path_out
+
+
+def get_energy(outlines):
+    Etot = None
+    for line in reversed(outlines):
+        if "final etot is" in line:  # for LTS
+            Etot = float(line.split()[-2])  # in eV
+            return Etot, True
+        elif "TOTAL ENERGY" in line:  # for develop
+            Etot = float(line.split()[-2])  # in eV
+            return Etot, True
+        elif "convergence has NOT been achieved!" in line:
+            return Etot, False
+        elif "convergence has not been achieved" in line:
+            return Etot, False
+
+    return Etot, False
+
+
+def collect_force(outlines):
+    force = []
+    for i, line in enumerate(outlines):
+        # if "TOTAL-FORCE (eV/Angstrom)" in line:
+        if "TOTAL-FORCE" in line:
+            value_pattern = re.compile(
+                r"^\s*[A-Z][a-z]?[1-9][0-9]*\s+[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\s+[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\s+[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\s*$"
+            )
+            j = i
+            # find the first line of force
+            noforce = False
+            while not value_pattern.match(outlines[j]):
+                j += 1
+                if (
+                    j >= i + 10
+                ):  # if can not find the first line of force in 10 lines, then stop
+                    warnings.warn("Warning: can not find the first line of force")
+                    noforce = True
+                    break
+            if noforce:
+                break
+
+            force.append([])
+            while value_pattern.match(outlines[j]):
+                force[-1].append([float(ii) for ii in outlines[j].split()[1:4]])
+                j += 1
+    return force  # only return the last force
+
+
+def get_force(outlines, natoms):
+    force = collect_force(outlines)
+    if len(force) == 0:
+        return None
+    else:
+        return np.array(force[-1])  # only return the last force
+
+
+def collect_stress(outlines):
+    stress = []
+    for i, line in enumerate(outlines):
+        # if "TOTAL-STRESS (KBAR)" in line:
+        if "TOTAL-STRESS" in line:
+            value_pattern = re.compile(
+                r"^\s*[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\s+[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\s+[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\s*$"
+            )
+            j = i
+            nostress = False
+            while not value_pattern.match(outlines[j]):
+                j += 1
+                if (
+                    j >= i + 10
+                ):  # if can not find the first line of stress in 10 lines, then stop
+                    warnings.warn("Warning: can not find the first line of stress")
+                    nostress = True
+                    break
+            if nostress:
+                break
+
+            stress.append([])
+            while value_pattern.match(outlines[j]):
+                stress[-1].append(
+                    list(map(lambda x: float(x), outlines[j].split()[0:3]))
+                )
+                j += 1
+    return stress
+
+
+def get_stress(outlines):
+    stress = collect_stress(outlines)
+    if len(stress) == 0:
+        return None
+    else:
+        return np.array(stress[-1]) * kbar2evperang3  # only return the last stress
+
+
+def get_mag_force(outlines):
+    """Read atomic magmom and magnetic force from OUT.ABACUS/running_scf.log.
+
+    Returns
+    -------
+    magmom: list of list of atomic magnetic moments (three dimensions: ION_STEP * NATOMS * 1/3)
+    magforce: list of list of atomic magnetic forces (three dimensions: ION_STEP * NATOMS * 1/3)
+    e.g.:
+    -------------------------------------------------------------------------------------------
+    Total Magnetism (uB)
+    -------------------------------------------------------------------------------------------
+        Fe         0.0000000001         0.0000000000         3.0000000307
+        Fe        -0.0000000000        -0.0000000000         3.0000001151
+    -------------------------------------------------------------------------------------------
+    -------------------------------------------------------------------------------------------
+    Magnetic force (eV/uB)
+    -------------------------------------------------------------------------------------------
+        Fe         0.0000000000         0.0000000000        -1.2117698671
+        Fe         0.0000000000         0.0000000000        -1.2117928796
+    -------------------------------------------------------------------------------------------
+
+    """
+    mags = []
+    magforces = []
+    for i, line in enumerate(outlines):
+        if "Total Magnetism (uB)" in line:
+            j = i + 2
+            mag = []
+            while "-------------------------" not in outlines[j]:
+                imag = [float(ii) for ii in outlines[j].split()[1:]]
+                if len(imag) == 1:
+                    imag = [0, 0, imag[0]]
+                mag.append(imag)
+                j += 1
+            mags.append(mag)
+        if "Magnetic force (eV/uB)" in line:
+            j = i + 2
+            magforce = []
+            while "-------------------------" not in outlines[j]:
+                imagforce = [float(ii) for ii in outlines[j].split()[1:]]
+                if len(imagforce) == 1:
+                    imagforce = [0, 0, imagforce[0]]
+                magforce.append(imagforce)
+                j += 1
+            magforces.append(magforce)
+    return np.array(mags), np.array(magforces)
+
+
+def get_frame(fname):
+    data = {
+        "atom_names": [],
+        "atom_numbs": [],
+        "atom_types": [],
+        "cells": np.array([]),
+        "coords": np.array([]),
+        "energies": np.array([]),
+        "forces": np.array([]),
+    }
+
+    if isinstance(fname, str):
+        # if the input parameter is only one string, it is assumed that it is the
+        # base directory containing INPUT file;
+        path_in = os.path.join(fname, "INPUT")
+    else:
+        raise RuntimeError("invalid input")
+
+    if not CheckFile(path_in):
+        return data
+
+    with open_file(path_in) as fp:
+        inlines = fp.read().split("\n")
+
+    geometry_path_in = get_geometry_in(fname, inlines)
+
+    # get OUT.ABACUS/running_scf.log
+    path_out = get_path_out(fname, inlines)
+    if not (CheckFile(geometry_path_in) and CheckFile(path_out)):
+        return data
+    with open_file(path_out) as fp:
+        outlines = fp.read().split("\n")
+
+    # get energy
+    energy, converge = get_energy(outlines)
+    if not converge:
+        return data
+
+    # read STRU file
+    data = get_frame_from_stru(geometry_path_in)
+    natoms = sum(data["atom_numbs"])
+    # should remove spins from STRU file
+    if "spins" in data:
+        data.pop("spins")
+    move = data.pop("move", None)
+
+    # get magmom and magforce, force and stress
+    magmom, magforce = get_mag_force(outlines)
+    if len(magmom) > 0:
+        magmom = magmom[-1:]
+    if len(magforce) > 0:
+        magforce = magforce[-1:]
+
+    force = get_force(outlines, natoms)
+    stress = get_stress(outlines)
+
+    data["energies"] = np.array(energy)[np.newaxis]
+    data["forces"] = np.empty((0,)) if force is None else force[np.newaxis, :, :]
+    data["orig"] = np.zeros(3)
+    if stress is not None:
+        cell = data["cells"][0]
+        stress *= np.abs(np.linalg.det(cell))
+        data["virials"] = stress[np.newaxis, :, :]
+
+    if len(magmom) > 0:
+        data["spins"] = magmom
+    if len(magforce) > 0:
+        data["force_mags"] = magforce
+    if move is not None:
+        data["move"] = move
+    return data
diff --git a/dpdata/formats/abacus/stru.py b/dpdata/formats/abacus/stru.py
new file mode 100644
index 000000000..0d899695b
--- /dev/null
+++ b/dpdata/formats/abacus/stru.py
@@ -0,0 +1,820 @@
+from __future__ import annotations
+
+import os
+import re
+import warnings
+
+import numpy as np
+
+from ...unit import LengthConversion
+
+bohr2ang = LengthConversion("bohr", "angstrom").value()
+
+
+def split_stru_block(lines):
+    """Split the ABACUS STRU file into blocks by keyword.
+
+    Args:
+        lines (list): list of lines in the ABACUS STRU file.
+
+    Returns
+    -------
+    dict: dictionary of blocks.
+    """
+
+    def clean_comment(line):
+        return re.split("[#]", line)[0]
+
+    ABACUS_STRU_KEYS = [
+        "ATOMIC_SPECIES",
+        "NUMERICAL_ORBITAL",
+        "LATTICE_CONSTANT",
+        "LATTICE_VECTORS",
+        "ATOMIC_POSITIONS",
+        "NUMERICAL_DESCRIPTOR",
+        "PAW_FILES",
+    ]
+    blocks = {i: [] for i in ABACUS_STRU_KEYS}
+    i = 0
+    while i < len(lines):
+        line = clean_comment(lines[i]).strip()
+        if line in ABACUS_STRU_KEYS:
+            key = line
+            for j in range(i + 1, len(lines)):
+                if clean_comment(lines[j]).strip() == "":
+                    continue
+                elif clean_comment(lines[j]).strip() in ABACUS_STRU_KEYS:
+                    break
+                else:
+                    blocks[key].append(clean_comment(lines[j]))
+            i = j
+        else:
+            i += 1
+
+    return blocks
+
+
+def parse_atomic_species_block(lines):
+    """Parse the ATOMIC_SPECIES block.
+
+    Args:
+        lines (list): list of lines in the ATOMIC_SPECIES block.
+
+    Returns
+    -------
+    tuple: tuple of atom_names, masses, and pp_files.
+
+    """
+    atom_names, masses, pp_files = [], [], []
+    for line in lines:
+        line = line.split()
+        atom_names.append(line[0])
+        masses.append(float(line[1]))
+
+        # for standard STRU, the pseudo potential file is required,
+        # but it is not required for dpdata.
+        if len(line) > 2:
+            pp_files.append(line[2])
+        else:
+            pp_files.append(None)
+
+    return atom_names, masses, pp_files
+
+
+def parse_numerical_orbital_block(lines):
+    """Parse the NUMERICAL_ORBITAL block.
+
+    Args:
+        lines (list): list of lines in the NUMERICAL_ORBITAL block.
+
+    Returns
+    -------
+    list: list of orbital files.
+    """
+    return [line.strip() for line in lines]
+
+
+def parse_lattice_constant_block(lines):
+    """Parse the LATTICE_CONSTANT block.
+
+    Args:
+        lines (list): list of lines in the LATTICE_CONSTANT block.
+
+    Returns
+    -------
+    float: the lattice constant.
+    """
+    return float(lines[0])
+
+
+def parse_lattice_vectors_block(lines):
+    """Parse the LATTICE_VECTORS block.
+
+    Args:
+        lines (list): list of lines in the LATTICE_VECTORS block.
+
+    Returns
+    -------
+    np.ndarray: the cell vectors.
+    """
+    cell = np.zeros((3, 3))
+    for i, line in enumerate(lines):
+        cell[i] = [float(x) for x in line.split()]
+    return cell
+
+
+def parse_pos_oneline(pos_line):
+    """Parses a line from the atom position block in a structure file.
+
+    The content in atom position block can include:
+    - `m` or NO key word: Three numbers (0 or 1) controlling atom movement in geometry relaxation calculations.
+    - `v`, `vel`, or `velocity`: Three components of initial velocity of atoms in geometry relaxation calculations.
+    - `mag` or `magmom`: Start magnetization for each atom. Can be one number (colinear) or three numbers (non-colinear).
+    - `angle1`: In non-colinear case, angle between c-axis and real spin (in degrees).
+    - `angle2`: In non-colinear case, angle between a-axis and real spin projection in ab-plane (in degrees).
+    - `cs` or `constrain`: Three numbers (0 or 1) controlling the spin constraint of the atom.
+    - `lambda`: Three numbers controlling the lambda of the atom.
+
+    Parameters
+    ----------
+    pos_line : A line from the atom position block.
+
+    Returns
+    -------
+    tuple: A tuple containing:
+          - pos (list of float): The position coordinates.
+          - move (list of int or None): Movement control values.
+          - velocity (list of float or None): Initial velocity components.
+          - magmom (float, list of float, or None): Magnetization values.
+          - angle1 (float or None): Angle1 value.
+          - angle2 (float or None): Angle2 value.
+          - constrain (list of bool or None): Spin constraint values.
+          - lambda1 (float, list of float, or None): Lambda values.
+
+        e.g.:
+        ```
+        Fe
+        1.0
+        2
+        0.0 0.0 0.0 m 0 0 0 mag 1.0 angle1 90 angle2 0 cs 0 0 0
+        0.5 0.5 0.5 m 1 1 1 mag 1.0 angle1 90 angle2 180
+        ```
+    """
+    pos_line = pos_line.split("#")[0]  # remove comments
+    sline = pos_line.split()
+    pos = [float(i) for i in sline[:3]]
+    move = None
+    velocity = None
+    magmom = None
+    angle1 = None
+    angle2 = None
+    constrain = None
+    lambda1 = None
+    if len(sline) > 3:
+        mag_list = None
+        velocity_list = None
+        move_list = []
+        angle1_list = None
+        angle2_list = None
+        constrain_list = None
+        lambda_list = None
+        label = "move"
+        for i in range(3, len(sline)):
+            # firstly read the label
+            if sline[i] == "m":
+                label = "move"
+            elif sline[i] in ["v", "vel", "velocity"]:
+                label = "velocity"
+                velocity_list = []
+            elif sline[i] in ["mag", "magmom"]:
+                label = "magmom"
+                mag_list = []
+            elif sline[i] == "angle1":
+                label = "angle1"
+                angle1_list = []
+            elif sline[i] == "angle2":
+                label = "angle2"
+                angle2_list = []
+            elif sline[i] in ["constrain", "sc"]:
+                label = "constrain"
+                constrain_list = []
+            elif sline[i] in ["lambda"]:
+                label = "lambda"
+                lambda_list = []
+
+            # the read the value to the list
+            elif label == "move":
+                move_list.append(int(sline[i]))
+            elif label == "velocity":
+                velocity_list.append(float(sline[i]))
+            elif label == "magmom":
+                mag_list.append(float(sline[i]))
+            elif label == "angle1":
+                angle1_list.append(float(sline[i]))
+            elif label == "angle2":
+                angle2_list.append(float(sline[i]))
+            elif label == "constrain":
+                constrain_list.append(bool(int(sline[i])))
+            elif label == "lambda":
+                lambda_list.append(float(sline[i]))
+
+        if move_list is not None and len(move_list) > 0:
+            if len(move_list) == 3:
+                move = move_list
+            else:
+                raise RuntimeError(f"Invalid setting of move: {pos_line}")
+
+        if velocity_list is not None:
+            if len(velocity_list) == 3:
+                velocity = velocity_list
+            else:
+                raise RuntimeError(f"Invalid setting of velocity: {pos_line}")
+
+        if mag_list is not None:
+            if len(mag_list) == 3:
+                magmom = mag_list
+            elif len(mag_list) == 1:
+                magmom = mag_list[0]
+            else:
+                raise RuntimeError(f"Invalid magnetic moment {pos_line}")
+
+        if angle1_list is not None:
+            if len(angle1_list) == 1:
+                angle1 = angle1_list[0]
+            else:
+                raise RuntimeError(f"Invalid angle1 {pos_line}")
+
+        if angle2_list is not None:
+            if len(angle2_list) == 1:
+                angle2 = angle2_list[0]
+            else:
+                raise RuntimeError(f"Invalid angle2 {pos_line}")
+
+        if constrain_list is not None:
+            if len(constrain_list) == 3:
+                constrain = constrain_list
+            elif len(constrain_list) == 1:
+                constrain = constrain_list[0]
+            else:
+                raise RuntimeError(f"Invalid constrain {pos_line}")
+
+        if lambda_list is not None:
+            if len(lambda_list) == 3:
+                lambda1 = lambda_list
+            elif len(lambda_list) == 1:
+                lambda1 = lambda_list[0]
+            else:
+                raise RuntimeError(f"Invalid lambda {pos_line}")
+
+    return pos, move, velocity, magmom, angle1, angle2, constrain, lambda1
+
+
+def get_atom_mag_cartesian(atommag, angle1, angle2):
+    """Transform atommag, angle1, angle2 to magmom in cartesian coordinates.
+
+    Parameters
+    ----------
+    atommag : float/list of float/None
+        Atom magnetic moment.
+    angle1 : float/None
+        value of angle1.
+    angle2 : float/None
+        value of angle2.
+    ABACUS support defining mag, angle1, angle2 at the same time.
+    angle1 is the angle between z-axis and real spin (in degrees).
+    angle2 is the angle between x-axis and real spin projection in xy-plane (in degrees).
+    If only mag is defined, then transfer it to magmom directly.
+    And if mag, angle1, angle2 are defined, then mag is only the norm of magmom, and the direction is defined by angle1 and angle2.
+    """
+    if atommag is None:
+        return None
+    if not (isinstance(atommag, list) or isinstance(atommag, float)):
+        raise RuntimeError(f"Invalid atommag: {atommag}")
+
+    if angle1 is None and angle2 is None:
+        if isinstance(atommag, list):
+            return atommag
+        else:
+            return [0, 0, atommag]
+    else:
+        a1 = 0
+        a2 = 0
+        if angle1 is not None:
+            a1 = angle1
+        if angle2 is not None:
+            a2 = angle2
+        if isinstance(atommag, list):
+            mag_norm = np.linalg.norm(atommag)
+        else:
+            mag_norm = atommag
+        return [
+            mag_norm * np.sin(np.radians(a1)) * np.cos(np.radians(a2)),
+            mag_norm * np.sin(np.radians(a1)) * np.sin(np.radians(a2)),
+            mag_norm * np.cos(np.radians(a1)),
+        ]
+
+
+def get_cartesian_coords(coords, coord_type, celldm, cell):
+    """Transform the atomic coordinates to cartesian coordinates.
+
+    Args:
+        coords (np.ndarray): atomic coordinates read from the STRU file.
+        coord_type (str): the coordination type, either "cartesian" or "direct".
+        celldm (float): the lattice constant.
+        cell (np.ndarray): the cell vectors in angstrom.
+
+    Returns
+    -------
+    np.ndarray: the cartesian coordinates in angstrom.
+    """
+    if coord_type == "cartesian":
+        return coords * celldm * bohr2ang
+    elif coord_type == "direct":
+        return np.matmul(coords, cell)
+    else:
+        raise RuntimeError(f"Invalid coordination type: {coord_type}")
+
+
+def parse_pos(coords_lines, atom_names, celldm, cell):
+    """Read the atomic positions block in the ABACUS STRU file.
+
+    Args:
+        coords_lines (list): list of lines in the atomic positions block.
+        atom_names (list): list of atom names.
+        celldm (float): the lattice constant.
+        cell (np.ndarray): the cell vectors in angstrom, and has multipy celldm.
+
+    Returns
+    -------
+    tuple: tuple of atom_numbs, coords, move, mags, velocity, sc, lambda_
+    Note: for atomic magnetic moment, we finnaly transform it to non-collinear magnetic moment in cartesian coordinates,
+        and do not return the angle1 and angle2, and the magnetic moment of each atom type.
+
+    """
+    coord_type = coords_lines[0].split()[0].lower()  # cartisan or direct
+    atom_numbs = []  # the number of each atom type
+    coords = []  # coordinations of atoms
+    move = []  # move flag of each atom
+    velocity = []  # velocity of each atom
+    mags = []  # magnetic moment of each atom
+    sc = []  # spin constraint flag of each atom
+    lambda_ = []  # lambda of each atom
+
+    ntype = len(atom_names)
+    line_idx = 1  # starting line of first element
+    define_atom_mag = False
+    for it in range(ntype):
+        atom_name = coords_lines[line_idx].split()[0]
+        if atom_name != atom_names[it]:
+            raise RuntimeError(
+                f"Read atom name '{atom_name}' is not equal to the expected atom name '{atom_names[it]}'"
+            )
+        atom_type_mag = float(coords_lines[line_idx + 1].split()[0])
+        line_idx += 2
+        atom_numbs.append(int(coords_lines[line_idx].split()[0]))
+        line_idx += 1
+        for iline in range(atom_numbs[it]):
+            pos, imove, ivelocity, imagmom, iangle1, iangle2, iconstrain, ilambda1 = (
+                parse_pos_oneline(coords_lines[line_idx])
+            )
+
+            coords.append(get_cartesian_coords(np.array(pos), coord_type, celldm, cell))
+
+            move.append(imove)
+            velocity.append(ivelocity)
+            sc.append(iconstrain)
+            lambda_.append(ilambda1)
+
+            # calculate the magnetic moment in cartesian coordinates
+            mag = get_atom_mag_cartesian(imagmom, iangle1, iangle2)
+            if mag is None:
+                mag = [0, 0, atom_type_mag]
+            mags.append(mag)
+
+            if imagmom is not None:
+                define_atom_mag = True
+
+            line_idx += 1
+    coords = np.array(coords)  # need transformation!!!
+
+    if all([i is None for i in move]):
+        move = []
+    else:
+        move = np.array(move, dtype=bool)
+
+    if all([i is None for i in velocity]):
+        velocity = []
+    else:
+        velocity = np.array(velocity)
+
+    if all([i is None for i in sc]):
+        sc = []
+
+    if all([i is None for i in lambda_]):
+        lambda_ = []
+
+    # here return the magnetic moment only when the atom magnetic moment is specified.
+    if not define_atom_mag:
+        mags = []
+    else:
+        mags = np.array(mags)
+
+    return atom_numbs, coords, move, mags, velocity, sc, lambda_
+
+
+def right_hand_rule(
+    cell: np.ndarray, coord: np.ndarray
+) -> tuple[np.ndarray, np.ndarray]:
+    """Rotate the cell and coord to make the cell fit the right-hand rule.
+
+    Args:
+        cell (np.ndarray): the cell vectors.
+        coord (np.ndarray): the atomic coordinates in cartesian.
+
+    Returns
+    -------
+    tuple: the rotated cell and coord.
+    """
+    if np.linalg.det(cell) < 0:
+        cell = -cell
+        coord = -coord
+    return cell, coord
+
+
+def get_frame_from_stru(stru):
+    """Read the ABACUS STRU file and return the dpdata frame.
+
+    The description of ABACUS STRU can be found in https://abacus.deepmodeling.com/en/latest/advanced/input_files/stru.html
+
+    Args:
+        stru (str): path to the ABACUS STRU file.
+
+    Returns
+    -------
+    data: the parsed stru information in dictionary.
+    {
+        "atom_names": list of atom names,
+        "atom_numbs": list of atom numbers,
+        "atom_types": list of atom types,
+        "masses": list of atomic masses,
+        "pp_files", list of pseudo potential files,
+        "orb_files", list of orbital files,
+        "dpks_descriptor": the deepks descriptor file,
+
+        # below are the information in each frame
+
+        "cells": list of cell vectors,
+        "coords": list of atomic coordinates,
+        "spins": list of magnetic moments, # return only when set "mag xxx" for each atom in STRU file
+        "moves": list of move flags,
+    }
+    For some keys, if the information is not provided in the STRU file, then it will not be included in the dictionary.
+    "spins" is designed for delta spin calculation, and when dpdata.System is write to lmp format, the spin will be written as magmom.
+    But we should note that this file format is valid only for a spin lammps job, not for a normal job.
+    If you want to use dpgen to run the non-spin job, then you should not define "mag x x x" in the STRU file.
+    """
+    if not os.path.isfile(stru):
+        raise FileNotFoundError(f"ABACUS STRU file {stru} not found!!!")
+
+    # 1. read the file and split the lines to blocks
+    with open(stru) as f:
+        lines = f.readlines()
+    blocks = split_stru_block(lines)
+
+    # 2. parse the blocks
+    atom_names, masses, pp_files = parse_atomic_species_block(blocks["ATOMIC_SPECIES"])
+    orb_files = parse_numerical_orbital_block(blocks.get("NUMERICAL_ORBITAL", []))
+    dpks_descriptor = blocks.get("NUMERICAL_DESCRIPTOR", [])
+    celldm = parse_lattice_constant_block(blocks["LATTICE_CONSTANT"])
+    cell = parse_lattice_vectors_block(blocks["LATTICE_VECTORS"])
+    cell = np.array(cell) * celldm * bohr2ang
+    atom_numbs, coords, move, mags, velocity, sc, lambda_ = parse_pos(
+        blocks["ATOMIC_POSITIONS"], atom_names, celldm, cell
+    )
+
+    cell, coords = right_hand_rule(cell, coords)
+    data = {
+        "atom_names": atom_names,
+        "atom_numbs": atom_numbs,
+        "atom_types": np.array(
+            [i for i in range(len(atom_numbs)) for j in range(atom_numbs[i])]
+        ),
+        "masses": np.array(masses),
+        "pp_files": pp_files,
+        "cells": np.array([cell]),
+        "coords": np.array([coords]),
+    }
+    if len(mags) > 0:
+        data["spins"] = np.array([mags])
+    if len(orb_files) > 0:
+        data["orb_files"] = orb_files
+    if len(dpks_descriptor) > 0:
+        data["dpks_descriptor"] = dpks_descriptor[0].strip()
+    if len(move) > 0:
+        data["move"] = np.array([move])
+
+    return data
+
+
+def make_unlabeled_stru(
+    data,
+    frame_idx,
+    pp_file=None,
+    numerical_orbital=None,
+    numerical_descriptor=None,
+    mass=None,
+    move=None,
+    velocity=None,
+    mag=None,
+    angle1=None,
+    angle2=None,
+    sc=None,
+    lambda_=None,
+    link_file=False,
+    dest_dir=None,
+    **kwargs,
+):
+    """Make an unlabeled STRU file from a dictionary.
+
+    Parameters
+    ----------
+    data : dict
+        System data
+    frame_idx : int
+        The index of the frame to dump
+    pp_file : list of string or dict
+        List of pseudo potential files, or a dictionary of pseudo potential files for each atomnames
+    numerical_orbital : list of string or dict, optional
+        List of orbital files, or a dictionary of orbital files for each atomnames
+    numerical_descriptor : str, optional
+        numerical descriptor file
+    mass : list of float, optional
+        List of atomic masses
+    move : list of (list of list of bool), optional
+        List of the move flag of each xyz direction of each atom for each frame
+    velocity : list of list of float, optional
+        List of the velocity of each xyz direction of each atom
+    mag : list of (list of float or float), optional
+        List of the magnetic moment of each atom, can be a list of three floats or one float
+        For noncollinear, three floats are the xyz component of the magnetic moment.
+        For collinear, one float is the norm of the magnetic moment.
+    angle1 : list of float, optional
+        List of the angle1 of each atom. For noncollinear calculation, it is the angle between the magnetic moment and the z-axis.
+    angle2 : list of float, optional
+        List of the angle2 of each atom. For noncollinear calculation, it is the angle between the projection of magnetic moment on xy plane and the x-axis.
+    sc : list of (bool or list of 3 bool), optional
+        List of the spin constraint flag of each atom. Each element can be a bool or a list of three bools or None.
+    lambda_ : list of (float or list of 3 float), optional
+        List of the lambda of each atom. Each element can be a float or a list of three floats.
+    link_file : bool, optional
+        Whether to link the pseudo potential files and orbital files in the STRU file.
+        If True, then only filename will be written in the STRU file, and make a soft link to the real file.
+    dest_dir : str, optional
+        The destination directory to make the soft link of the pseudo potential files and orbital files.
+    For velocity, mag, angle1, angle2, sc, and lambda_, if the value is None, then the corresponding information will not be written.
+    ABACUS support defining "mag" and "angle1"/"angle2" at the same time, and in this case, the "mag" only define the norm of the magnetic moment, and "angle1" and "angle2" define the direction of the magnetic moment.
+    If data has spins, then it will be written as mag to STRU file; while if mag is passed at the same time, then mag will be used.
+    """
+
+    def _link_file(dest_dir, src_file):
+        if not os.path.isfile(src_file):
+            print(f"ERROR: link_file: {src_file} is not a file.")
+            return False
+        src_file = os.path.abspath(src_file)
+        if not os.path.isdir(dest_dir):
+            os.makedirs(dest_dir)
+        dest_file = os.path.join(dest_dir, os.path.basename(src_file))
+        if os.path.isfile(dest_file):
+            if os.path.samefile(src_file, dest_file):
+                return True
+            else:
+                os.remove(dest_file)
+        os.symlink(src_file, dest_file)
+        return True
+
+    def ndarray2list(i):
+        if isinstance(i, np.ndarray):
+            return i.tolist()
+        else:
+            return i
+
+    def process_file_input(file_input, atom_names, input_name):
+        # For pp_file and numerical_orbital, process the file input, and return a list of file names
+        # file_input can be a list of file names, or a dictionary of file names for each atom names
+        if isinstance(file_input, (list, tuple)):
+            if len(file_input) != len(atom_names):
+                raise ValueError(
+                    f"{input_name} length is not equal to the number of atom types"
+                )
+            return file_input
+        elif isinstance(file_input, dict):
+            for element in atom_names:
+                if element not in file_input:
+                    raise KeyError(f"{input_name} does not contain {element}")
+            return [file_input[element] for element in atom_names]
+        else:
+            raise ValueError(f"Invalid {input_name}: {file_input}")
+
+    if link_file and dest_dir is None:
+        print(
+            "WARNING: make_unlabeled_stru: link_file is True, but dest_dir is None. Will write the filename to STRU but not making soft link."
+        )
+    if dest_dir is not None and dest_dir.strip() == "":
+        dest_dir = "."
+
+    # check the input data
+    if mass is None and data.get("masses") is not None and len(data["masses"]) > 0:
+        mass = data["masses"]
+
+    if (
+        pp_file is None
+        and data.get("pp_files") is not None
+        and len(data["pp_files"]) > 0
+    ):
+        pp_file = data["pp_files"]
+
+    if (
+        numerical_orbital is None
+        and data.get("orb_files") is not None
+        and len(data["orb_files"]) > 0
+    ):
+        numerical_orbital = data["orb_files"]
+
+    if numerical_descriptor is None and data.get("dpks_descriptor") is not None:
+        numerical_descriptor = data["dpks_descriptor"]
+
+    if mag is None and data.get("spins") is not None and len(data["spins"]) > 0:
+        mag = data["spins"][frame_idx]
+
+    if move is None and data.get("move", None) is not None and len(data["move"]) > 0:
+        move = data["move"][frame_idx]
+
+    # check the length of the input data
+    atom_numbs = sum(data["atom_numbs"])
+    for key in [move, velocity, mag, angle1, angle2, sc, lambda_]:
+        if key is not None:
+            if (
+                not isinstance(ndarray2list(key), (list, tuple))
+                and len(key) != atom_numbs
+            ):
+                key_name = [name for name, value in locals().items() if value is key][0]
+                print(
+                    f"ERROR: make_unlabeled_stru: the length of '{key_name}' ({len(key)}) should be equal to the number of atom number ({atom_numbs})."
+                )
+                return ""
+
+    # ATOMIC_SPECIES block
+    out = "ATOMIC_SPECIES\n"
+    if pp_file is not None:
+        ppfiles = process_file_input(
+            ndarray2list(pp_file), data["atom_names"], "pp_file"
+        )
+    else:
+        warnings.warn(
+            "pp_file is not provided, will use empty string for pseudo potential file."
+        )
+        ppfiles = [""] * len(data["atom_names"])
+
+    for iele in range(len(data["atom_names"])):
+        if data["atom_numbs"][iele] == 0:
+            continue
+        out += data["atom_names"][iele] + " "
+        if mass is not None:
+            out += f"{mass[iele]:.3f} "
+        else:
+            out += "1 "
+
+        ipp_file = ppfiles[iele]
+        if ipp_file != "":
+            if not link_file:
+                out += ipp_file
+            else:
+                out += os.path.basename(ipp_file.rstrip("/"))
+                if dest_dir is not None:
+                    _link_file(dest_dir, ipp_file)
+        out += "\n"
+    out += "\n"
+
+    # NUMERICAL_ORBITAL block
+    if numerical_orbital is not None:
+        numerical_orbital = ndarray2list(numerical_orbital)
+        orbfiles = process_file_input(
+            numerical_orbital, data["atom_names"], "numerical_orbital"
+        )
+        orbfiles = [
+            orbfiles[i]
+            for i in range(len(data["atom_names"]))
+            if data["atom_numbs"][i] != 0
+        ]
+        out += "NUMERICAL_ORBITAL\n"
+        for iorb in orbfiles:
+            if not link_file:
+                out += iorb
+            else:
+                out += os.path.basename(iorb.rstrip("/"))
+                if dest_dir is not None:
+                    _link_file(dest_dir, iorb)
+            out += "\n"
+        out += "\n"
+
+    # deepks block
+    if numerical_descriptor is not None:
+        assert isinstance(numerical_descriptor, str)
+        if not link_file:
+            out += f"NUMERICAL_DESCRIPTOR\n{numerical_descriptor}\n"
+        else:
+            out += f"NUMERICAL_DESCRIPTOR\n{os.path.basename(numerical_descriptor)}\n"
+            if dest_dir is not None:
+                _link_file(dest_dir, numerical_descriptor)
+        out += "\n"
+
+    # LATTICE_CONSTANT and LATTICE_VECTORS block
+    out += "LATTICE_CONSTANT\n"
+    out += str(1 / bohr2ang) + "\n\n"
+
+    out += "LATTICE_VECTORS\n"
+    for ix in range(3):
+        for iy in range(3):
+            out += str(data["cells"][frame_idx][ix][iy]) + " "
+        out += "\n"
+    out += "\n"
+
+    # ATOMIC_POSITIONS block
+    out += "ATOMIC_POSITIONS\n"
+    out += "Cartesian    # Cartesian(Unit is LATTICE_CONSTANT)\n"
+    # ret += "\n"
+    natom_tot = 0  # in for loop, it is also the atom index
+    for iele in range(len(data["atom_names"])):
+        if data["atom_numbs"][iele] == 0:
+            continue
+        out += data["atom_names"][iele] + "\n"
+        out += "0.0\n"
+        out += str(data["atom_numbs"][iele]) + "\n"
+        for iatom in range(data["atom_numbs"][iele]):
+            iatomtype = np.nonzero(data["atom_types"] == iele)[0][
+                iatom
+            ]  # it is the atom index
+            iout = f"{data['coords'][frame_idx][iatomtype, 0]:.12f} {data['coords'][frame_idx][iatomtype, 1]:.12f} {data['coords'][frame_idx][iatomtype, 2]:.12f}"
+            # add flags for move, velocity, mag, angle1, angle2, and sc
+            if move is not None:
+                if (
+                    isinstance(ndarray2list(move[iatomtype]), (list, tuple))
+                    and len(move[iatomtype]) == 3
+                ):
+                    iout += " " + " ".join(
+                        ["1" if ii else "0" for ii in move[iatomtype]]
+                    )
+                elif isinstance(ndarray2list(move[iatomtype]), (int, float, bool)):
+                    iout += " 1 1 1" if move[iatomtype] else " 0 0 0"
+            else:
+                iout += " 1 1 1"
+
+            if (
+                velocity is not None
+                and isinstance(ndarray2list(velocity[iatomtype]), (list, tuple))
+                and len(velocity[iatomtype]) == 3
+            ):
+                iout += " v " + " ".join([f"{ii:.12f}" for ii in velocity[iatomtype]])
+
+            if mag is not None:
+                if isinstance(ndarray2list(mag[iatomtype]), (list, tuple)) and len(
+                    mag[iatomtype]
+                ) in [1, 3]:
+                    iout += " mag " + " ".join([f"{ii:.12f}" for ii in mag[iatomtype]])
+                elif isinstance(ndarray2list(mag[iatomtype]), (int, float)):
+                    iout += " mag " + f"{mag[iatomtype]:.12f}"
+
+            if angle1 is not None and isinstance(
+                ndarray2list(angle1[iatomtype]), (int, float)
+            ):
+                iout += " angle1 " + f"{angle1[iatomtype]:.12f}"
+
+            if angle2 is not None and isinstance(
+                ndarray2list(angle2[iatomtype]), (int, float)
+            ):
+                iout += " angle2 " + f"{angle2[iatomtype]:.12f}"
+
+            if sc is not None:
+                if isinstance(ndarray2list(sc[iatomtype]), (list, tuple)) and len(
+                    sc[iatomtype]
+                ) in [1, 3]:
+                    iout += " sc " + " ".join(
+                        ["1" if ii else "0" for ii in sc[iatomtype]]
+                    )
+                elif isinstance(ndarray2list(sc[iatomtype]), (int, float, bool)):
+                    iout += " sc " + "1" if sc[iatomtype] else "0"
+
+            if lambda_ is not None:
+                if isinstance(ndarray2list(lambda_[iatomtype]), (list, tuple)) and len(
+                    lambda_[iatomtype]
+                ) in [1, 3]:
+                    iout += " lambda " + " ".join(
+                        [f"{ii:.12f}" for ii in lambda_[iatomtype]]
+                    )
+                elif isinstance(ndarray2list(lambda_[iatomtype]), (int, float)):
+                    iout += " lambda " + f"{lambda_[iatomtype]:.12f}"
+
+            out += iout + "\n"
+            natom_tot += 1
+    assert natom_tot == sum(data["atom_numbs"])
+    return out
diff --git a/dpdata/formats/amber/__init__.py b/dpdata/formats/amber/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/dpdata/formats/amber/mask.py b/dpdata/formats/amber/mask.py
new file mode 100644
index 000000000..155e2a7be
--- /dev/null
+++ b/dpdata/formats/amber/mask.py
@@ -0,0 +1,42 @@
+"""Amber mask."""
+
+from __future__ import annotations
+
+try:
+    import parmed
+except ImportError:
+    pass
+
+
+def pick_by_amber_mask(param, maskstr, coords=None):
+    """Pick atoms by amber masks.
+
+    Parameters
+    ----------
+    param : str or parmed.Structure
+        filename of Amber param file or parmed.Structure
+    maskstr : str
+        Amber masks
+    coords : np.ndarray (optional)
+        frame coordinates, shape: N*3
+    """
+    parm = load_param_file(param)
+    if coords is not None:
+        parm.initialize_topology(xyz=coords)
+    sele = []
+    if len(maskstr) > 0:
+        newmaskstr = maskstr.replace("@0", "!@*")
+        sele = [
+            parm.atoms[i].idx
+            for i in parmed.amber.mask.AmberMask(parm, newmaskstr).Selected()
+        ]
+    return sele
+
+
+def load_param_file(param_file):
+    if isinstance(param_file, str):
+        return parmed.load_file(param_file)
+    elif isinstance(param_file, parmed.Structure):
+        return param_file
+    else:
+        raise RuntimeError("Unsupported structure")
diff --git a/dpdata/formats/amber/md.py b/dpdata/formats/amber/md.py
new file mode 100644
index 000000000..54c8bb8d2
--- /dev/null
+++ b/dpdata/formats/amber/md.py
@@ -0,0 +1,190 @@
+from __future__ import annotations
+
+import os
+import re
+
+import numpy as np
+
+from dpdata.formats.amber.mask import pick_by_amber_mask
+from dpdata.unit import EnergyConversion
+from dpdata.utils import open_file
+
+from ...periodic_table import ELEMENTS
+
+kcalmol2eV = EnergyConversion("kcal_mol", "eV").value()
+symbols = ["X"] + ELEMENTS
+
+energy_convert = kcalmol2eV
+force_convert = energy_convert
+
+
+def cell_lengths_angles_to_cell(
+    cell_lengths: np.ndarray, cell_angles: np.ndarray
+) -> np.ndarray:
+    """Convert cell lengths and angles to cell vectors.
+
+    Parameters
+    ----------
+    cell_lengths
+        Cell lengths with shape ``(..., 3)`` where the last dimension is
+        ``a, b, c``.
+    cell_angles
+        Cell angles in degrees with shape ``(..., 3)`` where the last dimension
+        is ``alpha, beta, gamma``.
+
+    Returns
+    -------
+    np.ndarray
+        Cell vectors with shape ``(..., 3, 3)``.
+    """
+    alpha = np.deg2rad(cell_angles[..., 0])
+    beta = np.deg2rad(cell_angles[..., 1])
+    gamma = np.deg2rad(cell_angles[..., 2])
+
+    a = cell_lengths[..., 0]
+    b = cell_lengths[..., 1]
+    c = cell_lengths[..., 2]
+
+    if np.any(cell_lengths <= 0.0):
+        raise RuntimeError("Invalid AMBER cell lengths")
+    if np.any((cell_angles <= 0.0) | (cell_angles >= 180.0)):
+        raise RuntimeError("Invalid AMBER cell angles")
+
+    cos_alpha = np.cos(alpha)
+    cos_beta = np.cos(beta)
+    cos_gamma = np.cos(gamma)
+    sin_gamma = np.sin(gamma)
+    ly = b * sin_gamma
+    if np.any(ly <= 1e-8):
+        raise RuntimeError("Invalid AMBER cell angles")
+
+    z_factor = (
+        1
+        - cos_alpha**2
+        - cos_beta**2
+        - cos_gamma**2
+        + 2 * cos_alpha * cos_beta * cos_gamma
+    )
+    lz2 = c**2 * z_factor / sin_gamma**2
+    if np.any(lz2 <= 1e-8):
+        raise RuntimeError("Invalid AMBER cell angles")
+
+    z = np.sqrt(z_factor) / sin_gamma
+
+    shape = (*cell_lengths.shape[:-1], 3, 3)
+    cells = np.zeros(shape)
+    cells[..., 0, 0] = a
+    cells[..., 1, 0] = b * cos_gamma
+    cells[..., 1, 1] = b * sin_gamma
+    cells[..., 2, 0] = c * cos_beta
+    cells[..., 2, 1] = c * (cos_alpha - cos_beta * cos_gamma) / sin_gamma
+    cells[..., 2, 2] = c * z
+    return cells
+
+
+def read_amber_traj(
+    parm7_file,
+    nc_file,
+    mdfrc_file=None,
+    mden_file=None,
+    mdout_file=None,
+    use_element_symbols=None,
+    labeled=True,
+):
+    """The amber trajectory includes:
+    * nc, NetCDF format, stores coordinates
+    * mdfrc, NetCDF format, stores forces
+    * mden (optional), text format, stores energies
+    * mdout (optional), text format, may store energies if there is no mden_file
+    * parm7, text format, stores types.
+
+    Parameters
+    ----------
+    parm7_file, nc_file, mdfrc_file, mden_file, mdout_file:
+        filenames
+    use_element_symbols : None or list or str
+        If use_element_symbols is a list of atom indexes, these atoms will use element symbols
+        instead of amber types. For example, a ligand will use C, H, O, N, and so on
+        instead of h1, hc, o, os, and so on.
+        IF use_element_symbols is str, it will be considered as Amber mask.
+    labeled : bool
+        Whether to return labeled data
+    """
+    from scipy.io import netcdf_file
+
+    flag_atom_type = False
+    flag_atom_numb = False
+    amber_types = []
+    atomic_number = []
+    with open_file(parm7_file) as f:
+        for line in f:
+            if line.startswith("%FLAG"):
+                flag_atom_type = line.startswith("%FLAG AMBER_ATOM_TYPE")
+                flag_atom_numb = (use_element_symbols is not None) and line.startswith(
+                    "%FLAG ATOMIC_NUMBER"
+                )
+            elif flag_atom_type or flag_atom_numb:
+                if line.startswith("%FORMAT"):
+                    fmt = re.findall(r"\d+", line)
+                    fmt0 = int(fmt[0])
+                    fmt1 = int(fmt[1])
+                else:
+                    for ii in range(fmt0):
+                        start_index = ii * fmt1
+                        end_index = (ii + 1) * fmt1
+                        if end_index >= len(line):
+                            continue
+                        content = line[start_index:end_index].strip()
+                        if flag_atom_type:
+                            amber_types.append(content)
+                        elif flag_atom_numb:
+                            atomic_number.append(int(content))
+    if use_element_symbols is not None:
+        if isinstance(use_element_symbols, str):
+            use_element_symbols = pick_by_amber_mask(parm7_file, use_element_symbols)
+        for ii in use_element_symbols:
+            amber_types[ii] = symbols[atomic_number[ii]]
+
+    with netcdf_file(nc_file, "r") as f:
+        coords = np.array(f.variables["coordinates"][:])
+        cell_lengths = np.array(f.variables["cell_lengths"][:])
+        cell_angles = np.array(f.variables["cell_angles"][:])
+        cells = cell_lengths_angles_to_cell(cell_lengths, cell_angles)
+
+    if labeled:
+        with netcdf_file(mdfrc_file, "r") as f:
+            forces = np.array(f.variables["forces"][:])
+
+        # load energy from mden_file or mdout_file
+        energies = []
+        if mden_file is not None and os.path.isfile(mden_file):
+            with open_file(mden_file) as f:
+                for line in f:
+                    if line.startswith("L6"):
+                        s = line.split()
+                        if s[2] != "E_pot":
+                            energies.append(float(s[2]))
+        elif mdout_file is not None and os.path.isfile(mdout_file):
+            with open_file(mdout_file) as f:
+                for line in f:
+                    if "EPtot" in line:
+                        s = line.split()
+                        energies.append(float(s[-1]))
+        else:
+            raise RuntimeError("Please provide one of mden_file and mdout_file")
+
+    atom_names, atom_types, atom_numbs = np.unique(
+        amber_types, return_inverse=True, return_counts=True
+    )
+
+    data = {}
+    data["atom_names"] = list(atom_names)
+    data["atom_numbs"] = list(atom_numbs)
+    data["atom_types"] = atom_types
+    if labeled:
+        data["forces"] = forces * force_convert
+        data["energies"] = np.array(energies) * energy_convert
+    data["coords"] = coords
+    data["cells"] = cells
+    data["orig"] = np.array([0, 0, 0])
+    return data
diff --git a/dpdata/formats/amber/sqm.py b/dpdata/formats/amber/sqm.py
new file mode 100644
index 000000000..93e41f9aa
--- /dev/null
+++ b/dpdata/formats/amber/sqm.py
@@ -0,0 +1,120 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+from dpdata.periodic_table import ELEMENTS
+from dpdata.unit import EnergyConversion
+from dpdata.utils import open_file
+
+if TYPE_CHECKING:
+    from dpdata.utils import FileType
+
+kcal2ev = EnergyConversion("kcal_mol", "eV").value()
+
+START = 0
+READ_CHARGE = 2
+READ_COORDS_START = 3
+READ_COORDS = 6
+READ_FORCES = 7
+
+
+def parse_sqm_out(fname: FileType):
+    """Read atom symbols, charges and coordinates from ambertools sqm.out file."""
+    atom_symbols = []
+    coords = []
+    charges = []
+    forces = []
+    energies = []
+
+    with open_file(fname) as f:
+        flag = START
+        for line in f:
+            if line.startswith(" Total SCF energy"):
+                energy = float(line.strip().split()[-2])
+                energies = [energy]
+            elif line.startswith("  Atom    Element       Mulliken Charge"):
+                flag = READ_CHARGE
+                charges = []
+            elif line.startswith(" Total Mulliken Charge"):
+                flag = START
+            elif line.startswith(" Final Structure"):
+                flag = READ_COORDS_START
+                coords = []
+            elif line.startswith("QMMM: Forces on QM atoms"):
+                flag = READ_FORCES
+                forces = []
+            elif flag == READ_CHARGE:
+                ls = line.strip().split()
+                atom_symbols.append(ls[-2])
+                charges.append(float(ls[-1]))
+            elif READ_COORDS_START <= flag < READ_COORDS:
+                flag += 1
+            elif flag == READ_COORDS:
+                coords.append([float(x) for x in line.strip().split()[-3:]])
+                if len(coords) == len(charges):
+                    flag = START
+            elif flag == READ_FORCES:
+                ll = line.strip()
+                if not ll.startswith("QMMM: Atm "):
+                    flag = START
+                    continue
+                forces.append([float(ll[-60:-40]), float(ll[-40:-20]), float(ll[-20:])])
+                if len(forces) == len(charges):
+                    flag = START
+
+    data = {}
+    atom_names, data["atom_types"], atom_numbs = np.unique(
+        atom_symbols, return_inverse=True, return_counts=True
+    )
+    data["charges"] = np.array(charges)
+    data["atom_names"] = list(atom_names)
+    data["atom_numbs"] = list(atom_numbs)
+    data["orig"] = np.array([0, 0, 0])
+    data["cells"] = np.array(
+        [[[100.0, 0.0, 0.0], [0.0, 100.0, 0.0], [0.0, 0.0, 100.0]]]
+    )
+    data["nopbc"] = True
+    data["coords"] = np.array([coords])
+
+    energies = np.array(energies)
+    forces = -np.array([forces], dtype=np.float64) * kcal2ev
+    if len(forces) > 0:
+        data["energies"] = energies
+        data["forces"] = forces
+
+    return data
+
+
+def make_sqm_in(data, fname: FileType | None = None, frame_idx=0, **kwargs):
+    symbols = [data["atom_names"][ii] for ii in data["atom_types"]]
+    atomic_numbers = [ELEMENTS.index(ss) + 1 for ss in symbols]
+    charge = kwargs.get("charge", 0)
+
+    # multiplicity
+    mult = kwargs.get("mult", 1)
+    if mult != 1:
+        raise RuntimeError("Multiplicity is not 1, which is not supported by sqm")
+
+    maxcyc = kwargs.get("maxcyc", 0)  # 0 represents a single-point calculation
+    theory = kwargs.get("qm_theory", "DFTB3")
+    ret = "Run semi-emperical minimization\n"
+    ret += " &qmmm\n"
+    ret += f"     qm_theory='{theory}'\n"
+    ret += f"     qmcharge={charge}\n"
+    ret += f"     maxcyc={maxcyc}\n"
+    ret += "     verbosity=4\n"
+    ret += " /\n"
+    for ii in range(len(data["atom_types"])):
+        ret += "{:>4s}{:>6s}{:>16s}{:>16s}{:>16s}\n".format(
+            str(atomic_numbers[ii]),
+            str(symbols[ii]),
+            f"{data['coords'][frame_idx][ii, 0]:.6f}",
+            f"{data['coords'][frame_idx][ii, 1]:.6f}",
+            f"{data['coords'][frame_idx][ii, 2]:.6f}",
+        )
+    if fname is not None:
+        with open_file(fname, "w") as fp:
+            fp.write(ret)
+    return ret
diff --git a/dpdata/formats/cp2k/__init__.py b/dpdata/formats/cp2k/__init__.py
new file mode 100644
index 000000000..f3d05133b
--- /dev/null
+++ b/dpdata/formats/cp2k/__init__.py
@@ -0,0 +1,5 @@
+from __future__ import annotations
+
+from . import cell, output
+
+__all__ = ["cell", "output"]
diff --git a/dpdata/formats/cp2k/cell.py b/dpdata/formats/cp2k/cell.py
new file mode 100644
index 000000000..a3021b815
--- /dev/null
+++ b/dpdata/formats/cp2k/cell.py
@@ -0,0 +1,68 @@
+# %%
+from __future__ import annotations
+
+import numpy as np
+
+
+def cell_to_low_triangle(A, B, C, alpha, beta, gamma):
+    """Convert cell to low triangle matrix.
+
+    Parameters
+    ----------
+    A : float
+        cell length A
+    B : float
+        cell length B
+    C : float
+        cell length C
+    alpha : float
+        radian. The angle between vector B and  vector C.
+    beta : float
+        radian. The angle between vector A and  vector C.
+    gamma : float
+        radian. The angle between vector B and  vector C.
+
+    Returns
+    -------
+    cell : list
+        The cell matrix used by dpdata in low triangle form.
+    """
+    if not np.pi * 5 / 180 < alpha < np.pi * 175 / 180:
+        raise RuntimeError(
+            f"alpha=={alpha}: must be a radian, and \
+            must be in np.pi*5/180 < alpha < np.pi*175/180"
+        )
+    if not np.pi * 5 / 180 < beta < np.pi * 175 / 180:
+        raise RuntimeError(
+            f"beta=={beta}: must be a radian, and \
+            must be in np.pi*5/180 < beta < np.pi*175/180"
+        )
+    if not np.pi * 5 / 180 < gamma < np.pi * 175 / 180:
+        raise RuntimeError(
+            f"gamma=={gamma}: must be a radian, and \
+                must be in np.pi*5/180 < gamma < np.pi*175/180"
+        )
+    if not A > 0.2:
+        raise RuntimeError(f"A=={A}, must be greater than 0.2")
+    if not B > 0.2:
+        raise RuntimeError(f"B=={B}, must be greater than 0.2")
+    if not C > 0.2:
+        raise RuntimeError(f"C=={C}, must be greater than 0.2")
+
+    lx = A
+    xy = B * np.cos(gamma)
+    xz = C * np.cos(beta)
+    ly = B * np.sin(gamma)
+    if not ly > 0.1:
+        raise RuntimeError(
+            "ly:=B* np.sin(gamma)=={}, must be greater than 0.1", format(ly)
+        )
+    yz = (B * C * np.cos(alpha) - xy * xz) / ly
+    if not C**2 - xz**2 - yz**2 > 0.01:
+        raise RuntimeError(
+            "lz^2:=C**2-xz**2-yz**2=={}, must be greater than 0.01",
+            format(C**2 - xz**2 - yz**2),
+        )
+    lz = np.sqrt(C**2 - xz**2 - yz**2)
+    cell = np.asarray([[lx, 0, 0], [xy, ly, 0], [xz, yz, lz]]).astype("float32")
+    return cell
diff --git a/dpdata/formats/cp2k/output.py b/dpdata/formats/cp2k/output.py
new file mode 100644
index 000000000..e5ee733c9
--- /dev/null
+++ b/dpdata/formats/cp2k/output.py
@@ -0,0 +1,510 @@
+# %%
+from __future__ import annotations
+
+import math
+import re
+from collections import OrderedDict
+
+import numpy as np
+
+from ...unit import (
+    EnergyConversion,
+    ForceConversion,
+    LengthConversion,
+    PressureConversion,
+)
+from .cell import cell_to_low_triangle
+
+AU_TO_ANG = LengthConversion("bohr", "angstrom").value()
+AU_TO_EV = EnergyConversion("hartree", "eV").value()
+AU_TO_EV_EVERY_ANG = ForceConversion("hartree/bohr", "eV/angstrom").value()
+delimiter_patterns = []
+delimiter_p1 = re.compile(r"^ \* GO CP2K GO! \*+")
+delimiter_p2 = re.compile(r"^ \*+")
+delimiter_patterns.append(delimiter_p1)
+delimiter_patterns.append(delimiter_p2)
+avail_patterns = []
+avail_patterns.append(re.compile(r"^ INITIAL POTENTIAL ENERGY"))
+avail_patterns.append(re.compile(r"^ ENSEMBLE TYPE"))
+
+
+class Cp2kSystems:
+    """deal with cp2k outputfile."""
+
+    def __init__(self, log_file_name, xyz_file_name, restart=False):
+        self.log_file_object = open(log_file_name)
+        self.xyz_file_object = open(xyz_file_name)
+        self.log_block_generator = self.get_log_block_generator()
+        self.xyz_block_generator = self.get_xyz_block_generator()
+        self.restart_flag = restart
+
+        self.cell = None
+        self.print_level = None
+
+        self.atomic_kinds = None
+
+        if self.restart_flag:
+            self.handle_single_log_frame(next(self.log_block_generator))
+
+    def __del__(self):
+        self.log_file_object.close()
+        self.xyz_file_object.close()
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        info_dict = {}
+        log_info_dict = self.handle_single_log_frame(next(self.log_block_generator))
+        # print(log_info_dict)
+        xyz_info_dict = self.handle_single_xyz_frame(next(self.xyz_block_generator))
+        # eq1 = [v1==v2 for v1,v2 in zip(log_info_dict['atom_numbs'], xyz_info_dict['atom_numbs'])]
+        # eq2 = [v1==v2 for v1,v2 in zip(log_info_dict['atom_names'], xyz_info_dict['atom_names'])]
+        # eq3 = [v1==v2 for v1,v2 in zip(log_info_dict['atom_types'], xyz_info_dict['atom_types'])]
+        # assert all(eq1), (log_info_dict,xyz_info_dict,'There may be errors in the file. If it is a restart task; use restart=True')
+        # assert all(eq2), (log_info_dict,xyz_info_dict,'There may be errors in the file. If it is a restart task; use restart=True')
+        # assert all(eq3), (log_info_dict,xyz_info_dict,'There may be errors in the file. If it is a restart task; use restart=True')
+        assert math.isclose(
+            log_info_dict["energies"][0], xyz_info_dict["energies"][0], abs_tol=1.0e-6
+        ), (
+            log_info_dict["energies"],
+            xyz_info_dict["energies"],
+            "There may be errors in the file",
+        )
+        info_dict.update(log_info_dict)
+        info_dict.update(xyz_info_dict)
+        return info_dict
+
+    def get_log_block_generator(self):
+        lines = []
+        delimiter_flag = False
+        yield_flag = False
+        while True:
+            line = self.log_file_object.readline()
+            if line:
+                lines.append(line)
+                if any(p.match(line) for p in delimiter_patterns):
+                    if delimiter_flag is True:
+                        yield_flag = True
+                        yield lines
+                        lines = []
+                        delimiter_flag = False
+                    else:
+                        line = self.log_file_object.readline()
+                        lines.append(line)
+                        if any(p.match(line) for p in avail_patterns):
+                            delimiter_flag = True
+            else:
+                if not yield_flag:
+                    raise StopIteration("None of the delimiter patterns are matched")
+                break
+        if delimiter_flag is True:
+            raise RuntimeError("This file lacks some content, please check")
+
+    def get_xyz_block_generator(self):
+        p3 = re.compile(r"^\s*(\d+)\s*")
+        yield_flag = False
+        while True:
+            line = self.xyz_file_object.readline()
+            if not line:
+                if not yield_flag:
+                    raise StopIteration("None of the xyz patterns are matched")
+                break
+            if p3.match(line):
+                yield_flag = True
+                atom_num = int(p3.match(line).group(1))
+                lines = []
+                lines.append(line)
+                for ii in range(atom_num + 1):
+                    lines.append(self.xyz_file_object.readline())
+                if not lines[-1]:
+                    raise RuntimeError(
+                        f"this xyz file may lack of lines, should be {atom_num + 2};lines:{lines}"
+                    )
+                yield lines
+
+    def handle_single_log_frame(self, lines):
+        info_dict = {}
+        energy_pattern_1 = re.compile(
+            r" INITIAL POTENTIAL ENERGY\[hartree\]\s+=\s+(?P<number>\S+)"
+        )
+        #  CONSERVED QUANTITY [hartree] =                              -0.279168013085E+04
+        energy_pattern_2 = re.compile(
+            r" POTENTIAL ENERGY\[hartree\]\s+=\s+(?P<number>\S+)"
+        )
+        energy = None
+        cell_length_pattern = re.compile(
+            r" (INITIAL ){0,1}CELL LNTHS\[bohr\]\s+=\s+(?P<A>\S+)\s+(?P<B>\S+)\s+(?P<C>\S+)"
+        )
+        cell_angle_pattern = re.compile(
+            r" (INITIAL ){0,1}CELL ANGLS\[deg\]\s+=\s+(?P<alpha>\S+)\s+(?P<beta>\S+)\s+(?P<gamma>\S+)"
+        )
+        cell_A, cell_B, cell_C = (
+            0,
+            0,
+            0,
+        )
+        cell_alpha, cell_beta, cell_gamma = (
+            0,
+            0,
+            0,
+        )
+        cell_a_pattern = re.compile(
+            r" CELL\| Vector a \[angstrom\]:\s+(?P<ax>\S+)\s+(?P<ay>\S+)\s+(?P<az>\S+)"
+        )
+        cell_b_pattern = re.compile(
+            r" CELL\| Vector b \[angstrom\]:\s+(?P<bx>\S+)\s+(?P<by>\S+)\s+(?P<bz>\S+)"
+        )
+        cell_c_pattern = re.compile(
+            r" CELL\| Vector c \[angstrom\]:\s+(?P<cx>\S+)\s+(?P<cy>\S+)\s+(?P<cz>\S+)"
+        )
+        force_start_pattern = re.compile(r" ATOMIC FORCES in")
+        force_flag = False
+        force_end_pattern = re.compile(r" SUM OF ATOMIC FORCES")
+        force_lines = []
+        cell_flag = 0
+        print_level_pattern = re.compile(
+            r" GLOBAL\| Global print level\s+(?P<print_level>\S+)"
+        )
+        print_level_flag = 0
+        atomic_kinds_pattern = re.compile(r"\s+\d+\. Atomic kind:\s+(?P<akind>\S+)")
+        atomic_kinds = []
+        stress_sign = "STRESS"
+        stress_flag = 0
+        stress = []
+
+        for line in lines:
+            if stress_flag == 3:
+                if line == "\n":
+                    stress_flag = 0
+                else:
+                    stress.append(line.split()[1:4])
+            if stress_flag == 2:
+                stress_flag = 3
+            if stress_flag == 1:
+                stress_flag = 2
+            if stress_sign in line:
+                stress_flag = 1
+            if force_start_pattern.match(line):
+                force_flag = True
+            if force_end_pattern.match(line):
+                assert force_flag is True, (
+                    force_flag,
+                    "there may be errors in this file ",
+                )
+                force_flag = False
+            if force_flag is True:
+                force_lines.append(line)
+            if energy_pattern_1.match(line):
+                energy = (
+                    float(energy_pattern_1.match(line).groupdict()["number"]) * AU_TO_EV
+                )
+                # print('1to', energy)
+            if energy_pattern_2.match(line):
+                energy = (
+                    float(energy_pattern_2.match(line).groupdict()["number"]) * AU_TO_EV
+                )
+            if cell_length_pattern.match(line):
+                cell_A = (
+                    float(cell_length_pattern.match(line).groupdict()["A"]) * AU_TO_ANG
+                )
+                cell_B = (
+                    float(cell_length_pattern.match(line).groupdict()["B"]) * AU_TO_ANG
+                )
+                cell_C = (
+                    float(cell_length_pattern.match(line).groupdict()["C"]) * AU_TO_ANG
+                )
+                cell_flag += 1
+            if cell_angle_pattern.match(line):
+                cell_alpha = np.deg2rad(
+                    float(cell_angle_pattern.match(line).groupdict()["alpha"])
+                )
+                cell_beta = np.deg2rad(
+                    float(cell_angle_pattern.match(line).groupdict()["beta"])
+                )
+                cell_gamma = np.deg2rad(
+                    float(cell_angle_pattern.match(line).groupdict()["gamma"])
+                )
+                cell_flag += 1
+            if print_level_pattern.match(line):
+                print_level = print_level_pattern.match(line).groupdict()["print_level"]
+                print_level_flag += 1
+            if cell_a_pattern.match(line):
+                cell_ax = float(cell_a_pattern.match(line).groupdict()["ax"])
+                cell_ay = float(cell_a_pattern.match(line).groupdict()["ay"])
+                cell_az = float(cell_a_pattern.match(line).groupdict()["az"])
+                cell_flag += 1
+            if cell_b_pattern.match(line):
+                cell_bx = float(cell_b_pattern.match(line).groupdict()["bx"])
+                cell_by = float(cell_b_pattern.match(line).groupdict()["by"])
+                cell_bz = float(cell_b_pattern.match(line).groupdict()["bz"])
+                cell_flag += 1
+            if cell_c_pattern.match(line):
+                cell_cx = float(cell_c_pattern.match(line).groupdict()["cx"])
+                cell_cy = float(cell_c_pattern.match(line).groupdict()["cy"])
+                cell_cz = float(cell_c_pattern.match(line).groupdict()["cz"])
+                cell_flag += 1
+
+            if atomic_kinds_pattern.match(line):
+                akind = atomic_kinds_pattern.match(line).groupdict()["akind"]
+                atomic_kinds.append(akind)
+        if print_level_flag == 1:
+            self.print_level = print_level
+            if print_level == "LOW":
+                raise RuntimeError(
+                    "please provide cp2k output with higher print level(at least MEDIUM)"
+                )
+
+        if cell_flag == 2:
+            self.cell = cell_to_low_triangle(
+                cell_A, cell_B, cell_C, cell_alpha, cell_beta, cell_gamma
+            )
+        elif cell_flag == 5:
+            self.cell = np.asarray(
+                [
+                    [cell_ax, cell_ay, cell_az],
+                    [cell_bx, cell_by, cell_bz],
+                    [cell_cx, cell_cy, cell_cz],
+                ]
+            ).astype("float64")
+        if atomic_kinds:
+            self.atomic_kinds = atomic_kinds
+        # print(self.atomic_kinds)
+        # lx = cell_A
+        # xy = cell_B * np.cos(cell_gamma)
+        # xz = cell_C * np.cos(cell_beta)
+        # ly = cell_B* np.sin(cell_gamma)
+        # yz = (cell_B*cell_C*np.cos(cell_alpha)-xy*xz)/ly
+        # lz = np.sqrt(cell_C**2-xz**2-yz**2)
+        # self.cell = [[lx, 0 , 0],
+        #         [xy, ly, 0 ],
+        #         [xz, yz, lz]]
+
+        element_index = -1
+        element_dict = OrderedDict()
+        atom_types_idx_list = []
+        forces_list = []
+        for line in force_lines[3:]:
+            line_list = line.split()
+            # print(line_list)
+            if element_dict.get(line_list[1]):
+                element_dict[line_list[1]][1] += 1
+            else:
+                element_index += 1
+                element_dict[line_list[1]] = [element_index, 1]
+            atom_types_idx_list.append(element_dict[line_list[1]][0])
+            forces_list.append(
+                [
+                    float(line_list[3]) * AU_TO_EV_EVERY_ANG,
+                    float(line_list[4]) * AU_TO_EV_EVERY_ANG,
+                    float(line_list[5]) * AU_TO_EV_EVERY_ANG,
+                ]
+            )
+        # print(atom_types_idx_list)
+        # atom_names=list(element_dict.keys())
+        atom_names = self.atomic_kinds
+        atom_numbs = []
+
+        GPa = PressureConversion("eV/angstrom^3", "GPa").value()
+        if stress:
+            stress = np.array(stress)
+            stress = stress.astype("float64")
+            stress = stress[np.newaxis, :, :]
+            # stress to virial conversion, default unit in cp2k is GPa
+            # note the stress is virial = stress * volume
+            virial = stress * np.linalg.det(self.cell) / GPa
+            virial = virial.squeeze()
+        else:
+            virial = None
+        for ii in element_dict.keys():
+            atom_numbs.append(element_dict[ii][1])
+        # print(atom_numbs)
+        info_dict["atom_names"] = atom_names
+        info_dict["atom_numbs"] = atom_numbs
+        info_dict["atom_types"] = np.asarray(atom_types_idx_list)
+        info_dict["print_level"] = self.print_level
+        info_dict["cells"] = np.asarray([self.cell]).astype("float64")
+        info_dict["energies"] = np.asarray([energy]).astype("float64")
+        info_dict["forces"] = np.asarray([forces_list]).astype("float64")
+        if virial is not None:
+            info_dict["virials"] = np.asarray([virial]).astype("float64")
+        return info_dict
+
+    def handle_single_xyz_frame(self, lines):
+        info_dict = {}
+        atom_num = int(lines[0].strip("\n").strip())
+        if len(lines) != atom_num + 2:
+            raise RuntimeError(
+                f"format error, atom_num=={atom_num}, {len(lines)}!=atom_num+2"
+            )
+        data_format_line = lines[1].strip("\n").strip() + " "
+        prop_pattern = re.compile(r"(?P<prop>\w+)\s*=\s*(?P<number>.*?)[, ]")
+        prop_dict = dict(prop_pattern.findall(data_format_line))
+
+        energy = 0
+        if prop_dict.get("E"):
+            energy = float(prop_dict.get("E")) * AU_TO_EV
+            # info_dict['energies'] = np.array([prop_dict['E']]).astype('float64')
+
+        element_index = -1
+        element_dict = OrderedDict()
+        atom_types_list = []
+        coords_list = []
+        for line in lines[2:]:
+            line_list = line.split()
+            if element_dict.get(line_list[0]):
+                element_dict[line_list[0]][1] += 1
+            else:
+                element_index += 1
+                element_dict[line_list[0]] = [element_index, 1]
+            atom_types_list.append(element_dict[line_list[0]][0])
+            # coords_list.append([float(line_list[1])*AU_TO_ANG,
+            #     float(line_list[2])*AU_TO_ANG,
+            #     float(line_list[3])*AU_TO_ANG])
+            coords_list.append(
+                [float(line_list[1]), float(line_list[2]), float(line_list[3])]
+            )
+        atom_names = list(element_dict.keys())
+        atom_numbs = []
+        for ii in atom_names:
+            atom_numbs.append(element_dict[ii][1])
+        # info_dict['atom_names'] = atom_names
+        # info_dict['atom_numbs'] = atom_numbs
+        # info_dict['atom_types'] = np.asarray(atom_types_list)
+        info_dict["coords"] = np.asarray([coords_list]).astype("float64")
+        info_dict["energies"] = np.array([energy]).astype("float64")
+        info_dict["orig"] = np.zeros(3)
+        return info_dict
+
+
+# %%
+
+
+def get_frames(fname):
+    coord_flag = False
+    force_flag = False
+    stress_flag = False
+    eV = EnergyConversion("hartree", "eV").value()
+    angstrom = LengthConversion("bohr", "angstrom").value()
+    GPa = PressureConversion("eV/angstrom^3", "GPa").value()
+    atom_symbol_idx_list = []
+    atom_symbol_list = []
+    cell = []
+    coord = []
+    force = []
+    stress = []
+
+    fp = open(fname)
+    # check if output is converged, if not, return sys = 0
+    content = fp.read()
+    count = content.count("SCF run converged")
+    if count == 0:
+        fp.close()
+        return [], [], [], [], [], [], [], None
+
+    # search duplicated header
+    fp.seek(0)
+    header_idx = []
+    for idx, ii in enumerate(fp):
+        if "Multiplication driver" in ii:
+            header_idx.append(idx)
+
+    # parse from last header
+    fp.seek(0)
+    for idx, ii in enumerate(fp):
+        if idx > header_idx[-1]:
+            if "CELL| Vector" in ii:
+                cell.append(ii.split()[4:7])
+            if "Atomic kind:" in ii:
+                atom_symbol_list.append(ii.split()[3])
+
+            # beginning of coords block
+            if "Atom  Kind  Element" in ii or "Atom Kind Element" in ii:
+                coord_flag = True
+            # parse coords lines
+            elif coord_flag:
+                if ii == "\n":
+                    coord_flag = len(coord) == 0  # skip empty line at the beginning
+                else:
+                    coord.append(ii.split()[4:7])
+                    atom_symbol_idx_list.append(ii.split()[1])
+
+            if "ENERGY|" in ii:
+                energy = ii.split()[8]
+            if " Atom   Kind " in ii:
+                force_flag = True
+                force_idx = idx
+            if force_flag:
+                if idx > force_idx:
+                    if "SUM OF ATOMIC FORCES" in ii:
+                        force_flag = False
+                    else:
+                        force.append(ii.split()[3:6])
+            # add reading stress tensor
+            if "STRESS TENSOR [GPa" in ii:
+                stress_flag = True
+                stress_idx = idx
+            if stress_flag:
+                if idx > stress_idx + 2:
+                    if ii == "\n":
+                        stress_flag = False
+                    else:
+                        stress.append(ii.split()[1:4])
+
+    fp.close()
+    assert coord, "cannot find coords"
+    assert energy, "cannot find energies"
+    assert force, "cannot find forces"
+
+    # conver to float array and add extra dimension for nframes
+    cell = np.array(cell)
+    cell = cell.astype("float64")
+    cell = cell[np.newaxis, :, :]
+    coord = np.array(coord)
+    coord = coord.astype("float64")
+    coord = coord[np.newaxis, :, :]
+    atom_symbol_idx_list = np.array(atom_symbol_idx_list)
+    atom_symbol_idx_list = atom_symbol_idx_list.astype(int)
+    atom_symbol_idx_list = atom_symbol_idx_list - 1
+    atom_symbol_list = np.array(atom_symbol_list)
+    atom_symbol_list = atom_symbol_list[atom_symbol_idx_list]
+    force = np.array(force)
+    force = force.astype("float64")
+    force = force[np.newaxis, :, :]
+
+    # virial is not necessary
+    if stress:
+        stress = np.array(stress)
+        stress = stress.astype("float64")
+        stress = stress[np.newaxis, :, :]
+        # stress to virial conversion, default unit in cp2k is GPa
+        # note the stress is virial = stress * volume
+        virial = stress * np.linalg.det(cell[0]) / GPa
+    else:
+        virial = None
+
+    # force unit conversion, default unit in cp2k is hartree/bohr
+    force = force * eV / angstrom
+    # energy unit conversion, default unit in cp2k is hartree
+    energy = float(energy) * eV
+    energy = np.array(energy).astype("float64")
+    energy = energy[np.newaxis]
+
+    tmp_names, symbol_idx = np.unique(atom_symbol_list, return_index=True)
+    atom_types = []
+    atom_numbs = []
+    # preserve the atom_name order
+    atom_names = atom_symbol_list[np.sort(symbol_idx, kind="stable")]
+    for jj in atom_symbol_list:
+        for idx, ii in enumerate(atom_names):
+            if jj == ii:
+                atom_types.append(idx)
+    for idx in range(len(atom_names)):
+        atom_numbs.append(atom_types.count(idx))
+
+    atom_types = np.array(atom_types)
+
+    return list(atom_names), atom_numbs, atom_types, cell, coord, energy, force, virial
+
+
+# %%
diff --git a/dpdata/formats/deepmd/__init__.py b/dpdata/formats/deepmd/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/dpdata/formats/deepmd/comp.py b/dpdata/formats/deepmd/comp.py
new file mode 100644
index 000000000..410d789e1
--- /dev/null
+++ b/dpdata/formats/deepmd/comp.py
@@ -0,0 +1,162 @@
+from __future__ import annotations
+
+import glob
+import os
+import shutil
+import warnings
+
+import numpy as np
+
+import dpdata
+from dpdata.utils import open_file
+
+from .raw import load_type
+
+
+def _cond_load_data(fname):
+    tmp = None
+    if os.path.isfile(fname):
+        tmp = np.load(fname)
+    return tmp
+
+
+def _load_set(folder, nopbc: bool):
+    coords = np.load(os.path.join(folder, "coord.npy"))
+    if nopbc:
+        cells = np.zeros((coords.shape[0], 3, 3))
+    else:
+        cells = np.load(os.path.join(folder, "box.npy"))
+    return cells, coords
+
+
+def to_system_data(folder, type_map=None, labels=True):
+    # data is empty
+    data = load_type(folder, type_map=type_map)
+    data["orig"] = np.zeros([3])
+    if os.path.isfile(os.path.join(folder, "nopbc")):
+        data["nopbc"] = True
+    sets = sorted(glob.glob(os.path.join(folder, "set.*")))
+    all_cells = []
+    all_coords = []
+    for ii in sets:
+        cells, coords = _load_set(ii, data.get("nopbc", False))
+        nframes = np.reshape(cells, [-1, 3, 3]).shape[0]
+        all_cells.append(np.reshape(cells, [nframes, 3, 3]))
+        all_coords.append(np.reshape(coords, [nframes, -1, 3]))
+    data["cells"] = np.concatenate(all_cells, axis=0)
+    data["coords"] = np.concatenate(all_coords, axis=0)
+    # allow custom dtypes
+    if labels:
+        dtypes = dpdata.system.LabeledSystem.DTYPES
+    else:
+        dtypes = dpdata.system.System.DTYPES
+
+    for dtype in dtypes:
+        if dtype.name in (
+            "atom_numbs",
+            "atom_names",
+            "atom_types",
+            "orig",
+            "cells",
+            "coords",
+            "real_atom_names",
+            "nopbc",
+        ):
+            # skip as these data contains specific rules
+            continue
+        if not (len(dtype.shape) and dtype.shape[0] == dpdata.system.Axis.NFRAMES):
+            warnings.warn(
+                f"Shape of {dtype.name} is not (nframes, ...), but {dtype.shape}. This type of data will not converted from deepmd/npy format."
+            )
+            continue
+        natoms = data["atom_types"].shape[0]
+        shape = [
+            natoms if xx == dpdata.system.Axis.NATOMS else xx for xx in dtype.shape[1:]
+        ]
+        all_data = []
+        for ii in sets:
+            tmp = _cond_load_data(os.path.join(ii, dtype.deepmd_name + ".npy"))
+            if tmp is not None:
+                all_data.append(np.reshape(tmp, [tmp.shape[0], *shape]))
+        if len(all_data) > 0:
+            data[dtype.name] = np.concatenate(all_data, axis=0)
+    return data
+
+
+def dump(folder, data, set_size=5000, comp_prec=np.float32, remove_sets=True):
+    os.makedirs(folder, exist_ok=True)
+    sets = sorted(glob.glob(os.path.join(folder, "set.*")))
+    if len(sets) > 0:
+        if remove_sets:
+            for ii in sets:
+                shutil.rmtree(ii)
+        else:
+            raise RuntimeError(
+                "found "
+                + str(sets)
+                + " in "
+                + folder
+                + "not a clean deepmd raw dir. please firstly clean set.* then try compress"
+            )
+    # dump raw
+    np.savetxt(os.path.join(folder, "type.raw"), data["atom_types"], fmt="%d")
+    np.savetxt(os.path.join(folder, "type_map.raw"), data["atom_names"], fmt="%s")
+    # BondOrder System
+    if "bonds" in data:
+        np.savetxt(
+            os.path.join(folder, "bonds.raw"),
+            data["bonds"],
+            header="begin_atom, end_atom, bond_order",
+        )
+    if "formal_charges" in data:
+        np.savetxt(os.path.join(folder, "formal_charges.raw"), data["formal_charges"])
+    # reshape frame properties and convert prec
+    nframes = data["cells"].shape[0]
+    # dump frame properties: cell, coord, energy, force and virial
+    nsets = nframes // set_size
+    if set_size * nsets < nframes:
+        nsets += 1
+    for ii in range(nsets):
+        set_stt = ii * set_size
+        set_end = (ii + 1) * set_size
+        set_folder = os.path.join(folder, "set.%03d" % ii)  # noqa: UP031
+        os.makedirs(set_folder)
+    try:
+        os.remove(os.path.join(folder, "nopbc"))
+    except OSError:
+        pass
+    if data.get("nopbc", False):
+        with open_file(os.path.join(folder, "nopbc"), "w") as fw_nopbc:
+            pass
+    # allow custom dtypes
+    labels = "energies" in data
+    if labels:
+        dtypes = dpdata.system.LabeledSystem.DTYPES
+    else:
+        dtypes = dpdata.system.System.DTYPES
+    for dtype in dtypes:
+        if dtype.name in (
+            "atom_numbs",
+            "atom_names",
+            "atom_types",
+            "orig",
+            "real_atom_names",
+            "nopbc",
+        ):
+            # skip as these data contains specific rules
+            continue
+        if dtype.name not in data:
+            continue
+        if not (len(dtype.shape) and dtype.shape[0] == dpdata.system.Axis.NFRAMES):
+            warnings.warn(
+                f"Shape of {dtype.name} is not (nframes, ...), but {dtype.shape}. This type of data will not converted to deepmd/npy format."
+            )
+            continue
+        ddata = np.reshape(data[dtype.name], [nframes, -1])
+        if np.issubdtype(ddata.dtype, np.floating):
+            ddata = ddata.astype(comp_prec)
+        for ii in range(nsets):
+            set_stt = ii * set_size
+            set_end = (ii + 1) * set_size
+            set_folder = os.path.join(folder, "set.%03d" % ii)  # noqa: UP031
+            np.save(os.path.join(set_folder, dtype.deepmd_name), ddata[set_stt:set_end])
diff --git a/dpdata/formats/deepmd/hdf5.py b/dpdata/formats/deepmd/hdf5.py
new file mode 100644
index 000000000..c2b3bd424
--- /dev/null
+++ b/dpdata/formats/deepmd/hdf5.py
@@ -0,0 +1,228 @@
+"""Utils for deepmd/hdf5 format."""
+
+from __future__ import annotations
+
+import warnings
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+import dpdata
+
+if TYPE_CHECKING:
+    import h5py
+
+__all__ = ["to_system_data", "dump"]
+
+
+def to_system_data(
+    f: h5py.File | h5py.Group,
+    folder: str,
+    type_map: list | None = None,
+    labels: bool = True,
+):
+    """Load a HDF5 file.
+
+    Parameters
+    ----------
+    f : h5py.File or h5py.Group
+        HDF5 file or group object
+    folder : str
+        path in the HDF5 file
+    type_map : list
+        type map
+    labels : bool
+        labels
+    """
+    from wcmatch.glob import globfilter
+
+    g = f[folder] if folder else f
+
+    data = {}
+    # ignore empty files or groups
+    if "type.raw" not in g.keys():
+        return data
+    data["atom_types"] = g["type.raw"][:]
+    ntypes = np.max(data["atom_types"]) + 1
+    natoms = data["atom_types"].size
+    data["atom_numbs"] = []
+    for ii in range(ntypes):
+        data["atom_numbs"].append(np.count_nonzero(data["atom_types"] == ii))
+    data["atom_names"] = []
+    # if find type_map.raw, use it
+    if "type_map.raw" in g.keys():
+        my_type_map = list(np.char.decode(g["type_map.raw"][:]))
+    # else try to use arg type_map
+    elif type_map is not None:
+        my_type_map = type_map
+    # in the last case, make artificial atom names
+    else:
+        my_type_map = []
+        for ii in range(ntypes):
+            my_type_map.append("Type_%d" % ii)  # noqa: UP031
+    assert len(my_type_map) >= len(data["atom_numbs"])
+    for ii in range(len(data["atom_numbs"])):
+        data["atom_names"].append(my_type_map[ii])
+
+    data["orig"] = np.zeros([3])
+    if "nopbc" in g.keys():
+        data["nopbc"] = True
+    sets = globfilter(g.keys(), "set.*")
+
+    data_types = {}
+    # allow custom dtypes
+    if labels:
+        dtypes = dpdata.system.LabeledSystem.DTYPES
+    else:
+        dtypes = dpdata.system.System.DTYPES
+    for dtype in dtypes:
+        if dtype.name in (
+            "atom_numbs",
+            "atom_names",
+            "atom_types",
+            "orig",
+            "real_atom_types",
+            "real_atom_names",
+            "nopbc",
+        ):
+            # skip as these data contains specific rules
+            continue
+        if not (len(dtype.shape) and dtype.shape[0] == dpdata.system.Axis.NFRAMES):
+            warnings.warn(
+                f"Shape of {dtype.name} is not (nframes, ...), but {dtype.shape}. This type of data will not converted from deepmd/hdf5 format."
+            )
+            continue
+        shape = [
+            natoms if xx == dpdata.system.Axis.NATOMS else xx for xx in dtype.shape[1:]
+        ]
+
+        data_types[dtype.name] = {
+            "fn": dtype.deepmd_name,
+            "shape": shape,
+            "required": dtype.required
+            and not (dtype.name == "cells" and data.get("nopbc", False)),
+        }
+
+    for dt, prop in data_types.items():
+        all_data = []
+
+        for ii in sets:
+            set = g[ii]
+            fn = "{}.npy".format(prop["fn"])
+            if fn in set.keys():
+                dd = set[fn][:]
+                nframes = dd.shape[0]
+                all_data.append(np.reshape(dd, (nframes, *prop["shape"])))
+            elif prop["required"]:
+                raise RuntimeError(f"{folder}/{ii}/{fn} not found")
+
+        if len(all_data) > 0:
+            data[dt] = np.concatenate(all_data, axis=0)
+    if "cells" not in data:
+        nframes = data["coords"].shape[0]
+        data["cells"] = np.zeros((nframes, 3, 3))
+    return data
+
+
+def dump(
+    f: h5py.File | h5py.Group,
+    folder: str,
+    data: dict,
+    set_size=5000,
+    comp_prec=np.float32,
+) -> None:
+    """Dump data to a HDF5 file.
+
+    Parameters
+    ----------
+    f : h5py.File or h5py.Group
+        HDF5 file or group object
+    folder : str
+        path in the HDF5 file
+    data : dict
+        System or LabeledSystem data
+    set_size : int, default: 5000
+        size of a set
+    comp_prec : np.dtype, default: np.float32
+        precision of data
+    """
+    # if folder is None, use the root of the file
+    if folder:
+        if folder in f:
+            del f[folder]
+        g = f.create_group(folder)
+    else:
+        g = f
+    # ignore empty systems
+    if not len(data["coords"]):
+        return
+    # dump raw (array in fact)
+    g.create_dataset("type.raw", data=data["atom_types"])
+    g.create_dataset("type_map.raw", data=np.array(data["atom_names"], dtype="S"))
+    # BondOrder System
+    if "bonds" in data:
+        g.create_dataset("bonds.raw", data=data["bonds"])
+    if "formal_charges" in data:
+        g.create_dataset("formal_charges.raw", data=data["formal_charges"])
+    # reshape frame properties and convert prec
+    nframes = data["cells"].shape[0]
+
+    nopbc = data.get("nopbc", False)
+    reshaped_data = {}
+
+    data_types = {}
+
+    labels = "energies" in data
+    if labels:
+        dtypes = dpdata.system.LabeledSystem.DTYPES
+    else:
+        dtypes = dpdata.system.System.DTYPES
+    # allow custom dtypes
+    for dtype in dtypes:
+        if dtype.name in (
+            "atom_numbs",
+            "atom_names",
+            "atom_types",
+            "orig",
+            "real_atom_types",
+            "real_atom_names",
+            "nopbc",
+        ):
+            # skip as these data contains specific rules
+            continue
+        if not (len(dtype.shape) and dtype.shape[0] == dpdata.system.Axis.NFRAMES):
+            warnings.warn(
+                f"Shape of {dtype.name} is not (nframes, ...), but {dtype.shape}. This type of data will not converted to deepmd/hdf5 format."
+            )
+            continue
+
+        data_types[dtype.name] = {
+            "fn": dtype.deepmd_name,
+            "shape": (nframes, -1),
+            "dump": not (dtype.name == "cells" and nopbc),
+        }
+
+    for dt, prop in data_types.items():
+        if dt in data:
+            if prop["dump"]:
+                ddata = np.reshape(data[dt], prop["shape"])
+                if np.issubdtype(ddata.dtype, np.floating):
+                    ddata = ddata.astype(comp_prec)
+                reshaped_data[dt] = ddata
+
+    # dump frame properties: cell, coord, energy, force and virial
+    nsets = nframes // set_size
+    if set_size * nsets < nframes:
+        nsets += 1
+    for ii in range(nsets):
+        set_stt = ii * set_size
+        set_end = (ii + 1) * set_size
+        set_folder = g.create_group("set.%03d" % ii)  # noqa: UP031
+        for dt, prop in data_types.items():
+            if dt in reshaped_data:
+                set_folder.create_dataset(
+                    "{}.npy".format(prop["fn"]), data=reshaped_data[dt][set_stt:set_end]
+                )
+
+    if nopbc:
+        g.create_dataset("nopbc", data=True)
diff --git a/dpdata/formats/deepmd/mixed.py b/dpdata/formats/deepmd/mixed.py
new file mode 100644
index 000000000..734b6a730
--- /dev/null
+++ b/dpdata/formats/deepmd/mixed.py
@@ -0,0 +1,299 @@
+from __future__ import annotations
+
+import copy
+import math
+
+import numpy as np
+
+import dpdata
+from dpdata.data_type import Axis
+
+from .comp import dump as comp_dump
+from .comp import to_system_data as comp_to_system_data
+
+
+def _pad_to(sys_data, target_natoms, dtypes):
+    """Pad system data dict so that NATOMS dimension becomes target_natoms.
+
+    Virtual atoms get real_atom_types = -1, and all other per-atom data is
+    padded with zeros.
+
+    Parameters
+    ----------
+    sys_data : dict
+        System data dict, already in mixed-type format.
+    target_natoms : int
+        Target number of atoms after padding.
+    dtypes : tuple[DataType, ...]
+        Registered data types to iterate for generic per-atom padding.
+    """
+    natoms = sys_data["atom_types"].shape[0]
+    npad = target_natoms - natoms
+    if npad <= 0:
+        return
+    nframes = sys_data["coords"].shape[0]
+
+    # Pad atom_types (all MIXED_TOKEN = 0)
+    sys_data["atom_types"] = np.concatenate(
+        [sys_data["atom_types"], np.zeros(npad, dtype=int)]
+    )
+    sys_data["atom_numbs"] = [target_natoms]
+
+    # Pad real_atom_types with -1 (virtual atom sentinel)
+    sys_data["real_atom_types"] = np.concatenate(
+        [
+            sys_data["real_atom_types"],
+            -np.ones((nframes, npad), dtype=sys_data["real_atom_types"].dtype),
+        ],
+        axis=1,
+    )
+
+    # Pad coords and all other per-atom data generically
+    reserved = {
+        "atom_numbs",
+        "atom_names",
+        "atom_types",
+        "orig",
+        "cells",
+        "real_atom_names",
+        "real_atom_types",
+        "nopbc",
+    }
+    for dtype in dtypes:
+        if dtype.name in reserved:
+            continue
+        if dtype.name not in sys_data:
+            continue
+        if not (
+            len(dtype.shape) >= 2
+            and dtype.shape[0] == Axis.NFRAMES
+            and Axis.NATOMS in dtype.shape
+        ):
+            continue
+        axis_natoms = list(dtype.shape).index(Axis.NATOMS)
+        arr = sys_data[dtype.name]
+        pad_width = [(0, 0)] * len(arr.shape)
+        pad_width[axis_natoms] = (0, npad)
+        sys_data[dtype.name] = np.pad(
+            arr, pad_width, mode="constant", constant_values=0
+        )
+
+
+def _strip_virtual_atoms(atom_types_row, coords, extra_data, dtypes):
+    """Strip virtual atoms (type -1) from a group of frames.
+
+    Parameters
+    ----------
+    atom_types_row : np.ndarray
+        1-D array of atom type indices for the group (same for all frames).
+    coords : np.ndarray
+        Coordinates array, shape (nframes, natoms_padded, 3).
+    extra_data : dict
+        Dict of {name: array} for this group, arrays already frame-sliced.
+    dtypes : tuple[DataType, ...]
+        Registered data types.
+
+    Returns
+    -------
+    atom_types : np.ndarray
+        Atom types with virtual atoms removed.
+    coords : np.ndarray
+        Coords with virtual atoms removed.
+    extra_data : dict
+        Extra data with virtual atoms removed.
+    """
+    real_mask = atom_types_row >= 0
+    if real_mask.all():
+        return atom_types_row, coords, extra_data
+
+    atom_types = atom_types_row[real_mask]
+    coords = coords[:, real_mask, :]
+
+    stripped = {}
+    for name, arr in extra_data.items():
+        for dtype in dtypes:
+            if dtype.name == name and Axis.NATOMS in dtype.shape:
+                axis_natoms = list(dtype.shape).index(Axis.NATOMS)
+                idx = [slice(None)] * len(arr.shape)
+                idx[axis_natoms] = real_mask
+                arr = arr[tuple(idx)]
+                break
+        stripped[name] = arr
+
+    return atom_types, coords, stripped
+
+
+def to_system_data(folder, type_map=None, labels=True):
+    data = comp_to_system_data(folder, type_map, labels)
+    # data is empty
+    old_type_map = data["atom_names"].copy()
+    if type_map is not None:
+        assert isinstance(type_map, list)
+        missing_type = [i for i in old_type_map if i not in type_map]
+        assert not missing_type, (
+            f"These types are missing in selected type_map: {missing_type} !"
+        )
+        index_map = np.array([type_map.index(i) for i in old_type_map])
+        data["atom_names"] = type_map.copy()
+    else:
+        index_map = None
+    all_real_atom_types_concat = data.pop("real_atom_types").astype(int)
+    if index_map is not None:
+        # Preserve -1 (virtual atom sentinel) during remapping
+        valid = all_real_atom_types_concat >= 0
+        remapped = np.full_like(all_real_atom_types_concat, -1)
+        remapped[valid] = index_map[all_real_atom_types_concat[valid]]
+        all_real_atom_types_concat = remapped
+    all_cells_concat = data["cells"]
+    all_coords_concat = data["coords"]
+
+    # handle custom registered data types
+    if labels:
+        dtypes = dpdata.system.LabeledSystem.DTYPES
+    else:
+        dtypes = dpdata.system.System.DTYPES
+    reserved = {
+        "atom_numbs",
+        "atom_names",
+        "atom_types",
+        "real_atom_names",
+        "real_atom_types",
+        "cells",
+        "coords",
+        "orig",
+        "nopbc",
+    }
+    extra_data = {}
+    for dtype in dtypes:
+        name = dtype.name
+        if name in reserved:
+            continue
+        if not (len(dtype.shape) and dtype.shape[0] == dpdata.system.Axis.NFRAMES):
+            continue
+        if name in data:
+            extra_data[name] = data.pop(name)
+
+    data_list = []
+    while True:
+        if all_real_atom_types_concat.size == 0:
+            break
+        # temp_formula = formula(data['atom_names'], temp_atom_numbs)
+        temp_idx = np.arange(all_real_atom_types_concat.shape[0])[
+            (all_real_atom_types_concat == all_real_atom_types_concat[0]).all(-1)
+        ]
+        rest_idx = np.arange(all_real_atom_types_concat.shape[0])[
+            (all_real_atom_types_concat != all_real_atom_types_concat[0]).any(-1)
+        ]
+
+        # Extract data for this group
+        group_atom_types = all_real_atom_types_concat[0]
+        group_coords = all_coords_concat[temp_idx]
+        group_extra = {}
+        for name in extra_data:
+            group_extra[name] = extra_data[name][temp_idx]
+            extra_data[name] = extra_data[name][rest_idx]
+
+        # Strip virtual atoms (type -1) introduced by padding
+        group_atom_types, group_coords, group_extra = _strip_virtual_atoms(
+            group_atom_types, group_coords, group_extra, dtypes
+        )
+
+        temp_atom_numbs = [
+            np.count_nonzero(group_atom_types == i)
+            for i in range(len(data["atom_names"]))
+        ]
+
+        temp_data = data.copy()
+        temp_data["atom_names"] = data["atom_names"].copy()
+        temp_data["atom_numbs"] = temp_atom_numbs
+        temp_data["atom_types"] = group_atom_types
+        all_real_atom_types_concat = all_real_atom_types_concat[rest_idx]
+        temp_data["cells"] = all_cells_concat[temp_idx]
+        all_cells_concat = all_cells_concat[rest_idx]
+        temp_data["coords"] = group_coords
+        all_coords_concat = all_coords_concat[rest_idx]
+
+        for name in group_extra:
+            temp_data[name] = group_extra[name]
+
+        data_list.append(temp_data)
+    return data_list
+
+
+def dump(folder, data, set_size=2000, comp_prec=np.float32, remove_sets=True):
+    # if not converted to mixed
+    if "real_atom_types" not in data:
+        from dpdata import LabeledSystem, System
+
+        # not change the original content
+        data = copy.deepcopy(data)
+
+        if "energies" in data:
+            temp_sys = LabeledSystem(data=data)
+        else:
+            temp_sys = System(data=data)
+        temp_sys.convert_to_mixed_type()
+
+    data = data.copy()
+    data["atom_names"] = data.pop("real_atom_names")
+    comp_dump(folder, data, set_size, comp_prec, remove_sets)
+
+
+def mix_system(*system, type_map, atom_numb_pad=None, **kwargs):
+    """Mix the systems into mixed_type ones according to the unified given type_map.
+
+    Parameters
+    ----------
+    *system : System
+        The systems to mix
+    type_map : list of str
+        Maps atom type to name
+    atom_numb_pad : int, optional
+        If provided, pad atom counts to the next multiple of this number
+        using virtual atoms (type -1 in real_atom_types). This reduces the
+        number of subdirectories when systems have many different atom counts.
+        For example, atom_numb_pad=8 groups systems into multiples of 8.
+    **kwargs : dict
+        Other parameters
+
+    Returns
+    -------
+    mixed_systems: dict
+        dict of mixed system with key 'atom_numbs'
+    """
+    mixed_systems = {}
+    temp_systems = {}
+    atom_numbs_frame_index = {}  # index of frames in cur sys
+    # Use LabeledSystem DTYPES as superset for generic per-atom padding
+    dtypes = dpdata.system.LabeledSystem.DTYPES
+    for sys in system:
+        tmp_sys = sys.copy()
+        natom = tmp_sys.get_natoms()
+        tmp_sys.convert_to_mixed_type(type_map=type_map)
+        if atom_numb_pad is not None and atom_numb_pad > 1:
+            padded_natom = math.ceil(natom / atom_numb_pad) * atom_numb_pad
+            _pad_to(tmp_sys.data, padded_natom, dtypes)
+            group_key = str(padded_natom)
+        else:
+            group_key = str(natom)
+        if group_key not in atom_numbs_frame_index:
+            atom_numbs_frame_index[group_key] = 0
+        atom_numbs_frame_index[group_key] += tmp_sys.get_nframes()
+        if group_key not in temp_systems or not temp_systems[group_key]:
+            temp_systems[group_key] = tmp_sys
+        else:
+            temp_systems[group_key].append(tmp_sys)
+    for natom_key in temp_systems:
+        if atom_numbs_frame_index[natom_key] > 0:
+            mixed_systems[natom_key] = temp_systems[natom_key]
+    return mixed_systems
+
+
+def split_system(sys, split_num=10000):
+    rest = sys.get_nframes() - split_num
+    if rest <= 0:
+        return sys, None, 0
+    else:
+        split_sys = sys.sub_system(range(split_num))
+        rest_sys = sys.sub_system(range(split_num, sys.get_nframes()))
+        return split_sys, rest_sys, rest
diff --git a/dpdata/formats/deepmd/raw.py b/dpdata/formats/deepmd/raw.py
new file mode 100644
index 000000000..50dc5afd3
--- /dev/null
+++ b/dpdata/formats/deepmd/raw.py
@@ -0,0 +1,140 @@
+from __future__ import annotations
+
+import os
+import warnings
+
+import numpy as np
+
+import dpdata
+from dpdata.utils import open_file
+
+
+def load_type(folder, type_map=None):
+    data = {}
+    data["atom_types"] = np.loadtxt(os.path.join(folder, "type.raw"), ndmin=1).astype(
+        int
+    )
+    ntypes = np.max(data["atom_types"]) + 1
+    data["atom_names"] = []
+    # if find type_map.raw, use it
+    if os.path.isfile(os.path.join(folder, "type_map.raw")):
+        with open_file(os.path.join(folder, "type_map.raw")) as fp:
+            my_type_map = fp.read().split()
+    # else try to use arg type_map
+    elif type_map is not None:
+        my_type_map = type_map
+    # in the last case, make artificial atom names
+    else:
+        my_type_map = []
+        for ii in range(ntypes):
+            my_type_map.append("Type_%d" % ii)  # noqa: UP031
+    data["atom_names"] = my_type_map
+    data["atom_numbs"] = []
+    for ii, _ in enumerate(data["atom_names"]):
+        data["atom_numbs"].append(np.count_nonzero(data["atom_types"] == ii))
+
+    return data
+
+
+def to_system_data(folder, type_map=None, labels=True):
+    if os.path.isdir(folder):
+        data = load_type(folder, type_map=type_map)
+        data["orig"] = np.zeros([3])
+        data["coords"] = np.loadtxt(os.path.join(folder, "coord.raw"), ndmin=2)
+        nframes = data["coords"].shape[0]
+        if os.path.isfile(os.path.join(folder, "nopbc")):
+            data["nopbc"] = True
+            data["cells"] = np.zeros((nframes, 3, 3))
+        else:
+            data["cells"] = np.loadtxt(os.path.join(folder, "box.raw"), ndmin=2)
+        data["cells"] = np.reshape(data["cells"], [nframes, 3, 3])
+        data["coords"] = np.reshape(data["coords"], [nframes, -1, 3])
+        if os.path.isfile(os.path.join(folder, "nopbc")):
+            data["nopbc"] = True
+        # allow custom dtypes
+        if labels:
+            dtypes = dpdata.system.LabeledSystem.DTYPES
+        else:
+            dtypes = dpdata.system.System.DTYPES
+        for dtype in dtypes:
+            if dtype.name in (
+                "atom_numbs",
+                "atom_names",
+                "atom_types",
+                "orig",
+                "cells",
+                "coords",
+                "real_atom_types",
+                "real_atom_names",
+                "nopbc",
+            ):
+                # skip as these data contains specific rules
+                continue
+            if not (len(dtype.shape) and dtype.shape[0] == dpdata.system.Axis.NFRAMES):
+                warnings.warn(
+                    f"Shape of {dtype.name} is not (nframes, ...), but {dtype.shape}. This type of data will not converted from deepmd/raw format."
+                )
+                continue
+            natoms = data["atom_types"].shape[0]
+            shape = [
+                natoms if xx == dpdata.system.Axis.NATOMS else xx
+                for xx in dtype.shape[1:]
+            ]
+            if os.path.exists(os.path.join(folder, f"{dtype.deepmd_name}.raw")):
+                data[dtype.name] = np.reshape(
+                    np.loadtxt(os.path.join(folder, f"{dtype.deepmd_name}.raw")),
+                    [nframes, *shape],
+                )
+        return data
+    else:
+        raise RuntimeError("not dir " + folder)
+
+
+def dump(folder, data):
+    os.makedirs(folder, exist_ok=True)
+    nframes = data["cells"].shape[0]
+    np.savetxt(os.path.join(folder, "type.raw"), data["atom_types"], fmt="%d")
+    np.savetxt(os.path.join(folder, "type_map.raw"), data["atom_names"], fmt="%s")
+    # BondOrder System
+    if "bonds" in data:
+        np.savetxt(
+            os.path.join(folder, "bonds.raw"),
+            data["bonds"],
+            header="begin_atom, end_atom, bond_order",
+        )
+    if "formal_charges" in data:
+        np.savetxt(os.path.join(folder, "formal_charges.raw"), data["formal_charges"])
+    try:
+        os.remove(os.path.join(folder, "nopbc"))
+    except OSError:
+        pass
+    if data.get("nopbc", False):
+        with open_file(os.path.join(folder, "nopbc"), "w") as fw_nopbc:
+            pass
+    # allow custom dtypes
+    labels = "energies" in data
+    if labels:
+        dtypes = dpdata.system.LabeledSystem.DTYPES
+    else:
+        dtypes = dpdata.system.System.DTYPES
+    for dtype in dtypes:
+        if dtype.name in (
+            "atom_numbs",
+            "atom_names",
+            "atom_types",
+            "orig",
+            "real_atom_types",
+            "real_atom_names",
+            "nopbc",
+        ):
+            # skip as these data contains specific rules
+            continue
+        if dtype.name not in data:
+            continue
+        if not (len(dtype.shape) and dtype.shape[0] == dpdata.system.Axis.NFRAMES):
+            warnings.warn(
+                f"Shape of {dtype.name} is not (nframes, ...), but {dtype.shape}. This type of data will not converted to deepmd/raw format."
+            )
+            continue
+        ddata = np.reshape(data[dtype.name], [nframes, -1])
+        np.savetxt(os.path.join(folder, f"{dtype.deepmd_name}.raw"), ddata)
diff --git a/dpdata/formats/dftbplus/__init__.py b/dpdata/formats/dftbplus/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/dpdata/formats/dftbplus/output.py b/dpdata/formats/dftbplus/output.py
new file mode 100644
index 000000000..49fdd2b1b
--- /dev/null
+++ b/dpdata/formats/dftbplus/output.py
@@ -0,0 +1,83 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+from dpdata.utils import open_file
+
+if TYPE_CHECKING:
+    from dpdata.utils import FileType
+
+
+def read_dftb_plus(
+    fn_1: FileType, fn_2: FileType
+) -> tuple[str, np.ndarray, float, np.ndarray]:
+    """Read from DFTB+ input and output.
+
+    Parameters
+    ----------
+    fn_1 : str
+        DFTB+ input file name
+    fn_2 : str
+        DFTB+ output file name
+
+    Returns
+    -------
+    str
+        atomic symbols
+    np.ndarray
+        atomic coordinates
+    float
+        total potential energy
+    np.ndarray
+        atomic forces
+
+    """
+    coord = None
+    symbols = None
+    forces = None
+    energy = None
+    with open_file(fn_1) as f:
+        flag = 0
+        for line in f:
+            if flag == 1:
+                flag += 1
+            elif flag == 2:
+                components = line.split()
+                flag += 1
+            elif line.startswith("Geometry"):
+                flag = 1
+                coord = []
+                symbols = []
+            elif flag in (3, 4, 5, 6):
+                s = line.split()
+                components_num = int(s[1])
+                symbols.append(components[components_num - 1])
+                coord.append([float(s[2]), float(s[3]), float(s[4])])
+                flag += 1
+                if flag == 7:
+                    flag = 0
+    with open_file(fn_2) as f:
+        flag = 0
+        for line in f:
+            if line.startswith("Total Forces"):
+                flag = 8
+                forces = []
+            elif flag in (8, 9, 10, 11):
+                s = line.split()
+                forces.append([float(s[1]), float(s[2]), float(s[3])])
+                flag += 1
+                if flag == 12:
+                    flag = 0
+            elif line.startswith("Total energy:"):
+                s = line.split()
+                energy = float(s[2])
+                flag = 0
+
+    symbols = np.array(symbols)
+    forces = np.array(forces)
+    coord = np.array(coord)
+    assert coord.shape == forces.shape
+
+    return symbols, coord, energy, forces
diff --git a/dpdata/formats/fhi_aims/__init__.py b/dpdata/formats/fhi_aims/__init__.py
new file mode 100755
index 000000000..e69de29bb
diff --git a/dpdata/formats/fhi_aims/output.py b/dpdata/formats/fhi_aims/output.py
new file mode 100755
index 000000000..762e8bf4d
--- /dev/null
+++ b/dpdata/formats/fhi_aims/output.py
@@ -0,0 +1,204 @@
+from __future__ import annotations
+
+import re
+import warnings
+
+import numpy as np
+
+latt_patt = r"\|\s+([0-9]{1,}[.][0-9]*)\s+([0-9]{1,}[.][0-9]*)\s+([0-9]{1,}[.][0-9]*)"
+pos_patt_first = r"\|\s+[0-9]{1,}[:]\s\w+\s(\w+)(\s.*[-]?[0-9]{1,}[.][0-9]*)(\s+[-]?[0-9]{1,}[.][0-9]*)(\s+[-]?[0-9]{1,}[.][0-9]*)"
+pos_patt_other = r"\s+[a][t][o][m]\s+([-]?[0-9]{1,}[.][0-9]*)\s+([-]?[0-9]{1,}[.][0-9]*)\s+([-]?[0-9]{1,}[.][0-9]*)\s+(\w{1,2})"
+force_patt = r"\|\s+[0-9]{1,}\s+([-]?[0-9]{1,}[.][0-9]*[E][+-][0-9]{1,})\s+([-]?[0-9]{1,}[.][0-9]*[E][+-][0-9]{1,})\s+([-]?[0-9]{1,}[.][0-9]*[E][+-][0-9]{1,})"
+eng_patt = r"Total energy uncorrected.*([-]?[0-9]{1,}[.][0-9]*[E][+-][0-9]{1,})\s+eV"
+# atom_numb_patt=r"Number of atoms.*([0-9]{1,})"
+
+debug = False
+
+
+def get_info(lines, type_idx_zero=False):
+    atom_types = []
+    atom_names = []
+    cell = []
+    atom_numbs = None
+    _atom_names = []
+
+    contents = "\n".join(lines)
+    # cell
+    # _tmp=re.findall(latt_patt,contents)
+    # for ii in _tmp:
+    #    vect=[float(kk) for kk in ii]
+    #    cell.append(vect)
+    # ------------------
+    for ln, l in enumerate(lines):
+        if l.startswith("  | Unit cell"):
+            break
+    _tmp = lines[ln + 1 : ln + 4]
+    for ii in _tmp:
+        v_str = ii.split("|")[1].split()
+        vect = [float(kk) for kk in v_str]
+        cell.append(vect)
+
+    _tmp = re.findall(pos_patt_first, contents)
+    for ii in _tmp:
+        _atom_names.append(ii[0])
+    atom_names = []
+    for ii in _atom_names:
+        if ii not in atom_names:
+            atom_names.append(ii)
+
+    atom_numbs = [_atom_names.count(ii) for ii in atom_names]
+    if type_idx_zero:
+        type_map = dict(zip(atom_names, range(len(atom_names))))
+    else:
+        type_map = dict(zip(atom_names, range(1, len(atom_names) + 1)))
+    atom_types = list(map(lambda k: type_map[k], _atom_names))
+    assert atom_numbs is not None, "cannot find ion type info in aims output"
+
+    return [cell, atom_numbs, atom_names, atom_types]
+
+
+def get_fhi_aims_block(fp):
+    blk = []
+    for ii in fp:
+        if not ii:
+            return blk
+        blk.append(ii.rstrip("\n"))
+        if "Begin self-consistency loop: Re-initialization" in ii:
+            return blk
+    return blk
+
+
+def get_frames(fname, md=True, begin=0, step=1, convergence_check=True):
+    fp = open(fname)
+    blk = get_fhi_aims_block(fp)
+    ret = get_info(blk, type_idx_zero=True)
+
+    cell, atom_numbs, atom_names, atom_types = ret[0], ret[1], ret[2], ret[3]
+    ntot = sum(atom_numbs)
+
+    all_coords = []
+    all_cells = []
+    all_energies = []
+    all_forces = []
+    all_virials = []
+
+    cc = 0
+    rec_failed = []
+    while len(blk) > 0:
+        if debug:
+            with open(str(cc), "w") as f:
+                f.write("\n".join(blk))
+        if cc >= begin and (cc - begin) % step == 0:
+            if cc == 0:
+                coord, _cell, energy, force, virial, is_converge = analyze_block(
+                    blk, first_blk=True, md=md
+                )
+            else:
+                coord, _cell, energy, force, virial, is_converge = analyze_block(
+                    blk, first_blk=False
+                )
+            if len(coord) == 0:
+                break
+            if is_converge or not convergence_check:
+                all_coords.append(coord)
+
+                if _cell:
+                    all_cells.append(_cell)
+                else:
+                    all_cells.append(cell)
+
+                all_energies.append(energy)
+                all_forces.append(force)
+                if virial is not None:
+                    all_virials.append(virial)
+            if not is_converge:
+                rec_failed.append(cc + 1)
+
+        blk = get_fhi_aims_block(fp)
+        cc += 1
+
+    if len(rec_failed) > 0:
+        prt = (
+            "so they are not collected."
+            if convergence_check
+            else "but they are still collected due to the requirement for ignoring convergence checks."
+        )
+        warnings.warn(
+            f"The following structures were unconverged: {rec_failed}; " + prt
+        )
+
+    if len(all_virials) == 0:
+        all_virials = None
+    else:
+        all_virials = np.array(all_virials)
+    fp.close()
+    return (
+        atom_names,
+        atom_numbs,
+        np.array(atom_types),
+        np.array(all_cells),
+        np.array(all_coords),
+        np.array(all_energies),
+        np.array(all_forces),
+        all_virials,
+    )
+
+
+def analyze_block(lines, first_blk=False, md=True):
+    coord = []
+    cell = []
+    energy = None
+    force = []
+    virial = None
+    atom_names = []
+    _atom_names = []
+
+    contents = "\n".join(lines)
+    try:
+        natom = int(re.findall("Number of atoms.*([0-9]{1,})", lines)[0])
+    except Exception:
+        natom = 0
+
+    if first_blk:
+        if md:
+            _tmp = re.findall(pos_patt_other, contents)[:]
+            for ii in _tmp[slice(int(len(_tmp) / 2), len(_tmp))]:
+                coord.append([float(kk) for kk in ii[:-1]])
+        else:
+            _tmp = re.findall(pos_patt_first, contents)
+            for ii in _tmp:
+                coord.append([float(kk) for kk in ii[1:]])
+    else:
+        _tmp = re.findall(pos_patt_other, contents)
+        for ii in _tmp:
+            coord.append([float(kk) for kk in ii[:-1]])
+
+    _tmp = re.findall(force_patt, contents)
+    for ii in _tmp:
+        force.append([float(kk) for kk in ii])
+
+    if "Self-consistency cycle converged" in contents:
+        is_converge = True
+    else:
+        is_converge = False
+
+    try:
+        _eng_patt = re.compile(eng_patt)
+        energy = float(_eng_patt.search(contents).group().split()[-2])
+    except Exception:
+        energy = None
+
+    if not energy:
+        is_converge = False
+
+    if energy:
+        assert (force is not None) and len(coord) > 0
+
+    return coord, cell, energy, force, virial, is_converge
+
+
+if __name__ == "__main__":
+    import sys
+
+    ret = get_frames(sys.argv[1], begin=0, step=1)
+    print(ret)
diff --git a/dpdata/formats/gaussian/__init__.py b/dpdata/formats/gaussian/__init__.py
new file mode 100644
index 000000000..7ebe5e1cb
--- /dev/null
+++ b/dpdata/formats/gaussian/__init__.py
@@ -0,0 +1,5 @@
+from __future__ import annotations
+
+from . import fchk, gjf, log
+
+__all__ = ["fchk", "gjf", "log"]
diff --git a/dpdata/formats/gaussian/fchk.py b/dpdata/formats/gaussian/fchk.py
new file mode 100644
index 000000000..ab882cc21
--- /dev/null
+++ b/dpdata/formats/gaussian/fchk.py
@@ -0,0 +1,175 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+from dpdata.utils import open_file
+
+if TYPE_CHECKING:
+    from dpdata.utils import FileType
+
+from ...periodic_table import ELEMENTS
+from ...unit import (
+    EnergyConversion,
+    ForceConversion,
+    HessianConversion,
+    LengthConversion,
+)
+
+length_convert = LengthConversion("bohr", "angstrom").value()
+energy_convert = EnergyConversion("hartree", "eV").value()
+force_convert = ForceConversion("hartree/bohr", "eV/angstrom").value()
+hessian_convert = HessianConversion("hartree/bohr^2", "eV/angstrom^2").value()
+
+
+def create_full_hessian(hessian_raw: list | np.ndarray, natoms: int) -> np.ndarray:
+    """
+    Reconstructs the full, symmetric Hessian matrix from a 1D array
+    containing its lower triangular elements.
+
+    Args:
+        hessian_raw (list | np.ndarray): A 1D list or NumPy array containing the
+                                         lower triangular elements (including the
+                                         diagonal) of the Hessian matrix.
+        natoms (int): The number of atoms in the system.
+
+    Returns
+    -------
+    np.ndarray: A full, symmetric (3*natoms, 3*natoms) Hessian matrix.
+
+    Raises
+    ------
+    ValueError: If the number of elements in `hessian_raw` does not match
+        the expected number for the lower triangle of a
+        (3*natoms, 3*natoms) matrix.
+    """
+    # Convert input to a NumPy array in case it's a list
+    hessian_block = np.array(hessian_raw)
+
+    # Calculate the dimension of the final matrix
+    dim = 3 * natoms
+
+    # Validate that the input data has the correct length
+    # A lower triangle of an n x n matrix has n*(n+1)/2 elements
+    expected_length = dim * (dim + 1) // 2
+    if hessian_block.size != expected_length:
+        raise ValueError(
+            f"Input length {hessian_block.size} != expected {expected_length}"
+        )
+
+    # Create a zero matrix, then fill the lower triangle
+    hessian_full = np.zeros((dim, dim), dtype=hessian_block.dtype)
+    lower_triangle_indices = np.tril_indices(dim)
+    hessian_full[lower_triangle_indices] = hessian_block
+
+    # This is done by copying the lower triangle to the upper triangle
+    # M_full = M_lower + M_lower.T - diag(M_lower)
+    hessian_full = hessian_full + hessian_full.T - np.diag(np.diag(hessian_full))
+
+    return hessian_full
+
+
+def to_system_data(file_name: FileType, has_forces=True, has_hessian=True):
+    """Read Gaussian fchk file.
+
+    Parameters
+    ----------
+    file_name : str
+        file name
+    has_forces : bool, default True
+        whether to read force
+        Note: Cartesian Gradient in fchk file is converted to forces by taking negative sign
+    has_hessian : bool, default True
+        whether to read hessian
+
+    Returns
+    -------
+    data : dict
+        system data, including hessian if has_hessian is True
+    """
+    data = {}
+    natoms = 0
+    atom_numbers = []
+    coords_t = []
+    energy_t = []
+    forces_t = []
+    hessian_t = []
+    # Read fchk file
+    with open_file(file_name) as fp:
+        for line in fp:
+            if isinstance(line, bytes):
+                line = line.decode(errors="ignore")
+            if "Number of atoms" in line:
+                natoms = int(line.split()[-1])
+            elif "Atomic numbers" in line and "I" in line:
+                n = int(line.split()[-1])
+                atom_numbers = []
+                while len(atom_numbers) < n:
+                    next_line = next(fp)
+                    if isinstance(next_line, bytes):
+                        next_line = next_line.decode(errors="ignore")
+                    atom_numbers += [int(x) for x in next_line.split()]
+            elif "Current cartesian coordinates" in line and "R" in line:
+                n = int(line.split()[-1])
+                coords_raw = []
+                while len(coords_raw) < n:
+                    next_line = next(fp)
+                    if isinstance(next_line, bytes):
+                        next_line = next_line.decode(errors="ignore")
+                    coords_raw += [float(x) for x in next_line.split()]
+                coords = np.array(coords_raw).reshape(-1, 3) * length_convert
+                coords_t.append(coords)
+            elif "Total Energy" in line:
+                energy = float(line.split()[-1]) * energy_convert
+                energy_t.append(energy)
+            elif "Cartesian Gradient" in line:
+                n = int(line.split()[-1])
+                forces_raw = []
+                while len(forces_raw) < n:
+                    next_line = next(fp)
+                    if isinstance(next_line, bytes):
+                        next_line = next_line.decode(errors="ignore")
+                    forces_raw += [float(x) for x in next_line.split()]
+                # Cartesian Gradient is the negative of forces: F = -∇E
+                forces = -np.array(forces_raw).reshape(-1, 3) * force_convert
+                forces_t.append(forces)
+            elif "Cartesian Force Constants" in line and "R" in line:
+                n = int(line.split()[-1])
+                hessian_raw = []
+                while len(hessian_raw) < n:
+                    next_line = next(fp)
+                    if isinstance(next_line, bytes):
+                        next_line = next_line.decode(errors="ignore")
+                    hessian_raw += [float(x) for x in next_line.split()]
+                hessian_full = (
+                    create_full_hessian(hessian_raw, natoms) * hessian_convert
+                )
+                # store as (natoms, 3, natoms, 3) to align with registered shape
+                hessian_t.append(hessian_full.reshape(natoms, 3, natoms, 3))
+    # Assert key data
+    assert coords_t, "cannot find coords"
+    assert energy_t, "cannot find energy"
+    if has_forces:
+        assert forces_t, "cannot find forces"
+    if has_hessian:
+        assert hessian_t, "cannot find hessian"
+    # Assemble data
+    atom_symbols = [ELEMENTS[z - 1] for z in atom_numbers]
+    atom_names, atom_types, atom_numbs = np.unique(
+        atom_symbols, return_inverse=True, return_counts=True
+    )
+    data["atom_names"] = list(atom_names)
+    data["atom_numbs"] = list(atom_numbs)
+    data["atom_types"] = atom_types
+    data["coords"] = np.array(coords_t).reshape(-1, natoms, 3)
+    data["orig"] = np.zeros(3)
+    data["cells"] = np.array([np.eye(3) * 100])
+    data["nopbc"] = True
+    if energy_t:
+        data["energies"] = np.array(energy_t)
+    if has_forces and forces_t:
+        data["forces"] = np.array(forces_t)
+    if has_hessian and hessian_t:
+        data["hessian"] = np.array(hessian_t)
+    return data
diff --git a/dpdata/formats/gaussian/gjf.py b/dpdata/formats/gaussian/gjf.py
new file mode 100644
index 000000000..419ec354c
--- /dev/null
+++ b/dpdata/formats/gaussian/gjf.py
@@ -0,0 +1,335 @@
+# The initial code of this file is based on
+# https://github.com/deepmodeling/dpgen/blob/0767dce7cad29367edb2e4a55fd0d8724dbda642/dpgen/generator/lib/gaussian.py#L1-L190
+# under LGPL 3.0 license
+"""Generate Gaussian input file."""
+
+from __future__ import annotations
+
+import itertools
+import re
+import uuid
+import warnings
+
+import numpy as np
+
+from dpdata.periodic_table import Element
+
+
+def _crd2frag(symbols: list[str], crds: np.ndarray) -> tuple[int, list[int]]:
+    """Detect fragments from coordinates.
+
+    Parameters
+    ----------
+    symbols : list[str]
+        element symbols; virtual elements are not supported
+    crds : np.ndarray
+        atomic coordinates, shape: (N, 3)
+
+    Returns
+    -------
+    frag_numb : int
+        number of fragments
+    frag_index : list[int]
+        frament index that each atom belongs to
+
+    Notes
+    -----
+    In this method, Open Babel is used to detect bond connectivity. The threshold
+    is the sum of covalent radii with a slight tolerance (0.45 A). Note that
+    this threshold has errors.
+
+    PBC support is removed from this method as Gaussian does not support PBC calculation.
+
+    Raises
+    ------
+    ImportError
+        if Open Babel is not installed
+    """
+    from scipy.sparse import csr_matrix
+    from scipy.sparse.csgraph import connected_components
+
+    try:
+        from openbabel import openbabel
+    except ImportError:
+        import openbabel
+    atomnumber = len(symbols)
+    # Use openbabel to connect atoms
+    mol = openbabel.OBMol()
+    mol.BeginModify()
+    for idx, (symbol, position) in enumerate(zip(symbols, crds.astype(np.float64))):
+        num = Element(symbol).Z
+        atom = mol.NewAtom(idx)
+        atom.SetAtomicNum(int(num))
+        atom.SetVector(*position)
+    mol.ConnectTheDots()
+    mol.PerceiveBondOrders()
+    mol.EndModify()
+    bonds = []
+    for ii in range(mol.NumBonds()):
+        bond = mol.GetBond(ii)
+        a = bond.GetBeginAtom().GetId()
+        b = bond.GetEndAtom().GetId()
+        bo = bond.GetBondOrder()
+        bonds.extend([[a, b, bo], [b, a, bo]])
+    bonds = np.array(bonds, ndmin=2).reshape((-1, 3))
+    graph = csr_matrix(
+        (bonds[:, 2], (bonds[:, 0], bonds[:, 1])), shape=(atomnumber, atomnumber)
+    )
+    frag_numb, frag_index = connected_components(graph, 0)
+    return frag_numb, frag_index
+
+
+def detect_multiplicity(symbols: np.ndarray) -> int:
+    """Find the minimal multiplicity of the given molecules.
+
+    Parameters
+    ----------
+    symbols : np.ndarray
+        element symbols; virtual elements are not supported
+
+    Returns
+    -------
+    int
+        spin multiplicity
+    """
+    # currently only support charge=0
+    # oxygen -> 3
+    if np.count_nonzero(symbols == ["O"]) == 2 and symbols.size == 2:
+        return 3
+    # calculates the total number of electrons, assumes they are paired as much as possible
+    n_total = sum([Element(s).Z for s in symbols])
+    return n_total % 2 + 1
+
+
+def make_gaussian_input(
+    sys_data: dict,
+    keywords: str | list[str],
+    multiplicity: str | int = "auto",
+    charge: int = 0,
+    fragment_guesses: bool = False,
+    basis_set: str | None = None,
+    keywords_high_multiplicity: str | None = None,
+    nproc: int = 1,
+) -> str:
+    """Make gaussian input file.
+
+    Parameters
+    ----------
+    sys_data : dict
+        system data
+    keywords : str or list[str]
+        Gaussian keywords, e.g. force b3lyp/6-31g**. If a list,
+        run multiple steps
+    multiplicity : str or int, default=auto
+        spin multiplicity state. It can be a number. If auto,
+        multiplicity will be detected automatically, with the
+        following rules:
+            fragment_guesses=True
+                multiplicity will +1 for each radical, and +2
+                for each oxygen molecule
+            fragment_guesses=False
+                multiplicity will be 1 or 2, but +2 for each
+                oxygen molecule
+    charge : int, default=0
+        molecule charge. Only used when charge is not provided
+        by the system
+    fragment_guesses : bool, default=False
+        initial guess generated from fragment guesses. If True,
+        multiplicity should be auto
+    basis_set : str, default=None
+        custom basis set
+    keywords_high_multiplicity : str, default=None
+        keywords for points with multiple raicals. multiplicity
+        should be auto. If not set, fallback to normal keywords
+    nproc : int, default=1
+        Number of CPUs to use
+
+    Returns
+    -------
+    str
+        gjf output string
+    """
+    coordinates = sys_data["coords"][0]
+    atom_names = sys_data["atom_names"]
+    atom_numbs = sys_data["atom_numbs"]
+    atom_types = sys_data["atom_types"]
+    # get atom symbols list
+    symbols = [atom_names[atom_type] for atom_type in atom_types]
+
+    # assume default charge is zero and default spin multiplicity is 1
+    if "charge" in sys_data.keys():
+        charge = sys_data["charge"]
+
+    use_fragment_guesses = False
+    if isinstance(multiplicity, int):
+        mult_auto = False
+    elif multiplicity == "auto":
+        mult_auto = True
+    else:
+        raise RuntimeError('The keyword "multiplicity" is illegal.')
+
+    if fragment_guesses:
+        # Initial guess generated from fragment guesses
+        # New feature of Gaussian 16
+        use_fragment_guesses = True
+        if not mult_auto:
+            warnings.warn("Automatically set multiplicity to auto!")
+            mult_auto = True
+
+    if mult_auto:
+        frag_numb, frag_index = _crd2frag(symbols, coordinates)
+        if frag_numb == 1:
+            use_fragment_guesses = False
+        mult_frags = []
+        for i in range(frag_numb):
+            idx = frag_index == i
+            mult_frags.append(detect_multiplicity(np.array(symbols)[idx]))
+        if use_fragment_guesses:
+            multiplicity = sum(mult_frags) - frag_numb + 1 - charge % 2
+            chargekeywords_frag = "%d %d" % (charge, multiplicity) + "".join(  # noqa: UP031
+                [" %d %d" % (charge, mult_frag) for mult_frag in mult_frags]  # noqa: UP031
+            )
+        else:
+            multi_frags = np.array(mult_frags)
+            multiplicity = (
+                1
+                + np.count_nonzero(multi_frags == 2) % 2
+                + np.count_nonzero(multi_frags == 3) * 2
+                - charge % 2
+            )
+
+        if (
+            keywords_high_multiplicity is not None
+            and np.count_nonzero(multi_frags == 2) >= 2
+        ):
+            # at least 2 radicals
+            keywords = keywords_high_multiplicity
+
+    if isinstance(keywords, str):
+        keywords = [keywords]
+    else:
+        keywords = keywords.copy()
+
+    buff = []
+    # keywords, e.g., force b3lyp/6-31g**
+    if use_fragment_guesses:
+        keywords[0] = f"{keywords[0]} guess=fragment={frag_numb}"
+
+    chkkeywords = []
+    if len(keywords) > 1:
+        chkkeywords.append(f"%chk={str(uuid.uuid1())}.chk")
+
+    nprockeywords = f"%nproc={nproc:d}"
+    # use formula as title
+    titlekeywords = "".join(
+        [f"{symbol}{numb}" for symbol, numb in zip(atom_names, atom_numbs)]
+    )
+    chargekeywords = f"{charge} {multiplicity}"
+
+    buff = [
+        *chkkeywords,
+        nprockeywords,
+        f"#{keywords[0]}",
+        "",
+        titlekeywords,
+        "",
+        (chargekeywords_frag if use_fragment_guesses else chargekeywords),
+    ]
+
+    for ii, (symbol, coordinate) in enumerate(zip(symbols, coordinates)):
+        if use_fragment_guesses:
+            buff.append(
+                "%s(Fragment=%d) %f %f %f" % (symbol, frag_index[ii] + 1, *coordinate)  # noqa: UP031
+            )
+        else:
+            buff.append("{} {:f} {:f} {:f}".format(symbol, *coordinate))  # noqa: UP031
+    if not sys_data.get("nopbc", False):
+        # PBC condition
+        cell = sys_data["cells"][0]
+        for ii in range(3):
+            # use TV as atomic symbol, see https://gaussian.com/pbc/
+            buff.append("TV {:f} {:f} {:f}".format(*cell[ii]))
+    if basis_set is not None:
+        # custom basis set
+        buff.extend(["", basis_set, ""])
+    for kw in itertools.islice(keywords, 1, None):
+        buff.extend(
+            [
+                "\n--link1--",
+                *chkkeywords,
+                nprockeywords,
+                f"#{kw}",
+                "",
+                titlekeywords,
+                "",
+                chargekeywords,
+                "",
+            ]
+        )
+    buff.append("\n")
+    return "\n".join(buff)
+
+
+def read_gaussian_input(inp: str):
+    """Read Gaussian input.
+
+    Parameters
+    ----------
+    inp : str
+        Gaussian input str
+
+    Returns
+    -------
+    dict
+        system data
+    """
+    flag = 0
+    coords = []
+    elements = []
+    cells = []
+    for line in inp.split("\n"):
+        if not line.strip():
+            # empty line
+            flag += 1
+        elif flag == 0:
+            # keywords
+            if line.startswith("#"):
+                # setting
+                keywords = line.split()
+            elif line.startswith("%"):
+                pass
+        elif flag == 1:
+            # title
+            pass
+        elif flag == 2:
+            # multi and coords
+            s = line.split()
+            if len(s) == 2:
+                pass
+            elif len(s) == 4:
+                if s[0] == "TV":
+                    cells.append(list(map(float, s[1:4])))
+                else:
+                    # element
+                    elements.append(re.sub("\\(.*?\\)|\\{.*?}|\\[.*?]", "", s[0]))
+                    coords.append(list(map(float, s[1:4])))
+        elif flag == 3:
+            # end
+            break
+    atom_names, atom_types, atom_numbs = np.unique(
+        elements, return_inverse=True, return_counts=True
+    )
+    if len(cells):
+        nopbc = False
+    else:
+        nopbc = True
+        cells = np.array([np.eye(3)]) * 100
+    return {
+        "atom_names": list(atom_names),
+        "atom_numbs": list(atom_numbs),
+        "atom_types": atom_types,
+        "cells": np.array(cells).reshape(1, 3, 3),
+        "nopbc": nopbc,
+        "coords": np.array(coords).reshape(1, -1, 3),
+        "orig": np.zeros(3),
+    }
diff --git a/dpdata/formats/gaussian/log.py b/dpdata/formats/gaussian/log.py
new file mode 100644
index 000000000..a94141901
--- /dev/null
+++ b/dpdata/formats/gaussian/log.py
@@ -0,0 +1,136 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+from dpdata.utils import open_file
+
+if TYPE_CHECKING:
+    from dpdata.utils import FileType
+
+from ...periodic_table import ELEMENTS
+from ...unit import EnergyConversion, ForceConversion, LengthConversion
+
+length_convert = LengthConversion("bohr", "angstrom").value()
+energy_convert = EnergyConversion("hartree", "eV").value()
+force_convert = ForceConversion("hartree/bohr", "eV/angstrom").value()
+
+symbols = ["X"] + ELEMENTS
+
+
+def to_system_data(file_name: FileType, md=False):
+    """Read Gaussian log file.
+
+    Parameters
+    ----------
+    file_name : str
+        file name
+    md : bool, default False
+        whether to read multiple frames
+
+    Returns
+    -------
+    data : dict
+        system data
+
+    Raises
+    ------
+    RuntimeError
+        if the input orientation is not found
+    """
+    data = {}
+    # read from log lines
+    flag = 0
+    energy_t = []
+    coords_t = []
+    atom_symbols = []
+    forces_t = []
+    cells_t = []
+    nopbc = True
+    coords = None
+
+    with open_file(file_name) as fp:
+        for line in fp:
+            if line.startswith(" SCF Done"):
+                # energies
+                energy = float(line.split()[4])
+            elif line.startswith(
+                " Center     Atomic                   Forces (Hartrees/Bohr)"
+            ):
+                flag = 1
+                forces = []
+            elif line.startswith(
+                "                          Input orientation:"
+            ) or line.startswith("                         Z-Matrix orientation:"):
+                flag = 5
+                coords = []
+                atom_symbols = []
+                cells = []
+
+            if 1 <= flag <= 3 or 5 <= flag <= 9:
+                flag += 1
+            elif flag == 4:
+                # forces
+                if line.startswith(" -------"):
+                    if coords is None:
+                        raise RuntimeError(
+                            "Input orientation is not found. Using Gaussian keyword "
+                            "`Geom=PrintInputOrient` to always print the input orientation. "
+                            "See https://gaussian.com/geom/ for more details."
+                        )
+                    forces_t.append(forces)
+                    energy_t.append(energy)
+                    coords_t.append(coords)
+                    if cells:
+                        nopbc = False
+                        cells_t.append(cells)
+                    else:
+                        cells_t.append(
+                            [[100.0, 0.0, 0.0], [0.0, 100.0, 0.0], [0.0, 0.0, 100.0]]
+                        )
+                    flag = 0
+                    coords = None
+                else:
+                    s = line.split()
+                    if line[14:16] == "-2":
+                        # PBC
+                        pass
+                    else:
+                        forces.append(
+                            [float(line[23:38]), float(line[38:53]), float(line[53:68])]
+                        )
+            elif flag == 10:
+                # atom_symbols and coords
+                if line.startswith(" -------"):
+                    flag = 0
+                else:
+                    s = line.split()
+                    if int(s[1]) == -2:
+                        # PBC cells, see https://gaussian.com/pbc/
+                        cells.append([float(x) for x in s[3:6]])
+                    else:
+                        coords.append([float(x) for x in s[3:6]])
+                        atom_symbols.append(symbols[int(s[1])])
+
+    assert coords_t, "cannot find coords"
+    assert energy_t, "cannot find energies"
+    assert forces_t, "cannot find forces"
+
+    atom_names, data["atom_types"], atom_numbs = np.unique(
+        atom_symbols, return_inverse=True, return_counts=True
+    )
+    data["atom_names"] = list(atom_names)
+    data["atom_numbs"] = list(atom_numbs)
+    if not md:
+        forces_t = forces_t[-1:]
+        energy_t = energy_t[-1:]
+        coords_t = coords_t[-1:]
+        cells_t = cells_t[-1:]
+    data["forces"] = np.array(forces_t) * force_convert
+    data["energies"] = np.array(energy_t) * energy_convert
+    data["coords"] = np.array(coords_t)
+    data["orig"] = np.array([0, 0, 0])
+    data["cells"] = np.array(cells_t)
+    data["nopbc"] = nopbc
+    return data
diff --git a/dpdata/formats/gromacs/__init__.py b/dpdata/formats/gromacs/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/dpdata/formats/gromacs/gro.py b/dpdata/formats/gromacs/gro.py
new file mode 100644
index 000000000..0c61544fd
--- /dev/null
+++ b/dpdata/formats/gromacs/gro.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import re
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+from dpdata.utils import open_file
+
+if TYPE_CHECKING:
+    from dpdata.utils import FileType
+
+from ...unit import LengthConversion
+
+nm2ang = LengthConversion("nm", "angstrom").value()
+ang2nm = LengthConversion("angstrom", "nm").value()
+cell_idx_gmx2dp = [0, 4, 8, 1, 2, 3, 5, 6, 7]
+
+
+def _format_atom_name(atom_name):
+    patt = re.compile("[a-zA-Z]*")
+    match = re.search(patt, atom_name)
+    fmt_name = match.group().capitalize()
+    return fmt_name
+
+
+def _get_line(line, fmt_atom_name=True):
+    atom_name = line[10:15].split()[0]
+    if fmt_atom_name:
+        atom_name = _format_atom_name(atom_name)
+    atom_idx = int(line[15:20].split()[0])
+    posis = [float(line[ii : ii + 8]) for ii in range(20, 44, 8)]
+    posis = np.array(posis) * nm2ang
+    return atom_name, atom_idx, posis
+
+
+def _get_cell(line):
+    cell = np.zeros([3, 3])
+    lengths = [float(ii) for ii in line.split()]
+    if len(lengths) >= 3:
+        for dd in range(3):
+            cell[dd][dd] = lengths[dd]
+    else:
+        raise RuntimeError("wrong box format: ", line)
+    if len(lengths) == 9:
+        cell[0][1] = lengths[3]
+        cell[0][2] = lengths[4]
+        cell[1][0] = lengths[5]
+        cell[1][2] = lengths[6]
+        cell[2][0] = lengths[7]
+        cell[2][1] = lengths[8]
+    cell = cell * nm2ang
+    return cell
+
+
+def file_to_system_data(fname: FileType, format_atom_name=True, **kwargs):
+    system = {"coords": [], "cells": []}
+    with open_file(fname) as fp:
+        frame = 0
+        while True:
+            flag = fp.readline()
+            if not flag:
+                break
+            else:
+                frame += 1
+                names = []
+                idxs = []
+                posis = []
+                natoms = int(fp.readline())
+                for ii in range(natoms):
+                    n, i, p = _get_line(fp.readline(), fmt_atom_name=format_atom_name)
+                    names.append(n)
+                    idxs.append(i)
+                    posis.append(p)
+                cell = _get_cell(fp.readline())
+                posis = np.array(posis)
+                if frame == 1:
+                    system["orig"] = np.zeros(3)
+                    system["atom_names"] = list(set(names))
+                    system["atom_numbs"] = [
+                        names.count(ii) for ii in system["atom_names"]
+                    ]
+                    system["atom_types"] = [
+                        system["atom_names"].index(ii) for ii in names
+                    ]
+                    system["atom_types"] = np.array(system["atom_types"], dtype=int)
+                system["coords"].append(posis)
+                system["cells"].append(cell)
+    system["coords"] = np.array(system["coords"])
+    system["cells"] = np.array(system["cells"])
+    return system
+
+
+def from_system_data(system, f_idx=0, **kwargs):
+    resname = kwargs.get("resname", "MOL")
+    shift = kwargs.get("shift", 0)
+    ret = ""
+    ret += " molecule" + "\n"
+    n_atoms = sum(system["atom_numbs"])
+    ret += " " + str(n_atoms) + "\n"
+    for i in range(n_atoms):
+        atom_type = system["atom_types"][i]
+        atom_name = system["atom_names"][atom_type]
+        coords = system["coords"][f_idx] * ang2nm
+        ret += "{:>5d}{:<5s}{:>5s}{:5d}{:8.3f}{:8.3f}{:8.3f}\n".format(
+            1, resname, atom_name, i + shift + 1, *tuple(coords[i])
+        )
+    cell = (system["cells"][f_idx].flatten() * ang2nm)[cell_idx_gmx2dp]
+    ret += " " + " ".join([f"{x:.3f}" for x in cell])
+
+    return ret
diff --git a/dpdata/formats/lammps/__init__.py b/dpdata/formats/lammps/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/dpdata/formats/lammps/dump.py b/dpdata/formats/lammps/dump.py
new file mode 100644
index 000000000..89e75e4de
--- /dev/null
+++ b/dpdata/formats/lammps/dump.py
@@ -0,0 +1,433 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import os
+import sys
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+from dpdata.utils import open_file
+
+if TYPE_CHECKING:
+    from dpdata.utils import FileType
+
+lib_path = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(lib_path)
+import warnings
+
+import lmp
+
+
+class UnwrapWarning(UserWarning):
+    pass
+
+
+warnings.simplefilter("once", UnwrapWarning)
+
+
+def _get_block(lines, key):
+    for idx in range(len(lines)):
+        if ("ITEM: " + key) in lines[idx]:
+            break
+    idx_s = idx + 1
+    for idx in range(idx_s, len(lines)):
+        if ("ITEM: ") in lines[idx]:
+            break
+    idx_e = idx
+    if idx_e == len(lines) - 1:
+        idx_e += 1
+    return lines[idx_s:idx_e], lines[idx_s - 1]
+
+
+def get_atype(lines, type_idx_zero=False):
+    blk, head = _get_block(lines, "ATOMS")
+    keys = head.split()
+    id_idx = keys.index("id") - 2
+    tidx = keys.index("type") - 2
+    atype = []
+    for ii in blk:
+        atype.append([int(ii.split()[id_idx]), int(ii.split()[tidx])])
+    atype.sort()
+    atype = np.array(atype, dtype=int)
+    if type_idx_zero:
+        return atype[:, 1] - 1
+    else:
+        return atype[:, 1]
+
+
+def get_natoms(lines):
+    blk, head = _get_block(lines, "NUMBER OF ATOMS")
+    return int(blk[0])
+
+
+def get_natomtypes(lines):
+    atype = get_atype(lines)
+    return max(atype)
+
+
+def get_natoms_vec(lines):
+    atype = get_atype(lines)
+    natoms_vec = []
+    natomtypes = get_natomtypes(lines)
+    for ii in range(natomtypes):
+        natoms_vec.append(sum(atype == ii + 1))
+    assert sum(natoms_vec) == get_natoms(lines)
+    return natoms_vec
+
+
+def get_coordtype_and_scalefactor(keys):
+    # 4 types in total,with different scaling factor
+    key_pc = ["x", "y", "z"]  # plain cartesian, sf = 1
+    key_uc = ["xu", "yu", "zu"]  # unwraped cartesian, sf = 1
+    key_s = ["xs", "ys", "zs"]  # scaled by lattice parameter, sf = lattice parameter
+    key_su = ["xsu", "ysu", "zsu"]  # scaled and unfolded,sf = lattice parameter
+    lmp_coor_type = [key_pc, key_uc, key_s, key_su]
+    sf = [0, 0, 1, 1]
+    uw = [0, 1, 0, 1]  # unwraped or not
+    for k in range(4):
+        if all(i in keys for i in lmp_coor_type[k]):
+            return lmp_coor_type[k], sf[k], uw[k]
+
+
+def safe_get_posi(lines, cell, orig=np.zeros(3), unwrap=False):
+    blk, head = _get_block(lines, "ATOMS")
+    keys = head.split()
+    coord_tp_and_sf = get_coordtype_and_scalefactor(keys)
+    assert coord_tp_and_sf is not None, "Dump file does not contain atomic coordinates!"
+    coordtype, sf, uw = coord_tp_and_sf
+    id_idx = keys.index("id") - 2
+    xidx = keys.index(coordtype[0]) - 2
+    yidx = keys.index(coordtype[1]) - 2
+    zidx = keys.index(coordtype[2]) - 2
+    posis = []
+    for ii in blk:
+        words = ii.split()
+        posis.append(
+            [
+                float(words[id_idx]),
+                float(words[xidx]),
+                float(words[yidx]),
+                float(words[zidx]),
+            ]
+        )
+    posis.sort()
+    posis = np.array(posis)[:, 1:4]
+    if not sf:
+        posis = (posis - orig) @ np.linalg.inv(
+            cell
+        )  # Convert to scaled coordinates for unscaled coordinates
+    if uw and unwrap:
+        return (
+            posis @ cell
+        )  # convert scaled coordinates back to Cartesien coordinates unwrap at the periodic boundaries
+    else:
+        if uw and not unwrap:
+            warnings.warn(
+                message="Your dump file contains unwrapped coordinates, but you did not specify unwrapping (unwrap = True). The default is wrapping at periodic boundaries (unwrap = False).\n",
+                category=UnwrapWarning,
+            )
+        return (
+            (posis % 1) @ cell
+        )  # Convert scaled coordinates back to Cartesien coordinates with wraping at periodic boundary conditions
+
+
+def get_dumpbox(lines):
+    blk, h = _get_block(lines, "BOX BOUNDS")
+    bounds = np.zeros([3, 2])
+    tilt = np.zeros([3])
+    load_tilt = "xy xz yz" in h
+    for dd in range(3):
+        info = [float(jj) for jj in blk[dd].split()]
+        bounds[dd][0] = info[0]
+        bounds[dd][1] = info[1]
+        if load_tilt:
+            tilt[dd] = info[2]
+    return bounds, tilt
+
+
+def dumpbox2box(bounds, tilt):
+    xy = tilt[0]
+    xz = tilt[1]
+    yz = tilt[2]
+    xlo = bounds[0][0] - min(0.0, xy, xz, xy + xz)
+    xhi = bounds[0][1] - max(0.0, xy, xz, xy + xz)
+    ylo = bounds[1][0] - min(0.0, yz)
+    yhi = bounds[1][1] - max(0.0, yz)
+    zlo = bounds[2][0]
+    zhi = bounds[2][1]
+    info = [[xlo, xhi], [ylo, yhi], [zlo, zhi]]
+    return lmp.lmpbox2box(info, tilt)
+
+
+def box2dumpbox(orig, box):
+    lohi, tilt = lmp.box2lmpbox(orig, box)
+    xy = tilt[0]
+    xz = tilt[1]
+    yz = tilt[2]
+    bounds = np.zeros([3, 2])
+    bounds[0][0] = lohi[0][0] + min(0.0, xy, xz, xy + xz)
+    bounds[0][1] = lohi[0][1] + max(0.0, xy, xz, xy + xz)
+    bounds[1][0] = lohi[1][0] + min(0.0, yz)
+    bounds[1][1] = lohi[1][1] + max(0.0, yz)
+    bounds[2][0] = lohi[2][0]
+    bounds[2][1] = lohi[2][1]
+    return bounds, tilt
+
+
+def load_file(fname: FileType, begin=0, step=1):
+    lines = []
+    buff = []
+    cc = -1
+    with open_file(fname) as fp:
+        while True:
+            line = fp.readline().rstrip("\n")
+            if not line:
+                if cc >= begin and (cc - begin) % step == 0:
+                    lines += buff
+                    buff = []
+                cc += 1
+                return lines
+            if "ITEM: TIMESTEP" in line:
+                if cc >= begin and (cc - begin) % step == 0:
+                    lines += buff
+                    buff = []
+                cc += 1
+            if cc >= begin and (cc - begin) % step == 0:
+                buff.append(line)
+
+
+def get_spin_keys(inputfile):
+    """
+    Read input file and get the keys for spin info in dump.
+
+    Parameters
+    ----------
+    inputfile : str
+        Path to the input file.
+
+    Returns
+    -------
+    list or None
+        List of spin info keys if found, None otherwise.
+    """
+    if inputfile is None:
+        return None
+
+    if not os.path.isfile(inputfile):
+        warnings.warn(f"Input file {inputfile} not found.")
+        return None
+
+    with open(inputfile) as f:
+        for line in f.readlines():
+            ls = line.split()
+            if (
+                len(ls) > 7
+                and ls[0] == "compute"
+                and all(key in ls for key in ["sp", "spx", "spy", "spz"])
+            ):
+                compute_name = ls[1]
+                return [
+                    f"c_{compute_name}[{ls.index(key) - 3}]"
+                    for key in ["sp", "spx", "spy", "spz"]
+                ]
+
+    return None
+
+
+def get_spin(lines, spin_keys):
+    """
+    Get the spin info from the dump file.
+
+    Parameters
+    ----------
+    lines : list
+        The content of the dump file.
+    spin_keys : list
+        The keys for spin info in dump file.
+    the spin info is stored in sp, spx, spy, spz or spin_keys, which is the spin norm and the spin vector
+    1 1 0.00141160 5.64868599 0.01005602 1.54706291 0.00000000 0.00000000 1.00000000 -1.40772100 -2.03739417 -1522.64797384 -0.00397809 -0.00190426 -0.00743976
+    """
+    blk, head = _get_block(lines, "ATOMS")
+    heads = head.split()
+
+    if spin_keys is not None and all(i in heads for i in spin_keys):
+        key = spin_keys
+    else:
+        return None
+
+    try:
+        idx_id = heads.index("id") - 2
+        idx_sp, idx_spx, idx_spy, idx_spz = (heads.index(k) - 2 for k in key)
+
+        norm = []
+        vec = []
+        atom_ids = []
+        for line in blk:
+            words = line.split()
+            norm.append([float(words[idx_sp])])
+            vec.append(
+                [float(words[idx_spx]), float(words[idx_spy]), float(words[idx_spz])]
+            )
+            atom_ids.append(int(words[idx_id]))
+
+        spin = np.array(norm) * np.array(vec)
+        atom_ids, spin = zip(*sorted(zip(atom_ids, spin)))
+        return np.array(spin)
+    except (ValueError, IndexError) as e:
+        warnings.warn(f"Error processing spin data: {str(e)}")
+        return None
+
+
+def system_data(
+    lines, type_map=None, type_idx_zero=True, unwrap=False, input_file=None
+):
+    array_lines = split_traj(lines)
+    lines = array_lines[0]
+    system = {}
+    system["atom_numbs"] = get_natoms_vec(lines)
+    system["atom_names"] = []
+    if type_map is None:
+        for ii in range(len(system["atom_numbs"])):
+            system["atom_names"].append("TYPE_%d" % ii)  # noqa: UP031
+    else:
+        assert len(type_map) >= len(system["atom_numbs"])
+        for ii in range(len(system["atom_numbs"])):
+            system["atom_names"].append(type_map[ii])
+    bounds, tilt = get_dumpbox(lines)
+    orig, cell = dumpbox2box(bounds, tilt)
+    system["orig"] = np.array(orig) - np.array(orig)
+    system["cells"] = [np.array(cell)]
+    system["atom_types"] = get_atype(lines, type_idx_zero=type_idx_zero)
+    system["coords"] = [safe_get_posi(lines, cell, np.array(orig), unwrap)]
+    spin_keys = get_spin_keys(input_file)
+    spin = get_spin(lines, spin_keys)
+    has_spin = False
+    if spin is not None:
+        system["spins"] = [spin]
+        has_spin = True
+    for ii in range(1, len(array_lines)):
+        bounds, tilt = get_dumpbox(array_lines[ii])
+        orig, cell = dumpbox2box(bounds, tilt)
+        system["cells"].append(cell)
+        atype = get_atype(array_lines[ii], type_idx_zero=type_idx_zero)
+        # map atom type; a[as[a][as[as[b]]]] = b[as[b][as^{-1}[b]]] = b[id]
+        idx = np.argsort(atype, kind="stable")[
+            np.argsort(np.argsort(system["atom_types"], kind="stable"), kind="stable")
+        ]
+        system["coords"].append(
+            safe_get_posi(array_lines[ii], cell, np.array(orig), unwrap)[idx]
+        )
+        if has_spin:
+            spin = get_spin(array_lines[ii], spin_keys)
+            if spin is not None:
+                system["spins"].append(spin[idx])
+            else:
+                warnings.warn(
+                    f"Warning: spin info is not found in frame {ii}, remove spin info."
+                )
+                system.pop("spins")
+                has_spin = False
+    if has_spin:
+        system["spins"] = np.array(system["spins"])
+    system["cells"] = np.array(system["cells"])
+    system["coords"] = np.array(system["coords"])
+    return system
+
+
+def split_traj(dump_lines):
+    marks = []
+    for idx, ii in enumerate(dump_lines):
+        if "ITEM: TIMESTEP" in ii:
+            marks.append(idx)
+    if len(marks) == 0:
+        return None
+    elif len(marks) == 1:
+        return [dump_lines]
+    else:
+        block_size = marks[1] - marks[0]
+        ret = []
+        for ii in marks:
+            ret.append(dump_lines[ii : ii + block_size])
+        # for ii in range(len(marks)-1):
+        #     assert(marks[ii+1] - marks[ii] == block_size)
+        return ret
+    return None
+
+
+def from_system_data(system, f_idx=0, timestep=0):
+    """Convert system data to LAMMPS dump format string.
+
+    Parameters
+    ----------
+    system : dict
+        System data dictionary containing atoms, coordinates, cell, etc.
+    f_idx : int, optional
+        Frame index to dump (default: 0)
+    timestep : int, optional
+        Timestep number for the dump (default: 0)
+
+    Returns
+    -------
+    str
+        LAMMPS dump format string
+    """
+    ret = ""
+
+    # Get basic system info
+    natoms = sum(system["atom_numbs"])
+    coords = system["coords"][f_idx]
+    cell = system["cells"][f_idx]
+    atom_types = system["atom_types"]
+    orig = system.get("orig", np.zeros(3))
+
+    # Convert cell to dump format (bounds and tilt)
+    bounds, tilt = box2dumpbox(orig, cell)
+
+    # Write timestep
+    ret += "ITEM: TIMESTEP\n"
+    ret += f"{timestep}\n"
+
+    # Write number of atoms
+    ret += "ITEM: NUMBER OF ATOMS\n"
+    ret += f"{natoms}\n"
+
+    # Write box bounds
+    ret += "ITEM: BOX BOUNDS xy xz yz pp pp pp\n"
+    ret += f"{bounds[0][0]:.10f} {bounds[0][1]:.10f} {tilt[0]:.10f}\n"
+    ret += f"{bounds[1][0]:.10f} {bounds[1][1]:.10f} {tilt[1]:.10f}\n"
+    ret += f"{bounds[2][0]:.10f} {bounds[2][1]:.10f} {tilt[2]:.10f}\n"
+
+    # Write atoms header
+    ret += "ITEM: ATOMS id type x y z\n"
+
+    # Write atom data
+    for ii in range(natoms):
+        atom_id = ii + 1  # LAMMPS uses 1-based indexing
+        atom_type = atom_types[ii] + 1  # LAMMPS uses 1-based type indexing
+        x, y, z = coords[ii]
+        ret += f"{atom_id} {atom_type} {x:.10f} {y:.10f} {z:.10f}\n"
+
+    return ret
+
+
+if __name__ == "__main__":
+    # fname = 'dump.hti'
+    # lines = open(fname).read().split('\n')
+    # # print(get_natoms(lines))
+    # # print(get_natomtypes(lines))
+    # # print(get_natoms_vec(lines))
+    # posi = get_posi(lines)
+    # dbox1, tilt1 = box2dumpbox(orig, box)
+    # print(dbox - dbox1)
+    # print(tilt - tilt1)
+    # print(orig)
+    # print(box)
+    # np.savetxt('tmp.out', posi - orig, fmt='%.6f')
+    # print(system_data(lines))
+    lines = load_file("conf_unfold.dump", begin=0, step=1)
+    al = split_traj(lines)
+    s = system_data(lines, ["O", "H"])
+    # l = np.linalg.norm(s['cells'][1],axis=1)
+    # p = s['coords'][0] + l
+    # np.savetxt('p',p,fmt='%1.10f')
diff --git a/dpdata/formats/lammps/lmp.py b/dpdata/formats/lammps/lmp.py
new file mode 100644
index 000000000..c9d60ec53
--- /dev/null
+++ b/dpdata/formats/lammps/lmp.py
@@ -0,0 +1,649 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import numpy as np
+
+from dpdata.periodic_table import ELEMENTS, Element
+
+ptr_float_fmt = "%15.10f"
+ptr_int_fmt = "%6d"
+ptr_key_fmt = "%15s"
+
+# Mapping of LAMMPS atom styles to their column layouts
+# Format: (atom_id_col, atom_type_col, x_col, y_col, z_col, has_molecule_id, has_charge, charge_col)
+ATOM_STYLE_COLUMNS = {
+    "atomic": (0, 1, 2, 3, 4, False, False, None),
+    "angle": (0, 2, 3, 4, 5, True, False, None),
+    "bond": (0, 2, 3, 4, 5, True, False, None),
+    "charge": (0, 1, 3, 4, 5, False, True, 2),
+    "full": (0, 2, 4, 5, 6, True, True, 3),
+    "molecular": (0, 2, 3, 4, 5, True, False, None),
+    "dipole": (0, 1, 3, 4, 5, False, True, 2),
+    "sphere": (0, 1, 4, 5, 6, False, False, None),
+}
+
+
+def detect_atom_style(lines: list[str]) -> str | None:
+    """Detect LAMMPS atom style from data file content.
+
+    Parameters
+    ----------
+    lines : list
+        Lines from LAMMPS data file
+
+    Returns
+    -------
+    str or None
+        Detected atom style, or None if not detected
+    """
+    # Look for atom style in comments after "Atoms" section header
+    atom_lines = get_atoms(lines)
+    if not atom_lines:
+        return None
+
+    # Find the "Atoms" line
+    for idx, line in enumerate(lines):
+        if "Atoms" in line:
+            # Check if there's a comment with atom style after "Atoms"
+            if "#" in line:
+                comment_part = line.split("#")[1].strip().lower()
+                for style in ATOM_STYLE_COLUMNS:
+                    if style in comment_part:
+                        return style
+            break
+
+    # If no explicit style found, try to infer from first data line
+    if atom_lines:
+        first_line = atom_lines[0].split()
+        num_cols = len(first_line)
+
+        # Try to match based on number of columns and content patterns
+        # This is a heuristic approach
+        if num_cols == 5:
+            # Could be atomic style: atom-ID atom-type x y z
+            return "atomic"
+        elif num_cols == 6:
+            # Could be charge or bond/molecular style
+            # Try to determine if column 2 (index 2) looks like a charge (float) or type (int)
+            try:
+                val = float(first_line[2])
+                # If it's a small float, likely a charge
+                if abs(val) < 10 and val != int(val):
+                    return "charge"
+                else:
+                    # Likely molecule ID (integer), so bond/molecular style
+                    return "bond"
+            except ValueError:
+                return "atomic"  # fallback
+        elif num_cols == 7:
+            # Could be full style: atom-ID molecule-ID atom-type charge x y z
+            return "full"
+        elif num_cols >= 8:
+            # Could be dipole or sphere style
+            # For now, default to dipole if we have enough columns
+            return "dipole"
+
+    return None  # Unable to detect
+
+
+def _get_block(lines, keys):
+    for idx in range(len(lines)):
+        if keys in lines[idx]:
+            break
+    if idx == len(lines) - 1:
+        return None
+    idx_s = idx + 2
+    idx = idx_s
+    ret = []
+    while True:
+        if idx == len(lines) or len(lines[idx].split()) == 0:
+            break
+        else:
+            ret.append(lines[idx])
+        idx += 1
+    return ret
+
+
+def lmpbox2box(lohi, tilt):
+    xy = tilt[0]
+    xz = tilt[1]
+    yz = tilt[2]
+    orig = np.array([lohi[0][0], lohi[1][0], lohi[2][0]])
+    lens = []
+    for dd in range(3):
+        lens.append(lohi[dd][1] - lohi[dd][0])
+    xx = [lens[0], 0, 0]
+    yy = [xy, lens[1], 0]
+    zz = [xz, yz, lens[2]]
+    return orig, np.array([xx, yy, zz])
+
+
+def box2lmpbox(orig, box):
+    lohi = np.zeros([3, 2])
+    for dd in range(3):
+        lohi[dd][0] = orig[dd]
+    tilt = np.zeros(3)
+    tilt[0] = box[1][0]
+    tilt[1] = box[2][0]
+    tilt[2] = box[2][1]
+    lens = np.zeros(3)
+    lens[0] = box[0][0]
+    lens[1] = box[1][1]
+    lens[2] = box[2][2]
+    for dd in range(3):
+        lohi[dd][1] = lohi[dd][0] + lens[dd]
+    return lohi, tilt
+
+
+def get_atoms(lines):
+    return _get_block(lines, "Atoms")
+
+
+def get_natoms(lines):
+    for ii in lines:
+        if "atoms" in ii:
+            return int(ii.split()[0])
+    return None
+
+
+def get_natomtypes(lines):
+    for ii in lines:
+        if "atom types" in ii:
+            return int(ii.split()[0])
+    return None
+
+
+def _atom_info_mol(line):
+    vec = line.split()
+    # idx, mole_type, atom_type, charge, x, y, z
+    return (
+        int(vec[0]),
+        int(vec[1]),
+        int(vec[2]),
+        float(vec[3]),
+        float(vec[4]),
+        float(vec[5]),
+        float(vec[6]),
+    )
+
+
+def _atom_info_atom(line):
+    vec = line.split()
+    # idx, atom_type, x, y, z
+    return int(vec[0]), int(vec[1]), float(vec[2]), float(vec[3]), float(vec[4])
+
+
+def _atom_info_style(line: str, atom_style: str = "atomic") -> dict[str, int | float]:
+    """Parse atom information based on the specified atom style.
+
+    Parameters
+    ----------
+    line : str
+        The atom line from LAMMPS data file
+    atom_style : str
+        The LAMMPS atom style (atomic, full, charge, etc.)
+
+    Returns
+    -------
+    dict
+        Dictionary containing parsed atom information with keys:
+        'atom_id', 'atom_type', 'x', 'y', 'z', 'molecule_id' (if present), 'charge' (if present)
+    """
+    if atom_style not in ATOM_STYLE_COLUMNS:
+        raise ValueError(
+            f"Unsupported atom style: {atom_style}. Supported styles: {list(ATOM_STYLE_COLUMNS.keys())}"
+        )
+
+    vec = line.split()
+    columns = ATOM_STYLE_COLUMNS[atom_style]
+
+    result = {
+        "atom_id": int(vec[columns[0]]),
+        "atom_type": int(vec[columns[1]]),
+        "x": float(vec[columns[2]]),
+        "y": float(vec[columns[3]]),
+        "z": float(vec[columns[4]]),
+    }
+
+    # Add molecule ID if present
+    if columns[5]:  # has_molecule_id
+        result["molecule_id"] = int(
+            vec[1]
+        )  # molecule ID is always in column 1 when present
+
+    # Add charge if present
+    if columns[6]:  # has_charge
+        result["charge"] = float(vec[columns[7]])  # charge_col
+
+    return result
+
+
+def get_natoms_vec(lines: list[str], atom_style: str = "atomic") -> list[int]:
+    """Get number of atoms for each atom type.
+
+    Parameters
+    ----------
+    lines : list
+        Lines from LAMMPS data file
+    atom_style : str
+        The LAMMPS atom style
+
+    Returns
+    -------
+    list
+        Number of atoms for each atom type
+    """
+    atype = get_atype(lines, atom_style=atom_style)
+    natoms_vec = []
+    natomtypes = get_natomtypes(lines)
+    for ii in range(natomtypes):
+        natoms_vec.append(sum(atype == ii + 1))
+    assert sum(natoms_vec) == get_natoms(lines)
+    return natoms_vec
+
+
+def get_atype(
+    lines: list[str], type_idx_zero: bool = False, atom_style: str = "atomic"
+) -> np.ndarray:
+    """Get atom types from LAMMPS data file.
+
+    Parameters
+    ----------
+    lines : list
+        Lines from LAMMPS data file
+    type_idx_zero : bool
+        Whether to use zero-based indexing for atom types
+    atom_style : str
+        The LAMMPS atom style
+
+    Returns
+    -------
+    np.ndarray
+        Array of atom types
+    """
+    alines = get_atoms(lines)
+    atype = []
+    for ii in alines:
+        atom_info = _atom_info_style(ii, atom_style)
+        at = atom_info["atom_type"]
+        if type_idx_zero:
+            atype.append(at - 1)
+        else:
+            atype.append(at)
+    return np.array(atype, dtype=int)
+
+
+def get_posi(lines: list[str], atom_style: str = "atomic") -> np.ndarray:
+    """Get atomic positions from LAMMPS data file.
+
+    Parameters
+    ----------
+    lines : list
+        Lines from LAMMPS data file
+    atom_style : str
+        The LAMMPS atom style
+
+    Returns
+    -------
+    np.ndarray
+        Array of atomic positions
+    """
+    atom_lines = get_atoms(lines)
+    posis = []
+    for ii in atom_lines:
+        atom_info = _atom_info_style(ii, atom_style)
+        posis.append([atom_info["x"], atom_info["y"], atom_info["z"]])
+    return np.array(posis)
+
+
+def get_charges(lines: list[str], atom_style: str = "atomic") -> np.ndarray | None:
+    """Get atomic charges from LAMMPS data file if the atom style supports charges.
+
+    Parameters
+    ----------
+    lines : list
+        Lines from LAMMPS data file
+    atom_style : str
+        The LAMMPS atom style
+
+    Returns
+    -------
+    np.ndarray or None
+        Array of atomic charges if atom style has charges, None otherwise
+    """
+    if atom_style not in ATOM_STYLE_COLUMNS:
+        raise ValueError(f"Unsupported atom style: {atom_style}")
+
+    # Check if this atom style has charges
+    if not ATOM_STYLE_COLUMNS[atom_style][6]:  # has_charge
+        return None
+
+    atom_lines = get_atoms(lines)
+    charges = []
+    for ii in atom_lines:
+        atom_info = _atom_info_style(ii, atom_style)
+        charges.append(atom_info["charge"])
+    return np.array(charges)
+
+
+def get_spins(lines: list[str], atom_style: str = "atomic") -> np.ndarray | None:
+    atom_lines = get_atoms(lines)
+    if len(atom_lines[0].split()) < 8:
+        return None
+    spins_ori = []
+    spins_norm = []
+    for ii in atom_lines:
+        iis = ii.split()
+        spins_ori.append([float(jj) for jj in iis[5:8]])
+        spins_norm.append([float(iis[-1])])
+    return np.array(spins_ori) * np.array(spins_norm)
+
+
+def get_lmpbox(lines):
+    box_info = []
+    tilt = np.zeros(3)
+    for ii in lines:
+        if "xlo" in ii and "xhi" in ii:
+            box_info.append([float(ii.split()[0]), float(ii.split()[1])])
+            break
+    for ii in lines:
+        if "ylo" in ii and "yhi" in ii:
+            box_info.append([float(ii.split()[0]), float(ii.split()[1])])
+            break
+    for ii in lines:
+        if "zlo" in ii and "zhi" in ii:
+            box_info.append([float(ii.split()[0]), float(ii.split()[1])])
+            break
+    for ii in lines:
+        if "xy" in ii and "xz" in ii and "yz" in ii:
+            tilt = np.array([float(jj) for jj in ii.split()[0:3]])
+    return box_info, tilt
+
+
+def system_data(
+    lines: list[str],
+    type_map: list[str] | None = None,
+    type_idx_zero: bool = True,
+    atom_style: str = "atomic",
+) -> dict:
+    """Parse LAMMPS data file to system data format.
+
+    Parameters
+    ----------
+    lines : list
+        Lines from LAMMPS data file
+    type_map : list, optional
+        Mapping from atom types to element names
+    type_idx_zero : bool
+        Whether to use zero-based indexing for atom types
+    atom_style : str
+        The LAMMPS atom style (atomic, full, charge, etc.)
+
+    Returns
+    -------
+    dict
+        System data dictionary
+    """
+    system = {}
+    system["atom_numbs"] = get_natoms_vec(lines, atom_style=atom_style)
+    system["atom_names"] = []
+    if type_map is None:
+        for ii in range(len(system["atom_numbs"])):
+            system["atom_names"].append("Type_%d" % ii)  # noqa: UP031
+    else:
+        assert len(type_map) >= len(system["atom_numbs"])
+        for ii in range(len(system["atom_numbs"])):
+            system["atom_names"].append(type_map[ii])
+    lohi, tilt = get_lmpbox(lines)
+    orig, cell = lmpbox2box(lohi, tilt)
+    system["orig"] = np.array(orig)
+    system["cells"] = [np.array(cell)]
+    natoms = sum(system["atom_numbs"])
+    system["atom_types"] = get_atype(
+        lines, type_idx_zero=type_idx_zero, atom_style=atom_style
+    )
+    system["coords"] = [get_posi(lines, atom_style=atom_style)]
+    system["cells"] = np.array(system["cells"])
+    system["coords"] = np.array(system["coords"])
+
+    # Add charges if the atom style supports them
+    charges = get_charges(lines, atom_style=atom_style)
+    if charges is not None:
+        system["charges"] = np.array([charges])
+
+    spins = get_spins(lines, atom_style=atom_style)
+    if spins is not None:
+        system["spins"] = np.array([spins])
+
+    return system
+
+
+def to_system_data(
+    lines: list[str],
+    type_map: list[str] | None = None,
+    type_idx_zero: bool = True,
+    atom_style: str = "atomic",
+) -> dict:
+    """Parse LAMMPS data file to system data format.
+
+    Parameters
+    ----------
+    lines : list
+        Lines from LAMMPS data file
+    type_map : list, optional
+        Mapping from atom types to element names
+    type_idx_zero : bool
+        Whether to use zero-based indexing for atom types
+    atom_style : str
+        The LAMMPS atom style. If "auto", attempts to detect automatically
+        from file. Default is "atomic".
+
+    Returns
+    -------
+    dict
+        System data dictionary
+    """
+    # Attempt automatic detection if requested
+    if atom_style == "auto":
+        detected_style = detect_atom_style(lines)
+        if detected_style:
+            atom_style = detected_style
+        else:
+            atom_style = "atomic"  # fallback to default
+
+    return system_data(
+        lines, type_map=type_map, type_idx_zero=type_idx_zero, atom_style=atom_style
+    )
+
+
+def rotate_to_lower_triangle(
+    cell: np.ndarray, coord: np.ndarray
+) -> tuple[np.ndarray, np.ndarray]:
+    """Rotate the cell to lower triangular and ensure the diagonal elements are non-negative.
+
+    Args:
+        cell (np.ndarray): The original cell matrix.
+        coord (np.ndarray): The coordinates of the atoms.
+
+    Returns
+    -------
+    tuple[np.ndarray, np.ndarray]: The rotated cell and adjusted coordinates.
+    """
+    q, _ = np.linalg.qr(cell.T)
+    cell = np.matmul(cell, q)
+    coord = np.matmul(coord, q)
+
+    # Ensure the diagonal elements of the cell are non-negative
+    rot = np.eye(3)
+    if cell[0][0] < 0:
+        rot[0][0] = -1
+    if cell[1][1] < 0:
+        rot[1][1] = -1
+    if cell[2][2] < 0:
+        rot[2][2] = -1
+    cell = np.matmul(cell, rot)
+    coord = np.matmul(coord, rot)
+    return cell, coord
+
+
+def _get_lammps_masses(system) -> np.ndarray | None:
+    """Get masses for the LAMMPS ``Masses`` section.
+
+    Prefer explicitly stored masses when available. Otherwise, infer masses from
+    ``atom_names`` when all names are valid chemical element symbols.
+
+    Parameters
+    ----------
+    system : dict
+        System data dictionary
+
+    Returns
+    -------
+    np.ndarray or None
+        Per-type masses aligned with ``atom_names``. Returns ``None`` when the
+        masses cannot be determined safely.
+
+    Raises
+    ------
+    ValueError
+        If explicit ``system["masses"]`` is present but does not match the
+        length of ``atom_names``.
+    """
+    atom_names = system["atom_names"]
+    masses = system.get("masses")
+    if masses is not None:
+        masses = np.asarray(masses, dtype=float)
+        if masses.ndim != 1 or len(masses) != len(atom_names):
+            raise ValueError(
+                'Explicit system["masses"] must be a 1D array with the same '
+                'length as system["atom_names"] to write the LAMMPS Masses '
+                "section."
+            )
+        return masses
+
+    if not all(name in ELEMENTS for name in atom_names):
+        return None
+
+    return np.array([Element(name).mass for name in atom_names], dtype=float)
+
+
+def from_system_data(system, f_idx=0):
+    ret = ""
+    ret += "\n"
+    natoms = sum(system["atom_numbs"])
+    ntypes = len(system["atom_numbs"])
+    cell, coord = rotate_to_lower_triangle(
+        system["cells"][f_idx], system["coords"][f_idx]
+    )
+    ret += "%d atoms\n" % natoms  # noqa: UP031
+    ret += "%d atom types\n" % ntypes  # noqa: UP031
+    ret += (ptr_float_fmt + " " + ptr_float_fmt + " xlo xhi\n") % (
+        0,
+        cell[0][0],
+    )  # noqa: UP031
+    ret += (ptr_float_fmt + " " + ptr_float_fmt + " ylo yhi\n") % (
+        0,
+        cell[1][1],
+    )  # noqa: UP031
+    ret += (ptr_float_fmt + " " + ptr_float_fmt + " zlo zhi\n") % (
+        0,
+        cell[2][2],
+    )  # noqa: UP031
+    ret += (
+        ptr_float_fmt + " " + ptr_float_fmt + " " + ptr_float_fmt + " xy xz yz\n"
+    ) % (
+        cell[1][0],
+        cell[2][0],
+        cell[2][1],
+    )  # noqa: UP031
+    ret += "\n"
+
+    masses = _get_lammps_masses(system)
+    if masses is not None:
+        ret += "Masses\n"
+        ret += "\n"
+        mass_fmt = ptr_int_fmt + " " + ptr_float_fmt + " # %s\n"  # noqa: UP031
+        for ii, (mass, atom_name) in enumerate(zip(masses, system["atom_names"])):
+            ret += mass_fmt % (ii + 1, mass, atom_name)
+        ret += "\n"
+
+    ret += "Atoms # atomic\n"
+    ret += "\n"
+    coord_fmt = (
+        ptr_int_fmt
+        + " "
+        + ptr_int_fmt
+        + " "
+        + ptr_float_fmt
+        + " "
+        + ptr_float_fmt
+        + " "
+        + ptr_float_fmt
+        + "\n"
+    )  # noqa: UP031
+
+    if "spins" in system:
+        coord_fmt = (
+            coord_fmt.strip("\n")
+            + " "
+            + ptr_float_fmt
+            + " "
+            + ptr_float_fmt
+            + " "
+            + ptr_float_fmt
+            + " "
+            + ptr_float_fmt
+            + "\n"
+        )  # noqa: UP031
+        spins_norm = np.linalg.norm(system["spins"][f_idx], axis=1)
+    for ii in range(natoms):
+        if "spins" in system:
+            if spins_norm[ii] != 0:
+                ret += coord_fmt % (
+                    ii + 1,
+                    system["atom_types"][ii] + 1,
+                    coord[ii][0] - system["orig"][0],
+                    coord[ii][1] - system["orig"][1],
+                    coord[ii][2] - system["orig"][2],
+                    system["spins"][f_idx][ii][0] / spins_norm[ii],
+                    system["spins"][f_idx][ii][1] / spins_norm[ii],
+                    system["spins"][f_idx][ii][2] / spins_norm[ii],
+                    spins_norm[ii],
+                )  # noqa: UP031
+            else:
+                ret += coord_fmt % (
+                    ii + 1,
+                    system["atom_types"][ii] + 1,
+                    coord[ii][0] - system["orig"][0],
+                    coord[ii][1] - system["orig"][1],
+                    coord[ii][2] - system["orig"][2],
+                    system["spins"][f_idx][ii][0],
+                    system["spins"][f_idx][ii][1],
+                    system["spins"][f_idx][ii][2] + 1,
+                    spins_norm[ii],
+                )  # noqa: UP031
+        else:
+            ret += coord_fmt % (
+                ii + 1,
+                system["atom_types"][ii] + 1,
+                coord[ii][0] - system["orig"][0],
+                coord[ii][1] - system["orig"][1],
+                coord[ii][2] - system["orig"][2],
+            )  # noqa: UP031
+    return ret
+
+
+if __name__ == "__main__":
+    fname = "water-SPCE.data"
+    lines = open(fname).read().split("\n")
+    bonds, tilt = get_lmpbox(lines)
+    # print(bonds, tilt)
+    orig, box = lmpbox2box(bonds, tilt)
+    # print(orig, box)
+    bonds1, tilt1 = box2lmpbox(orig, box)
+    # print(bonds1, tilt1)
+    print(bonds1 - bonds)
+    print(tilt1 - tilt)
+    print(box)
+    print(get_atype(lines))
+    print(get_posi(lines))
diff --git a/dpdata/formats/lmdb/__init__.py b/dpdata/formats/lmdb/__init__.py
new file mode 100644
index 000000000..53a3e8f0e
--- /dev/null
+++ b/dpdata/formats/lmdb/__init__.py
@@ -0,0 +1,5 @@
+from __future__ import annotations
+
+from .format import LMDBFormat
+
+__all__ = ["LMDBFormat"]
diff --git a/dpdata/formats/lmdb/format.py b/dpdata/formats/lmdb/format.py
new file mode 100644
index 000000000..9b518be6b
--- /dev/null
+++ b/dpdata/formats/lmdb/format.py
@@ -0,0 +1,286 @@
+from __future__ import annotations
+
+import os
+
+import lmdb
+import msgpack
+import msgpack_numpy as m
+import numpy as np
+
+from dpdata.format import Format
+
+m.patch()
+
+
+class LMDBError(Exception):
+    """Base class for LMDB errors."""
+
+
+class LMDBMetadataError(LMDBError):
+    """Metadata not found in LMDB."""
+
+
+class LMDBFrameError(LMDBError):
+    """Frame data not found in LMDB."""
+
+
+class LMDBFormat(Format):
+    """
+    Class for handling the LMDB format, which stores atomic configurations in a
+    Lightning Memory-Mapped Database (LMDB).
+
+    This format is optimized for machine learning workflows where fast, random
+    access to a large number of frames is required. All frames from multiple
+    systems (with potentially different numbers of atoms) are stored in a
+    single LMDB database file.
+
+    Both single systems and multiple systems are supported via the standard
+    ``dpdata`` APIs.
+
+    Examples
+    --------
+    **Saving a single LabeledSystem**
+
+    >>> import dpdata
+    >>> system = dpdata.LabeledSystem("path/to/input.vasp", fmt="vasp/outcar")
+    >>> system.to("lmdb", "my_single_system.lmdb")
+
+    **Loading a single LabeledSystem**
+
+    >>> loaded_system = dpdata.LabeledSystem("my_single_system.lmdb", fmt="lmdb")
+
+    **Saving multiple systems to a single LMDB database**
+
+    >>> import dpdata
+    >>> system_1 = dpdata.LabeledSystem("path/to/system1/OUTCAR", fmt="vasp/outcar")
+    >>> system_2 = dpdata.LabeledSystem("path/to/system2/OUTCAR", fmt="vasp/outcar")
+    >>> multi_systems_obj = dpdata.MultiSystems(system_1, system_2)
+    >>> multi_systems_obj.to("lmdb", "my_multi_system_db.lmdb")
+
+    **Loading multiple systems from a single LMDB database**
+
+    >>> import dpdata
+    >>> loaded_multi_systems = dpdata.MultiSystems.from_file("my_multi_system_db.lmdb", fmt="lmdb")
+    """
+
+    def to_multi_systems(
+        self, formulas, directory, map_size=1000000000, frame_idx_fmt="012d", **kwargs
+    ):
+        """Implement MultiSystems.to for LMDB format.
+
+        Parameters
+        ----------
+        formulas : list[str]
+            list of formulas
+        directory : str
+            directory of system
+        map_size : int, optional
+            Maximum size of the LMDB database in bytes. Default is 1GB.
+        frame_idx_fmt : str, optional
+            The format string used to encode the frame index as a key. Default is "012d".
+        **kwargs : dict
+            other parameters
+
+        Yields
+        ------
+        tuple
+            (self, formula) to be used by to_system
+        """
+        self._frame_idx_fmt = frame_idx_fmt
+        self._global_frame_idx = 0
+        self._system_info = []
+        os.makedirs(directory, exist_ok=True)
+        with lmdb.open(directory, map_size=map_size) as env:
+            with env.begin(write=True) as txn:
+                self._txn = txn
+                for ff in formulas:
+                    yield (self, ff)
+                # Finalize metadata
+                metadata = {
+                    "nframes": self._global_frame_idx,
+                    "system_info": self._system_info,
+                    "frame_idx_fmt": self._frame_idx_fmt,
+                }
+                txn.put(b"__metadata__", msgpack.packb(metadata, use_bin_type=True))
+                self._txn = None
+
+    def _dump_to_txn(self, data, txn, formula, dtypes):
+        from dpdata.data_type import Axis
+
+        nframes = data["coords"].shape[0]
+
+        # Identify symbolic shapes and frame-dependent keys
+        data_shapes = {}
+        frame_dependent_keys = []
+        for dt in dtypes:
+            if dt.name in data:
+                if dt.shape is not None:
+                    data_shapes[dt.name] = [
+                        s.value if isinstance(s, Axis) else s for s in dt.shape
+                    ]
+                    if Axis.NFRAMES in dt.shape:
+                        frame_dependent_keys.append(dt.name)
+                else:
+                    data_shapes[dt.name] = None
+
+        # Record system info
+        # natoms needs to be extracted from data
+        if "atom_numbs" in data:
+            natoms_list = data["atom_numbs"]
+        else:
+            # Fallback for systems without atom_numbs (should not happen in valid dpdata systems)
+            natoms_list = []
+
+        self._system_info.append(
+            {
+                "formula": formula,
+                "natoms": natoms_list,
+                "nframes": nframes,
+                "start_idx": self._global_frame_idx,
+                "data_shapes": data_shapes,
+                "frame_dependent_keys": frame_dependent_keys,
+            }
+        )
+
+        for i in range(nframes):
+            frame_data = {}
+            for key, val in data.items():
+                if key in frame_dependent_keys:
+                    frame_data[key] = val[i]
+                else:
+                    frame_data[key] = val
+
+            key = f"{self._global_frame_idx:{self._frame_idx_fmt}}".encode("ascii")
+            value = msgpack.packb(frame_data, use_bin_type=True)
+            txn.put(key, value)
+            self._global_frame_idx += 1
+
+    def to_labeled_system(self, data, file_name, **kwargs):
+        """Save a single LabeledSystem to an LMDB database."""
+        from dpdata.system import LabeledSystem
+
+        if isinstance(file_name, tuple) and file_name[0] is self:
+            txn, formula = self._txn, file_name[1]
+            self._dump_to_txn(data, txn, formula, LabeledSystem.DTYPES)
+        else:
+            # Single system call: use to_multi_systems logic
+            # Infer formula from data if possible, or use default
+            formula = kwargs.get("formula", "unknown")
+            gen = self.to_multi_systems([formula], file_name, **kwargs)
+            handle = next(gen)
+            self.to_labeled_system(data, handle, **kwargs)
+            try:
+                next(gen)
+            except StopIteration:
+                pass
+
+    def to_system(self, data, file_name, **kwargs):
+        """Save a single System to an LMDB database."""
+        from dpdata.system import System
+
+        if isinstance(file_name, tuple) and file_name[0] is self:
+            txn, formula = self._txn, file_name[1]
+            self._dump_to_txn(data, txn, formula, System.DTYPES)
+        else:
+            # Single system call
+            formula = kwargs.get("formula", "unknown")
+            gen = self.to_multi_systems([formula], file_name, **kwargs)
+            handle = next(gen)
+            self.to_system(data, handle, **kwargs)
+            try:
+                next(gen)
+            except StopIteration:
+                pass
+
+    def from_multi_systems(self, file_name, map_size=1000000000, **kwargs):
+        """Load multiple systems from a single LMDB database.
+
+        Parameters
+        ----------
+        file_name : str
+            The path to the LMDB database directory.
+        map_size : int, optional
+            Maximum size of the LMDB database in bytes.
+        **kwargs : dict
+            other parameters
+
+        Yields
+        ------
+        dict
+            data dictionary for each system
+        """
+        from dpdata.data_type import Axis, DataType
+        from dpdata.system import LabeledSystem, System
+
+        with lmdb.open(file_name, readonly=True) as env:
+            with env.begin() as txn:
+                metadata_packed = txn.get(b"__metadata__")
+                if metadata_packed is None:
+                    raise LMDBMetadataError("LMDB database does not contain metadata.")
+                metadata = msgpack.unpackb(metadata_packed, raw=False)
+                frame_idx_fmt = metadata.get("frame_idx_fmt", "012d")
+
+                for sys_info in metadata["system_info"]:
+                    system_frames = []
+                    start_idx = sys_info["start_idx"]
+                    nframes = sys_info["nframes"]
+                    data_shapes = sys_info.get("data_shapes", {})
+                    frame_dependent_keys = sys_info.get("frame_dependent_keys", [])
+
+                    for i in range(start_idx, start_idx + nframes):
+                        key = f"{i:{frame_idx_fmt}}".encode("ascii")
+                        value = txn.get(key)
+                        if value is None:
+                            raise LMDBFrameError(f"Frame data not found for key: {key}")
+                        frame_data = msgpack.unpackb(value, raw=False)
+                        system_frames.append(frame_data)
+
+                    # Aggregate data for one system
+                    first_frame = system_frames[0]
+                    is_labeled = "energies" in first_frame
+                    cls = LabeledSystem if is_labeled else System
+
+                    # Auto-register unknown data types
+                    existing_dt_names = [dt.name for dt in cls.DTYPES]
+                    new_dts = []
+                    axis_map = {a.value: a for a in Axis}
+                    for key, val in first_frame.items():
+                        if key not in existing_dt_names and key in data_shapes:
+                            shape_raw = data_shapes[key]
+                            if shape_raw is not None:
+                                shape = tuple([axis_map.get(s, s) for s in shape_raw])
+                            else:
+                                shape = None
+
+                            v_arr = np.array(val)
+                            new_dts.append(
+                                DataType(key, type(v_arr), shape=shape, required=False)
+                            )
+
+                    if new_dts:
+                        cls.register_data_type(*new_dts)
+
+                    agg_data = {}
+                    for key, val in first_frame.items():
+                        if key in frame_dependent_keys:
+                            agg_data[key] = np.array([d[key] for d in system_frames])
+                        else:
+                            agg_data[key] = val
+
+                    yield agg_data
+
+    def from_labeled_system(self, file_name, **kwargs):
+        """Load data for a single LabeledSystem from an LMDB database."""
+        if isinstance(file_name, dict):
+            return file_name
+        # from_multi_systems returns a generator of dicts
+        gen = self.from_multi_systems(file_name, **kwargs)
+        return next(gen)
+
+    def from_system(self, file_name, **kwargs):
+        """Load data for a single System from an LMDB database."""
+        if isinstance(file_name, dict):
+            return file_name
+        # from_multi_systems returns a generator of dicts
+        gen = self.from_multi_systems(file_name, **kwargs)
+        return next(gen)
diff --git a/dpdata/formats/openmx/__init__.py b/dpdata/formats/openmx/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/dpdata/formats/openmx/omx.py b/dpdata/formats/openmx/omx.py
new file mode 100644
index 000000000..16368eb2f
--- /dev/null
+++ b/dpdata/formats/openmx/omx.py
@@ -0,0 +1,200 @@
+#!/usr/bin/python3
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+from dpdata.utils import open_file
+
+if TYPE_CHECKING:
+    from dpdata.utils import FileType
+
+from ...unit import (
+    EnergyConversion,
+    ForceConversion,
+    LengthConversion,
+    PressureConversion,
+)
+
+ry2ev = EnergyConversion("rydberg", "eV").value()
+kbar2evperang3 = PressureConversion("kbar", "eV/angstrom^3").value()
+
+length_convert = LengthConversion("bohr", "angstrom").value()
+energy_convert = EnergyConversion("hartree", "eV").value()
+force_convert = ForceConversion("hartree/bohr", "eV/angstrom").value()
+
+import warnings
+from collections import OrderedDict
+
+
+def load_atom(lines):
+    atom_names = []
+    atom_names_mode = False
+    for line in lines:
+        if "<Atoms.SpeciesAndCoordinates" in line:
+            atom_names_mode = True
+        elif "Atoms.SpeciesAndCoordinates>" in line:
+            atom_names_mode = False
+        elif atom_names_mode:
+            parts = line.split()
+            atom_names.append(parts[1])
+    atom_names_original = atom_names
+    atom_names = list(OrderedDict.fromkeys(set(atom_names)))
+    atom_names = sorted(
+        atom_names, key=atom_names_original.index
+    )  # Unique ordering of atomic species
+    ntypes = len(atom_names)
+    atom_numbs = [0] * ntypes
+    atom_types = []
+    atom_types_mode = False
+    for line in lines:
+        if "<Atoms.SpeciesAndCoordinates" in line:
+            atom_types_mode = True
+        elif "Atoms.SpeciesAndCoordinates>" in line:
+            atom_types_mode = False
+        elif atom_types_mode:
+            parts = line.split()
+            for i, atom_name in enumerate(atom_names):
+                if parts[1] == atom_name:
+                    atom_numbs[i] += 1
+                    atom_types.append(i)
+    atom_types = np.array(atom_types)
+    return atom_names, atom_types, atom_numbs
+
+
+def load_cells(lines):
+    cells = []
+    for line in lines:
+        if "Cell_Vectors=" in line:
+            part = line.split("Cell_Vectors=")[1]
+            parts = part.split()
+            values = list(map(float, parts[:9]))
+            cell = [values[0:3], values[3:6], values[6:9]]
+            cells.append(cell)
+            # Checking SCF converged or not
+            for token in line.split():
+                if token.startswith("scf_conv="):
+                    scf_conv = int(token.split("=")[1])
+                    if scf_conv == 0:
+                        warnings.warn("SCF not converged!", stacklevel=2)
+    cells = np.array(cells)
+    return cells
+
+
+# load atom_names, atom_numbs, atom_types, cells
+def load_param_file(fname: FileType, mdname: FileType):
+    with open_file(fname) as dat_file:
+        lines = dat_file.readlines()
+    atom_names, atom_types, atom_numbs = load_atom(lines)
+
+    with open_file(mdname) as md_file:
+        lines = md_file.readlines()
+    cells = load_cells(lines)
+    return atom_names, atom_numbs, atom_types, cells
+
+
+def load_coords(lines, atom_names, natoms):
+    cnt = 0
+    coord, coords = [], []
+    for line in lines:
+        if "time=" in line:
+            continue
+        for atom_name in atom_names:
+            atom_name += " "
+            if atom_name in line:
+                cnt += 1
+                parts = line.split()
+                for_line = [float(parts[1]), float(parts[2]), float(parts[3])]
+                coord.append(for_line)
+        if cnt == natoms:
+            coords.append(coord)
+            cnt = 0
+            coord = []
+    coords = np.array(coords)
+    return coords
+
+
+def load_data(mdname: FileType, atom_names, natoms):
+    with open_file(mdname) as md_file:
+        lines = md_file.readlines()
+    coords = load_coords(lines, atom_names, natoms)
+    steps = [str(i) for i in range(1, coords.shape[0] + 1)]
+    return coords, steps
+
+
+def to_system_data(fname: FileType, mdname: FileType):
+    data = {}
+    (
+        data["atom_names"],
+        data["atom_numbs"],
+        data["atom_types"],
+        data["cells"],
+    ) = load_param_file(fname, mdname)
+    data["coords"], steps = load_data(
+        mdname,
+        data["atom_names"],
+        np.sum(data["atom_numbs"]),
+    )
+    data["orig"] = np.zeros(3)
+    return data, steps
+
+
+def load_energy(lines):
+    energy = []
+    for line in lines:
+        if "time=" in line:
+            parts = line.split()
+            ene_line = float(parts[4])  # Hartree
+            energy.append(ene_line)
+            continue
+    energy = energy_convert * np.array(energy)  # Hartree -> eV
+    return energy
+
+
+def load_force(lines, atom_names, atom_numbs):
+    cnt = 0
+    field, fields = [], []
+    for line in lines:
+        if "time=" in line:
+            continue
+        for atom_name in atom_names:
+            atom_name += " "
+            if atom_name in line:
+                cnt += 1
+                parts = line.split()
+                for_line = [float(parts[4]), float(parts[5]), float(parts[6])]
+                field.append(for_line)
+        if cnt == np.sum(atom_numbs):
+            fields.append(field)
+            cnt = 0
+            field = []
+    force = force_convert * np.array(fields)
+    return force
+
+
+# load energy, force
+def to_system_label(fname, mdname):
+    atom_names, atom_numbs, atom_types, cells = load_param_file(fname, mdname)
+    with open_file(mdname) as md_file:
+        lines = md_file.readlines()
+    energy = load_energy(lines)
+    force = load_force(lines, atom_names, atom_numbs)
+    return energy, force
+
+
+if __name__ == "__main__":
+    file_name = "Au111Surface"
+    fname = f"{file_name}.dat"
+    mdname = f"{file_name}.md"
+    atom_names, atom_numbs, atom_types, cells = load_param_file(fname, mdname)
+    coords, steps = load_data(mdname, atom_names, np.sum(atom_numbs))
+    data, steps = to_system_data(fname, mdname)
+    energy, force = to_system_label(fname, mdname)
+    print(atom_names)
+    print(atom_numbs)
+    print(atom_types)
+# print(cells.shape)
+# print(coords.shape)
+# print(len(energy))
+# print(force.shape)
diff --git a/dpdata/formats/orca/__init__.py b/dpdata/formats/orca/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/dpdata/formats/orca/output.py b/dpdata/formats/orca/output.py
new file mode 100644
index 000000000..a0915162b
--- /dev/null
+++ b/dpdata/formats/orca/output.py
@@ -0,0 +1,73 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+from dpdata.utils import open_file
+
+if TYPE_CHECKING:
+    from dpdata.utils import FileType
+
+
+def read_orca_sp_output(
+    fn: FileType,
+) -> tuple[np.ndarray, np.ndarray, float, np.ndarray]:
+    """Read from ORCA output.
+
+    Note that both the energy and the gradient should be printed.
+
+    Parameters
+    ----------
+    fn : str
+        file name
+
+    Returns
+    -------
+    np.ndarray
+        atomic symbols
+    np.ndarray
+        atomic coordinates
+    float
+        total potential energy
+    np.ndarray
+        atomic forces
+    """
+    coord = None
+    symbols = None
+    forces = None
+    energy = None
+    with open_file(fn) as f:
+        flag = 0
+        for line in f:
+            if flag in (1, 3, 4):
+                flag += 1
+            elif flag == 2:
+                s = line.split()
+                if not len(s):
+                    flag = 0
+                else:
+                    symbols.append(s[0].capitalize())
+                    coord.append([float(s[1]), float(s[2]), float(s[3])])
+            elif flag == 5:
+                s = line.split()
+                if not len(s):
+                    flag = 0
+                else:
+                    forces.append([float(s[3]), float(s[4]), float(s[5])])
+            elif line.startswith("CARTESIAN COORDINATES (ANGSTROEM)"):
+                # coord
+                flag = 1
+                coord = []
+                symbols = []
+            elif line.startswith("CARTESIAN GRADIENT"):
+                flag = 3
+                forces = []
+            elif line.startswith("FINAL SINGLE POINT ENERGY"):
+                energy = float(line.split()[-1])
+    symbols = np.array(symbols)
+    forces = -np.array(forces)
+    coord = np.array(coord)
+    assert coord.shape == forces.shape
+
+    return symbols, coord, energy, forces
diff --git a/dpdata/formats/psi4/__init__.py b/dpdata/formats/psi4/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/dpdata/formats/psi4/input.py b/dpdata/formats/psi4/input.py
new file mode 100644
index 000000000..3959cb753
--- /dev/null
+++ b/dpdata/formats/psi4/input.py
@@ -0,0 +1,62 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    import numpy as np
+
+# Angston is used in Psi4 by default
+template = """molecule {{
+{atoms:s}
+{charge:d} {multiplicity:d}
+}}
+set basis {basis:s}
+set gradient_write on
+G, wfn = gradient("WB97M-D3BJ", return_wfn=True)
+wfn.energy()
+wfn.gradient().print_out()
+"""
+
+
+def write_psi4_input(
+    types: np.ndarray,
+    coords: np.ndarray,
+    method: str,
+    basis: str,
+    charge: int = 0,
+    multiplicity: int = 1,
+) -> str:
+    """Write Psi4 input file.
+
+    Parameters
+    ----------
+    types : np.ndarray
+        atomic symbols
+    coords : np.ndarray
+        atomic coordinates
+    method : str
+        computational method
+    basis : str
+        basis set; see https://psicode.org/psi4manual/master/basissets_tables.html
+    charge : int, default=0
+        charge of system
+    multiplicity : int, default=1
+        multiplicity of system
+
+    Returns
+    -------
+    str
+        content of Psi4 input file
+    """
+    return template.format(
+        atoms="\n".join(
+            [
+                "{:s} {:16.9f} {:16.9f} {:16.9f}".format(*ii)
+                for ii in zip(types, *coords.T)
+            ]
+        ),
+        charge=charge,
+        multiplicity=multiplicity,
+        method=method,
+        basis=basis,
+    )
diff --git a/dpdata/formats/psi4/output.py b/dpdata/formats/psi4/output.py
new file mode 100644
index 000000000..c3594ffb4
--- /dev/null
+++ b/dpdata/formats/psi4/output.py
@@ -0,0 +1,80 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+from dpdata.unit import LengthConversion
+from dpdata.utils import open_file
+
+if TYPE_CHECKING:
+    from dpdata.utils import FileType
+
+
+def read_psi4_output(fn: FileType) -> tuple[str, np.ndarray, float, np.ndarray]:
+    """Read from Psi4 output.
+
+    Note that both the energy and the gradient should be printed.
+
+    Parameters
+    ----------
+    fn : str
+        file name
+
+    Returns
+    -------
+    str
+        atomic symbols
+    np.ndarray
+        atomic coordinates
+    float
+        total potential energy
+    np.ndarray
+        atomic forces
+    """
+    coord = None
+    symbols = None
+    forces = None
+    energy = None
+    length_unit = None
+    with open_file(fn) as f:
+        flag = 0
+        for line in f:
+            if flag in (1, 3, 4, 5, 6):
+                flag += 1
+            elif flag == 2:
+                s = line.split()
+                if not len(s):
+                    flag = 0
+                else:
+                    symbols.append(s[0].capitalize())
+                    coord.append([float(s[1]), float(s[2]), float(s[3])])
+            elif flag == 7:
+                s = line.split()
+                if not len(s):
+                    flag = 0
+                else:
+                    forces.append([float(s[1]), float(s[2]), float(s[3])])
+            elif line.startswith(
+                "       Center              X                  Y                   Z               Mass"
+            ):
+                # coord
+                flag = 1
+                coord = []
+                symbols = []
+            elif line.startswith("    Geometry (in "):
+                # remove ),
+                length_unit = line.split()[2][:-2].lower()
+            elif line.startswith("  ## Total Gradient"):
+                flag = 3
+                forces = []
+            elif line.startswith("    Total Energy ="):
+                energy = float(line.split()[-1])
+    assert length_unit is not None
+    length_convert = LengthConversion(length_unit, "angstrom").value()
+    symbols = np.array(symbols)
+    forces = -np.array(forces)
+    coord = np.array(coord) * length_convert
+    assert coord.shape == forces.shape
+
+    return symbols, coord, energy, forces
diff --git a/dpdata/formats/pwmat/__init__.py b/dpdata/formats/pwmat/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/dpdata/formats/pwmat/atomconfig.py b/dpdata/formats/pwmat/atomconfig.py
new file mode 100644
index 000000000..11677b0ef
--- /dev/null
+++ b/dpdata/formats/pwmat/atomconfig.py
@@ -0,0 +1,95 @@
+#!/usr/bin/python3
+from __future__ import annotations
+
+import numpy as np
+
+from ...periodic_table import ELEMENTS
+
+
+def _to_system_data_lower(lines):
+    system = {}
+    natoms = int(lines[0].split()[0])
+    cell = []
+    for idx, ii in enumerate(lines):
+        if "lattice" in ii or "Lattice" in ii or "LATTICE" in ii:
+            for kk in range(idx + 1, idx + 1 + 3):
+                vector = [float(jj) for jj in lines[kk].split()[0:3]]
+                cell.append(vector)
+    system["cells"] = np.array([cell])
+    coord = []
+    atomic_number = []
+    atom_numbs = []
+    for idx, ii in enumerate(lines):
+        if "Position" in ii or "POSITION" in ii or "position" in ii:
+            for kk in range(idx + 1, idx + 1 + natoms):
+                min = kk
+                for jj in range(kk + 1, idx + 1 + natoms):
+                    if int(lines[jj].split()[0]) < int(lines[min].split()[0]):
+                        min = jj
+                        lines[min], lines[kk] = lines[kk], lines[min]
+            for gg in range(idx + 1, idx + 1 + natoms):
+                tmpv = [float(jj) for jj in lines[gg].split()[1:4]]
+                tmpv = np.matmul(np.array(tmpv), system["cells"][0])
+                coord.append(tmpv)
+                tmpn = int(lines[gg].split()[0])
+                atomic_number.append(tmpn)
+    for ii in np.unique(sorted(atomic_number)):
+        atom_numbs.append(atomic_number.count(ii))
+    system["atom_numbs"] = [int(ii) for ii in atom_numbs]
+    system["coords"] = np.array([coord])
+    system["orig"] = np.zeros(3)
+    atom_types = []
+    for idx, ii in enumerate(system["atom_numbs"]):
+        for jj in range(ii):
+            atom_types.append(idx)
+    system["atom_types"] = np.array(atom_types, dtype=int)
+    system["atom_names"] = [ELEMENTS[ii - 1] for ii in np.unique(sorted(atomic_number))]
+    return system
+
+
+def to_system_data(lines):
+    return _to_system_data_lower(lines)
+
+
+def from_system_data(system, f_idx=0, skip_zeros=True):
+    ret = ""
+    natoms = sum(system["atom_numbs"])
+    ret += "%d" % natoms  # noqa: UP031
+    ret += "\n"
+    ret += "LATTICE"
+    ret += "\n"
+    for ii in system["cells"][f_idx]:
+        for jj in ii:
+            ret += f"{jj:.16e} "
+        ret += "\n"
+    ret += "POSITION"
+    ret += "\n"
+    atom_numbs = system["atom_numbs"]
+    atom_names = system["atom_names"]
+    atype = system["atom_types"]
+    posis = system["coords"][f_idx]
+    # atype_idx = [[idx,tt] for idx,tt in enumerate(atype)]
+    # sort_idx = np.argsort(atype, kind = 'mergesort')
+    sort_idx = np.lexsort((np.arange(len(atype)), atype))
+    atype = atype[sort_idx]
+    posis = posis[sort_idx]
+    symbal = []
+    for ii, jj in zip(atom_numbs, atom_names):
+        for kk in range(ii):
+            symbal.append(jj)
+    atomic_numbers = []
+    for ii in symbal:
+        atomic_numbers.append(ELEMENTS.index(ii) + 1)
+    posi_list = []
+    for jj, ii in zip(atomic_numbers, posis):
+        ii = np.matmul(ii, np.linalg.inv(system["cells"][0]))
+        posi_list.append("%d %15.10f %15.10f %15.10f 1 1 1" % (jj, ii[0], ii[1], ii[2]))  # noqa: UP031
+    for kk in range(len(posi_list)):
+        min = kk
+        for jj in range(kk, len(posi_list)):
+            if int(posi_list[jj].split()[0]) < int(posi_list[min].split()[0]):
+                min = jj
+                posi_list[min], posi_list[kk] = posi_list[kk], posi_list[min]
+    posi_list.append("")
+    ret += "\n".join(posi_list)
+    return ret
diff --git a/dpdata/formats/pwmat/movement.py b/dpdata/formats/pwmat/movement.py
new file mode 100644
index 000000000..a0f28e64b
--- /dev/null
+++ b/dpdata/formats/pwmat/movement.py
@@ -0,0 +1,208 @@
+from __future__ import annotations
+
+import warnings
+
+import numpy as np
+
+from ...periodic_table import ELEMENTS
+
+
+def system_info(lines, type_idx_zero=False):
+    atom_names = []
+    atom_numbs = []
+    nelm = 0
+    natoms = int(lines[0].split()[0])
+    iteration = float(lines[0].split("Etot")[0].split("=")[1].split(",")[0])
+    #    print(iteration)
+    if iteration > 0:
+        nelm = 40
+    else:
+        nelm = 100
+    atomic_number = []
+    for idx, ii in enumerate(lines):
+        if ("Position" in ii) and ("nonperiodic_Position" not in ii):
+            for kk in range(idx + 1, idx + 1 + natoms):
+                min = kk
+                for jj in range(kk + 1, idx + 1 + natoms):
+                    if int(lines[jj].split()[0]) < int(lines[min].split()[0]):
+                        min = jj
+                        lines[min], lines[kk] = lines[kk], lines[min]
+            for gg in range(idx + 1, idx + 1 + natoms):
+                tmpn = int(lines[gg].split()[0])
+                atomic_number.append(tmpn)
+    for ii in np.unique(sorted(atomic_number)):
+        atom_numbs.append(atomic_number.count(ii))
+    atom_types = []
+    for idx, ii in enumerate(atom_numbs):
+        for jj in range(ii):
+            if type_idx_zero:
+                atom_types.append(idx)
+            else:
+                atom_types.append(idx + 1)
+    for ii in np.unique(sorted(atomic_number)):
+        atom_names.append(ELEMENTS[ii - 1])
+    return atom_names, atom_numbs, np.array(atom_types, dtype=int), nelm
+
+
+def get_movement_block(fp):
+    blk = []
+    for ii in fp:
+        if not ii:
+            return blk
+        blk.append(ii.rstrip("\n"))
+        if "------------" in ii:
+            return blk
+    return blk
+
+
+# we assume that the force is printed ...
+def get_frames(fname, begin=0, step=1, convergence_check=True):
+    fp = open(fname)
+    blk = get_movement_block(fp)
+
+    atom_names, atom_numbs, atom_types, nelm = system_info(blk, type_idx_zero=True)
+    ntot = sum(atom_numbs)
+
+    all_coords = []
+    all_cells = []
+    all_energies = []
+    all_atomic_energy = []
+    all_forces = []
+    all_virials = []
+
+    cc = 0
+    rec_failed = []
+    while len(blk) > 0:
+        if cc >= begin and (cc - begin) % step == 0:
+            coord, cell, energy, force, virial, is_converge = analyze_block(
+                blk, ntot, nelm
+            )
+            if len(coord) == 0:
+                break
+            if is_converge or not convergence_check:
+                all_coords.append(coord)
+                all_cells.append(cell)
+                all_energies.append(energy)
+                all_forces.append(force)
+                if virial is not None:
+                    all_virials.append(virial)
+            if not is_converge:
+                rec_failed.append(cc + 1)
+
+        blk = get_movement_block(fp)
+        cc += 1
+
+    if len(rec_failed) > 0:
+        prt = (
+            "so they are not collected."
+            if convergence_check
+            else "but they are still collected due to the requirement for ignoring convergence checks."
+        )
+        warnings.warn(
+            f"The following structures were unconverged: {rec_failed}; " + prt
+        )
+
+    if len(all_virials) == 0:
+        all_virials = None
+    else:
+        all_virials = np.array(all_virials)
+    fp.close()
+    return (
+        atom_names,
+        atom_numbs,
+        atom_types,
+        np.array(all_cells),
+        np.array(all_coords),
+        np.array(all_energies),
+        np.array(all_forces),
+        all_virials,
+    )
+
+
+def analyze_block(lines, ntot, nelm):
+    coord = []
+    cell = []
+    energy = None
+    #    atomic_energy = []
+    force = []
+    virial = None
+    is_converge = True
+    sc_index = 0
+    for idx, ii in enumerate(lines):
+        if "Iteration" in ii:
+            sc_index = int(ii.split("SCF =")[1])
+            if sc_index >= nelm:
+                is_converge = False
+            energy = float(
+                ii.split("Etot,Ep,Ek (eV)")[1].split()[2]
+            )  # use Ep, not Etot=Ep+Ek
+        elif "----------" in ii:
+            assert (force is not None) and len(coord) > 0 and len(cell) > 0
+            # all_coords.append(coord)
+            # all_cells.append(cell)
+            # all_energies.append(energy)
+            # all_forces.append(force)
+            # if virial is not None :
+            #     all_virials.append(virial)
+            return coord, cell, energy, force, virial, is_converge
+        #        elif 'NPT' in ii:
+        #            tmp_v = []
+        elif "Lattice vector" in ii:
+            if "stress" in lines[idx + 1]:
+                tmp_v = []
+                for dd in range(3):
+                    tmp_l = lines[idx + 1 + dd]
+                    cell.append([float(ss) for ss in tmp_l.split()[0:3]])
+                    tmp_v.append([float(stress) for stress in tmp_l.split()[5:8]])
+                virial = np.zeros([3, 3])
+                virial[0][0] = tmp_v[0][0]
+                virial[0][1] = tmp_v[0][1]
+                virial[0][2] = tmp_v[0][2]
+                virial[1][0] = tmp_v[1][0]
+                virial[1][1] = tmp_v[1][1]
+                virial[1][2] = tmp_v[1][2]
+                virial[2][0] = tmp_v[2][0]
+                virial[2][1] = tmp_v[2][1]
+                virial[2][2] = tmp_v[2][2]
+                volume = np.linalg.det(np.array(cell))
+                virial = virial * 160.2 * 10.0 / volume
+            else:
+                for dd in range(3):
+                    tmp_l = lines[idx + 1 + dd]
+                    cell.append([float(ss) for ss in tmp_l.split()[0:3]])
+
+        #            else :
+        #                for dd in range(3) :
+        #                    tmp_l = lines[idx+1+dd]
+        #                    cell.append([float(ss)
+        #                                 for ss in tmp_l.split()[0:3]])
+        #                virial = np.zeros([3,3])
+        elif ("Position" in ii) and ("nonperiodic_Position" not in ii):
+            for kk in range(idx + 1, idx + 1 + ntot):
+                min = kk
+                for jj in range(kk + 1, idx + 1 + ntot):
+                    if int(lines[jj].split()[0]) < int(lines[min].split()[0]):
+                        min = jj
+                        lines[min], lines[kk] = lines[kk], lines[min]
+            for gg in range(idx + 1, idx + 1 + ntot):
+                info = [float(jj) for jj in lines[gg].split()[1:4]]
+                info = np.matmul(np.array(info), np.array(cell))
+                coord.append(info)
+        elif "Force" in ii:
+            for kk in range(idx + 1, idx + 1 + ntot):
+                min = kk
+                for jj in range(kk + 1, idx + 1 + ntot):
+                    if int(lines[jj].split()[0]) < int(lines[min].split()[0]):
+                        min = jj
+                        lines[min], lines[kk] = lines[kk], lines[min]
+            for gg in range(idx + 1, idx + 1 + ntot):
+                info = [
+                    -float(ss) for ss in lines[gg].split()
+                ]  # forces in MOVEMENT file are dE/dR, lacking a minus sign
+                force.append(info[1:4])
+    #        elif 'Atomic-Energy' in ii:
+    #            for jj in range(idx+1, idx+1+ntot) :
+    #                tmp_l = lines[jj]
+    #                info = [float(ss) for ss in tmp_l.split()]
+    #                atomic_energy.append(info[1])
+    return coord, cell, energy, force, virial, is_converge
diff --git a/dpdata/formats/pymatgen/__init__.py b/dpdata/formats/pymatgen/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/dpdata/formats/pymatgen/molecule.py b/dpdata/formats/pymatgen/molecule.py
new file mode 100644
index 000000000..8d397984a
--- /dev/null
+++ b/dpdata/formats/pymatgen/molecule.py
@@ -0,0 +1,30 @@
+from __future__ import annotations
+
+from collections import Counter
+
+import numpy as np
+
+
+def to_system_data(file_name, protect_layer=9):
+    from pymatgen.core import Molecule
+
+    mol = Molecule.from_file(file_name)
+    elem_mol = list(str(site.species.elements[0]) for site in mol.sites)
+    elem_counter = Counter(elem_mol)
+    atom_names = list(elem_counter.keys())
+    atom_numbs = list(elem_counter.values())
+    atom_types = [list(atom_names).index(e) for e in elem_mol]
+    natoms = np.sum(atom_numbs)
+
+    tmpcoord = np.copy(mol.cart_coords)
+
+    system = {}
+    system["atom_names"] = atom_names
+    system["atom_numbs"] = atom_numbs
+    system["atom_types"] = np.array(atom_types, dtype=int)
+    # center = [c - h_cell_size for c in mol.center_of_mass]
+    system["orig"] = np.array([0, 0, 0])
+
+    system["coords"] = np.array([tmpcoord])
+    system["cells"] = np.array([10.0 * np.eye(3)])
+    return system
diff --git a/dpdata/formats/pymatgen/structure.py b/dpdata/formats/pymatgen/structure.py
new file mode 100644
index 000000000..1f74dbdd0
--- /dev/null
+++ b/dpdata/formats/pymatgen/structure.py
@@ -0,0 +1,30 @@
+from __future__ import annotations
+
+import numpy as np
+
+
+def from_system_data(structure) -> dict:
+    """Convert one pymatgen structure to dpdata's datadict."""
+    symbols = [ii.specie.symbol for ii in structure]
+    atom_names = list(structure.symbol_set)
+    atom_numbs = [symbols.count(symbol) for symbol in atom_names]
+    atom_types = np.array([atom_names.index(symbol) for symbol in symbols]).astype(int)
+    coords = structure.cart_coords
+    cells = structure.lattice.matrix
+    if all(structure.pbc):
+        pbc = True
+    elif not any(structure.pbc):
+        pbc = False
+    else:
+        raise ValueError(f"Partial pbc condition {structure.pbc} is not supported")
+
+    info_dict = {
+        "atom_names": atom_names,
+        "atom_numbs": atom_numbs,
+        "atom_types": atom_types,
+        "coords": np.array([coords]),
+        "cells": np.array([cells]),
+        "orig": np.zeros(3),
+        "nopbc": not pbc,
+    }
+    return info_dict
diff --git a/dpdata/formats/qe/__init__.py b/dpdata/formats/qe/__init__.py
new file mode 100644
index 000000000..35d20788f
--- /dev/null
+++ b/dpdata/formats/qe/__init__.py
@@ -0,0 +1,5 @@
+from __future__ import annotations
+
+from . import scf, traj
+
+__all__ = ["scf", "traj"]
diff --git a/dpdata/formats/qe/scf.py b/dpdata/formats/qe/scf.py
new file mode 100755
index 000000000..341261d22
--- /dev/null
+++ b/dpdata/formats/qe/scf.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import os
+
+import numpy as np
+
+from dpdata.utils import open_file
+
+from .traj import (
+    kbar2evperang3,
+    ry2ev,
+)
+from .traj import (
+    length_convert as bohr2ang,
+)
+
+_QE_BLOCK_KEYWORDS = [
+    "ATOMIC_SPECIES",
+    "ATOMIC_POSITIONS",
+    "K_POINTS",
+    "ADDITIONAL_K_POINTS",
+    "CELL_PARAMETERS",
+    "CONSTRAINTS",
+    "OCCUPATIONS",
+    "ATOMIC_VELOCITIES",
+    "ATOMIC_FORCES",
+    "SOLVENTS",
+    "HUBBARD",
+]
+
+
+def get_block(lines, keyword, skip=0):
+    ret = []
+    for idx, ii in enumerate(lines):
+        if keyword in ii:
+            blk_idx = idx + 1 + skip
+            while len(lines[blk_idx].split()) == 0:
+                blk_idx += 1
+            while (
+                len(lines[blk_idx].split()) != 0
+                and (lines[blk_idx].split()[0] not in _QE_BLOCK_KEYWORDS)
+            ) and blk_idx != len(lines):
+                ret.append(lines[blk_idx])
+                blk_idx += 1
+            break
+    return ret
+
+
+def get_cell(lines):
+    ret = []
+    for idx, ii in enumerate(lines):
+        if "ibrav" in ii:
+            break
+    blk = lines[idx : idx + 2]
+    ibrav = int(blk[0].replace(",", "").split("=")[-1])
+    if ibrav == 0:
+        for iline in lines:
+            if "CELL_PARAMETERS" in iline and "angstrom" not in iline.lower():
+                raise RuntimeError(
+                    "CELL_PARAMETERS must be written in Angstrom. Other units are not supported yet."
+                )
+        blk = get_block(lines, "CELL_PARAMETERS")
+        for ii in blk:
+            ret.append([float(jj) for jj in ii.split()[0:3]])
+        ret = np.array(ret)
+    elif ibrav == 1:
+        a = None
+        for iline in lines:
+            line = iline.replace("=", " ").replace(",", "").split()
+            if len(line) >= 2 and "a" == line[0]:
+                # print("line = ", line)
+                a = float(line[1])
+            if len(line) >= 2 and "celldm(1)" == line[0]:
+                a = float(line[1]) * bohr2ang
+        # print("a = ", a)
+        if not a:
+            raise RuntimeError("parameter 'a' or 'celldm(1)' cannot be found.")
+        ret = np.array([[a, 0.0, 0.0], [0.0, a, 0.0], [0.0, 0.0, a]])
+    else:
+        raise RuntimeError("ibrav > 1 not supported yet.")
+    return ret
+
+
+def get_coords(lines, cell):
+    coord = []
+    atom_symbol_list = []
+    for iline in lines:
+        if "ATOMIC_POSITIONS" in iline and (
+            "angstrom" not in iline.lower() and "crystal" not in iline.lower()
+        ):
+            raise RuntimeError(
+                "ATOMIC_POSITIONS must be written in Angstrom or crystal. Other units are not supported yet."
+            )
+        if "ATOMIC_POSITIONS" in iline and "angstrom" in iline.lower():
+            blk = get_block(lines, "ATOMIC_POSITIONS")
+            for ii in blk:
+                coord.append([float(jj) for jj in ii.split()[1:4]])
+                atom_symbol_list.append(ii.split()[0])
+            coord = np.array(coord)
+        elif "ATOMIC_POSITIONS" in iline and "crystal" in iline.lower():
+            blk = get_block(lines, "ATOMIC_POSITIONS")
+            for ii in blk:
+                coord.append([float(jj) for jj in ii.split()[1:4]])
+                atom_symbol_list.append(ii.split()[0])
+            coord = np.array(coord)
+            coord = np.matmul(coord, cell)
+    atom_symbol_list = np.array(atom_symbol_list)
+    tmp_names, symbol_idx = np.unique(atom_symbol_list, return_index=True)
+    atom_types = []
+    atom_numbs = []
+    # preserve the atom_name order
+    atom_names = atom_symbol_list[np.sort(symbol_idx, kind="stable")]
+    for jj in atom_symbol_list:
+        for idx, ii in enumerate(atom_names):
+            if jj == ii:
+                atom_types.append(idx)
+    for idx in range(len(atom_names)):
+        atom_numbs.append(atom_types.count(idx))
+    atom_types = np.array(atom_types)
+
+    return list(atom_names), atom_numbs, atom_types, coord
+
+
+def get_energy(lines):
+    energy = None
+    for ii in lines:
+        if "!    total energy" in ii:
+            energy = ry2ev * float(ii.split("=")[1].split()[0])
+    return energy
+
+
+def get_force(lines, natoms):
+    blk = get_block(lines, "Forces acting on atoms", skip=1)
+    ret = []
+    blk = blk[0 : sum(natoms)]
+    for ii in blk:
+        ret.append([float(jj) for jj in ii.split("=")[1].split()])
+    ret = np.array(ret)
+    ret *= ry2ev / bohr2ang
+    return ret
+
+
+def get_stress(lines):
+    blk = get_block(lines, "total   stress")
+    if len(blk) == 0:
+        return None
+    ret = []
+    for ii in blk:
+        ret.append([float(jj) for jj in ii.split()[3:6]])
+    ret = np.array(ret)
+    ret *= kbar2evperang3
+    return ret
+
+
+def get_frame(fname):
+    if isinstance(fname, str):
+        path_out = fname
+        outname = os.path.basename(path_out)
+        # the name of the input file is assumed to be different from the output by 'in' and 'out'
+        inname = outname.replace("out", "in")
+        path_in = os.path.join(os.path.dirname(path_out), inname)
+    elif isinstance(fname, list) and len(fname) == 2:
+        path_in = fname[0]
+        path_out = fname[1]
+    else:
+        raise RuntimeError("invalid input")
+    with open_file(path_out) as fp:
+        outlines = fp.read().split("\n")
+    with open_file(path_in) as fp:
+        inlines = fp.read().split("\n")
+    cell = get_cell(inlines)
+    atom_names, natoms, types, coords = get_coords(inlines, cell)
+    energy = get_energy(outlines)
+    force = get_force(outlines, natoms)
+    stress = get_stress(outlines)
+    if stress is not None:
+        stress = (stress * np.linalg.det(cell))[np.newaxis, :, :]
+    return (
+        atom_names,
+        natoms,
+        types,
+        cell[np.newaxis, :, :],
+        coords[np.newaxis, :, :],
+        np.array(energy)[np.newaxis],
+        force[np.newaxis, :, :],
+        stress,
+    )
diff --git a/dpdata/formats/qe/traj.py b/dpdata/formats/qe/traj.py
new file mode 100644
index 000000000..382d3acfa
--- /dev/null
+++ b/dpdata/formats/qe/traj.py
@@ -0,0 +1,284 @@
+#!/usr/bin/python3
+from __future__ import annotations
+
+import warnings
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+from dpdata.utils import open_file
+
+if TYPE_CHECKING:
+    from dpdata.utils import FileType
+
+import os
+
+from ...unit import (
+    EnergyConversion,
+    ForceConversion,
+    LengthConversion,
+    PressureConversion,
+)
+
+ry2ev = EnergyConversion("rydberg", "eV").value()
+kbar2evperang3 = PressureConversion("kbar", "eV/angstrom^3").value()
+gpa2evperbohr = PressureConversion("GPa", "eV/bohr^3").value()
+
+length_convert = LengthConversion("bohr", "angstrom").value()
+energy_convert = EnergyConversion("hartree", "eV").value()
+force_convert = ForceConversion("hartree/bohr", "eV/angstrom").value()
+
+
+def load_key(lines, key):
+    for ii in lines:
+        if key in ii:
+            words = ii.split(",")
+            for jj in words:
+                if key in jj:
+                    return jj.split("=")[1]
+    return None
+
+
+def load_block(lines, key, nlines):
+    for idx, ii in enumerate(lines):
+        if key in ii:
+            break
+    return lines[idx + 1 : idx + 1 + nlines]
+
+
+def convert_celldm(ibrav, celldm):
+    if ibrav == 1:
+        return celldm[0] * np.eye(3)
+    elif ibrav == 2:
+        return celldm[0] * 0.5 * np.array([[-1, 0, 1], [0, 1, 1], [-1, 1, 0]])
+    elif ibrav == 3:
+        return celldm[0] * 0.5 * np.array([[1, 1, 1], [-1, 1, 1], [-1, -1, 1]])
+    elif ibrav == -3:
+        return celldm[0] * 0.5 * np.array([[-1, 1, 1], [1, -1, 1], [1, 1, -1]])
+    else:
+        warnings.warn(
+            "unsupported ibrav "
+            + str(ibrav)
+            + " if no .cel file, the cell convertion may be wrong. "
+        )
+        return np.eye(3)
+        # raise RuntimeError('unsupported ibrav ' + str(ibrav))
+
+
+def load_cell_parameters(lines):
+    blk = load_block(lines, "CELL_PARAMETERS", 3)
+    ret = []
+    for ii in blk:
+        ret.append([float(jj) for jj in ii.split()[0:3]])
+    return np.array(ret)
+
+
+def load_atom_names(lines, ntypes):
+    blk = load_block(lines, "ATOMIC_SPECIES", ntypes)
+    return [ii.split()[0] for ii in blk]
+
+
+def load_celldm(lines):
+    celldm = np.zeros(6)
+    for ii in range(6):
+        key = "celldm(%d)" % (ii + 1)  # noqa: UP031
+        val = load_key(lines, key)
+        if val is not None:
+            celldm[ii] = float(val)
+    return celldm
+
+
+def load_atom_types(lines, natoms, atom_names):
+    blk = load_block(lines, "ATOMIC_POSITIONS", natoms)
+    ret = []
+    for ii in blk:
+        ret.append(atom_names.index(ii.split()[0]))
+    return np.array(ret, dtype=int)
+
+
+def load_param_file(fname: FileType):
+    with open_file(fname) as fp:
+        lines = fp.read().split("\n")
+    natoms = int(load_key(lines, "nat"))
+    ntypes = int(load_key(lines, "ntyp"))
+    atom_names = load_atom_names(lines, ntypes)
+    atom_types = load_atom_types(lines, natoms, atom_names)
+    atom_numbs = []
+    for ii in range(ntypes):
+        atom_numbs.append(np.sum(atom_types == ii))
+    ibrav = int(load_key(lines, "ibrav"))
+    celldm = load_celldm(lines)
+    if ibrav == 0:
+        cell = load_cell_parameters(lines)
+    else:
+        cell = convert_celldm(ibrav, celldm)
+    cell = cell * length_convert
+    # print(atom_names)
+    # print(atom_numbs)
+    # print(atom_types)
+    # print(cell)
+    return atom_names, atom_numbs, atom_types, cell
+
+
+def _load_pos_block(fp, natoms):
+    head = fp.readline()
+    if not head:
+        # print('get None')
+        return None, None
+    else:
+        ss = head.split()[0]
+        blk = []
+        for ii in range(natoms):
+            newline = fp.readline()
+            if not newline:
+                return None, None
+            blk.append([float(jj) for jj in newline.split()])
+        return blk, ss
+
+
+def load_data(fname: FileType, natoms, begin=0, step=1, convert=1.0):
+    coords = []
+    steps = []
+    cc = 0
+    with open_file(fname) as fp:
+        while True:
+            blk, ss = _load_pos_block(fp, natoms)
+            if blk is None:
+                break
+            else:
+                if cc >= begin and (cc - begin) % step == 0:
+                    coords.append(blk)
+                    steps.append(ss)
+            cc += 1
+    coords = convert * np.array(coords)
+    return coords, steps
+
+
+# def load_pos(fname, natoms) :
+#     coords = []
+#     with open_file(fname) as fp:
+#         while True:
+#             blk = _load_pos_block(fp, natoms)
+#             # print(blk)
+#             if blk == None :
+#                 break
+#             else :
+#                 coords.append(blk)
+#     coords= length_convert * np.array(coords)
+#     return coords
+
+
+def load_energy(fname, begin=0, step=1):
+    data = np.loadtxt(fname, ndmin=2)
+    steps = []
+    for ii in data[begin::step, 0]:
+        steps.append("%d" % ii)  # noqa: UP031
+    with open_file(fname) as fp:
+        while True:
+            line = fp.readline()
+            if not line:
+                return None
+            if line.split()[0][0] != "#":
+                nw = len(line.split())
+                break
+    data = np.reshape(data, [-1, nw])
+    return energy_convert * data[begin::step, 5], steps
+
+
+# def load_force(fname, natoms) :
+#     coords = []
+#     with open_file(fname) as fp:
+#         while True:
+#             blk = _load_pos_block(fp, natoms)
+#             # print(blk)
+#             if blk == None :
+#                 break
+#             else :
+#                 coords.append(blk)
+#     coords= force_convert * np.array(coords)
+#     return coords
+
+
+def to_system_data(input_name, prefix, begin=0, step=1):
+    data = {}
+    data["atom_names"], data["atom_numbs"], data["atom_types"], cell = load_param_file(
+        input_name
+    )
+    data["coords"], csteps = load_data(
+        prefix + ".pos",
+        np.sum(data["atom_numbs"]),
+        begin=begin,
+        step=step,
+        convert=length_convert,
+    )
+    data["orig"] = np.zeros(3)
+    try:
+        data["cells"], tmp_steps = load_data(
+            prefix + ".cel", 3, begin=begin, step=step, convert=length_convert
+        )
+        data["cells"] = np.transpose(data["cells"], (0, 2, 1))
+        if csteps != tmp_steps:
+            csteps.append(None)
+            tmp_steps.append(None)
+            for int_id in range(len(csteps)):
+                if csteps[int_id] != tmp_steps[int_id]:
+                    break
+            step_id = begin + int_id * step
+            raise RuntimeError(
+                f"the step key between files are not consistent. "
+                f"The difference locates at step: {step_id}, "
+                f".pos is {csteps[int_id]}, .cel is {tmp_steps[int_id]}"
+            )
+    except FileNotFoundError:
+        data["cells"] = np.tile(cell, (data["coords"].shape[0], 1, 1))
+
+    # handle virial
+    stress_fname = prefix + ".str"
+    if os.path.exists(stress_fname):
+        # 1. Read stress tensor (in GPa) for each structure
+        stress, vsteps = load_data(stress_fname, 3, begin=begin, step=step, convert=1.0)
+        if csteps != vsteps:
+            csteps.append(None)
+            vsteps.append(None)
+            for int_id in range(len(csteps)):
+                if csteps[int_id] != vsteps[int_id]:
+                    break
+            step_id = begin + int_id * step
+            raise RuntimeError(
+                f"the step key between files are not consistent. "
+                f"The difference locates at step: {step_id}, "
+                f".pos is {csteps[int_id]}, .str is {vsteps[int_id]}"
+            )
+        # 2. Calculate volume from cell. revert unit to bohr before taking det
+        volumes = np.linalg.det(data["cells"] / length_convert).reshape(-1)
+        # 3. Calculate virials for each structure, shape [nf x 3 x 3]
+        data["virials"] = gpa2evperbohr * volumes[:, None, None] * stress
+
+    return data, csteps
+
+
+def to_system_label(input_name, prefix, begin=0, step=1):
+    atom_names, atom_numbs, atom_types, cell = load_param_file(input_name)
+    energy, esteps = load_energy(prefix + ".evp", begin=begin, step=step)
+    force, fsteps = load_data(
+        prefix + ".for",
+        np.sum(atom_numbs),
+        begin=begin,
+        step=step,
+        convert=force_convert,
+    )
+    assert esteps == fsteps, "the step key between files are not consistent "
+    return energy, force, esteps
+
+
+if __name__ == "__main__":
+    prefix = "nacl"
+    atom_names, atom_numbs, atom_types, cell = load_param_file(prefix + ".in")
+    coords = load_data(prefix + ".pos", np.sum(atom_numbs))
+    cells = load_data(prefix + ".cel", 3)
+    print(atom_names)
+    print(atom_numbs)
+    print(atom_types)
+    print(cells)
+    print(coords.shape)
+    print(cells.shape)
diff --git a/dpdata/formats/rdkit/__init__.py b/dpdata/formats/rdkit/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/dpdata/formats/rdkit/sanitize.py b/dpdata/formats/rdkit/sanitize.py
new file mode 100644
index 000000000..9afc52c9a
--- /dev/null
+++ b/dpdata/formats/rdkit/sanitize.py
@@ -0,0 +1,728 @@
+from __future__ import annotations
+
+import os
+import time
+from copy import deepcopy
+
+
+def get_explicit_valence(atom, verbose=False):
+    exp_val_calculated_from_bonds = int(
+        sum([bond.GetBondTypeAsDouble() for bond in atom.GetBonds()])
+    )
+    try:
+        try:
+            from rdkit import Chem
+
+            exp_val = atom.GetValence(Chem.ValenceType.EXPLICIT)
+            valence_method = "GetValence(Chem.ValenceType.EXPLICIT)"
+        except (ImportError, AttributeError, TypeError):
+            exp_val = atom.GetExplicitValence()
+            valence_method = "GetExplicitValence()"
+        if exp_val != exp_val_calculated_from_bonds:
+            if verbose:
+                print(
+                    f"Explicit valence given by {valence_method} and sum of bond order are inconsistent on {atom.GetSymbol()}{atom.GetIdx() + 1}, using sum of bond order."
+                )
+        return exp_val_calculated_from_bonds
+    except Exception:
+        return exp_val_calculated_from_bonds
+
+
+def regularize_formal_charges(mol, sanitize=True, verbose=False):
+    """Regularize formal charges of atoms."""
+    from rdkit import Chem
+
+    assert isinstance(mol, Chem.rdchem.Mol)
+    for atom in mol.GetAtoms():
+        assign_formal_charge_for_atom(atom, verbose)
+    if sanitize:
+        try:
+            Chem.SanitizeMol(mol)
+            return mol
+        except Exception:
+            return None
+    else:
+        return mol
+
+
+def assign_formal_charge_for_atom(atom, verbose=False):
+    """Assigen formal charge according to 8-electron rule for element B,C,N,O,S,P,As."""
+    from rdkit import Chem
+
+    assert isinstance(atom, Chem.rdchem.Atom)
+    valence = get_explicit_valence(atom, verbose)
+    if atom.GetSymbol() == "B":
+        atom.SetFormalCharge(3 - valence)
+    elif atom.GetSymbol() == "C":
+        atom.SetFormalCharge(valence - 4)
+        if valence == 3:
+            print(
+                f"Detect a valence of 3 on #C{atom.GetIdx() + 1}, the formal charge of this atom will be assigned to -1"
+            )
+        elif valence > 4:
+            raise ValueError(f"#C{atom.GetIdx() + 1} has a valence larger than 4")
+    elif atom.GetSymbol() == "N":
+        if valence > 4:
+            raise ValueError(f"#N{atom.GetIdx() + 1} has a valence larger than 4")
+        else:
+            atom.SetFormalCharge(valence - 3)
+    elif atom.GetSymbol() == "O":
+        atom.SetFormalCharge(valence - 2)
+    elif atom.GetSymbol() == "S":
+        if valence == 1:
+            atom.SetFormalCharge(-1)
+        elif valence == 3:
+            atom.SetFormalCharge(1)
+        elif valence > 6:
+            raise ValueError(f"#S{atom.GetIdx() + 1} has a valence larger than 6")
+        else:
+            atom.SetFormalCharge(0)
+    elif atom.GetSymbol() == "P" or atom.GetSymbol() == "As":
+        if valence == 5:
+            atom.SetFormalCharge(0)
+        elif valence > 5:
+            raise ValueError(
+                f"#{atom.GetSymbol()}{atom.GetIdx() + 1} has a valence larger than 5"
+            )
+        else:
+            atom.SetFormalCharge(valence - 3)
+
+
+# print bond and atom information (for debugger)
+def print_bonds(mol):
+    for bond in mol.GetBonds():
+        begin_atom = bond.GetBeginAtom()
+        end_atom = bond.GetEndAtom()
+        print(
+            f"{begin_atom.GetSymbol()}{begin_atom.GetIdx() + 1} {end_atom.GetSymbol()}{end_atom.GetIdx() + 1} {bond.GetBondType()}"
+        )
+
+
+def print_atoms(mol):
+    for atom in mol.GetAtoms():
+        print(
+            f"{atom.GetSymbol()}{atom.GetIdx() + 1} {atom.GetFormalCharge()} {get_explicit_valence(atom)}"
+        )
+
+
+def is_terminal_oxygen(O_atom):
+    return len(O_atom.GetNeighbors()) == 1
+
+
+def get_terminal_oxygens(atom):
+    terminal_oxygens = []
+    for nei in atom.GetNeighbors():
+        if nei.GetSymbol() == "O" or nei.GetSymbol() == "S":
+            if is_terminal_oxygen(nei):
+                terminal_oxygens.append(nei)
+    return terminal_oxygens
+
+
+def is_terminal_NR2(N_atom):
+    return len(N_atom.GetNeighbors()) == 3
+
+
+def get_terminal_NR2s(atom):
+    terminal_NR2s = []
+    for nei in atom.GetNeighbors():
+        if nei.GetSymbol() == "N":
+            if is_terminal_NR2(nei):
+                terminal_NR2s.append(nei)
+    terminal_NR2s.sort(
+        key=lambda N_atom: len(
+            [atom for atom in N_atom.GetNeighbors() if atom.GetSymbol() == "H"]
+        )
+    )
+    return terminal_NR2s
+
+
+def sanitize_phosphate_Patom(P_atom, verbose=True):
+    from rdkit import Chem
+
+    if P_atom.GetSymbol() == "P":
+        terminal_oxygens = get_terminal_oxygens(P_atom)
+        mol = P_atom.GetOwningMol()
+        if len(terminal_oxygens) > 1:
+            if verbose:
+                print("Phospate group detected, sanitizing it...")
+            # set one P=O and two P-O
+            bond1 = mol.GetBondBetweenAtoms(
+                P_atom.GetIdx(), terminal_oxygens[0].GetIdx()
+            )
+            bond1.SetBondType(Chem.rdchem.BondType.DOUBLE)
+            for ii in range(1, len(terminal_oxygens)):
+                bond = mol.GetBondBetweenAtoms(
+                    P_atom.GetIdx(), terminal_oxygens[ii].GetIdx()
+                )
+                bond.SetBondType(Chem.rdchem.BondType.SINGLE)
+                terminal_oxygens[ii].SetFormalCharge(-1)
+
+
+def sanitize_phosphate(mol):
+    for atom in mol.GetAtoms():
+        sanitize_phosphate_Patom(atom)
+    return mol
+
+
+def sanitize_sulfate_Satom(S_atom, verbose=True):
+    from rdkit import Chem
+
+    if S_atom.GetSymbol() == "S":
+        terminal_oxygens = get_terminal_oxygens(S_atom)
+        mol = S_atom.GetOwningMol()
+        if len(terminal_oxygens) == 3:
+            if verbose:
+                print("Sulfate group detected, sanitizing it...")
+            # set one S-O and two S=O
+            bond1 = mol.GetBondBetweenAtoms(
+                S_atom.GetIdx(), terminal_oxygens[0].GetIdx()
+            )
+            bond1.SetBondType(Chem.rdchem.BondType.SINGLE)
+            terminal_oxygens[0].SetFormalCharge(-1)
+            for ii in range(1, len(terminal_oxygens)):
+                bond = mol.GetBondBetweenAtoms(
+                    S_atom.GetIdx(), terminal_oxygens[ii].GetIdx()
+                )
+                bond.SetBondType(Chem.rdchem.BondType.DOUBLE)
+
+
+def sanitize_sulfate(mol):
+    for atom in mol.GetAtoms():
+        sanitize_sulfate_Satom(atom)
+    return mol
+
+
+def sanitize_carboxyl_Catom(C_atom, verbose=True):
+    from rdkit import Chem
+
+    if C_atom.GetSymbol() == "C":
+        terminal_oxygens = get_terminal_oxygens(C_atom)
+        mol = C_atom.GetOwningMol()
+        if len(terminal_oxygens) == 2:
+            if verbose:
+                print("Carbonxyl group detected, sanitizing it...")
+            # set one C-O and one C=O
+            bond1 = mol.GetBondBetweenAtoms(
+                C_atom.GetIdx(), terminal_oxygens[0].GetIdx()
+            )
+            bond1.SetBondType(Chem.rdchem.BondType.SINGLE)
+            terminal_oxygens[0].SetFormalCharge(-1)
+
+            bond2 = mol.GetBondBetweenAtoms(
+                C_atom.GetIdx(), terminal_oxygens[1].GetIdx()
+            )
+            bond2.SetBondType(Chem.rdchem.BondType.DOUBLE)
+            terminal_oxygens[1].SetFormalCharge(0)
+
+
+def sanitize_carboxyl(mol):
+    for atom in mol.GetAtoms():
+        sanitize_carboxyl_Catom(atom)
+    return mol
+
+
+def sanitize_guanidine_Catom(C_atom, verbose=True):
+    from rdkit import Chem
+
+    if C_atom.GetSymbol() == "C":
+        terminal_NR2s = get_terminal_NR2s(C_atom)
+        mol = C_atom.GetOwningMol()
+        if len(terminal_NR2s) == 3:
+            if verbose:
+                print("Guanidyl group detected, sanitizing it...")
+            # set two C-N and one C=N+
+            bond1 = mol.GetBondBetweenAtoms(C_atom.GetIdx(), terminal_NR2s[0].GetIdx())
+            bond1.SetBondType(Chem.rdchem.BondType.SINGLE)
+            terminal_NR2s[0].SetFormalCharge(-1)
+
+            bond2 = mol.GetBondBetweenAtoms(C_atom.GetIdx(), terminal_NR2s[1].GetIdx())
+            bond2.SetBondType(Chem.rdchem.BondType.SINGLE)
+            terminal_NR2s[1].SetFormalCharge(0)
+
+            bond3 = mol.GetBondBetweenAtoms(C_atom.GetIdx(), terminal_NR2s[2].GetIdx())
+            bond3.SetBondType(Chem.rdchem.BondType.DOUBLE)
+            terminal_NR2s[2].SetFormalCharge(1)
+
+
+def sanitize_guanidine(mol):
+    for atom in mol.GetAtoms():
+        sanitize_guanidine_Catom(atom)
+    return mol
+
+
+def sanitize_nitro_Natom(N_atom, verbose=True):
+    from rdkit import Chem
+
+    if N_atom.GetSymbol() == "N":
+        terminal_oxygens = get_terminal_oxygens(N_atom)
+        mol = N_atom.GetOwningMol()
+        if len(terminal_oxygens) == 2:
+            if verbose:
+                print("Nitro group detected, sanitizing it...")
+            # set one N-O and one N=O
+            bond1 = mol.GetBondBetweenAtoms(
+                N_atom.GetIdx(), terminal_oxygens[0].GetIdx()
+            )
+            bond1.SetBondType(Chem.rdchem.BondType.SINGLE)
+            terminal_oxygens[0].SetFormalCharge(-1)
+
+            bond2 = mol.GetBondBetweenAtoms(
+                N_atom.GetIdx(), terminal_oxygens[1].GetIdx()
+            )
+            bond2.SetBondType(Chem.rdchem.BondType.DOUBLE)
+            terminal_oxygens[1].SetFormalCharge(0)
+
+
+def sanitize_nitro(mol):
+    for atom in mol.GetAtoms():
+        sanitize_nitro_Natom(atom)
+    return mol
+
+
+def is_terminal_nitrogen(N_atom):
+    if N_atom.GetSymbol() == "N" and len(N_atom.GetNeighbors()) == 1:
+        return True
+    else:
+        return False
+
+
+def sanitize_nitrine_Natom(atom, verbose=True):
+    from rdkit import Chem
+
+    if atom.GetSymbol() == "N" and len(atom.GetNeighbors()) == 2:
+        mol = atom.GetOwningMol()
+        nei1, nei2 = atom.GetNeighbors()[0], atom.GetNeighbors()[1]
+        if nei1.GetSymbol() == "N" and nei2.GetSymbol() == "N":
+            if is_terminal_nitrogen(nei1):
+                N_terminal = nei1
+                N_non_terminal = nei2
+            elif is_terminal_nitrogen(nei2):
+                N_terminal = nei2
+                N_non_terminal = nei1
+            else:
+                N_terminal = None
+                N_non_terminal = None
+            if (N_terminal is not None) and (N_non_terminal is not None):
+                # set X-N=[N+]=[N-]
+                if verbose:
+                    print("Detecting nitrine group, fixing it...")
+                bond = mol.GetBondBetweenAtoms(atom.GetIdx(), N_terminal.GetIdx())
+                bond.SetBondType(Chem.rdchem.BondType.DOUBLE)
+                N_terminal.SetFormalCharge(-1)
+
+                bond = mol.GetBondBetweenAtoms(atom.GetIdx(), N_non_terminal.GetIdx())
+                bond.SetBondType(Chem.rdchem.BondType.DOUBLE)
+                atom.SetFormalCharge(1)
+
+
+def contain_hetero_aromatic(mol):
+    flag = False
+    for atom in mol.GetAtoms():
+        if atom.GetSymbol() != "C" and atom.GetIsAromatic():
+            flag = True
+            break
+    return flag
+
+
+# for carbon with explicit valence > 4
+def regularize_carbon_bond_order(atom, verbose=True):
+    from rdkit import Chem
+
+    if atom.GetSymbol() == "C" and get_explicit_valence(atom) > 4:
+        if verbose:
+            print("Detecting carbon with explicit valence > 4, fixing it...")
+        mol = atom.GetOwningMol()
+        double_bond_idx = -1
+        for nei in atom.GetNeighbors():
+            bond = mol.GetBondBetweenAtoms(atom.GetIdx(), nei.GetIdx())
+            if bond.GetBondTypeAsDouble() == 2:
+                double_bond_idx = bond.GetIdx()
+                break
+        if double_bond_idx != -1:
+            for bond in atom.GetBonds():
+                if bond.GetIdx() != double_bond_idx:
+                    bond.SetBondType(Chem.rdchem.BondType.SINGLE)
+
+
+# for nitrogen with explicit valence > 4
+def regularize_nitrogen_bond_order(atom, verbose=True):
+    from rdkit import Chem
+
+    mol = atom.GetOwningMol()
+    if atom.GetSymbol() == "N" and get_explicit_valence(atom) > 4:
+        O_atoms = get_terminal_oxygens(atom)
+        for O_atom in O_atoms:
+            bond = mol.GetBondBetweenAtoms(atom.GetIdx(), O_atom.GetIdx())
+            if bond.GetBondTypeAsDouble() == 2:
+                bond.SetBondType(Chem.rdchem.BondType.SINGLE)
+                O_atom.SetFormalCharge(-1)
+
+
+def sanitize_mol(mol, verbose=False):
+    for atom in mol.GetAtoms():
+        sanitize_carboxyl_Catom(atom, verbose)
+        sanitize_guanidine_Catom(atom, verbose)
+        sanitize_phosphate_Patom(atom, verbose)
+        sanitize_sulfate_Satom(atom, verbose)
+        sanitize_nitro_Natom(atom, verbose)
+        sanitize_nitrine_Natom(atom, verbose)
+        regularize_carbon_bond_order(atom, verbose)
+        regularize_nitrogen_bond_order(atom, verbose)
+    return mol
+
+
+# copy from FEprep
+def mol_edit_log(mol, i, j):
+    if not mol.HasProp("edit"):
+        mol.SetProp("edit", "%d_%d" % (i, j))  # noqa: UP031
+    else:
+        edited = mol.GetProp("edit")
+        mol.SetProp("edit", edited + ",%d_%d" % (i, j))  # noqa: UP031
+
+
+def kekulize_aromatic_heterocycles(mol_in, assign_formal_charge=True, sanitize=True):
+    from rdkit import Chem
+    from rdkit.Chem.rdchem import BondType
+
+    mol = Chem.RWMol(mol_in)
+    rings = Chem.rdmolops.GetSymmSSSR(mol)
+    rings = [list(i) for i in list(rings)]
+    rings.sort(key=lambda r: len(r))
+
+    def search_and_assign_ring(
+        mol, ring, hetero, start, forward=True, start_switch=True
+    ):
+        j = start
+        switch = start_switch
+        lring = len(ring)
+        delta = 1 if forward else -1
+        n_edit = 0
+        n_double = 0
+        while not ((j in hetero) & (not switch)):
+            btype = BondType.SINGLE if switch else BondType.DOUBLE
+            bond = mol.GetBondBetweenAtoms(ring[j], ring[(j + delta) % lring])
+            if bond.GetBondType() == BondType.AROMATIC:
+                bond.SetBondType(btype)
+                mol_edit_log(mol, ring[j], ring[(j + delta) % lring])
+                # print(ring[j], ring[(j + delta) % lring], bond.GetBondType())
+                if btype == BondType.DOUBLE:
+                    n_double += 1
+                n_edit += 1
+            else:
+                break
+            j = (j + delta) % lring
+            switch = not switch
+        return n_edit, n_double
+
+    def print_bondtypes(mol, rings):
+        for ring in rings:
+            lring = len(ring)
+            btype = []
+            for i in range(lring):
+                btype.append(
+                    mol.GetBondBetweenAtoms(
+                        ring[i], ring[(i + 1) % lring]
+                    ).GetBondType()
+                )
+            atoms = [mol.GetAtomWithIdx(i).GetSymbol() for i in ring]
+            print(ring)
+            print(atoms)
+            print(btype)
+
+    def hetero_priority(idx, mol):
+        atom = mol.GetAtomWithIdx(idx)
+        sym = atom.GetSymbol()
+        valence = len(atom.GetBonds())
+
+        if (sym in ["O", "S"]) & (valence == 2):
+            return 0
+        elif sym in ["N", "P", "As", "B"]:
+            if valence == 3:
+                return 1
+            elif valence == 2:
+                return 2
+
+    # save carbon/hetero aromatic rings
+    CAr = []
+    HAr = []
+    for ring in rings:
+        lring = len(ring)
+        bAllAr = True
+        bAllC = True
+        for i in range(lring):
+            atom = mol.GetAtomWithIdx(ring[i])
+            if atom.GetSymbol() != "C":
+                bAllC = False
+
+            bond = mol.GetBondBetweenAtoms(ring[i], ring[(i + 1) % lring])
+            if bond.GetBondType() != BondType.AROMATIC:
+                bAllAr = False
+        if bAllAr and bAllC:
+            CAr.append(ring)
+        elif bAllAr and not bAllC:
+            HAr.append(ring)
+
+    if len(HAr) == 0:
+        # no hetrerocycles
+        return mol_in
+    else:
+        # edit heterocycles
+        for ring in HAr:
+            lring = len(ring)
+            cring = len(CAr)
+            hetero = []
+            hasDouble = []
+            fuseCAr = []
+            fuseDouble = []
+            for i in range(lring):
+                fuseCAr.append(-1)
+                for j in range(cring):
+                    if ring[i] in CAr[j]:
+                        fuseCAr[i] = j
+                        break
+                if i > 1:
+                    if (fuseCAr[i] == fuseCAr[i - 1]) & (fuseCAr[i] >= 0):
+                        fuseDouble.append(i)
+                atom = mol.GetAtomWithIdx(ring[i])
+                if atom.GetSymbol() != "C":
+                    hetero.append(i)
+                atom_bonds = atom.GetBonds()
+                btype = [bond.GetBondType() for bond in atom_bonds]
+                # print(btype)
+                if BondType.DOUBLE in btype:
+                    hasDouble.append(i)
+                bond = mol.GetBondBetweenAtoms(ring[i], ring[(i + 1) % lring])
+
+            if (fuseCAr[0] == fuseCAr[lring - 1]) & (fuseCAr[0] >= 0):
+                fuseDouble.append(0)
+
+            if (len(hetero) > 0) | (len(hasDouble) > 0):
+                n_targetDouble = lring // 2
+                n_targetEdit = lring
+                hetero_prior = {i: hetero_priority(ring[i], mol) for i in hetero}
+                hetero.sort(key=lambda i: hetero_prior[i])
+                for i in hasDouble:
+                    d1, e1 = search_and_assign_ring(mol, ring, hetero, i, forward=True)
+                    d2, e2 = search_and_assign_ring(mol, ring, hetero, i, forward=False)
+                    n_targetDouble -= d1 + d2 + 1
+                    n_targetEdit -= e1 + e2
+                for i in fuseDouble:
+                    bond = mol.GetBondBetweenAtoms(ring[i], ring[(i - 1) % lring])
+                    if bond.GetBondType() == BondType.AROMATIC:
+                        bond.SetBondType(BondType.DOUBLE)
+                        mol_edit_log(mol, ring[i], ring[(i - 1) % lring])
+                    d1, e1 = search_and_assign_ring(mol, ring, hetero, i, forward=True)
+                    d2, e2 = search_and_assign_ring(
+                        mol, ring, hetero, (i - 1) % lring, forward=False
+                    )
+                    n_targetDouble -= d1 + d2 + 1
+                    n_targetEdit -= e1 + e2 + 1
+                for i in hetero:
+                    atom = mol.GetAtomWithIdx(ring[i])
+                    if (hetero_prior[i] == 2) | (n_targetDouble * 2 >= n_targetEdit):
+                        forward_btype = mol.GetBondBetweenAtoms(
+                            ring[i], ring[(i + 1) % lring]
+                        ).GetBondType()
+                        backward_btype = mol.GetBondBetweenAtoms(
+                            ring[i], ring[(i - 1) % lring]
+                        ).GetBondType()
+                        if forward_btype != BondType.AROMATIC:
+                            switch = forward_btype == BondType.DOUBLE
+                            d1, e1 = search_and_assign_ring(
+                                mol, ring, hetero, i, forward=False, start_switch=switch
+                            )
+                            d2 = e2 = 0
+                        elif backward_btype != BondType.AROMATIC:
+                            switch = backward_btype == BondType.DOUBLE
+                            d1, e1 = search_and_assign_ring(
+                                mol, ring, hetero, i, forward=True, start_switch=switch
+                            )
+                            d2 = e2 = 0
+                        else:
+                            d1, e1 = search_and_assign_ring(
+                                mol, ring, hetero, i, forward=True, start_switch=True
+                            )
+                            d2, e2 = search_and_assign_ring(
+                                mol, ring, hetero, i, forward=False, start_switch=False
+                            )
+                        n_targetDouble -= d1 + d2
+                        n_targetEdit -= e1 + e2
+                    else:
+                        d1, e1 = search_and_assign_ring(
+                            mol, ring, hetero, i, forward=True, start_switch=True
+                        )
+                        d2, e2 = search_and_assign_ring(
+                            mol, ring, hetero, i, forward=False, start_switch=True
+                        )
+                        n_targetDouble -= d1 + d2
+                        n_targetEdit -= e1 + e2
+
+        for ring in CAr:
+            lring = len(ring)
+            for i in range(lring):
+                bond = mol.GetBondBetweenAtoms(ring[i], ring[(i + 1) % lring])
+                bond.SetBondType(BondType.AROMATIC)
+        print("Manual kekulization for aromatic heterocycles:")
+        print_bondtypes(mol, rings)
+
+        atoms = mol.GetAtoms()
+        for i in range(len(atoms)):
+            mol.ReplaceAtom(i, Chem.Atom(atoms[i].GetSymbol()))
+        mol_edited = mol.GetMol()
+        # charge assignment
+        if assign_formal_charge:
+            mol_edited = regularize_formal_charges(mol_edited, sanitize=False)
+        if not sanitize:
+            return mol_edited
+        else:
+            try:
+                Chem.SanitizeMol(mol_edited)
+                return mol_edited
+            except Exception as e:
+                raise RuntimeError(
+                    f"Manual kekulization for aromatic heterocycles failed, below are errors:\n\t {e}"
+                )
+
+
+def convert_by_obabel(
+    mol, cache_dir=os.path.join(os.getcwd(), ".cache"), obabel_path="obabel"
+):
+    from openbabel import openbabel
+    from rdkit import Chem
+
+    if not os.path.exists(cache_dir):
+        os.mkdir(cache_dir)
+    if mol.HasProp("_Name"):
+        name = mol.GetProp("_Name")
+    else:
+        name = f"mol{int(time.time())}"
+    mol_file_in = os.path.join(cache_dir, f"{name}.mol")
+    mol_file_out = os.path.join(cache_dir, f"{name}_obabel.mol")
+    Chem.MolToMolFile(mol, mol_file_in, kekulize=False)
+    obConversion = openbabel.OBConversion()
+    obConversion.SetInAndOutFormats("mol", "mol")
+    mol = openbabel.OBMol()
+    obConversion.ReadFile(mol, mol_file_in)
+    obConversion.WriteFile(mol, mol_file_out)
+    mol_obabel = Chem.MolFromMolFile(mol_file_out, removeHs=False, sanitize=False)
+    return mol_obabel
+
+
+def super_sanitize_mol(mol, name=None, verbose=True):
+    from rdkit import Chem
+
+    if name is None:
+        if mol.HasProp("_Name"):
+            name = mol.GetProp("_Name")
+        else:
+            name = "mol"
+    try:
+        if verbose:
+            print("=====Stage 1: use Hermite procedure=====")
+        # use our procedure
+        mol = sanitize_mol(mol, verbose)
+        mol = regularize_formal_charges(mol, sanitize=False)
+        mol_copy = deepcopy(mol)
+        Chem.SanitizeMol(mol_copy)
+        if verbose:
+            print(name, "Success.")
+        return mol_copy
+    except Exception as e:
+        try:
+            if verbose:
+                print(
+                    "Hermite procedure failed, maybe due to unsupported representation of hetero aromatic rings, re-try with obabel"
+                )
+                print("=====Stage 2: re-try with obabel=====")
+            mol = convert_by_obabel(mol)
+            mol = sanitize_mol(mol, verbose)
+            mol = kekulize_aromatic_heterocycles(
+                mol, assign_formal_charge=False, sanitize=False
+            )  # aromatic heterocycles
+            mol = regularize_formal_charges(mol, sanitize=False)
+            mol_copy = deepcopy(mol)
+            Chem.SanitizeMol(mol_copy)
+            if verbose:
+                print(name, "Success.")
+            return mol_copy
+        except Exception as e:
+            if verbose:
+                print(e)
+                print(name, "Failed!")
+            return None
+
+
+class Sanitizer:
+    def __init__(self, level="medium", raise_errors=True, verbose=False):
+        """Set up sanitizer.
+        --------.
+
+        Parameters
+        ----------
+        level : 'low', 'medium' or 'high'.
+            `low`    - use rdkit.Chem.SanitizeMol() to sanitize
+            `medium` - before using rdkit, assign formal charges of each atom first, which requires
+                        the rightness of bond order information
+            `high`   - try to regularize bond order of nitro, phosphate, sulfate, nitrine, guanidine,
+                        pyridine-oxide function groups and aromatic heterocycles. If failed, the program
+                        will call obabel to pre-process the mol object and re-try the procedure.
+        raise_errors : bool, default=True
+            If True, raise SanitizeError when failed.
+        verbose : bool, default=False
+            If True, print error information when failed.
+        """
+        self._check_level(level)
+        self.level = level
+        self.raise_errors = raise_errors
+        self.verbose = verbose
+
+    def _check_level(self, level):
+        if level not in ["low", "medium", "high"]:
+            raise ValueError(
+                f"Invalid level '{level}', please set to 'low', 'medium' or 'high'"
+            )
+
+    def _handle_exception(self, error_info):
+        if self.raise_errors:
+            raise SanitizeError(error_info)
+        elif self.verbose:
+            print(error_info)
+
+    def sanitize(self, mol):
+        """Sanitize mol according to `self.level`. If failed, return None."""
+        from rdkit import Chem
+
+        if self.level == "low":
+            try:
+                Chem.SanitizeMol(mol)
+                return mol
+            except Exception as e:
+                error_info = f"Sanitization Failed, please use more strict sanitizer by setting 'level' to 'medium' or 'high'. The error occurs:\n\t{e}"
+                self._handle_exception(error_info)
+                return None
+        elif self.level == "medium":
+            try:
+                mol = regularize_formal_charges(mol, sanitize=False)
+                Chem.SanitizeMol(mol)
+                return mol
+            except Exception as e:
+                error_info = f"Sanitization Failed, please use more strict sanitizer by setting 'level' to 'high'. The error occurs:\n\t{e}"
+                self._handle_exception(error_info)
+                return None
+        elif self.level == "high":
+            mol = super_sanitize_mol(mol, verbose=self.verbose)
+            error_info = "Sanitization Failed. Please check your molecule file."
+            if mol is None:
+                self._handle_exception(error_info)
+            return mol
+
+
+class SanitizeError(Exception):
+    def __init__(self, content="Sanitization Failed."):
+        self.content = content
+
+    def __str__(self):
+        return self.content
+
+    def __repr__(self):
+        return self.__str__()
diff --git a/dpdata/formats/rdkit/utils.py b/dpdata/formats/rdkit/utils.py
new file mode 100644
index 000000000..efeef6070
--- /dev/null
+++ b/dpdata/formats/rdkit/utils.py
@@ -0,0 +1,131 @@
+from __future__ import annotations
+
+import numpy as np
+
+
+def mol_to_system_data(mol):
+    from rdkit import Chem
+
+    if not isinstance(mol, Chem.rdchem.Mol):
+        raise TypeError(f"rdkit.Chem.Mol required, not {type(mol)}")
+
+    num_confs = mol.GetNumConformers()
+    if num_confs:
+        atom_symbols = [at.GetSymbol() for at in mol.GetAtoms()]
+        atom_names, atom_types, atom_numbs = np.unique(
+            atom_symbols, return_inverse=True, return_counts=True
+        )
+        coords = np.array([conf.GetPositions() for conf in mol.GetConformers()])
+        bonds = np.array(
+            [
+                [
+                    bond.GetBeginAtomIdx(),
+                    bond.GetEndAtomIdx(),
+                    bond.GetBondTypeAsDouble(),
+                ]
+                for bond in mol.GetBonds()
+            ]
+        )
+        formal_charges = np.array(
+            [at.GetFormalCharge() for at in mol.GetAtoms()], dtype=np.int32
+        )
+        data = {}
+        data["atom_numbs"] = list(atom_numbs)
+        data["atom_names"] = list(atom_names)
+        data["atom_types"] = atom_types
+        data["cells"] = np.array(
+            [
+                [[100.0, 0.0, 0.0], [0.0, 100.0, 0.0], [0.0, 0.0, 100.0]]
+                for _ in range(num_confs)
+            ]
+        )
+        data["coords"] = coords
+        data["bonds"] = bonds
+        data["formal_charges"] = formal_charges
+        data["orig"] = np.array([0.0, 0.0, 0.0])
+        # other properties
+        if mol.HasProp("_Name"):
+            data["_name"] = mol.GetProp("_Name")
+        return data
+    else:
+        raise ValueError("The moleclue does not contain 3-D conformers")
+
+
+def system_data_to_mol(data):
+    from rdkit import Chem
+
+    mol_ed = Chem.RWMol()
+    atom_symbols = [data["atom_names"][i] for i in data["atom_types"]]
+    # add atoms
+    for atom_type in data["atom_types"]:
+        symbol = data["atom_names"][atom_type]
+        atom = Chem.Atom(symbol)
+        mol_ed.AddAtom(atom)
+    # add bonds
+    for bond_info in data["bonds"]:
+        if bond_info[2] == 1:
+            mol_ed.AddBond(int(bond_info[0]), int(bond_info[1]), Chem.BondType.SINGLE)
+        elif bond_info[2] == 2:
+            mol_ed.AddBond(int(bond_info[0]), int(bond_info[1]), Chem.BondType.DOUBLE)
+        elif bond_info[2] == 3:
+            mol_ed.AddBond(int(bond_info[0]), int(bond_info[1]), Chem.BondType.TRIPLE)
+        elif bond_info[2] == 1.5:
+            mol_ed.AddBond(int(bond_info[0]), int(bond_info[1]), Chem.BondType.AROMATIC)
+    # set conformers
+    for frame_idx in range(data["coords"].shape[0]):
+        conf = Chem.rdchem.Conformer(len(data["atom_types"]))
+        for atom_idx in range(len(data["atom_types"])):
+            conf.SetAtomPosition(atom_idx, data["coords"][frame_idx][atom_idx])
+        mol_ed.AddConformer(conf, assignId=True)
+    mol = mol_ed.GetMol()
+    # set formal charges
+    for idx, atom in enumerate(mol.GetAtoms()):
+        atom.SetFormalCharge(int(data["formal_charges"][idx]))
+    # set mol name
+    if "_name" in list(data.keys()):
+        mol.SetProp("_Name", data["_name"])
+    # sanitize
+    Chem.SanitizeMol(mol_ed)
+    return mol
+
+
+def check_same_atom(atom_1, atom_2):
+    if atom_1.GetIdx() != atom_2.GetIdx():
+        return False
+    elif atom_1.GetSymbol() != atom_2.GetSymbol():
+        return False
+    else:
+        return True
+
+
+def check_same_molecule(mol_1, mol_2):
+    flag = True
+    for bond_1, bond_2 in zip(mol_1.GetBonds(), mol_2.GetBonds()):
+        begin_atom_1, end_atom_1 = bond_1.GetBeginAtom(), bond_1.GetEndAtom()
+        begin_atom_2, end_atom_2 = bond_2.GetBeginAtom(), bond_2.GetEndAtom()
+        if not check_same_atom(begin_atom_1, begin_atom_2):
+            flag = False
+            break
+        elif not check_same_atom(end_atom_1, end_atom_2):
+            flag = False
+            break
+    return flag
+
+
+def check_molecule_list(mols):
+    flag = True
+    for mol in mols[1:]:
+        if not check_same_molecule(mol, mols[0]):
+            flag = False
+            break
+    return flag
+
+
+def combine_molecules(mols):
+    if check_molecule_list(mols):
+        for mol in mols[1:]:
+            for conf in mol.GetConformers():
+                mols[0].AddConformer(conf, assignId=True)
+        return mols[0]
+    else:
+        raise ValueError("molecules are not of the same topology.")
diff --git a/dpdata/formats/siesta/__init__.py b/dpdata/formats/siesta/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/dpdata/formats/siesta/aiMD_output.py b/dpdata/formats/siesta/aiMD_output.py
new file mode 100644
index 000000000..daa4f6a25
--- /dev/null
+++ b/dpdata/formats/siesta/aiMD_output.py
@@ -0,0 +1,187 @@
+# !/usr/bin/python3
+from __future__ import annotations
+
+import numpy as np
+
+ev2ev = 1
+ang2ang = 1
+
+
+#############################read output#####################################
+def get_single_line_tail(fin, keyword, num=1):
+    file = open(fin)
+    part_res = []
+    for value in file:
+        if keyword in value:
+            temp = len(value.split()) - num
+            part_res.append(float(value.split()[temp]))
+
+    file.close()
+    return part_res
+
+
+## atomnum: number of atoms,  row numbers
+## begin_column: begin column num
+## read_column_num: read column num
+## column_num: the column number in nxet reading line
+def extract_keyword(
+    fout,
+    keyword,
+    down_line_num,
+    begin_column,
+    read_column_num,
+    is_repeated_read,
+    column_num,
+):
+    file = open(fout)
+    ret = []
+    part_ret = []
+    flag = 0
+    idx = 0
+    extr_frame = 0
+    length = obtain_nframe(fout)
+    # for (num,value) in enumerate(file):
+    for value in file:
+        if keyword in value:
+            flag = 1
+            continue
+        if flag == 1:
+            if idx < down_line_num:
+                idx += 1
+            else:
+                flag = 0
+                part_ret.append(np.array(ret))
+                ret = []
+                extr_frame += 1
+                if extr_frame == length:
+                    file.close()
+                    return part_ret
+                ## is_repeated_read = 0: only read 1 time for SCF
+                ## is_repeated_read = 1:  read all for aiMD --> get all frames
+                if is_repeated_read:
+                    idx = 0
+                continue
+
+            for i in range(begin_column, read_column_num):
+                if len(value.split()) == column_num:
+                    if not value.split()[i].isalpha():
+                        ret.append(float(value.strip().split()[i]))
+                    else:
+                        ret.append(value.strip().split()[i])
+            continue
+    file.close()
+    return part_ret
+
+
+def obtain_nframe(fname):
+    fp = open(fname)
+    flag = False
+    idx = 0
+    temp = 0
+    for ii in fp:
+        if "siesta: Stress tensor (static) (eV/Ang**3):" in ii:
+            flag = True
+            continue
+        if flag:
+            if "siesta: Pressure (static):" not in ii:
+                if len(ii.split()) == 3:
+                    temp += 1
+                    if temp == 3:
+                        idx += 1
+                        # print(idx)
+                        flag = False
+                        temp = 0
+    fp.close()
+    return idx
+
+
+def get_atom_types(fout, atomnums):
+    covert_type = extract_keyword(
+        fout, "outcoor: Atomic coordinates (Ang):", atomnums, 3, 4, 0, 6
+    )[0]
+    atomtype = []
+    # print(covert_type)
+    for i in range(0, len(covert_type)):
+        atomtype.append(int(covert_type[i]) - 1)
+    return atomtype
+
+
+def get_atom_name(fout):
+    file = open(fout)
+    ret = []
+    for value in file:
+        if "Species number:" in value:
+            for j in range(len(value.split())):
+                if value.split()[j] == "Label:":
+                    ret.append(value.split()[j + 1])
+                    break
+    file.close()
+    return ret
+
+
+def get_atom_numbs(atomtypes):
+    atom_numbs = []
+    for i in set(atomtypes):
+        atom_numbs.append(atomtypes.count(i))
+    return atom_numbs
+
+
+def get_virial(fout, cell):
+    viri = extract_keyword(
+        fout, "siesta: Stress tensor (static) (eV/Ang**3):", 3, 0, 3, 1, 3
+    )
+    vols = []
+    length = obtain_nframe(fout)
+    for ii in range(length):
+        vols.append(np.linalg.det(cell[ii].reshape([3, 3])))
+        for jj in range(len(viri[ii])):
+            ## siesta: 1eV/A^3= 1.60217*10^11 Pa ,  ---> qe: kBar=10^8Pa
+            # ii *= vols[idx] * 1e3 / 1.602176621e6 * (1.602176621e3)
+            viri[ii][jj] *= vols[ii]
+    return viri
+
+
+def covert_dimension(arr, num):
+    arr = np.array(arr)
+    frames = len(arr)
+    ret = np.zeros((frames, num, 3))
+    for i in range(frames):
+        ret[i] = arr[i].reshape(num, 3)
+    return ret
+
+
+def get_aiMD_frame(fname):
+    NumberOfSpecies = int(
+        get_single_line_tail(fname, "redata: Number of Atomic Species")[0]
+    )
+    atom_names = get_atom_name(fname)
+    tot_natoms = int(get_single_line_tail(fname, "Number of atoms", 3)[0])
+
+    atom_types = get_atom_types(fname, tot_natoms)
+    atom_numbs = get_atom_numbs(atom_types)
+    assert max(atom_types) + 1 == NumberOfSpecies
+
+    cell = extract_keyword(fname, "outcell: Unit cell vectors (Ang):", 3, 0, 3, 1, 3)
+    coord = extract_keyword(
+        fname, "outcoor: Atomic coordinates (Ang):", tot_natoms, 0, 3, 1, 6
+    )
+    energy = get_single_line_tail(fname, "siesta: E_KS(eV) =")
+    force = extract_keyword(
+        fname, "siesta: Atomic forces (eV/Ang):", tot_natoms, 1, 4, 1, 4
+    )
+    virial = get_virial(fname, cell)
+
+    cells = covert_dimension(np.array(cell), 3)
+    coords = covert_dimension(np.array(coord), tot_natoms)
+    forces = covert_dimension(np.array(force), tot_natoms)
+    virials = covert_dimension(np.array(virial), 3)
+    return (
+        atom_names,
+        atom_numbs,
+        np.array(atom_types),
+        cells,
+        coords,
+        np.array(energy),
+        forces,
+        virials,
+    )
diff --git a/dpdata/formats/siesta/output.py b/dpdata/formats/siesta/output.py
new file mode 100644
index 000000000..0c944d5b5
--- /dev/null
+++ b/dpdata/formats/siesta/output.py
@@ -0,0 +1,142 @@
+#!/usr/bin/python3
+from __future__ import annotations
+
+import numpy as np
+
+ev2ev = 1
+ang2ang = 1
+
+
+#############################read output#####################################
+def get_single_line_tail(fin, keyword, num=1):
+    file = open(fin)
+    res = []
+    for value in file:
+        if keyword in value:
+            temp = len(value.split()) - num
+            res.append(float(value.split()[temp]))
+            file.close()
+            return res
+    return res
+
+
+## atomnum: number of atoms,  row numbers
+## begin_column: begin column num
+## column_num: read column num
+def extract_keyword(fout, keyword, down_line_num, begin_column, column_num):
+    file = open(fout)
+    ret = []
+    flag = 0
+    idx = 0
+    # for (num,value) in enumerate(file):
+    for value in file:
+        if keyword in value:
+            flag = 1
+            continue
+        if flag == 1:
+            if idx < down_line_num:
+                idx += 1
+            else:
+                flag = 0
+                continue
+            if len(value.split()) >= column_num:
+                for i in range(begin_column, column_num):
+                    if not value.split()[i].isalpha():
+                        ret.append(float(value.strip().split()[i]))
+                    else:
+                        ret.append(value.strip().split()[i])
+            ## compatible siesta-4.0.2 and siesta-4.1-b4
+            else:
+                flag = 0
+                idx = 0
+    file.close()
+    return ret
+
+
+def get_atom_types(fout, atomnums):
+    covert_type = extract_keyword(
+        fout, "outcoor: Atomic coordinates (Ang):", atomnums, 3, 4
+    )
+    atomtype = []
+    for i in range(0, len(covert_type)):
+        atomtype.append(int(covert_type[i]) - 1)
+    return atomtype
+
+
+def get_atom_name(fout):
+    file = open(fout)
+    ret = []
+    for value in file:
+        if "Species number:" in value:
+            for j in range(len(value.split())):
+                if value.split()[j] == "Label:":
+                    ret.append(value.split()[j + 1])
+                    break
+    file.close()
+    return ret
+
+
+def get_atom_numbs(atomtypes):
+    atom_numbs = []
+    for i in set(atomtypes):
+        atom_numbs.append(atomtypes.count(i))
+    return atom_numbs
+
+
+def get_virial(fout, cells):
+    vols = []
+    for ii in cells:
+        ### calucate vol
+        vols.append(np.linalg.det(ii.reshape([3, 3])))
+    ret = extract_keyword(fout, "siesta: Stress tensor (static) (eV/Ang**3):", 3, 1, 4)
+    ret = np.array([ret])
+    for idx, ii in enumerate(ret):
+        ## siesta: 1eV/A^3= 1.60217*10^11 Pa ,  ---> qe: kBar=10^8Pa
+        # ii *= vols[idx] * 1e3 / 1.602176621e6 * (1.602176621e3)
+        ii *= vols[idx]
+    return ret
+
+
+def obtain_frame(fname):
+    NumberOfSpecies = int(
+        get_single_line_tail(fname, "redata: Number of Atomic Species")[0]
+    )
+    atom_names = get_atom_name(fname)
+    tot_natoms = int(get_single_line_tail(fname, "Number of atoms", 3)[0])
+    atom_types = get_atom_types(fname, tot_natoms)
+    atom_numbs = get_atom_numbs(atom_types)
+    assert max(atom_types) + 1 == NumberOfSpecies
+    cell = extract_keyword(fname, "outcell: Unit cell vectors (Ang):", 3, 0, 3)
+    coord = extract_keyword(
+        fname, "outcoor: Atomic coordinates (Ang):", tot_natoms, 0, 3
+    )
+    energy = get_single_line_tail(fname, "siesta: E_KS(eV) =")
+    force = extract_keyword(fname, "siesta: Atomic forces (eV/Ang):", tot_natoms, 1, 4)
+    virial = get_virial(fname, np.array([cell]))
+
+    cell = np.array(cell).reshape(3, 3)
+    coord = np.array(coord).reshape(tot_natoms, 3)
+    force = np.array(force).reshape(tot_natoms, 3)
+    virial = np.array(virial).reshape(3, 3)
+
+    # data = {}
+    # data['orig'] = np.array([0, 0, 0])
+    # data['atom_names'] = atom_names
+    # data['atom_numbs'] = atom_numbs
+    # data['atom_types'] = np.array(atom_types)
+    # data['cells'] = np.array([cell])
+    # data['coords'] = np.array([coord])
+    # data['energies'] = np.array([energy])
+    # data['forces'] = np.array([force])
+    # data['virials'] = virial
+    # return data
+    return (
+        atom_names,
+        atom_numbs,
+        np.array(atom_types),
+        np.array([cell]),
+        np.array([coord]),
+        np.array(energy),
+        np.array([force]),
+        np.array([virial]),
+    )
diff --git a/dpdata/formats/vasp/__init__.py b/dpdata/formats/vasp/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/dpdata/formats/vasp/outcar.py b/dpdata/formats/vasp/outcar.py
new file mode 100644
index 000000000..a16fd6f9f
--- /dev/null
+++ b/dpdata/formats/vasp/outcar.py
@@ -0,0 +1,275 @@
+from __future__ import annotations
+
+import re
+import warnings
+
+import numpy as np
+
+
+def atom_name_from_potcar_string(instr: str) -> str:
+    """Get atom name from a potcar element name.
+
+    e.g. Sn_d -> Sn
+
+    Parameters
+    ----------
+    instr : str
+        input potcar elemenet name
+
+    Returns
+    -------
+    name: str
+        name of atoms
+    """
+    if "_" in instr:
+        # for case like : TITEL  = PAW_PBE Sn_d 06Sep2000
+        return instr.split("_")[0]
+    else:
+        return instr
+
+
+def system_info(
+    lines: list[str],
+    type_idx_zero: bool = False,
+) -> tuple[list[str], list[int], np.ndarray, int | None, int | None]:
+    """Get system information from lines of an OUTCAR file.
+
+    Parameters
+    ----------
+    lines : list[str]
+        the lines of the OUTCAR file
+    type_idx_zero : bool
+        if true atom types starts from 0 otherwise from 1.
+
+    Returns
+    -------
+    atom_names: list[str]
+        name of atoms
+    atom_numbs: list[int]
+        number of atoms that have a certain name. same length as atom_names
+    atom_types: np.ndarray
+        type of each atom, the array has same lenght as number of atoms
+    nelm: optional[int]
+        the value of NELM parameter
+    nwrite: optional[int]
+        the value of NWRITE parameter
+    """
+    atom_names = []
+    atom_names_potcar = []
+    atom_numbs = None
+    nelm = None
+    nwrite = None
+    for ii in lines:
+        if "TITEL" in ii:
+            # get atom names from POTCAR info, tested only for PAW_PBE ...
+            # for case like : TITEL  = PAW_PBE Sn_d 06Sep2000
+            _ii = ii.split()[3]
+            atom_names.append(atom_name_from_potcar_string(_ii))
+        elif "POTCAR:" in ii:
+            # get atom names from POTCAR info, tested only for PAW_PBE ...
+            # for case like : POTCAR:  PAW_PBE Ti 08Apr2002
+            _ii = ii.split()[2]
+            atom_names_potcar.append(atom_name_from_potcar_string(_ii))
+        # a stricker check for "NELM"; compatible with distingct formats in different versions(6 and older, newers_expect-to-work) of vasp
+        elif nelm is None:
+            m = re.search(r"NELM\s*=\s*(\d+)", ii)
+            if m:
+                nelm = int(m.group(1))
+        elif nwrite is None:
+            m = re.search(r"NWRITE\s*=\s*(\d+)", ii)
+            if m:
+                nwrite = int(m.group(1))
+        if "ions per type" in ii:
+            atom_numbs_ = [int(s) for s in ii.split()[4:]]
+            if atom_numbs is None:
+                atom_numbs = atom_numbs_
+            else:
+                assert atom_numbs == atom_numbs_, "in consistent numb atoms in OUTCAR"
+    if len(atom_names) == 0:
+        # try to use atom_names_potcar
+        if len(atom_names_potcar) == 0:
+            raise ValueError("cannot get atom names from potcar")
+        nnames = len(atom_names_potcar)
+        # the names are repeated. check if it is the case
+        assert atom_names_potcar[: nnames // 2] == atom_names_potcar[nnames // 2 :]
+        atom_names = atom_names_potcar[: nnames // 2]
+    assert nelm is not None, "cannot find maximum steps for each SC iteration"
+    assert atom_numbs is not None, "cannot find ion type info in OUTCAR"
+    if len(atom_numbs) != len(atom_names):
+        raise RuntimeError(
+            f"The number of the atom numbers per each type ({len(atom_numbs)}) "
+            f"does not match that of the atom types ({len(atom_names)}) detected "
+            f"from the OUTCAR. This issue may be cause by a bug in vasp <= 6.3. "
+            f"Please try to convert data from vasprun.xml instead."
+        )
+    atom_names = atom_names[: len(atom_numbs)]
+    atom_types = []
+    for idx, ii in enumerate(atom_numbs):
+        for jj in range(ii):
+            if type_idx_zero:
+                atom_types.append(idx)
+            else:
+                atom_types.append(idx + 1)
+    return atom_names, atom_numbs, np.array(atom_types, dtype=int), nelm, nwrite
+
+
+def get_outcar_block(fp, ml=False):
+    blk = []
+    energy_token = ["free  energy   TOTEN", "free  energy ML TOTEN"]
+    ml_index = int(ml)
+    for ii in fp:
+        if not ii:
+            return blk
+        blk.append(ii.rstrip("\n"))
+        if energy_token[ml_index] in ii:
+            return blk
+    return blk
+
+
+def check_outputs(coord, cell, force):
+    if len(force) == 0:
+        raise ValueError("cannot find forces in OUTCAR block")
+    if len(coord) == 0:
+        raise ValueError("cannot find coordinates in OUTCAR block")
+    if len(cell) == 0:
+        raise ValueError("cannot find cell in OUTCAR block")
+    return True
+
+
+# we assume that the force is printed ...
+def get_frames(fname, begin=0, step=1, ml=False, convergence_check=True):
+    with open(fname) as fp:
+        return _get_frames_lower(
+            fp,
+            fname,
+            begin=begin,
+            step=step,
+            ml=ml,
+            convergence_check=convergence_check,
+        )
+
+
+def _get_frames_lower(fp, fname, begin=0, step=1, ml=False, convergence_check=True):
+    blk = get_outcar_block(fp)
+
+    atom_names, atom_numbs, atom_types, nelm, nwrite = system_info(
+        blk, type_idx_zero=True
+    )
+    ntot = sum(atom_numbs)
+
+    all_coords = []
+    all_cells = []
+    all_energies = []
+    all_forces = []
+    all_virials = []
+
+    cc = 0
+    rec_failed = []
+    while len(blk) > 0:
+        if cc >= begin and (cc - begin) % step == 0:
+            coord, cell, energy, force, virial, is_converge = analyze_block(
+                blk, ntot, nelm, ml
+            )
+            if energy is None:
+                break
+            if nwrite == 0:
+                has_label = len(force) > 0 and len(coord) > 0 and len(cell) > 0
+                if not has_label:
+                    warnings.warn("cannot find labels in the frame, ingore")
+            else:
+                has_label = check_outputs(coord, cell, force)
+            if (is_converge or not convergence_check) and has_label:
+                all_coords.append(coord)
+                all_cells.append(cell)
+                all_energies.append(energy)
+                all_forces.append(force)
+                if virial is not None:
+                    all_virials.append(virial)
+            if not is_converge:
+                rec_failed.append(cc + 1)
+
+        blk = get_outcar_block(fp, ml)
+        cc += 1
+
+    if len(rec_failed) > 0:
+        prt = (
+            "so they are not collected."
+            if convergence_check
+            else "but they are still collected due to the requirement for ignoring convergence checks."
+        )
+        warnings.warn(
+            f"The following structures were unconverged: {rec_failed}; " + prt
+        )
+
+    if len(all_virials) == 0:
+        all_virials = None
+    else:
+        all_virials = np.array(all_virials)
+    return (
+        atom_names,
+        atom_numbs,
+        atom_types,
+        np.array(all_cells),
+        np.array(all_coords),
+        np.array(all_energies),
+        np.array(all_forces),
+        all_virials,
+    )
+
+
+def analyze_block(lines, ntot, nelm, ml=False):
+    coord = []
+    cell = []
+    energy = None
+    force = []
+    virial = None
+    is_converge = True
+    sc_index = 0
+    # select different searching tokens based on the ml label
+    energy_token = ["free  energy   TOTEN", "free  energy ML TOTEN"]
+    energy_index = [4, 5]
+    virial_token = ["FORCE on cell =-STRESS in cart. coord.  units", "ML FORCE"]
+    virial_index = [14, 4]
+    cell_token = ["VOLUME and BASIS", "ML FORCE"]
+    cell_index = [5, 12]
+    ml_index = int(ml)
+    for idx, ii in enumerate(lines):
+        # if set ml == True, is_converged will always be True
+        if ("Iteration" in ii) and (not ml):
+            sc_index = int(ii.split()[3][:-1])
+            if sc_index >= nelm:
+                is_converge = False
+        elif energy_token[ml_index] in ii:
+            energy = float(ii.split()[energy_index[ml_index]])
+            return coord, cell, energy, force, virial, is_converge
+        elif cell_token[ml_index] in ii:
+            for dd in range(3):
+                tmp_l = lines[idx + cell_index[ml_index] + dd]
+                cell.append([float(ss) for ss in tmp_l.replace("-", " -").split()[0:3]])
+        elif virial_token[ml_index] in ii:
+            in_kB_index = virial_index[ml_index]
+            while idx + in_kB_index < len(lines) and (
+                not lines[idx + in_kB_index].split()[0:2] == ["in", "kB"]
+            ):
+                in_kB_index += 1
+            assert idx + in_kB_index < len(lines), (
+                'ERROR: "in kB" is not found in OUTCAR. Unable to extract virial.'
+            )
+            tmp_v = [float(ss) for ss in lines[idx + in_kB_index].split()[2:8]]
+            virial = np.zeros([3, 3])
+            virial[0][0] = tmp_v[0]
+            virial[1][1] = tmp_v[1]
+            virial[2][2] = tmp_v[2]
+            virial[0][1] = tmp_v[3]
+            virial[1][0] = tmp_v[3]
+            virial[1][2] = tmp_v[4]
+            virial[2][1] = tmp_v[4]
+            virial[0][2] = tmp_v[5]
+            virial[2][0] = tmp_v[5]
+        elif "TOTAL-FORCE" in ii and (("ML" in ii) == ml):
+            for jj in range(idx + 2, idx + 2 + ntot):
+                tmp_l = lines[jj]
+                info = [float(ss) for ss in tmp_l.split()]
+                coord.append(info[:3])
+                force.append(info[3:6])
+    return coord, cell, energy, force, virial, is_converge
diff --git a/dpdata/formats/vasp/poscar.py b/dpdata/formats/vasp/poscar.py
new file mode 100644
index 000000000..78b8dbbeb
--- /dev/null
+++ b/dpdata/formats/vasp/poscar.py
@@ -0,0 +1,134 @@
+#!/usr/bin/python3
+from __future__ import annotations
+
+import numpy as np
+
+
+def _to_system_data_lower(lines, cartesian=True, selective_dynamics=False):
+    def move_flag_mapper(flag):
+        if flag == "T":
+            return True
+        elif flag == "F":
+            return False
+        else:
+            raise RuntimeError(f"Invalid move flag: {flag}")
+
+    """Treat as cartesian poscar."""
+    system = {}
+    system["atom_names"] = [str(ii) for ii in lines[5].split()]
+    system["atom_numbs"] = [int(ii) for ii in lines[6].split()]
+    scale = float(lines[1])
+    cell = []
+    move_flags = []
+    for ii in range(2, 5):
+        boxv = [float(jj) for jj in lines[ii].split()]
+        boxv = np.array(boxv) * scale
+        cell.append(boxv)
+    system["cells"] = [np.array(cell)]
+    natoms = sum(system["atom_numbs"])
+    coord = []
+    for ii in range(8, 8 + natoms):
+        tmp = lines[ii].split()
+        tmpv = [float(jj) for jj in tmp[:3]]
+        if cartesian:
+            tmpv = np.array(tmpv) * scale
+        else:
+            tmpv = np.matmul(np.array(tmpv), system["cells"][0])
+        coord.append(tmpv)
+        if selective_dynamics:
+            if len(tmp) == 6:
+                move_flags.append(list(map(move_flag_mapper, tmp[3:])))
+            else:
+                raise RuntimeError(
+                    f"Invalid move flags, should be 6 columns, got {tmp}"
+                )
+
+    system["coords"] = [np.array(coord)]
+    system["orig"] = np.zeros(3)
+    atom_types = []
+    for idx, ii in enumerate(system["atom_numbs"]):
+        for jj in range(ii):
+            atom_types.append(idx)
+    system["atom_types"] = np.array(atom_types, dtype=int)
+    system["cells"] = np.array(system["cells"])
+    system["coords"] = np.array(system["coords"])
+    if move_flags:
+        move_flags = np.array(move_flags, dtype=bool)
+        move_flags = move_flags.reshape((1, natoms, 3))
+        system["move"] = np.array(move_flags, dtype=bool)
+    return system
+
+
+def to_system_data(lines):
+    # remove the line that has 'selective dynamics'
+    selective_dynamics = False
+    if lines[7][0] == "S" or lines[7][0] == "s":
+        selective_dynamics = True
+        lines.pop(7)
+    is_cartesian = lines[7][0] in ["C", "c", "K", "k"]
+    if not is_cartesian:
+        if lines[7][0] not in ["d", "D"]:
+            raise RuntimeError(
+                "seem not to be a valid POSCAR of vasp 5.x, may be a POSCAR of vasp 4.x?"
+            )
+    return _to_system_data_lower(lines, is_cartesian, selective_dynamics)
+
+
+def from_system_data(system, f_idx=0, skip_zeros=True):
+    ret = ""
+    for ii, name in zip(system["atom_numbs"], system["atom_names"]):
+        if ii == 0:
+            continue
+        ret += "%s%d " % (name, ii)  # noqa: UP031
+    ret += "\n"
+    ret += "1.0\n"
+    for ii in system["cells"][f_idx]:
+        for jj in ii:
+            ret += f"{jj:.16e} "
+        ret += "\n"
+    for idx, ii in enumerate(system["atom_names"]):
+        if system["atom_numbs"][idx] == 0:
+            continue
+        ret += f"{ii} "
+    ret += "\n"
+    for ii in system["atom_numbs"]:
+        if ii == 0:
+            continue
+        ret += "%d " % ii  # noqa: UP031
+    ret += "\n"
+    move = system.get("move", None)
+    if move is not None and len(move) > 0:
+        ret += "Selective Dynamics\n"
+
+    # should use Cartesian for VESTA software
+    ret += "Cartesian\n"
+    atype = system["atom_types"]
+    posis = system["coords"][f_idx]
+    # atype_idx = [[idx,tt] for idx,tt in enumerate(atype)]
+    # sort_idx = np.argsort(atype, kind = 'mergesort')
+    sort_idx = np.lexsort((np.arange(len(atype)), atype))
+    atype = atype[sort_idx]
+    posis = posis[sort_idx]
+    if move is not None and len(move) > 0:
+        move = move[f_idx][sort_idx]
+
+    if isinstance(move, np.ndarray):
+        move = move.tolist()
+
+    posi_list = []
+    for idx in range(len(posis)):
+        ii_posi = posis[idx]
+        line = f"{ii_posi[0]:15.10f} {ii_posi[1]:15.10f} {ii_posi[2]:15.10f}"
+        if move is not None and len(move) > 0:
+            move_flags = move[idx]
+            if not isinstance(move_flags, list) or len(move_flags) != 3:
+                raise RuntimeError(
+                    f"Invalid move flags: {move_flags}, should be a list of 3 bools"
+                )
+            line += " " + " ".join("T" if flag else "F" for flag in move_flags)
+
+        posi_list.append(line)
+
+    posi_list.append("")
+    ret += "\n".join(posi_list)
+    return ret
diff --git a/dpdata/formats/vasp/xml.py b/dpdata/formats/vasp/xml.py
new file mode 100755
index 000000000..1b407c254
--- /dev/null
+++ b/dpdata/formats/vasp/xml.py
@@ -0,0 +1,176 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import xml.etree.ElementTree as ET
+from typing import Any
+
+import numpy as np
+
+
+def check_name(item, name):
+    assert item.attrib["name"] == name, (
+        "item attrib '{}' dose not math required '{}'".format(item.attrib["name"], name)
+    )
+
+
+def get_varray(varray):
+    array = []
+    for vv in varray.findall("v"):
+        array.append([float(ii) for ii in vv.text.split()])
+    return np.array(array)
+
+
+def analyze_atominfo(atominfo_xml):
+    check_name(atominfo_xml.find("array"), "atoms")
+    eles = []
+    types = []
+    visited = set()
+    for ii in atominfo_xml.find("array").find("set"):
+        atom_type = int(ii.findall("c")[1].text)
+        if atom_type not in visited:
+            eles.append(ii.findall("c")[0].text.strip())
+            visited.add(atom_type)
+        types.append(atom_type)
+    return eles, types
+
+
+def analyze_calculation(
+    cc: Any,
+    nelm: int | None,
+) -> tuple[np.ndarray, np.ndarray, float, np.ndarray, np.ndarray | None, bool | None]:
+    """Analyze a calculation block.
+
+    Parameters
+    ----------
+    cc : xml.etree.ElementTree.Element
+        The xml element for a ion step calculation
+    nelm : Optional[int]
+        The number nelm, if it is not None, convergence check is performed.
+
+    Returns
+    -------
+    posi : np.ndarray
+        The positions
+    cell : np.ndarray
+        The cell
+    ener : float
+        The energy
+    force : np.ndarray
+        The forces
+    str : Optional[np.ndarray]
+        The stress
+    is_converged: Optional[bool]
+        If the scf calculation is converged. Only return boolean when
+        nelm is not None. Otherwise return None.
+
+    """
+    structure_xml = cc.find("structure")
+    check_name(structure_xml.find("crystal").find("varray"), "basis")
+    check_name(structure_xml.find("varray"), "positions")
+    cell = get_varray(structure_xml.find("crystal").find("varray"))
+    posi = get_varray(structure_xml.find("varray"))
+    strs = None
+    is_converged = None
+    if nelm is not None:
+        niter = len(cc.findall(".//scstep"))
+        is_converged = niter < nelm
+    for vv in cc.findall("varray"):
+        if vv.attrib["name"] == "forces":
+            forc = get_varray(vv)
+        elif vv.attrib["name"] == "stress":
+            strs = get_varray(vv)
+    for ii in cc.find("energy").findall("i"):
+        if ii.attrib["name"] == "e_fr_energy":
+            ener = float(ii.text)
+    return posi, cell, ener, forc, strs, is_converged
+
+
+def formulate_config(eles, types, posi, cell, ener, forc, strs_):
+    strs = strs_ / 1602
+    natoms = len(types)
+    ntypes = len(eles)
+    ret = ""
+    ret += "#N %d %d\n" % (natoms, ntypes - 1)  # noqa: UP031
+    ret += "#C "
+    for ii in eles:
+        ret += " " + ii
+    ret += "\n"
+    ret += "##\n"
+    ret += f"#X {cell[0][0]:13.8f} {cell[0][1]:13.8f} {cell[0][2]:13.8f}\n"
+    ret += f"#Y {cell[1][0]:13.8f} {cell[1][1]:13.8f} {cell[1][2]:13.8f}\n"
+    ret += f"#Z {cell[2][0]:13.8f} {cell[2][1]:13.8f} {cell[2][2]:13.8f}\n"
+    ret += "#W 1.0\n"
+    ret += "#E %.10f\n" % (ener / natoms)
+    ret += f"#S {strs[0][0]:.9e} {strs[1][1]:.9e} {strs[2][2]:.9e} {strs[0][1]:.9e} {strs[1][2]:.9e} {strs[0][2]:.9e}\n"
+    ret += "#F\n"
+    for ii in range(natoms):
+        sp = np.matmul(cell.T, posi[ii])
+        ret += "%d" % (types[ii] - 1)  # noqa: UP031
+        ret += f" {sp[0]:12.6f} {sp[1]:12.6f} {sp[2]:12.6f}"
+        ret += f" {forc[ii][0]:12.6f} {forc[ii][1]:12.6f} {forc[ii][2]:12.6f}"
+        ret += "\n"
+    return ret
+
+
+def analyze(fname, type_idx_zero=False, begin=0, step=1, convergence_check=True):
+    """Deal with broken xml file."""
+    all_posi = []
+    all_cell = []
+    all_ener = []
+    all_forc = []
+    all_strs = []
+    cc = 0
+    if convergence_check:
+        tree = ET.parse(fname)
+        root = tree.getroot()
+        parameters = root.find(".//parameters")
+        nelm = parameters.find(".//i[@name='NELM']")
+        # will check convergence
+        nelm = int(nelm.text)
+    else:
+        # not checking convergence
+        nelm = None
+    try:
+        for event, elem in ET.iterparse(fname):
+            if elem.tag == "atominfo":
+                eles, types = analyze_atominfo(elem)
+                types = np.array(types, dtype=int)
+                if type_idx_zero:
+                    types = types - 1
+            if elem.tag == "calculation":
+                posi, cell, ener, forc, strs, is_converged = analyze_calculation(
+                    elem, nelm
+                )
+                # record when not checking convergence or is_converged
+                # and the step criteria is satisfied
+                if (
+                    (nelm is None or is_converged)
+                    and cc >= begin
+                    and (cc - begin) % step == 0
+                ):
+                    all_posi.append(posi)
+                    all_cell.append(cell)
+                    all_ener.append(ener)
+                    all_forc.append(forc)
+                    if strs is not None:
+                        all_strs.append(strs)
+                cc += 1
+    except ET.ParseError:
+        return (
+            eles,
+            types,
+            np.array(all_cell),
+            np.array(all_posi),
+            np.array(all_ener),
+            np.array(all_forc),
+            np.array(all_strs),
+        )
+    return (
+        eles,
+        types,
+        np.array(all_cell),
+        np.array(all_posi),
+        np.array(all_ener),
+        np.array(all_forc),
+        np.array(all_strs),
+    )
diff --git a/dpdata/formats/xyz/__init__.py b/dpdata/formats/xyz/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/dpdata/formats/xyz/quip_gap_xyz.py b/dpdata/formats/xyz/quip_gap_xyz.py
new file mode 100644
index 000000000..71e976de6
--- /dev/null
+++ b/dpdata/formats/xyz/quip_gap_xyz.py
@@ -0,0 +1,250 @@
+#!/usr/bin/env python3
+# %%
+from __future__ import annotations
+
+import re
+from collections import OrderedDict
+
+import numpy as np
+
+from dpdata.periodic_table import Element
+
+
+class QuipGapxyzSystems:
+    """deal with QuipGapxyzFile."""
+
+    def __init__(self, file_name):
+        self.file_object = open(file_name)
+        self.block_generator = self.get_block_generator()
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        return self.handle_single_xyz_frame(next(self.block_generator))
+
+    def __del__(self):
+        self.file_object.close()
+
+    def get_block_generator(self):
+        p3 = re.compile(r"^\s*(\d+)\s*")
+        while True:
+            line = self.file_object.readline()
+            if not line:
+                break
+            if p3.match(line):
+                atom_num = int(p3.match(line).group(1))
+                lines = []
+                lines.append(line)
+                for ii in range(atom_num + 1):
+                    lines.append(self.file_object.readline())
+                if not lines[-1]:
+                    raise RuntimeError(
+                        f"this xyz file may lack of lines, should be {atom_num + 2};lines:{lines}"
+                    )
+                yield lines
+
+    @staticmethod
+    def handle_single_xyz_frame(lines):
+        atom_num = int(lines[0].strip("\n").strip())
+        if len(lines) != atom_num + 2:
+            raise RuntimeError(
+                f"format error, atom_num=={atom_num}, {len(lines)}!=atom_num+2"
+            )
+        data_format_line = lines[1].strip("\n").strip() + " "
+        field_value_pattern = re.compile(
+            r"(?P<key>\S+)=(?P<quote>[\'\"]?)(?P<value>.*?)(?P=quote)\s+"
+        )
+        prop_pattern = re.compile(
+            r"(?P<key>\w+?):(?P<datatype>[a-zA-Z]):(?P<value>\d+)"
+        )
+
+        data_format_list = [
+            kv_dict.groupdict()
+            for kv_dict in field_value_pattern.finditer(data_format_line)
+        ]
+        field_dict = {}
+        for item in data_format_list:
+            field_dict[item["key"]] = item["value"]
+
+        Properties = field_dict["Properties"]
+        prop_list = [
+            kv_dict.groupdict() for kv_dict in prop_pattern.finditer(Properties)
+        ]
+
+        data_lines = []
+        for line in lines[2:]:
+            data_lines.append(list(filter(bool, line.strip().split())))
+        data_array = np.array(data_lines)
+        used_colomn = 0
+
+        type_array = None
+        coords_array = None
+        Z_array = None
+        force_array = None
+        virials = None
+        for kv_dict in prop_list:
+            if kv_dict["key"] == "species":
+                if kv_dict["datatype"] != "S":
+                    raise RuntimeError(
+                        "datatype for species must be 'S' instead of {}".format(
+                            kv_dict["datatype"]
+                        )
+                    )
+                field_length = int(kv_dict["value"])
+                type_array = data_array[
+                    :, used_colomn : used_colomn + field_length
+                ].flatten()
+                used_colomn += field_length
+                continue
+            elif kv_dict["key"] == "pos":
+                if kv_dict["datatype"] != "R":
+                    raise RuntimeError(
+                        "datatype for pos must be 'R' instead of {}".format(
+                            kv_dict["datatype"]
+                        )
+                    )
+                field_length = int(kv_dict["value"])
+                coords_array = data_array[:, used_colomn : used_colomn + field_length]
+                used_colomn += field_length
+                continue
+            elif kv_dict["key"] == "Z":
+                if kv_dict["datatype"] != "I":
+                    raise RuntimeError(
+                        "datatype for pos must be 'R' instead of {}".format(
+                            kv_dict["datatype"]
+                        )
+                    )
+                field_length = int(kv_dict["value"])
+                Z_array = data_array[
+                    :, used_colomn : used_colomn + field_length
+                ].flatten()
+                used_colomn += field_length
+                continue
+            elif kv_dict["key"] == "force":
+                if kv_dict["datatype"] != "R":
+                    raise RuntimeError(
+                        "datatype for pos must be 'R' instead of {}".format(
+                            kv_dict["datatype"]
+                        )
+                    )
+                field_length = int(kv_dict["value"])
+                force_array = data_array[:, used_colomn : used_colomn + field_length]
+                used_colomn += field_length
+                continue
+            else:
+                raise RuntimeError("unknown field {}".format(kv_dict["key"]))
+
+        type_num_dict = OrderedDict()
+        atom_type_list = []
+        type_map = {}
+        temp_atom_max_index = 0
+        if type_array is None:
+            raise RuntimeError("type_array can't be None type, check .xyz file")
+        for ii in type_array:
+            if ii not in type_map:
+                type_map[ii] = temp_atom_max_index
+                temp_atom_max_index += 1
+                temp_atom_index = type_map[ii]
+                atom_type_list.append(temp_atom_index)
+                type_num_dict[ii] = 1
+            else:
+                temp_atom_index = type_map[ii]
+                atom_type_list.append(temp_atom_index)
+                type_num_dict[ii] += 1
+        type_num_list = []
+        for atom_type, atom_num in type_num_dict.items():
+            type_num_list.append((atom_type, atom_num))
+        type_num_array = np.array(type_num_list)
+        if field_dict.get("virial", None):
+            virials = np.array(
+                [
+                    np.array(
+                        list(filter(bool, field_dict["virial"].split(" ")))
+                    ).reshape(3, 3)
+                ]
+            ).astype(np.float64)
+        else:
+            virials = None
+
+        info_dict = {}
+        info_dict["atom_names"] = list(type_num_array[:, 0])
+        info_dict["atom_numbs"] = list(type_num_array[:, 1].astype(int))
+        info_dict["atom_types"] = np.array(atom_type_list).astype(int)
+        info_dict["cells"] = np.array(
+            [
+                np.array(list(filter(bool, field_dict["Lattice"].split(" ")))).reshape(
+                    3, 3
+                )
+            ]
+        ).astype(np.float64)
+        info_dict["coords"] = np.array([coords_array]).astype(np.float64)
+        info_dict["energies"] = np.array([field_dict["energy"]]).astype(np.float64)
+        info_dict["forces"] = np.array([force_array]).astype(np.float64)
+        if virials is not None:
+            info_dict["virials"] = virials
+        info_dict["orig"] = np.zeros(3)
+        return info_dict
+
+
+def format_single_frame(data, frame_idx):
+    """Format a single frame of system data into QUIP/GAP XYZ format lines.
+
+    Parameters
+    ----------
+    data : dict
+        system data
+    frame_idx : int
+        frame index
+
+    Returns
+    -------
+    list[str]
+        lines for the frame
+    """
+    # Number of atoms
+    natoms = len(data["atom_types"])
+
+    # Build header line with metadata
+    header_parts = []
+
+    # Energy
+    energy = data["energies"][frame_idx]
+    header_parts.append(f"energy={energy:.12e}")
+
+    # Virial (if present)
+    if "virials" in data:
+        virial = data["virials"][frame_idx]
+        virial_str = "    ".join(f"{v:.12e}" for v in virial.flatten())
+        header_parts.append(f'virial="{virial_str}"')
+
+    # Lattice
+    cell = data["cells"][frame_idx]
+    lattice_str = "   ".join(f"{c:.12e}" for c in cell.flatten())
+    header_parts.append(f'Lattice="{lattice_str}"')
+
+    # Properties
+    header_parts.append("Properties=species:S:1:pos:R:3:Z:I:1:force:R:3")
+
+    header_line = "    ".join(header_parts)
+
+    # Format atom lines
+    atom_lines = []
+    coords = data["coords"][frame_idx]
+    forces = data["forces"][frame_idx]
+    atom_names = np.array(data["atom_names"])
+    atom_types = data["atom_types"]
+
+    for i in range(natoms):
+        atom_type_idx = atom_types[i]
+        species = atom_names[atom_type_idx]
+        x, y, z = coords[i]
+        fx, fy, fz = forces[i]
+        atomic_number = Element(species).Z
+
+        atom_line = f"{species}    {x:.11e}   {y:.11e}   {z:.11e}   {atomic_number}    {fx:.11e}  {fy:.11e}   {fz:.11e}"
+        atom_lines.append(atom_line)
+
+    # Combine all lines for this frame
+    frame_lines = [str(natoms), header_line] + atom_lines
+    return frame_lines
diff --git a/dpdata/formats/xyz/xyz.py b/dpdata/formats/xyz/xyz.py
new file mode 100644
index 000000000..0c36ac32b
--- /dev/null
+++ b/dpdata/formats/xyz/xyz.py
@@ -0,0 +1,59 @@
+from __future__ import annotations
+
+import numpy as np
+
+
+def coord_to_xyz(coord: np.ndarray, types: list) -> str:
+    """Convert coordinates and types to xyz format.
+
+    Parameters
+    ----------
+    coord : np.ndarray
+        coordinates, Nx3 array
+    types : list
+        list of types
+
+    Returns
+    -------
+    str
+        xyz format string
+
+    Examples
+    --------
+    >>> coord_to_xyz(np.ones((1,3)), ["C"])
+    1
+
+    C 1.000000 1.000000 1.000000
+    """
+    buff = [str(len(types)), ""]
+    for at, cc in zip(types, coord):
+        buff.append("{} {:.6f} {:.6f} {:.6f}".format(at, *cc))
+    return "\n".join(buff)
+
+
+def xyz_to_coord(xyz: str) -> tuple[np.ndarray, list]:
+    """Convert xyz format to coordinates and types.
+
+    Parameters
+    ----------
+    xyz : str
+        xyz format string
+
+    Returns
+    -------
+    coords : np.ndarray
+        coordinates, Nx3 array
+    types : list
+        list of types
+    """
+    symbols = []
+    coords = []
+    for ii, line in enumerate(xyz.split("\n")):
+        if ii == 0:
+            natoms = int(line.strip())
+        elif 2 <= ii <= 1 + natoms:
+            # symbol x y z
+            symbol, x, y, z = line.split()
+            coords.append((float(x), float(y), float(z)))
+            symbols.append(symbol)
+    return np.array(coords), symbols
diff --git a/dpdata/gaussian/__init__.py b/dpdata/gaussian/__init__.py
index e69de29bb..7b2ae19b5 100644
--- a/dpdata/gaussian/__init__.py
+++ b/dpdata/gaussian/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import annotations
+
+from dpdata.formats.gaussian import *  # noqa: F403
diff --git a/dpdata/gaussian/fchk.py b/dpdata/gaussian/fchk.py
index 816a999ce..b41d94ec1 100644
--- a/dpdata/gaussian/fchk.py
+++ b/dpdata/gaussian/fchk.py
@@ -1,175 +1,3 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
-
-import numpy as np
-
-from dpdata.utils import open_file
-
-if TYPE_CHECKING:
-    from dpdata.utils import FileType
-
-from ..periodic_table import ELEMENTS
-from ..unit import (
-    EnergyConversion,
-    ForceConversion,
-    HessianConversion,
-    LengthConversion,
-)
-
-length_convert = LengthConversion("bohr", "angstrom").value()
-energy_convert = EnergyConversion("hartree", "eV").value()
-force_convert = ForceConversion("hartree/bohr", "eV/angstrom").value()
-hessian_convert = HessianConversion("hartree/bohr^2", "eV/angstrom^2").value()
-
-
-def create_full_hessian(hessian_raw: list | np.ndarray, natoms: int) -> np.ndarray:
-    """
-    Reconstructs the full, symmetric Hessian matrix from a 1D array
-    containing its lower triangular elements.
-
-    Args:
-        hessian_raw (list | np.ndarray): A 1D list or NumPy array containing the
-                                         lower triangular elements (including the
-                                         diagonal) of the Hessian matrix.
-        natoms (int): The number of atoms in the system.
-
-    Returns
-    -------
-    np.ndarray: A full, symmetric (3*natoms, 3*natoms) Hessian matrix.
-
-    Raises
-    ------
-    ValueError: If the number of elements in `hessian_raw` does not match
-        the expected number for the lower triangle of a
-        (3*natoms, 3*natoms) matrix.
-    """
-    # Convert input to a NumPy array in case it's a list
-    hessian_block = np.array(hessian_raw)
-
-    # Calculate the dimension of the final matrix
-    dim = 3 * natoms
-
-    # Validate that the input data has the correct length
-    # A lower triangle of an n x n matrix has n*(n+1)/2 elements
-    expected_length = dim * (dim + 1) // 2
-    if hessian_block.size != expected_length:
-        raise ValueError(
-            f"Input length {hessian_block.size} != expected {expected_length}"
-        )
-
-    # Create a zero matrix, then fill the lower triangle
-    hessian_full = np.zeros((dim, dim), dtype=hessian_block.dtype)
-    lower_triangle_indices = np.tril_indices(dim)
-    hessian_full[lower_triangle_indices] = hessian_block
-
-    # This is done by copying the lower triangle to the upper triangle
-    # M_full = M_lower + M_lower.T - diag(M_lower)
-    hessian_full = hessian_full + hessian_full.T - np.diag(np.diag(hessian_full))
-
-    return hessian_full
-
-
-def to_system_data(file_name: FileType, has_forces=True, has_hessian=True):
-    """Read Gaussian fchk file.
-
-    Parameters
-    ----------
-    file_name : str
-        file name
-    has_forces : bool, default True
-        whether to read force
-        Note: Cartesian Gradient in fchk file is converted to forces by taking negative sign
-    has_hessian : bool, default True
-        whether to read hessian
-
-    Returns
-    -------
-    data : dict
-        system data, including hessian if has_hessian is True
-    """
-    data = {}
-    natoms = 0
-    atom_numbers = []
-    coords_t = []
-    energy_t = []
-    forces_t = []
-    hessian_t = []
-    # Read fchk file
-    with open_file(file_name) as fp:
-        for line in fp:
-            if isinstance(line, bytes):
-                line = line.decode(errors="ignore")
-            if "Number of atoms" in line:
-                natoms = int(line.split()[-1])
-            elif "Atomic numbers" in line and "I" in line:
-                n = int(line.split()[-1])
-                atom_numbers = []
-                while len(atom_numbers) < n:
-                    next_line = next(fp)
-                    if isinstance(next_line, bytes):
-                        next_line = next_line.decode(errors="ignore")
-                    atom_numbers += [int(x) for x in next_line.split()]
-            elif "Current cartesian coordinates" in line and "R" in line:
-                n = int(line.split()[-1])
-                coords_raw = []
-                while len(coords_raw) < n:
-                    next_line = next(fp)
-                    if isinstance(next_line, bytes):
-                        next_line = next_line.decode(errors="ignore")
-                    coords_raw += [float(x) for x in next_line.split()]
-                coords = np.array(coords_raw).reshape(-1, 3) * length_convert
-                coords_t.append(coords)
-            elif "Total Energy" in line:
-                energy = float(line.split()[-1]) * energy_convert
-                energy_t.append(energy)
-            elif "Cartesian Gradient" in line:
-                n = int(line.split()[-1])
-                forces_raw = []
-                while len(forces_raw) < n:
-                    next_line = next(fp)
-                    if isinstance(next_line, bytes):
-                        next_line = next_line.decode(errors="ignore")
-                    forces_raw += [float(x) for x in next_line.split()]
-                # Cartesian Gradient is the negative of forces: F = -∇E
-                forces = -np.array(forces_raw).reshape(-1, 3) * force_convert
-                forces_t.append(forces)
-            elif "Cartesian Force Constants" in line and "R" in line:
-                n = int(line.split()[-1])
-                hessian_raw = []
-                while len(hessian_raw) < n:
-                    next_line = next(fp)
-                    if isinstance(next_line, bytes):
-                        next_line = next_line.decode(errors="ignore")
-                    hessian_raw += [float(x) for x in next_line.split()]
-                hessian_full = (
-                    create_full_hessian(hessian_raw, natoms) * hessian_convert
-                )
-                # store as (natoms, 3, natoms, 3) to align with registered shape
-                hessian_t.append(hessian_full.reshape(natoms, 3, natoms, 3))
-    # Assert key data
-    assert coords_t, "cannot find coords"
-    assert energy_t, "cannot find energy"
-    if has_forces:
-        assert forces_t, "cannot find forces"
-    if has_hessian:
-        assert hessian_t, "cannot find hessian"
-    # Assemble data
-    atom_symbols = [ELEMENTS[z - 1] for z in atom_numbers]
-    atom_names, atom_types, atom_numbs = np.unique(
-        atom_symbols, return_inverse=True, return_counts=True
-    )
-    data["atom_names"] = list(atom_names)
-    data["atom_numbs"] = list(atom_numbs)
-    data["atom_types"] = atom_types
-    data["coords"] = np.array(coords_t).reshape(-1, natoms, 3)
-    data["orig"] = np.zeros(3)
-    data["cells"] = np.array([np.eye(3) * 100])
-    data["nopbc"] = True
-    if energy_t:
-        data["energies"] = np.array(energy_t)
-    if has_forces and forces_t:
-        data["forces"] = np.array(forces_t)
-    if has_hessian and hessian_t:
-        data["hessian"] = np.array(hessian_t)
-    return data
+from dpdata.formats.gaussian.fchk import *  # noqa: F403
diff --git a/dpdata/gaussian/gjf.py b/dpdata/gaussian/gjf.py
index 419ec354c..c0e3600f2 100644
--- a/dpdata/gaussian/gjf.py
+++ b/dpdata/gaussian/gjf.py
@@ -1,335 +1,3 @@
-# The initial code of this file is based on
-# https://github.com/deepmodeling/dpgen/blob/0767dce7cad29367edb2e4a55fd0d8724dbda642/dpgen/generator/lib/gaussian.py#L1-L190
-# under LGPL 3.0 license
-"""Generate Gaussian input file."""
-
 from __future__ import annotations
 
-import itertools
-import re
-import uuid
-import warnings
-
-import numpy as np
-
-from dpdata.periodic_table import Element
-
-
-def _crd2frag(symbols: list[str], crds: np.ndarray) -> tuple[int, list[int]]:
-    """Detect fragments from coordinates.
-
-    Parameters
-    ----------
-    symbols : list[str]
-        element symbols; virtual elements are not supported
-    crds : np.ndarray
-        atomic coordinates, shape: (N, 3)
-
-    Returns
-    -------
-    frag_numb : int
-        number of fragments
-    frag_index : list[int]
-        frament index that each atom belongs to
-
-    Notes
-    -----
-    In this method, Open Babel is used to detect bond connectivity. The threshold
-    is the sum of covalent radii with a slight tolerance (0.45 A). Note that
-    this threshold has errors.
-
-    PBC support is removed from this method as Gaussian does not support PBC calculation.
-
-    Raises
-    ------
-    ImportError
-        if Open Babel is not installed
-    """
-    from scipy.sparse import csr_matrix
-    from scipy.sparse.csgraph import connected_components
-
-    try:
-        from openbabel import openbabel
-    except ImportError:
-        import openbabel
-    atomnumber = len(symbols)
-    # Use openbabel to connect atoms
-    mol = openbabel.OBMol()
-    mol.BeginModify()
-    for idx, (symbol, position) in enumerate(zip(symbols, crds.astype(np.float64))):
-        num = Element(symbol).Z
-        atom = mol.NewAtom(idx)
-        atom.SetAtomicNum(int(num))
-        atom.SetVector(*position)
-    mol.ConnectTheDots()
-    mol.PerceiveBondOrders()
-    mol.EndModify()
-    bonds = []
-    for ii in range(mol.NumBonds()):
-        bond = mol.GetBond(ii)
-        a = bond.GetBeginAtom().GetId()
-        b = bond.GetEndAtom().GetId()
-        bo = bond.GetBondOrder()
-        bonds.extend([[a, b, bo], [b, a, bo]])
-    bonds = np.array(bonds, ndmin=2).reshape((-1, 3))
-    graph = csr_matrix(
-        (bonds[:, 2], (bonds[:, 0], bonds[:, 1])), shape=(atomnumber, atomnumber)
-    )
-    frag_numb, frag_index = connected_components(graph, 0)
-    return frag_numb, frag_index
-
-
-def detect_multiplicity(symbols: np.ndarray) -> int:
-    """Find the minimal multiplicity of the given molecules.
-
-    Parameters
-    ----------
-    symbols : np.ndarray
-        element symbols; virtual elements are not supported
-
-    Returns
-    -------
-    int
-        spin multiplicity
-    """
-    # currently only support charge=0
-    # oxygen -> 3
-    if np.count_nonzero(symbols == ["O"]) == 2 and symbols.size == 2:
-        return 3
-    # calculates the total number of electrons, assumes they are paired as much as possible
-    n_total = sum([Element(s).Z for s in symbols])
-    return n_total % 2 + 1
-
-
-def make_gaussian_input(
-    sys_data: dict,
-    keywords: str | list[str],
-    multiplicity: str | int = "auto",
-    charge: int = 0,
-    fragment_guesses: bool = False,
-    basis_set: str | None = None,
-    keywords_high_multiplicity: str | None = None,
-    nproc: int = 1,
-) -> str:
-    """Make gaussian input file.
-
-    Parameters
-    ----------
-    sys_data : dict
-        system data
-    keywords : str or list[str]
-        Gaussian keywords, e.g. force b3lyp/6-31g**. If a list,
-        run multiple steps
-    multiplicity : str or int, default=auto
-        spin multiplicity state. It can be a number. If auto,
-        multiplicity will be detected automatically, with the
-        following rules:
-            fragment_guesses=True
-                multiplicity will +1 for each radical, and +2
-                for each oxygen molecule
-            fragment_guesses=False
-                multiplicity will be 1 or 2, but +2 for each
-                oxygen molecule
-    charge : int, default=0
-        molecule charge. Only used when charge is not provided
-        by the system
-    fragment_guesses : bool, default=False
-        initial guess generated from fragment guesses. If True,
-        multiplicity should be auto
-    basis_set : str, default=None
-        custom basis set
-    keywords_high_multiplicity : str, default=None
-        keywords for points with multiple raicals. multiplicity
-        should be auto. If not set, fallback to normal keywords
-    nproc : int, default=1
-        Number of CPUs to use
-
-    Returns
-    -------
-    str
-        gjf output string
-    """
-    coordinates = sys_data["coords"][0]
-    atom_names = sys_data["atom_names"]
-    atom_numbs = sys_data["atom_numbs"]
-    atom_types = sys_data["atom_types"]
-    # get atom symbols list
-    symbols = [atom_names[atom_type] for atom_type in atom_types]
-
-    # assume default charge is zero and default spin multiplicity is 1
-    if "charge" in sys_data.keys():
-        charge = sys_data["charge"]
-
-    use_fragment_guesses = False
-    if isinstance(multiplicity, int):
-        mult_auto = False
-    elif multiplicity == "auto":
-        mult_auto = True
-    else:
-        raise RuntimeError('The keyword "multiplicity" is illegal.')
-
-    if fragment_guesses:
-        # Initial guess generated from fragment guesses
-        # New feature of Gaussian 16
-        use_fragment_guesses = True
-        if not mult_auto:
-            warnings.warn("Automatically set multiplicity to auto!")
-            mult_auto = True
-
-    if mult_auto:
-        frag_numb, frag_index = _crd2frag(symbols, coordinates)
-        if frag_numb == 1:
-            use_fragment_guesses = False
-        mult_frags = []
-        for i in range(frag_numb):
-            idx = frag_index == i
-            mult_frags.append(detect_multiplicity(np.array(symbols)[idx]))
-        if use_fragment_guesses:
-            multiplicity = sum(mult_frags) - frag_numb + 1 - charge % 2
-            chargekeywords_frag = "%d %d" % (charge, multiplicity) + "".join(  # noqa: UP031
-                [" %d %d" % (charge, mult_frag) for mult_frag in mult_frags]  # noqa: UP031
-            )
-        else:
-            multi_frags = np.array(mult_frags)
-            multiplicity = (
-                1
-                + np.count_nonzero(multi_frags == 2) % 2
-                + np.count_nonzero(multi_frags == 3) * 2
-                - charge % 2
-            )
-
-        if (
-            keywords_high_multiplicity is not None
-            and np.count_nonzero(multi_frags == 2) >= 2
-        ):
-            # at least 2 radicals
-            keywords = keywords_high_multiplicity
-
-    if isinstance(keywords, str):
-        keywords = [keywords]
-    else:
-        keywords = keywords.copy()
-
-    buff = []
-    # keywords, e.g., force b3lyp/6-31g**
-    if use_fragment_guesses:
-        keywords[0] = f"{keywords[0]} guess=fragment={frag_numb}"
-
-    chkkeywords = []
-    if len(keywords) > 1:
-        chkkeywords.append(f"%chk={str(uuid.uuid1())}.chk")
-
-    nprockeywords = f"%nproc={nproc:d}"
-    # use formula as title
-    titlekeywords = "".join(
-        [f"{symbol}{numb}" for symbol, numb in zip(atom_names, atom_numbs)]
-    )
-    chargekeywords = f"{charge} {multiplicity}"
-
-    buff = [
-        *chkkeywords,
-        nprockeywords,
-        f"#{keywords[0]}",
-        "",
-        titlekeywords,
-        "",
-        (chargekeywords_frag if use_fragment_guesses else chargekeywords),
-    ]
-
-    for ii, (symbol, coordinate) in enumerate(zip(symbols, coordinates)):
-        if use_fragment_guesses:
-            buff.append(
-                "%s(Fragment=%d) %f %f %f" % (symbol, frag_index[ii] + 1, *coordinate)  # noqa: UP031
-            )
-        else:
-            buff.append("{} {:f} {:f} {:f}".format(symbol, *coordinate))  # noqa: UP031
-    if not sys_data.get("nopbc", False):
-        # PBC condition
-        cell = sys_data["cells"][0]
-        for ii in range(3):
-            # use TV as atomic symbol, see https://gaussian.com/pbc/
-            buff.append("TV {:f} {:f} {:f}".format(*cell[ii]))
-    if basis_set is not None:
-        # custom basis set
-        buff.extend(["", basis_set, ""])
-    for kw in itertools.islice(keywords, 1, None):
-        buff.extend(
-            [
-                "\n--link1--",
-                *chkkeywords,
-                nprockeywords,
-                f"#{kw}",
-                "",
-                titlekeywords,
-                "",
-                chargekeywords,
-                "",
-            ]
-        )
-    buff.append("\n")
-    return "\n".join(buff)
-
-
-def read_gaussian_input(inp: str):
-    """Read Gaussian input.
-
-    Parameters
-    ----------
-    inp : str
-        Gaussian input str
-
-    Returns
-    -------
-    dict
-        system data
-    """
-    flag = 0
-    coords = []
-    elements = []
-    cells = []
-    for line in inp.split("\n"):
-        if not line.strip():
-            # empty line
-            flag += 1
-        elif flag == 0:
-            # keywords
-            if line.startswith("#"):
-                # setting
-                keywords = line.split()
-            elif line.startswith("%"):
-                pass
-        elif flag == 1:
-            # title
-            pass
-        elif flag == 2:
-            # multi and coords
-            s = line.split()
-            if len(s) == 2:
-                pass
-            elif len(s) == 4:
-                if s[0] == "TV":
-                    cells.append(list(map(float, s[1:4])))
-                else:
-                    # element
-                    elements.append(re.sub("\\(.*?\\)|\\{.*?}|\\[.*?]", "", s[0]))
-                    coords.append(list(map(float, s[1:4])))
-        elif flag == 3:
-            # end
-            break
-    atom_names, atom_types, atom_numbs = np.unique(
-        elements, return_inverse=True, return_counts=True
-    )
-    if len(cells):
-        nopbc = False
-    else:
-        nopbc = True
-        cells = np.array([np.eye(3)]) * 100
-    return {
-        "atom_names": list(atom_names),
-        "atom_numbs": list(atom_numbs),
-        "atom_types": atom_types,
-        "cells": np.array(cells).reshape(1, 3, 3),
-        "nopbc": nopbc,
-        "coords": np.array(coords).reshape(1, -1, 3),
-        "orig": np.zeros(3),
-    }
+from dpdata.formats.gaussian.gjf import *  # noqa: F403
diff --git a/dpdata/gaussian/log.py b/dpdata/gaussian/log.py
index 08a65b9dc..750343df4 100644
--- a/dpdata/gaussian/log.py
+++ b/dpdata/gaussian/log.py
@@ -1,136 +1,3 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
-
-import numpy as np
-
-from dpdata.utils import open_file
-
-if TYPE_CHECKING:
-    from dpdata.utils import FileType
-
-from ..periodic_table import ELEMENTS
-from ..unit import EnergyConversion, ForceConversion, LengthConversion
-
-length_convert = LengthConversion("bohr", "angstrom").value()
-energy_convert = EnergyConversion("hartree", "eV").value()
-force_convert = ForceConversion("hartree/bohr", "eV/angstrom").value()
-
-symbols = ["X"] + ELEMENTS
-
-
-def to_system_data(file_name: FileType, md=False):
-    """Read Gaussian log file.
-
-    Parameters
-    ----------
-    file_name : str
-        file name
-    md : bool, default False
-        whether to read multiple frames
-
-    Returns
-    -------
-    data : dict
-        system data
-
-    Raises
-    ------
-    RuntimeError
-        if the input orientation is not found
-    """
-    data = {}
-    # read from log lines
-    flag = 0
-    energy_t = []
-    coords_t = []
-    atom_symbols = []
-    forces_t = []
-    cells_t = []
-    nopbc = True
-    coords = None
-
-    with open_file(file_name) as fp:
-        for line in fp:
-            if line.startswith(" SCF Done"):
-                # energies
-                energy = float(line.split()[4])
-            elif line.startswith(
-                " Center     Atomic                   Forces (Hartrees/Bohr)"
-            ):
-                flag = 1
-                forces = []
-            elif line.startswith(
-                "                          Input orientation:"
-            ) or line.startswith("                         Z-Matrix orientation:"):
-                flag = 5
-                coords = []
-                atom_symbols = []
-                cells = []
-
-            if 1 <= flag <= 3 or 5 <= flag <= 9:
-                flag += 1
-            elif flag == 4:
-                # forces
-                if line.startswith(" -------"):
-                    if coords is None:
-                        raise RuntimeError(
-                            "Input orientation is not found. Using Gaussian keyword "
-                            "`Geom=PrintInputOrient` to always print the input orientation. "
-                            "See https://gaussian.com/geom/ for more details."
-                        )
-                    forces_t.append(forces)
-                    energy_t.append(energy)
-                    coords_t.append(coords)
-                    if cells:
-                        nopbc = False
-                        cells_t.append(cells)
-                    else:
-                        cells_t.append(
-                            [[100.0, 0.0, 0.0], [0.0, 100.0, 0.0], [0.0, 0.0, 100.0]]
-                        )
-                    flag = 0
-                    coords = None
-                else:
-                    s = line.split()
-                    if line[14:16] == "-2":
-                        # PBC
-                        pass
-                    else:
-                        forces.append(
-                            [float(line[23:38]), float(line[38:53]), float(line[53:68])]
-                        )
-            elif flag == 10:
-                # atom_symbols and coords
-                if line.startswith(" -------"):
-                    flag = 0
-                else:
-                    s = line.split()
-                    if int(s[1]) == -2:
-                        # PBC cells, see https://gaussian.com/pbc/
-                        cells.append([float(x) for x in s[3:6]])
-                    else:
-                        coords.append([float(x) for x in s[3:6]])
-                        atom_symbols.append(symbols[int(s[1])])
-
-    assert coords_t, "cannot find coords"
-    assert energy_t, "cannot find energies"
-    assert forces_t, "cannot find forces"
-
-    atom_names, data["atom_types"], atom_numbs = np.unique(
-        atom_symbols, return_inverse=True, return_counts=True
-    )
-    data["atom_names"] = list(atom_names)
-    data["atom_numbs"] = list(atom_numbs)
-    if not md:
-        forces_t = forces_t[-1:]
-        energy_t = energy_t[-1:]
-        coords_t = coords_t[-1:]
-        cells_t = cells_t[-1:]
-    data["forces"] = np.array(forces_t) * force_convert
-    data["energies"] = np.array(energy_t) * energy_convert
-    data["coords"] = np.array(coords_t)
-    data["orig"] = np.array([0, 0, 0])
-    data["cells"] = np.array(cells_t)
-    data["nopbc"] = nopbc
-    return data
+from dpdata.formats.gaussian.log import *  # noqa: F403
diff --git a/dpdata/gromacs/__init__.py b/dpdata/gromacs/__init__.py
index e69de29bb..7251bb787 100644
--- a/dpdata/gromacs/__init__.py
+++ b/dpdata/gromacs/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import annotations
+
+from dpdata.formats.gromacs import *  # noqa: F403
diff --git a/dpdata/gromacs/gro.py b/dpdata/gromacs/gro.py
index fe83e0c5c..8878ce8ff 100644
--- a/dpdata/gromacs/gro.py
+++ b/dpdata/gromacs/gro.py
@@ -1,112 +1,3 @@
-#!/usr/bin/env python3
 from __future__ import annotations
 
-import re
-from typing import TYPE_CHECKING
-
-import numpy as np
-
-from dpdata.utils import open_file
-
-if TYPE_CHECKING:
-    from dpdata.utils import FileType
-
-from ..unit import LengthConversion
-
-nm2ang = LengthConversion("nm", "angstrom").value()
-ang2nm = LengthConversion("angstrom", "nm").value()
-cell_idx_gmx2dp = [0, 4, 8, 1, 2, 3, 5, 6, 7]
-
-
-def _format_atom_name(atom_name):
-    patt = re.compile("[a-zA-Z]*")
-    match = re.search(patt, atom_name)
-    fmt_name = match.group().capitalize()
-    return fmt_name
-
-
-def _get_line(line, fmt_atom_name=True):
-    atom_name = line[10:15].split()[0]
-    if fmt_atom_name:
-        atom_name = _format_atom_name(atom_name)
-    atom_idx = int(line[15:20].split()[0])
-    posis = [float(line[ii : ii + 8]) for ii in range(20, 44, 8)]
-    posis = np.array(posis) * nm2ang
-    return atom_name, atom_idx, posis
-
-
-def _get_cell(line):
-    cell = np.zeros([3, 3])
-    lengths = [float(ii) for ii in line.split()]
-    if len(lengths) >= 3:
-        for dd in range(3):
-            cell[dd][dd] = lengths[dd]
-    else:
-        raise RuntimeError("wrong box format: ", line)
-    if len(lengths) == 9:
-        cell[0][1] = lengths[3]
-        cell[0][2] = lengths[4]
-        cell[1][0] = lengths[5]
-        cell[1][2] = lengths[6]
-        cell[2][0] = lengths[7]
-        cell[2][1] = lengths[8]
-    cell = cell * nm2ang
-    return cell
-
-
-def file_to_system_data(fname: FileType, format_atom_name=True, **kwargs):
-    system = {"coords": [], "cells": []}
-    with open_file(fname) as fp:
-        frame = 0
-        while True:
-            flag = fp.readline()
-            if not flag:
-                break
-            else:
-                frame += 1
-                names = []
-                idxs = []
-                posis = []
-                natoms = int(fp.readline())
-                for ii in range(natoms):
-                    n, i, p = _get_line(fp.readline(), fmt_atom_name=format_atom_name)
-                    names.append(n)
-                    idxs.append(i)
-                    posis.append(p)
-                cell = _get_cell(fp.readline())
-                posis = np.array(posis)
-                if frame == 1:
-                    system["orig"] = np.zeros(3)
-                    system["atom_names"] = list(set(names))
-                    system["atom_numbs"] = [
-                        names.count(ii) for ii in system["atom_names"]
-                    ]
-                    system["atom_types"] = [
-                        system["atom_names"].index(ii) for ii in names
-                    ]
-                    system["atom_types"] = np.array(system["atom_types"], dtype=int)
-                system["coords"].append(posis)
-                system["cells"].append(cell)
-    system["coords"] = np.array(system["coords"])
-    system["cells"] = np.array(system["cells"])
-    return system
-
-
-def from_system_data(system, f_idx=0, **kwargs):
-    resname = kwargs.get("resname", "MOL")
-    shift = kwargs.get("shift", 0)
-    ret = ""
-    ret += " molecule" + "\n"
-    n_atoms = sum(system["atom_numbs"])
-    ret += " " + str(n_atoms) + "\n"
-    for i in range(n_atoms):
-        atom_type = system["atom_types"][i]
-        atom_name = system["atom_names"][atom_type]
-        coords = system["coords"][f_idx] * ang2nm
-        ret += "{:>5d}{:<5s}{:>5s}{:5d}{:8.3f}{:8.3f}{:8.3f}\n".format(
-            1, resname, atom_name, i + shift + 1, *tuple(coords[i])
-        )
-    cell = (system["cells"][f_idx].flatten() * ang2nm)[cell_idx_gmx2dp]
-    ret += " " + " ".join([f"{x:.3f}" for x in cell])
-
-    return ret
+from dpdata.formats.gromacs.gro import *  # noqa: F403
diff --git a/dpdata/lammps/__init__.py b/dpdata/lammps/__init__.py
index e69de29bb..3069f9b62 100644
--- a/dpdata/lammps/__init__.py
+++ b/dpdata/lammps/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import annotations
+
+from dpdata.formats.lammps import *  # noqa: F403
diff --git a/dpdata/lammps/dump.py b/dpdata/lammps/dump.py
index 89e75e4de..0ac2b31d7 100644
--- a/dpdata/lammps/dump.py
+++ b/dpdata/lammps/dump.py
@@ -1,433 +1,3 @@
-#!/usr/bin/env python3
 from __future__ import annotations
 
-import os
-import sys
-from typing import TYPE_CHECKING
-
-import numpy as np
-
-from dpdata.utils import open_file
-
-if TYPE_CHECKING:
-    from dpdata.utils import FileType
-
-lib_path = os.path.dirname(os.path.realpath(__file__))
-sys.path.append(lib_path)
-import warnings
-
-import lmp
-
-
-class UnwrapWarning(UserWarning):
-    pass
-
-
-warnings.simplefilter("once", UnwrapWarning)
-
-
-def _get_block(lines, key):
-    for idx in range(len(lines)):
-        if ("ITEM: " + key) in lines[idx]:
-            break
-    idx_s = idx + 1
-    for idx in range(idx_s, len(lines)):
-        if ("ITEM: ") in lines[idx]:
-            break
-    idx_e = idx
-    if idx_e == len(lines) - 1:
-        idx_e += 1
-    return lines[idx_s:idx_e], lines[idx_s - 1]
-
-
-def get_atype(lines, type_idx_zero=False):
-    blk, head = _get_block(lines, "ATOMS")
-    keys = head.split()
-    id_idx = keys.index("id") - 2
-    tidx = keys.index("type") - 2
-    atype = []
-    for ii in blk:
-        atype.append([int(ii.split()[id_idx]), int(ii.split()[tidx])])
-    atype.sort()
-    atype = np.array(atype, dtype=int)
-    if type_idx_zero:
-        return atype[:, 1] - 1
-    else:
-        return atype[:, 1]
-
-
-def get_natoms(lines):
-    blk, head = _get_block(lines, "NUMBER OF ATOMS")
-    return int(blk[0])
-
-
-def get_natomtypes(lines):
-    atype = get_atype(lines)
-    return max(atype)
-
-
-def get_natoms_vec(lines):
-    atype = get_atype(lines)
-    natoms_vec = []
-    natomtypes = get_natomtypes(lines)
-    for ii in range(natomtypes):
-        natoms_vec.append(sum(atype == ii + 1))
-    assert sum(natoms_vec) == get_natoms(lines)
-    return natoms_vec
-
-
-def get_coordtype_and_scalefactor(keys):
-    # 4 types in total,with different scaling factor
-    key_pc = ["x", "y", "z"]  # plain cartesian, sf = 1
-    key_uc = ["xu", "yu", "zu"]  # unwraped cartesian, sf = 1
-    key_s = ["xs", "ys", "zs"]  # scaled by lattice parameter, sf = lattice parameter
-    key_su = ["xsu", "ysu", "zsu"]  # scaled and unfolded,sf = lattice parameter
-    lmp_coor_type = [key_pc, key_uc, key_s, key_su]
-    sf = [0, 0, 1, 1]
-    uw = [0, 1, 0, 1]  # unwraped or not
-    for k in range(4):
-        if all(i in keys for i in lmp_coor_type[k]):
-            return lmp_coor_type[k], sf[k], uw[k]
-
-
-def safe_get_posi(lines, cell, orig=np.zeros(3), unwrap=False):
-    blk, head = _get_block(lines, "ATOMS")
-    keys = head.split()
-    coord_tp_and_sf = get_coordtype_and_scalefactor(keys)
-    assert coord_tp_and_sf is not None, "Dump file does not contain atomic coordinates!"
-    coordtype, sf, uw = coord_tp_and_sf
-    id_idx = keys.index("id") - 2
-    xidx = keys.index(coordtype[0]) - 2
-    yidx = keys.index(coordtype[1]) - 2
-    zidx = keys.index(coordtype[2]) - 2
-    posis = []
-    for ii in blk:
-        words = ii.split()
-        posis.append(
-            [
-                float(words[id_idx]),
-                float(words[xidx]),
-                float(words[yidx]),
-                float(words[zidx]),
-            ]
-        )
-    posis.sort()
-    posis = np.array(posis)[:, 1:4]
-    if not sf:
-        posis = (posis - orig) @ np.linalg.inv(
-            cell
-        )  # Convert to scaled coordinates for unscaled coordinates
-    if uw and unwrap:
-        return (
-            posis @ cell
-        )  # convert scaled coordinates back to Cartesien coordinates unwrap at the periodic boundaries
-    else:
-        if uw and not unwrap:
-            warnings.warn(
-                message="Your dump file contains unwrapped coordinates, but you did not specify unwrapping (unwrap = True). The default is wrapping at periodic boundaries (unwrap = False).\n",
-                category=UnwrapWarning,
-            )
-        return (
-            (posis % 1) @ cell
-        )  # Convert scaled coordinates back to Cartesien coordinates with wraping at periodic boundary conditions
-
-
-def get_dumpbox(lines):
-    blk, h = _get_block(lines, "BOX BOUNDS")
-    bounds = np.zeros([3, 2])
-    tilt = np.zeros([3])
-    load_tilt = "xy xz yz" in h
-    for dd in range(3):
-        info = [float(jj) for jj in blk[dd].split()]
-        bounds[dd][0] = info[0]
-        bounds[dd][1] = info[1]
-        if load_tilt:
-            tilt[dd] = info[2]
-    return bounds, tilt
-
-
-def dumpbox2box(bounds, tilt):
-    xy = tilt[0]
-    xz = tilt[1]
-    yz = tilt[2]
-    xlo = bounds[0][0] - min(0.0, xy, xz, xy + xz)
-    xhi = bounds[0][1] - max(0.0, xy, xz, xy + xz)
-    ylo = bounds[1][0] - min(0.0, yz)
-    yhi = bounds[1][1] - max(0.0, yz)
-    zlo = bounds[2][0]
-    zhi = bounds[2][1]
-    info = [[xlo, xhi], [ylo, yhi], [zlo, zhi]]
-    return lmp.lmpbox2box(info, tilt)
-
-
-def box2dumpbox(orig, box):
-    lohi, tilt = lmp.box2lmpbox(orig, box)
-    xy = tilt[0]
-    xz = tilt[1]
-    yz = tilt[2]
-    bounds = np.zeros([3, 2])
-    bounds[0][0] = lohi[0][0] + min(0.0, xy, xz, xy + xz)
-    bounds[0][1] = lohi[0][1] + max(0.0, xy, xz, xy + xz)
-    bounds[1][0] = lohi[1][0] + min(0.0, yz)
-    bounds[1][1] = lohi[1][1] + max(0.0, yz)
-    bounds[2][0] = lohi[2][0]
-    bounds[2][1] = lohi[2][1]
-    return bounds, tilt
-
-
-def load_file(fname: FileType, begin=0, step=1):
-    lines = []
-    buff = []
-    cc = -1
-    with open_file(fname) as fp:
-        while True:
-            line = fp.readline().rstrip("\n")
-            if not line:
-                if cc >= begin and (cc - begin) % step == 0:
-                    lines += buff
-                    buff = []
-                cc += 1
-                return lines
-            if "ITEM: TIMESTEP" in line:
-                if cc >= begin and (cc - begin) % step == 0:
-                    lines += buff
-                    buff = []
-                cc += 1
-            if cc >= begin and (cc - begin) % step == 0:
-                buff.append(line)
-
-
-def get_spin_keys(inputfile):
-    """
-    Read input file and get the keys for spin info in dump.
-
-    Parameters
-    ----------
-    inputfile : str
-        Path to the input file.
-
-    Returns
-    -------
-    list or None
-        List of spin info keys if found, None otherwise.
-    """
-    if inputfile is None:
-        return None
-
-    if not os.path.isfile(inputfile):
-        warnings.warn(f"Input file {inputfile} not found.")
-        return None
-
-    with open(inputfile) as f:
-        for line in f.readlines():
-            ls = line.split()
-            if (
-                len(ls) > 7
-                and ls[0] == "compute"
-                and all(key in ls for key in ["sp", "spx", "spy", "spz"])
-            ):
-                compute_name = ls[1]
-                return [
-                    f"c_{compute_name}[{ls.index(key) - 3}]"
-                    for key in ["sp", "spx", "spy", "spz"]
-                ]
-
-    return None
-
-
-def get_spin(lines, spin_keys):
-    """
-    Get the spin info from the dump file.
-
-    Parameters
-    ----------
-    lines : list
-        The content of the dump file.
-    spin_keys : list
-        The keys for spin info in dump file.
-    the spin info is stored in sp, spx, spy, spz or spin_keys, which is the spin norm and the spin vector
-    1 1 0.00141160 5.64868599 0.01005602 1.54706291 0.00000000 0.00000000 1.00000000 -1.40772100 -2.03739417 -1522.64797384 -0.00397809 -0.00190426 -0.00743976
-    """
-    blk, head = _get_block(lines, "ATOMS")
-    heads = head.split()
-
-    if spin_keys is not None and all(i in heads for i in spin_keys):
-        key = spin_keys
-    else:
-        return None
-
-    try:
-        idx_id = heads.index("id") - 2
-        idx_sp, idx_spx, idx_spy, idx_spz = (heads.index(k) - 2 for k in key)
-
-        norm = []
-        vec = []
-        atom_ids = []
-        for line in blk:
-            words = line.split()
-            norm.append([float(words[idx_sp])])
-            vec.append(
-                [float(words[idx_spx]), float(words[idx_spy]), float(words[idx_spz])]
-            )
-            atom_ids.append(int(words[idx_id]))
-
-        spin = np.array(norm) * np.array(vec)
-        atom_ids, spin = zip(*sorted(zip(atom_ids, spin)))
-        return np.array(spin)
-    except (ValueError, IndexError) as e:
-        warnings.warn(f"Error processing spin data: {str(e)}")
-        return None
-
-
-def system_data(
-    lines, type_map=None, type_idx_zero=True, unwrap=False, input_file=None
-):
-    array_lines = split_traj(lines)
-    lines = array_lines[0]
-    system = {}
-    system["atom_numbs"] = get_natoms_vec(lines)
-    system["atom_names"] = []
-    if type_map is None:
-        for ii in range(len(system["atom_numbs"])):
-            system["atom_names"].append("TYPE_%d" % ii)  # noqa: UP031
-    else:
-        assert len(type_map) >= len(system["atom_numbs"])
-        for ii in range(len(system["atom_numbs"])):
-            system["atom_names"].append(type_map[ii])
-    bounds, tilt = get_dumpbox(lines)
-    orig, cell = dumpbox2box(bounds, tilt)
-    system["orig"] = np.array(orig) - np.array(orig)
-    system["cells"] = [np.array(cell)]
-    system["atom_types"] = get_atype(lines, type_idx_zero=type_idx_zero)
-    system["coords"] = [safe_get_posi(lines, cell, np.array(orig), unwrap)]
-    spin_keys = get_spin_keys(input_file)
-    spin = get_spin(lines, spin_keys)
-    has_spin = False
-    if spin is not None:
-        system["spins"] = [spin]
-        has_spin = True
-    for ii in range(1, len(array_lines)):
-        bounds, tilt = get_dumpbox(array_lines[ii])
-        orig, cell = dumpbox2box(bounds, tilt)
-        system["cells"].append(cell)
-        atype = get_atype(array_lines[ii], type_idx_zero=type_idx_zero)
-        # map atom type; a[as[a][as[as[b]]]] = b[as[b][as^{-1}[b]]] = b[id]
-        idx = np.argsort(atype, kind="stable")[
-            np.argsort(np.argsort(system["atom_types"], kind="stable"), kind="stable")
-        ]
-        system["coords"].append(
-            safe_get_posi(array_lines[ii], cell, np.array(orig), unwrap)[idx]
-        )
-        if has_spin:
-            spin = get_spin(array_lines[ii], spin_keys)
-            if spin is not None:
-                system["spins"].append(spin[idx])
-            else:
-                warnings.warn(
-                    f"Warning: spin info is not found in frame {ii}, remove spin info."
-                )
-                system.pop("spins")
-                has_spin = False
-    if has_spin:
-        system["spins"] = np.array(system["spins"])
-    system["cells"] = np.array(system["cells"])
-    system["coords"] = np.array(system["coords"])
-    return system
-
-
-def split_traj(dump_lines):
-    marks = []
-    for idx, ii in enumerate(dump_lines):
-        if "ITEM: TIMESTEP" in ii:
-            marks.append(idx)
-    if len(marks) == 0:
-        return None
-    elif len(marks) == 1:
-        return [dump_lines]
-    else:
-        block_size = marks[1] - marks[0]
-        ret = []
-        for ii in marks:
-            ret.append(dump_lines[ii : ii + block_size])
-        # for ii in range(len(marks)-1):
-        #     assert(marks[ii+1] - marks[ii] == block_size)
-        return ret
-    return None
-
-
-def from_system_data(system, f_idx=0, timestep=0):
-    """Convert system data to LAMMPS dump format string.
-
-    Parameters
-    ----------
-    system : dict
-        System data dictionary containing atoms, coordinates, cell, etc.
-    f_idx : int, optional
-        Frame index to dump (default: 0)
-    timestep : int, optional
-        Timestep number for the dump (default: 0)
-
-    Returns
-    -------
-    str
-        LAMMPS dump format string
-    """
-    ret = ""
-
-    # Get basic system info
-    natoms = sum(system["atom_numbs"])
-    coords = system["coords"][f_idx]
-    cell = system["cells"][f_idx]
-    atom_types = system["atom_types"]
-    orig = system.get("orig", np.zeros(3))
-
-    # Convert cell to dump format (bounds and tilt)
-    bounds, tilt = box2dumpbox(orig, cell)
-
-    # Write timestep
-    ret += "ITEM: TIMESTEP\n"
-    ret += f"{timestep}\n"
-
-    # Write number of atoms
-    ret += "ITEM: NUMBER OF ATOMS\n"
-    ret += f"{natoms}\n"
-
-    # Write box bounds
-    ret += "ITEM: BOX BOUNDS xy xz yz pp pp pp\n"
-    ret += f"{bounds[0][0]:.10f} {bounds[0][1]:.10f} {tilt[0]:.10f}\n"
-    ret += f"{bounds[1][0]:.10f} {bounds[1][1]:.10f} {tilt[1]:.10f}\n"
-    ret += f"{bounds[2][0]:.10f} {bounds[2][1]:.10f} {tilt[2]:.10f}\n"
-
-    # Write atoms header
-    ret += "ITEM: ATOMS id type x y z\n"
-
-    # Write atom data
-    for ii in range(natoms):
-        atom_id = ii + 1  # LAMMPS uses 1-based indexing
-        atom_type = atom_types[ii] + 1  # LAMMPS uses 1-based type indexing
-        x, y, z = coords[ii]
-        ret += f"{atom_id} {atom_type} {x:.10f} {y:.10f} {z:.10f}\n"
-
-    return ret
-
-
-if __name__ == "__main__":
-    # fname = 'dump.hti'
-    # lines = open(fname).read().split('\n')
-    # # print(get_natoms(lines))
-    # # print(get_natomtypes(lines))
-    # # print(get_natoms_vec(lines))
-    # posi = get_posi(lines)
-    # dbox1, tilt1 = box2dumpbox(orig, box)
-    # print(dbox - dbox1)
-    # print(tilt - tilt1)
-    # print(orig)
-    # print(box)
-    # np.savetxt('tmp.out', posi - orig, fmt='%.6f')
-    # print(system_data(lines))
-    lines = load_file("conf_unfold.dump", begin=0, step=1)
-    al = split_traj(lines)
-    s = system_data(lines, ["O", "H"])
-    # l = np.linalg.norm(s['cells'][1],axis=1)
-    # p = s['coords'][0] + l
-    # np.savetxt('p',p,fmt='%1.10f')
+from dpdata.formats.lammps.dump import *  # noqa: F403
diff --git a/dpdata/lammps/lmp.py b/dpdata/lammps/lmp.py
index c9d60ec53..30225fcae 100644
--- a/dpdata/lammps/lmp.py
+++ b/dpdata/lammps/lmp.py
@@ -1,649 +1,3 @@
-#!/usr/bin/env python3
 from __future__ import annotations
 
-import numpy as np
-
-from dpdata.periodic_table import ELEMENTS, Element
-
-ptr_float_fmt = "%15.10f"
-ptr_int_fmt = "%6d"
-ptr_key_fmt = "%15s"
-
-# Mapping of LAMMPS atom styles to their column layouts
-# Format: (atom_id_col, atom_type_col, x_col, y_col, z_col, has_molecule_id, has_charge, charge_col)
-ATOM_STYLE_COLUMNS = {
-    "atomic": (0, 1, 2, 3, 4, False, False, None),
-    "angle": (0, 2, 3, 4, 5, True, False, None),
-    "bond": (0, 2, 3, 4, 5, True, False, None),
-    "charge": (0, 1, 3, 4, 5, False, True, 2),
-    "full": (0, 2, 4, 5, 6, True, True, 3),
-    "molecular": (0, 2, 3, 4, 5, True, False, None),
-    "dipole": (0, 1, 3, 4, 5, False, True, 2),
-    "sphere": (0, 1, 4, 5, 6, False, False, None),
-}
-
-
-def detect_atom_style(lines: list[str]) -> str | None:
-    """Detect LAMMPS atom style from data file content.
-
-    Parameters
-    ----------
-    lines : list
-        Lines from LAMMPS data file
-
-    Returns
-    -------
-    str or None
-        Detected atom style, or None if not detected
-    """
-    # Look for atom style in comments after "Atoms" section header
-    atom_lines = get_atoms(lines)
-    if not atom_lines:
-        return None
-
-    # Find the "Atoms" line
-    for idx, line in enumerate(lines):
-        if "Atoms" in line:
-            # Check if there's a comment with atom style after "Atoms"
-            if "#" in line:
-                comment_part = line.split("#")[1].strip().lower()
-                for style in ATOM_STYLE_COLUMNS:
-                    if style in comment_part:
-                        return style
-            break
-
-    # If no explicit style found, try to infer from first data line
-    if atom_lines:
-        first_line = atom_lines[0].split()
-        num_cols = len(first_line)
-
-        # Try to match based on number of columns and content patterns
-        # This is a heuristic approach
-        if num_cols == 5:
-            # Could be atomic style: atom-ID atom-type x y z
-            return "atomic"
-        elif num_cols == 6:
-            # Could be charge or bond/molecular style
-            # Try to determine if column 2 (index 2) looks like a charge (float) or type (int)
-            try:
-                val = float(first_line[2])
-                # If it's a small float, likely a charge
-                if abs(val) < 10 and val != int(val):
-                    return "charge"
-                else:
-                    # Likely molecule ID (integer), so bond/molecular style
-                    return "bond"
-            except ValueError:
-                return "atomic"  # fallback
-        elif num_cols == 7:
-            # Could be full style: atom-ID molecule-ID atom-type charge x y z
-            return "full"
-        elif num_cols >= 8:
-            # Could be dipole or sphere style
-            # For now, default to dipole if we have enough columns
-            return "dipole"
-
-    return None  # Unable to detect
-
-
-def _get_block(lines, keys):
-    for idx in range(len(lines)):
-        if keys in lines[idx]:
-            break
-    if idx == len(lines) - 1:
-        return None
-    idx_s = idx + 2
-    idx = idx_s
-    ret = []
-    while True:
-        if idx == len(lines) or len(lines[idx].split()) == 0:
-            break
-        else:
-            ret.append(lines[idx])
-        idx += 1
-    return ret
-
-
-def lmpbox2box(lohi, tilt):
-    xy = tilt[0]
-    xz = tilt[1]
-    yz = tilt[2]
-    orig = np.array([lohi[0][0], lohi[1][0], lohi[2][0]])
-    lens = []
-    for dd in range(3):
-        lens.append(lohi[dd][1] - lohi[dd][0])
-    xx = [lens[0], 0, 0]
-    yy = [xy, lens[1], 0]
-    zz = [xz, yz, lens[2]]
-    return orig, np.array([xx, yy, zz])
-
-
-def box2lmpbox(orig, box):
-    lohi = np.zeros([3, 2])
-    for dd in range(3):
-        lohi[dd][0] = orig[dd]
-    tilt = np.zeros(3)
-    tilt[0] = box[1][0]
-    tilt[1] = box[2][0]
-    tilt[2] = box[2][1]
-    lens = np.zeros(3)
-    lens[0] = box[0][0]
-    lens[1] = box[1][1]
-    lens[2] = box[2][2]
-    for dd in range(3):
-        lohi[dd][1] = lohi[dd][0] + lens[dd]
-    return lohi, tilt
-
-
-def get_atoms(lines):
-    return _get_block(lines, "Atoms")
-
-
-def get_natoms(lines):
-    for ii in lines:
-        if "atoms" in ii:
-            return int(ii.split()[0])
-    return None
-
-
-def get_natomtypes(lines):
-    for ii in lines:
-        if "atom types" in ii:
-            return int(ii.split()[0])
-    return None
-
-
-def _atom_info_mol(line):
-    vec = line.split()
-    # idx, mole_type, atom_type, charge, x, y, z
-    return (
-        int(vec[0]),
-        int(vec[1]),
-        int(vec[2]),
-        float(vec[3]),
-        float(vec[4]),
-        float(vec[5]),
-        float(vec[6]),
-    )
-
-
-def _atom_info_atom(line):
-    vec = line.split()
-    # idx, atom_type, x, y, z
-    return int(vec[0]), int(vec[1]), float(vec[2]), float(vec[3]), float(vec[4])
-
-
-def _atom_info_style(line: str, atom_style: str = "atomic") -> dict[str, int | float]:
-    """Parse atom information based on the specified atom style.
-
-    Parameters
-    ----------
-    line : str
-        The atom line from LAMMPS data file
-    atom_style : str
-        The LAMMPS atom style (atomic, full, charge, etc.)
-
-    Returns
-    -------
-    dict
-        Dictionary containing parsed atom information with keys:
-        'atom_id', 'atom_type', 'x', 'y', 'z', 'molecule_id' (if present), 'charge' (if present)
-    """
-    if atom_style not in ATOM_STYLE_COLUMNS:
-        raise ValueError(
-            f"Unsupported atom style: {atom_style}. Supported styles: {list(ATOM_STYLE_COLUMNS.keys())}"
-        )
-
-    vec = line.split()
-    columns = ATOM_STYLE_COLUMNS[atom_style]
-
-    result = {
-        "atom_id": int(vec[columns[0]]),
-        "atom_type": int(vec[columns[1]]),
-        "x": float(vec[columns[2]]),
-        "y": float(vec[columns[3]]),
-        "z": float(vec[columns[4]]),
-    }
-
-    # Add molecule ID if present
-    if columns[5]:  # has_molecule_id
-        result["molecule_id"] = int(
-            vec[1]
-        )  # molecule ID is always in column 1 when present
-
-    # Add charge if present
-    if columns[6]:  # has_charge
-        result["charge"] = float(vec[columns[7]])  # charge_col
-
-    return result
-
-
-def get_natoms_vec(lines: list[str], atom_style: str = "atomic") -> list[int]:
-    """Get number of atoms for each atom type.
-
-    Parameters
-    ----------
-    lines : list
-        Lines from LAMMPS data file
-    atom_style : str
-        The LAMMPS atom style
-
-    Returns
-    -------
-    list
-        Number of atoms for each atom type
-    """
-    atype = get_atype(lines, atom_style=atom_style)
-    natoms_vec = []
-    natomtypes = get_natomtypes(lines)
-    for ii in range(natomtypes):
-        natoms_vec.append(sum(atype == ii + 1))
-    assert sum(natoms_vec) == get_natoms(lines)
-    return natoms_vec
-
-
-def get_atype(
-    lines: list[str], type_idx_zero: bool = False, atom_style: str = "atomic"
-) -> np.ndarray:
-    """Get atom types from LAMMPS data file.
-
-    Parameters
-    ----------
-    lines : list
-        Lines from LAMMPS data file
-    type_idx_zero : bool
-        Whether to use zero-based indexing for atom types
-    atom_style : str
-        The LAMMPS atom style
-
-    Returns
-    -------
-    np.ndarray
-        Array of atom types
-    """
-    alines = get_atoms(lines)
-    atype = []
-    for ii in alines:
-        atom_info = _atom_info_style(ii, atom_style)
-        at = atom_info["atom_type"]
-        if type_idx_zero:
-            atype.append(at - 1)
-        else:
-            atype.append(at)
-    return np.array(atype, dtype=int)
-
-
-def get_posi(lines: list[str], atom_style: str = "atomic") -> np.ndarray:
-    """Get atomic positions from LAMMPS data file.
-
-    Parameters
-    ----------
-    lines : list
-        Lines from LAMMPS data file
-    atom_style : str
-        The LAMMPS atom style
-
-    Returns
-    -------
-    np.ndarray
-        Array of atomic positions
-    """
-    atom_lines = get_atoms(lines)
-    posis = []
-    for ii in atom_lines:
-        atom_info = _atom_info_style(ii, atom_style)
-        posis.append([atom_info["x"], atom_info["y"], atom_info["z"]])
-    return np.array(posis)
-
-
-def get_charges(lines: list[str], atom_style: str = "atomic") -> np.ndarray | None:
-    """Get atomic charges from LAMMPS data file if the atom style supports charges.
-
-    Parameters
-    ----------
-    lines : list
-        Lines from LAMMPS data file
-    atom_style : str
-        The LAMMPS atom style
-
-    Returns
-    -------
-    np.ndarray or None
-        Array of atomic charges if atom style has charges, None otherwise
-    """
-    if atom_style not in ATOM_STYLE_COLUMNS:
-        raise ValueError(f"Unsupported atom style: {atom_style}")
-
-    # Check if this atom style has charges
-    if not ATOM_STYLE_COLUMNS[atom_style][6]:  # has_charge
-        return None
-
-    atom_lines = get_atoms(lines)
-    charges = []
-    for ii in atom_lines:
-        atom_info = _atom_info_style(ii, atom_style)
-        charges.append(atom_info["charge"])
-    return np.array(charges)
-
-
-def get_spins(lines: list[str], atom_style: str = "atomic") -> np.ndarray | None:
-    atom_lines = get_atoms(lines)
-    if len(atom_lines[0].split()) < 8:
-        return None
-    spins_ori = []
-    spins_norm = []
-    for ii in atom_lines:
-        iis = ii.split()
-        spins_ori.append([float(jj) for jj in iis[5:8]])
-        spins_norm.append([float(iis[-1])])
-    return np.array(spins_ori) * np.array(spins_norm)
-
-
-def get_lmpbox(lines):
-    box_info = []
-    tilt = np.zeros(3)
-    for ii in lines:
-        if "xlo" in ii and "xhi" in ii:
-            box_info.append([float(ii.split()[0]), float(ii.split()[1])])
-            break
-    for ii in lines:
-        if "ylo" in ii and "yhi" in ii:
-            box_info.append([float(ii.split()[0]), float(ii.split()[1])])
-            break
-    for ii in lines:
-        if "zlo" in ii and "zhi" in ii:
-            box_info.append([float(ii.split()[0]), float(ii.split()[1])])
-            break
-    for ii in lines:
-        if "xy" in ii and "xz" in ii and "yz" in ii:
-            tilt = np.array([float(jj) for jj in ii.split()[0:3]])
-    return box_info, tilt
-
-
-def system_data(
-    lines: list[str],
-    type_map: list[str] | None = None,
-    type_idx_zero: bool = True,
-    atom_style: str = "atomic",
-) -> dict:
-    """Parse LAMMPS data file to system data format.
-
-    Parameters
-    ----------
-    lines : list
-        Lines from LAMMPS data file
-    type_map : list, optional
-        Mapping from atom types to element names
-    type_idx_zero : bool
-        Whether to use zero-based indexing for atom types
-    atom_style : str
-        The LAMMPS atom style (atomic, full, charge, etc.)
-
-    Returns
-    -------
-    dict
-        System data dictionary
-    """
-    system = {}
-    system["atom_numbs"] = get_natoms_vec(lines, atom_style=atom_style)
-    system["atom_names"] = []
-    if type_map is None:
-        for ii in range(len(system["atom_numbs"])):
-            system["atom_names"].append("Type_%d" % ii)  # noqa: UP031
-    else:
-        assert len(type_map) >= len(system["atom_numbs"])
-        for ii in range(len(system["atom_numbs"])):
-            system["atom_names"].append(type_map[ii])
-    lohi, tilt = get_lmpbox(lines)
-    orig, cell = lmpbox2box(lohi, tilt)
-    system["orig"] = np.array(orig)
-    system["cells"] = [np.array(cell)]
-    natoms = sum(system["atom_numbs"])
-    system["atom_types"] = get_atype(
-        lines, type_idx_zero=type_idx_zero, atom_style=atom_style
-    )
-    system["coords"] = [get_posi(lines, atom_style=atom_style)]
-    system["cells"] = np.array(system["cells"])
-    system["coords"] = np.array(system["coords"])
-
-    # Add charges if the atom style supports them
-    charges = get_charges(lines, atom_style=atom_style)
-    if charges is not None:
-        system["charges"] = np.array([charges])
-
-    spins = get_spins(lines, atom_style=atom_style)
-    if spins is not None:
-        system["spins"] = np.array([spins])
-
-    return system
-
-
-def to_system_data(
-    lines: list[str],
-    type_map: list[str] | None = None,
-    type_idx_zero: bool = True,
-    atom_style: str = "atomic",
-) -> dict:
-    """Parse LAMMPS data file to system data format.
-
-    Parameters
-    ----------
-    lines : list
-        Lines from LAMMPS data file
-    type_map : list, optional
-        Mapping from atom types to element names
-    type_idx_zero : bool
-        Whether to use zero-based indexing for atom types
-    atom_style : str
-        The LAMMPS atom style. If "auto", attempts to detect automatically
-        from file. Default is "atomic".
-
-    Returns
-    -------
-    dict
-        System data dictionary
-    """
-    # Attempt automatic detection if requested
-    if atom_style == "auto":
-        detected_style = detect_atom_style(lines)
-        if detected_style:
-            atom_style = detected_style
-        else:
-            atom_style = "atomic"  # fallback to default
-
-    return system_data(
-        lines, type_map=type_map, type_idx_zero=type_idx_zero, atom_style=atom_style
-    )
-
-
-def rotate_to_lower_triangle(
-    cell: np.ndarray, coord: np.ndarray
-) -> tuple[np.ndarray, np.ndarray]:
-    """Rotate the cell to lower triangular and ensure the diagonal elements are non-negative.
-
-    Args:
-        cell (np.ndarray): The original cell matrix.
-        coord (np.ndarray): The coordinates of the atoms.
-
-    Returns
-    -------
-    tuple[np.ndarray, np.ndarray]: The rotated cell and adjusted coordinates.
-    """
-    q, _ = np.linalg.qr(cell.T)
-    cell = np.matmul(cell, q)
-    coord = np.matmul(coord, q)
-
-    # Ensure the diagonal elements of the cell are non-negative
-    rot = np.eye(3)
-    if cell[0][0] < 0:
-        rot[0][0] = -1
-    if cell[1][1] < 0:
-        rot[1][1] = -1
-    if cell[2][2] < 0:
-        rot[2][2] = -1
-    cell = np.matmul(cell, rot)
-    coord = np.matmul(coord, rot)
-    return cell, coord
-
-
-def _get_lammps_masses(system) -> np.ndarray | None:
-    """Get masses for the LAMMPS ``Masses`` section.
-
-    Prefer explicitly stored masses when available. Otherwise, infer masses from
-    ``atom_names`` when all names are valid chemical element symbols.
-
-    Parameters
-    ----------
-    system : dict
-        System data dictionary
-
-    Returns
-    -------
-    np.ndarray or None
-        Per-type masses aligned with ``atom_names``. Returns ``None`` when the
-        masses cannot be determined safely.
-
-    Raises
-    ------
-    ValueError
-        If explicit ``system["masses"]`` is present but does not match the
-        length of ``atom_names``.
-    """
-    atom_names = system["atom_names"]
-    masses = system.get("masses")
-    if masses is not None:
-        masses = np.asarray(masses, dtype=float)
-        if masses.ndim != 1 or len(masses) != len(atom_names):
-            raise ValueError(
-                'Explicit system["masses"] must be a 1D array with the same '
-                'length as system["atom_names"] to write the LAMMPS Masses '
-                "section."
-            )
-        return masses
-
-    if not all(name in ELEMENTS for name in atom_names):
-        return None
-
-    return np.array([Element(name).mass for name in atom_names], dtype=float)
-
-
-def from_system_data(system, f_idx=0):
-    ret = ""
-    ret += "\n"
-    natoms = sum(system["atom_numbs"])
-    ntypes = len(system["atom_numbs"])
-    cell, coord = rotate_to_lower_triangle(
-        system["cells"][f_idx], system["coords"][f_idx]
-    )
-    ret += "%d atoms\n" % natoms  # noqa: UP031
-    ret += "%d atom types\n" % ntypes  # noqa: UP031
-    ret += (ptr_float_fmt + " " + ptr_float_fmt + " xlo xhi\n") % (
-        0,
-        cell[0][0],
-    )  # noqa: UP031
-    ret += (ptr_float_fmt + " " + ptr_float_fmt + " ylo yhi\n") % (
-        0,
-        cell[1][1],
-    )  # noqa: UP031
-    ret += (ptr_float_fmt + " " + ptr_float_fmt + " zlo zhi\n") % (
-        0,
-        cell[2][2],
-    )  # noqa: UP031
-    ret += (
-        ptr_float_fmt + " " + ptr_float_fmt + " " + ptr_float_fmt + " xy xz yz\n"
-    ) % (
-        cell[1][0],
-        cell[2][0],
-        cell[2][1],
-    )  # noqa: UP031
-    ret += "\n"
-
-    masses = _get_lammps_masses(system)
-    if masses is not None:
-        ret += "Masses\n"
-        ret += "\n"
-        mass_fmt = ptr_int_fmt + " " + ptr_float_fmt + " # %s\n"  # noqa: UP031
-        for ii, (mass, atom_name) in enumerate(zip(masses, system["atom_names"])):
-            ret += mass_fmt % (ii + 1, mass, atom_name)
-        ret += "\n"
-
-    ret += "Atoms # atomic\n"
-    ret += "\n"
-    coord_fmt = (
-        ptr_int_fmt
-        + " "
-        + ptr_int_fmt
-        + " "
-        + ptr_float_fmt
-        + " "
-        + ptr_float_fmt
-        + " "
-        + ptr_float_fmt
-        + "\n"
-    )  # noqa: UP031
-
-    if "spins" in system:
-        coord_fmt = (
-            coord_fmt.strip("\n")
-            + " "
-            + ptr_float_fmt
-            + " "
-            + ptr_float_fmt
-            + " "
-            + ptr_float_fmt
-            + " "
-            + ptr_float_fmt
-            + "\n"
-        )  # noqa: UP031
-        spins_norm = np.linalg.norm(system["spins"][f_idx], axis=1)
-    for ii in range(natoms):
-        if "spins" in system:
-            if spins_norm[ii] != 0:
-                ret += coord_fmt % (
-                    ii + 1,
-                    system["atom_types"][ii] + 1,
-                    coord[ii][0] - system["orig"][0],
-                    coord[ii][1] - system["orig"][1],
-                    coord[ii][2] - system["orig"][2],
-                    system["spins"][f_idx][ii][0] / spins_norm[ii],
-                    system["spins"][f_idx][ii][1] / spins_norm[ii],
-                    system["spins"][f_idx][ii][2] / spins_norm[ii],
-                    spins_norm[ii],
-                )  # noqa: UP031
-            else:
-                ret += coord_fmt % (
-                    ii + 1,
-                    system["atom_types"][ii] + 1,
-                    coord[ii][0] - system["orig"][0],
-                    coord[ii][1] - system["orig"][1],
-                    coord[ii][2] - system["orig"][2],
-                    system["spins"][f_idx][ii][0],
-                    system["spins"][f_idx][ii][1],
-                    system["spins"][f_idx][ii][2] + 1,
-                    spins_norm[ii],
-                )  # noqa: UP031
-        else:
-            ret += coord_fmt % (
-                ii + 1,
-                system["atom_types"][ii] + 1,
-                coord[ii][0] - system["orig"][0],
-                coord[ii][1] - system["orig"][1],
-                coord[ii][2] - system["orig"][2],
-            )  # noqa: UP031
-    return ret
-
-
-if __name__ == "__main__":
-    fname = "water-SPCE.data"
-    lines = open(fname).read().split("\n")
-    bonds, tilt = get_lmpbox(lines)
-    # print(bonds, tilt)
-    orig, box = lmpbox2box(bonds, tilt)
-    # print(orig, box)
-    bonds1, tilt1 = box2lmpbox(orig, box)
-    # print(bonds1, tilt1)
-    print(bonds1 - bonds)
-    print(tilt1 - tilt)
-    print(box)
-    print(get_atype(lines))
-    print(get_posi(lines))
+from dpdata.formats.lammps.lmp import *  # noqa: F403
diff --git a/dpdata/lmdb/__init__.py b/dpdata/lmdb/__init__.py
index 53a3e8f0e..dd764bd55 100644
--- a/dpdata/lmdb/__init__.py
+++ b/dpdata/lmdb/__init__.py
@@ -1,5 +1,3 @@
 from __future__ import annotations
 
-from .format import LMDBFormat
-
-__all__ = ["LMDBFormat"]
+from dpdata.formats.lmdb import *  # noqa: F403
diff --git a/dpdata/lmdb/format.py b/dpdata/lmdb/format.py
index 9b518be6b..3db613b44 100644
--- a/dpdata/lmdb/format.py
+++ b/dpdata/lmdb/format.py
@@ -1,286 +1,3 @@
 from __future__ import annotations
 
-import os
-
-import lmdb
-import msgpack
-import msgpack_numpy as m
-import numpy as np
-
-from dpdata.format import Format
-
-m.patch()
-
-
-class LMDBError(Exception):
-    """Base class for LMDB errors."""
-
-
-class LMDBMetadataError(LMDBError):
-    """Metadata not found in LMDB."""
-
-
-class LMDBFrameError(LMDBError):
-    """Frame data not found in LMDB."""
-
-
-class LMDBFormat(Format):
-    """
-    Class for handling the LMDB format, which stores atomic configurations in a
-    Lightning Memory-Mapped Database (LMDB).
-
-    This format is optimized for machine learning workflows where fast, random
-    access to a large number of frames is required. All frames from multiple
-    systems (with potentially different numbers of atoms) are stored in a
-    single LMDB database file.
-
-    Both single systems and multiple systems are supported via the standard
-    ``dpdata`` APIs.
-
-    Examples
-    --------
-    **Saving a single LabeledSystem**
-
-    >>> import dpdata
-    >>> system = dpdata.LabeledSystem("path/to/input.vasp", fmt="vasp/outcar")
-    >>> system.to("lmdb", "my_single_system.lmdb")
-
-    **Loading a single LabeledSystem**
-
-    >>> loaded_system = dpdata.LabeledSystem("my_single_system.lmdb", fmt="lmdb")
-
-    **Saving multiple systems to a single LMDB database**
-
-    >>> import dpdata
-    >>> system_1 = dpdata.LabeledSystem("path/to/system1/OUTCAR", fmt="vasp/outcar")
-    >>> system_2 = dpdata.LabeledSystem("path/to/system2/OUTCAR", fmt="vasp/outcar")
-    >>> multi_systems_obj = dpdata.MultiSystems(system_1, system_2)
-    >>> multi_systems_obj.to("lmdb", "my_multi_system_db.lmdb")
-
-    **Loading multiple systems from a single LMDB database**
-
-    >>> import dpdata
-    >>> loaded_multi_systems = dpdata.MultiSystems.from_file("my_multi_system_db.lmdb", fmt="lmdb")
-    """
-
-    def to_multi_systems(
-        self, formulas, directory, map_size=1000000000, frame_idx_fmt="012d", **kwargs
-    ):
-        """Implement MultiSystems.to for LMDB format.
-
-        Parameters
-        ----------
-        formulas : list[str]
-            list of formulas
-        directory : str
-            directory of system
-        map_size : int, optional
-            Maximum size of the LMDB database in bytes. Default is 1GB.
-        frame_idx_fmt : str, optional
-            The format string used to encode the frame index as a key. Default is "012d".
-        **kwargs : dict
-            other parameters
-
-        Yields
-        ------
-        tuple
-            (self, formula) to be used by to_system
-        """
-        self._frame_idx_fmt = frame_idx_fmt
-        self._global_frame_idx = 0
-        self._system_info = []
-        os.makedirs(directory, exist_ok=True)
-        with lmdb.open(directory, map_size=map_size) as env:
-            with env.begin(write=True) as txn:
-                self._txn = txn
-                for ff in formulas:
-                    yield (self, ff)
-                # Finalize metadata
-                metadata = {
-                    "nframes": self._global_frame_idx,
-                    "system_info": self._system_info,
-                    "frame_idx_fmt": self._frame_idx_fmt,
-                }
-                txn.put(b"__metadata__", msgpack.packb(metadata, use_bin_type=True))
-                self._txn = None
-
-    def _dump_to_txn(self, data, txn, formula, dtypes):
-        from dpdata.data_type import Axis
-
-        nframes = data["coords"].shape[0]
-
-        # Identify symbolic shapes and frame-dependent keys
-        data_shapes = {}
-        frame_dependent_keys = []
-        for dt in dtypes:
-            if dt.name in data:
-                if dt.shape is not None:
-                    data_shapes[dt.name] = [
-                        s.value if isinstance(s, Axis) else s for s in dt.shape
-                    ]
-                    if Axis.NFRAMES in dt.shape:
-                        frame_dependent_keys.append(dt.name)
-                else:
-                    data_shapes[dt.name] = None
-
-        # Record system info
-        # natoms needs to be extracted from data
-        if "atom_numbs" in data:
-            natoms_list = data["atom_numbs"]
-        else:
-            # Fallback for systems without atom_numbs (should not happen in valid dpdata systems)
-            natoms_list = []
-
-        self._system_info.append(
-            {
-                "formula": formula,
-                "natoms": natoms_list,
-                "nframes": nframes,
-                "start_idx": self._global_frame_idx,
-                "data_shapes": data_shapes,
-                "frame_dependent_keys": frame_dependent_keys,
-            }
-        )
-
-        for i in range(nframes):
-            frame_data = {}
-            for key, val in data.items():
-                if key in frame_dependent_keys:
-                    frame_data[key] = val[i]
-                else:
-                    frame_data[key] = val
-
-            key = f"{self._global_frame_idx:{self._frame_idx_fmt}}".encode("ascii")
-            value = msgpack.packb(frame_data, use_bin_type=True)
-            txn.put(key, value)
-            self._global_frame_idx += 1
-
-    def to_labeled_system(self, data, file_name, **kwargs):
-        """Save a single LabeledSystem to an LMDB database."""
-        from dpdata.system import LabeledSystem
-
-        if isinstance(file_name, tuple) and file_name[0] is self:
-            txn, formula = self._txn, file_name[1]
-            self._dump_to_txn(data, txn, formula, LabeledSystem.DTYPES)
-        else:
-            # Single system call: use to_multi_systems logic
-            # Infer formula from data if possible, or use default
-            formula = kwargs.get("formula", "unknown")
-            gen = self.to_multi_systems([formula], file_name, **kwargs)
-            handle = next(gen)
-            self.to_labeled_system(data, handle, **kwargs)
-            try:
-                next(gen)
-            except StopIteration:
-                pass
-
-    def to_system(self, data, file_name, **kwargs):
-        """Save a single System to an LMDB database."""
-        from dpdata.system import System
-
-        if isinstance(file_name, tuple) and file_name[0] is self:
-            txn, formula = self._txn, file_name[1]
-            self._dump_to_txn(data, txn, formula, System.DTYPES)
-        else:
-            # Single system call
-            formula = kwargs.get("formula", "unknown")
-            gen = self.to_multi_systems([formula], file_name, **kwargs)
-            handle = next(gen)
-            self.to_system(data, handle, **kwargs)
-            try:
-                next(gen)
-            except StopIteration:
-                pass
-
-    def from_multi_systems(self, file_name, map_size=1000000000, **kwargs):
-        """Load multiple systems from a single LMDB database.
-
-        Parameters
-        ----------
-        file_name : str
-            The path to the LMDB database directory.
-        map_size : int, optional
-            Maximum size of the LMDB database in bytes.
-        **kwargs : dict
-            other parameters
-
-        Yields
-        ------
-        dict
-            data dictionary for each system
-        """
-        from dpdata.data_type import Axis, DataType
-        from dpdata.system import LabeledSystem, System
-
-        with lmdb.open(file_name, readonly=True) as env:
-            with env.begin() as txn:
-                metadata_packed = txn.get(b"__metadata__")
-                if metadata_packed is None:
-                    raise LMDBMetadataError("LMDB database does not contain metadata.")
-                metadata = msgpack.unpackb(metadata_packed, raw=False)
-                frame_idx_fmt = metadata.get("frame_idx_fmt", "012d")
-
-                for sys_info in metadata["system_info"]:
-                    system_frames = []
-                    start_idx = sys_info["start_idx"]
-                    nframes = sys_info["nframes"]
-                    data_shapes = sys_info.get("data_shapes", {})
-                    frame_dependent_keys = sys_info.get("frame_dependent_keys", [])
-
-                    for i in range(start_idx, start_idx + nframes):
-                        key = f"{i:{frame_idx_fmt}}".encode("ascii")
-                        value = txn.get(key)
-                        if value is None:
-                            raise LMDBFrameError(f"Frame data not found for key: {key}")
-                        frame_data = msgpack.unpackb(value, raw=False)
-                        system_frames.append(frame_data)
-
-                    # Aggregate data for one system
-                    first_frame = system_frames[0]
-                    is_labeled = "energies" in first_frame
-                    cls = LabeledSystem if is_labeled else System
-
-                    # Auto-register unknown data types
-                    existing_dt_names = [dt.name for dt in cls.DTYPES]
-                    new_dts = []
-                    axis_map = {a.value: a for a in Axis}
-                    for key, val in first_frame.items():
-                        if key not in existing_dt_names and key in data_shapes:
-                            shape_raw = data_shapes[key]
-                            if shape_raw is not None:
-                                shape = tuple([axis_map.get(s, s) for s in shape_raw])
-                            else:
-                                shape = None
-
-                            v_arr = np.array(val)
-                            new_dts.append(
-                                DataType(key, type(v_arr), shape=shape, required=False)
-                            )
-
-                    if new_dts:
-                        cls.register_data_type(*new_dts)
-
-                    agg_data = {}
-                    for key, val in first_frame.items():
-                        if key in frame_dependent_keys:
-                            agg_data[key] = np.array([d[key] for d in system_frames])
-                        else:
-                            agg_data[key] = val
-
-                    yield agg_data
-
-    def from_labeled_system(self, file_name, **kwargs):
-        """Load data for a single LabeledSystem from an LMDB database."""
-        if isinstance(file_name, dict):
-            return file_name
-        # from_multi_systems returns a generator of dicts
-        gen = self.from_multi_systems(file_name, **kwargs)
-        return next(gen)
-
-    def from_system(self, file_name, **kwargs):
-        """Load data for a single System from an LMDB database."""
-        if isinstance(file_name, dict):
-            return file_name
-        # from_multi_systems returns a generator of dicts
-        gen = self.from_multi_systems(file_name, **kwargs)
-        return next(gen)
+from dpdata.formats.lmdb.format import *  # noqa: F403
diff --git a/dpdata/openmx/__init__.py b/dpdata/openmx/__init__.py
index e69de29bb..16d07d584 100644
--- a/dpdata/openmx/__init__.py
+++ b/dpdata/openmx/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import annotations
+
+from dpdata.formats.openmx import *  # noqa: F403
diff --git a/dpdata/openmx/omx.py b/dpdata/openmx/omx.py
index 89f853687..4b0635a96 100644
--- a/dpdata/openmx/omx.py
+++ b/dpdata/openmx/omx.py
@@ -1,200 +1,3 @@
-#!/usr/bin/python3
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
-
-import numpy as np
-
-from dpdata.utils import open_file
-
-if TYPE_CHECKING:
-    from dpdata.utils import FileType
-
-from ..unit import (
-    EnergyConversion,
-    ForceConversion,
-    LengthConversion,
-    PressureConversion,
-)
-
-ry2ev = EnergyConversion("rydberg", "eV").value()
-kbar2evperang3 = PressureConversion("kbar", "eV/angstrom^3").value()
-
-length_convert = LengthConversion("bohr", "angstrom").value()
-energy_convert = EnergyConversion("hartree", "eV").value()
-force_convert = ForceConversion("hartree/bohr", "eV/angstrom").value()
-
-import warnings
-from collections import OrderedDict
-
-
-def load_atom(lines):
-    atom_names = []
-    atom_names_mode = False
-    for line in lines:
-        if "<Atoms.SpeciesAndCoordinates" in line:
-            atom_names_mode = True
-        elif "Atoms.SpeciesAndCoordinates>" in line:
-            atom_names_mode = False
-        elif atom_names_mode:
-            parts = line.split()
-            atom_names.append(parts[1])
-    atom_names_original = atom_names
-    atom_names = list(OrderedDict.fromkeys(set(atom_names)))
-    atom_names = sorted(
-        atom_names, key=atom_names_original.index
-    )  # Unique ordering of atomic species
-    ntypes = len(atom_names)
-    atom_numbs = [0] * ntypes
-    atom_types = []
-    atom_types_mode = False
-    for line in lines:
-        if "<Atoms.SpeciesAndCoordinates" in line:
-            atom_types_mode = True
-        elif "Atoms.SpeciesAndCoordinates>" in line:
-            atom_types_mode = False
-        elif atom_types_mode:
-            parts = line.split()
-            for i, atom_name in enumerate(atom_names):
-                if parts[1] == atom_name:
-                    atom_numbs[i] += 1
-                    atom_types.append(i)
-    atom_types = np.array(atom_types)
-    return atom_names, atom_types, atom_numbs
-
-
-def load_cells(lines):
-    cells = []
-    for line in lines:
-        if "Cell_Vectors=" in line:
-            part = line.split("Cell_Vectors=")[1]
-            parts = part.split()
-            values = list(map(float, parts[:9]))
-            cell = [values[0:3], values[3:6], values[6:9]]
-            cells.append(cell)
-            # Checking SCF converged or not
-            for token in line.split():
-                if token.startswith("scf_conv="):
-                    scf_conv = int(token.split("=")[1])
-                    if scf_conv == 0:
-                        warnings.warn("SCF not converged!", stacklevel=2)
-    cells = np.array(cells)
-    return cells
-
-
-# load atom_names, atom_numbs, atom_types, cells
-def load_param_file(fname: FileType, mdname: FileType):
-    with open_file(fname) as dat_file:
-        lines = dat_file.readlines()
-    atom_names, atom_types, atom_numbs = load_atom(lines)
-
-    with open_file(mdname) as md_file:
-        lines = md_file.readlines()
-    cells = load_cells(lines)
-    return atom_names, atom_numbs, atom_types, cells
-
-
-def load_coords(lines, atom_names, natoms):
-    cnt = 0
-    coord, coords = [], []
-    for line in lines:
-        if "time=" in line:
-            continue
-        for atom_name in atom_names:
-            atom_name += " "
-            if atom_name in line:
-                cnt += 1
-                parts = line.split()
-                for_line = [float(parts[1]), float(parts[2]), float(parts[3])]
-                coord.append(for_line)
-        if cnt == natoms:
-            coords.append(coord)
-            cnt = 0
-            coord = []
-    coords = np.array(coords)
-    return coords
-
-
-def load_data(mdname: FileType, atom_names, natoms):
-    with open_file(mdname) as md_file:
-        lines = md_file.readlines()
-    coords = load_coords(lines, atom_names, natoms)
-    steps = [str(i) for i in range(1, coords.shape[0] + 1)]
-    return coords, steps
-
-
-def to_system_data(fname: FileType, mdname: FileType):
-    data = {}
-    (
-        data["atom_names"],
-        data["atom_numbs"],
-        data["atom_types"],
-        data["cells"],
-    ) = load_param_file(fname, mdname)
-    data["coords"], steps = load_data(
-        mdname,
-        data["atom_names"],
-        np.sum(data["atom_numbs"]),
-    )
-    data["orig"] = np.zeros(3)
-    return data, steps
-
-
-def load_energy(lines):
-    energy = []
-    for line in lines:
-        if "time=" in line:
-            parts = line.split()
-            ene_line = float(parts[4])  # Hartree
-            energy.append(ene_line)
-            continue
-    energy = energy_convert * np.array(energy)  # Hartree -> eV
-    return energy
-
-
-def load_force(lines, atom_names, atom_numbs):
-    cnt = 0
-    field, fields = [], []
-    for line in lines:
-        if "time=" in line:
-            continue
-        for atom_name in atom_names:
-            atom_name += " "
-            if atom_name in line:
-                cnt += 1
-                parts = line.split()
-                for_line = [float(parts[4]), float(parts[5]), float(parts[6])]
-                field.append(for_line)
-        if cnt == np.sum(atom_numbs):
-            fields.append(field)
-            cnt = 0
-            field = []
-    force = force_convert * np.array(fields)
-    return force
-
-
-# load energy, force
-def to_system_label(fname, mdname):
-    atom_names, atom_numbs, atom_types, cells = load_param_file(fname, mdname)
-    with open_file(mdname) as md_file:
-        lines = md_file.readlines()
-    energy = load_energy(lines)
-    force = load_force(lines, atom_names, atom_numbs)
-    return energy, force
-
-
-if __name__ == "__main__":
-    file_name = "Au111Surface"
-    fname = f"{file_name}.dat"
-    mdname = f"{file_name}.md"
-    atom_names, atom_numbs, atom_types, cells = load_param_file(fname, mdname)
-    coords, steps = load_data(mdname, atom_names, np.sum(atom_numbs))
-    data, steps = to_system_data(fname, mdname)
-    energy, force = to_system_label(fname, mdname)
-    print(atom_names)
-    print(atom_numbs)
-    print(atom_types)
-# print(cells.shape)
-# print(coords.shape)
-# print(len(energy))
-# print(force.shape)
+from dpdata.formats.openmx.omx import *  # noqa: F403
diff --git a/dpdata/orca/__init__.py b/dpdata/orca/__init__.py
index e69de29bb..ff93fd88b 100644
--- a/dpdata/orca/__init__.py
+++ b/dpdata/orca/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import annotations
+
+from dpdata.formats.orca import *  # noqa: F403
diff --git a/dpdata/orca/output.py b/dpdata/orca/output.py
index a0915162b..be44132cc 100644
--- a/dpdata/orca/output.py
+++ b/dpdata/orca/output.py
@@ -1,73 +1,3 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
-
-import numpy as np
-
-from dpdata.utils import open_file
-
-if TYPE_CHECKING:
-    from dpdata.utils import FileType
-
-
-def read_orca_sp_output(
-    fn: FileType,
-) -> tuple[np.ndarray, np.ndarray, float, np.ndarray]:
-    """Read from ORCA output.
-
-    Note that both the energy and the gradient should be printed.
-
-    Parameters
-    ----------
-    fn : str
-        file name
-
-    Returns
-    -------
-    np.ndarray
-        atomic symbols
-    np.ndarray
-        atomic coordinates
-    float
-        total potential energy
-    np.ndarray
-        atomic forces
-    """
-    coord = None
-    symbols = None
-    forces = None
-    energy = None
-    with open_file(fn) as f:
-        flag = 0
-        for line in f:
-            if flag in (1, 3, 4):
-                flag += 1
-            elif flag == 2:
-                s = line.split()
-                if not len(s):
-                    flag = 0
-                else:
-                    symbols.append(s[0].capitalize())
-                    coord.append([float(s[1]), float(s[2]), float(s[3])])
-            elif flag == 5:
-                s = line.split()
-                if not len(s):
-                    flag = 0
-                else:
-                    forces.append([float(s[3]), float(s[4]), float(s[5])])
-            elif line.startswith("CARTESIAN COORDINATES (ANGSTROEM)"):
-                # coord
-                flag = 1
-                coord = []
-                symbols = []
-            elif line.startswith("CARTESIAN GRADIENT"):
-                flag = 3
-                forces = []
-            elif line.startswith("FINAL SINGLE POINT ENERGY"):
-                energy = float(line.split()[-1])
-    symbols = np.array(symbols)
-    forces = -np.array(forces)
-    coord = np.array(coord)
-    assert coord.shape == forces.shape
-
-    return symbols, coord, energy, forces
+from dpdata.formats.orca.output import *  # noqa: F403
diff --git a/dpdata/plugins/3dmol.py b/dpdata/plugins/3dmol.py
index 56ec25161..d3ce0e9a7 100644
--- a/dpdata/plugins/3dmol.py
+++ b/dpdata/plugins/3dmol.py
@@ -3,7 +3,7 @@
 import numpy as np
 
 from dpdata.format import Format
-from dpdata.xyz.xyz import coord_to_xyz
+from dpdata.formats.xyz.xyz import coord_to_xyz
 
 
 @Format.register("3dmol")
diff --git a/dpdata/plugins/abacus.py b/dpdata/plugins/abacus.py
index 0423bda58..c8736fe39 100644
--- a/dpdata/plugins/abacus.py
+++ b/dpdata/plugins/abacus.py
@@ -5,12 +5,12 @@
 
 import numpy as np
 
-import dpdata.abacus.md
-import dpdata.abacus.relax
-import dpdata.abacus.scf
-from dpdata.abacus.stru import get_frame_from_stru, make_unlabeled_stru
+import dpdata.formats.abacus.md
+import dpdata.formats.abacus.relax
+import dpdata.formats.abacus.scf
 from dpdata.data_type import Axis, DataType
 from dpdata.format import Format
+from dpdata.formats.abacus.stru import get_frame_from_stru, make_unlabeled_stru
 from dpdata.utils import open_file
 
 if TYPE_CHECKING:
@@ -90,7 +90,7 @@ def register_move_data(data):
 class AbacusSCFFormat(Format):
     # @Format.post("rot_lower_triangular")
     def from_labeled_system(self, file_name, **kwargs):
-        data = dpdata.abacus.scf.get_frame(file_name)
+        data = dpdata.formats.abacus.scf.get_frame(file_name)
         register_mag_data(data)
         register_move_data(data)
         return data
@@ -102,7 +102,7 @@ def from_labeled_system(self, file_name, **kwargs):
 class AbacusMDFormat(Format):
     # @Format.post("rot_lower_triangular")
     def from_labeled_system(self, file_name, **kwargs):
-        data = dpdata.abacus.md.get_frame(file_name)
+        data = dpdata.formats.abacus.md.get_frame(file_name)
         register_mag_data(data)
         register_move_data(data)
         return data
@@ -114,7 +114,7 @@ def from_labeled_system(self, file_name, **kwargs):
 class AbacusRelaxFormat(Format):
     # @Format.post("rot_lower_triangular")
     def from_labeled_system(self, file_name, **kwargs):
-        data = dpdata.abacus.relax.get_frame(file_name)
+        data = dpdata.formats.abacus.relax.get_frame(file_name)
         register_mag_data(data)
         register_move_data(data)
         return data
diff --git a/dpdata/plugins/amber.py b/dpdata/plugins/amber.py
index c51af3465..60cfc393c 100644
--- a/dpdata/plugins/amber.py
+++ b/dpdata/plugins/amber.py
@@ -4,8 +4,8 @@
 import subprocess as sp
 import tempfile
 
-import dpdata.amber.md
-import dpdata.amber.sqm
+import dpdata.formats.amber.md
+import dpdata.formats.amber.sqm
 from dpdata.driver import Driver, Minimizer
 from dpdata.format import Format
 from dpdata.utils import open_file
@@ -26,7 +26,7 @@ def from_system(
             parm7_file = file_name + ".parm7"
         if nc_file is None:
             nc_file = file_name + ".nc"
-        return dpdata.amber.md.read_amber_traj(
+        return dpdata.formats.amber.md.read_amber_traj(
             parm7_file=parm7_file,
             nc_file=nc_file,
             use_element_symbols=use_element_symbols,
@@ -55,7 +55,7 @@ def from_labeled_system(
             mden_file = file_name + ".mden"
         if mdout_file is None:
             mdout_file = file_name + ".mdout"
-        return dpdata.amber.md.read_amber_traj(
+        return dpdata.formats.amber.md.read_amber_traj(
             parm7_file, nc_file, mdfrc_file, mden_file, mdout_file, use_element_symbols
         )
 
@@ -64,11 +64,11 @@ def from_labeled_system(
 class SQMOutFormat(Format):
     def from_system(self, fname, **kwargs):
         """Read from ambertools sqm.out."""
-        return dpdata.amber.sqm.parse_sqm_out(fname)
+        return dpdata.formats.amber.sqm.parse_sqm_out(fname)
 
     def from_labeled_system(self, fname, **kwargs):
         """Read from ambertools sqm.out."""
-        data = dpdata.amber.sqm.parse_sqm_out(fname)
+        data = dpdata.formats.amber.sqm.parse_sqm_out(fname)
         assert "forces" in list(data.keys()), f"No forces in {fname}"
         return data
 
@@ -104,7 +104,7 @@ def to_system(self, data, fname=None, frame_idx=0, **kwargs):
                 mult : int, default=1
                     multiplicity. Only 1 is allowed.
         """
-        return dpdata.amber.sqm.make_sqm_in(data, fname, frame_idx, **kwargs)
+        return dpdata.formats.amber.sqm.make_sqm_in(data, fname, frame_idx, **kwargs)
 
 
 @Driver.register("sqm")
diff --git a/dpdata/plugins/cp2k.py b/dpdata/plugins/cp2k.py
index f5c1b5394..61f2eaf9d 100644
--- a/dpdata/plugins/cp2k.py
+++ b/dpdata/plugins/cp2k.py
@@ -2,9 +2,9 @@
 
 import glob
 
-import dpdata.cp2k.output
-from dpdata.cp2k.output import Cp2kSystems
+import dpdata.formats.cp2k.output
 from dpdata.format import Format
+from dpdata.formats.cp2k.output import Cp2kSystems
 
 string_warning = """
 Hi, you got an error from dpdata,
@@ -42,7 +42,7 @@ def from_labeled_system(self, file_name, restart=False, **kwargs):
                 data["energies"],
                 data["forces"],
                 tmp_virial,
-            ) = dpdata.cp2k.output.get_frames(file_name)
+            ) = dpdata.formats.cp2k.output.get_frames(file_name)
             if tmp_virial is not None:
                 data["virials"] = tmp_virial
             return data
diff --git a/dpdata/plugins/deepmd.py b/dpdata/plugins/deepmd.py
index 860f52d02..99bd9b237 100644
--- a/dpdata/plugins/deepmd.py
+++ b/dpdata/plugins/deepmd.py
@@ -6,10 +6,10 @@
 import numpy as np
 
 import dpdata
-import dpdata.deepmd.comp
-import dpdata.deepmd.hdf5
-import dpdata.deepmd.mixed
-import dpdata.deepmd.raw
+import dpdata.formats.deepmd.comp
+import dpdata.formats.deepmd.hdf5
+import dpdata.formats.deepmd.mixed
+import dpdata.formats.deepmd.raw
 from dpdata.data_type import Axis, DataType
 from dpdata.driver import Driver
 from dpdata.format import Format
@@ -45,17 +45,17 @@ def register_spin():
 class DeePMDRawFormat(Format):
     def from_system(self, file_name, type_map=None, **kwargs):
         register_spin()
-        return dpdata.deepmd.raw.to_system_data(
+        return dpdata.formats.deepmd.raw.to_system_data(
             file_name, type_map=type_map, labels=False
         )
 
     def to_system(self, data, file_name, **kwargs):
         """Dump the system in deepmd raw format to directory `file_name`."""
-        dpdata.deepmd.raw.dump(file_name, data)
+        dpdata.formats.deepmd.raw.dump(file_name, data)
 
     def from_labeled_system(self, file_name, type_map=None, **kwargs):
         register_spin()
-        return dpdata.deepmd.raw.to_system_data(
+        return dpdata.formats.deepmd.raw.to_system_data(
             file_name, type_map=type_map, labels=True
         )
 
@@ -67,7 +67,7 @@ def from_labeled_system(self, file_name, type_map=None, **kwargs):
 class DeePMDCompFormat(Format):
     def from_system(self, file_name, type_map=None, **kwargs):
         register_spin()
-        return dpdata.deepmd.comp.to_system_data(
+        return dpdata.formats.deepmd.comp.to_system_data(
             file_name, type_map=type_map, labels=False
         )
 
@@ -92,11 +92,13 @@ def to_system(self, data, file_name, set_size=5000, prec=np.float64, **kwargs):
         **kwargs : dict
             other parameters
         """
-        dpdata.deepmd.comp.dump(file_name, data, set_size=set_size, comp_prec=prec)
+        dpdata.formats.deepmd.comp.dump(
+            file_name, data, set_size=set_size, comp_prec=prec
+        )
 
     def from_labeled_system(self, file_name, type_map=None, **kwargs):
         register_spin()
-        return dpdata.deepmd.comp.to_system_data(
+        return dpdata.formats.deepmd.comp.to_system_data(
             file_name, type_map=type_map, labels=True
         )
 
@@ -130,7 +132,7 @@ class DeePMDMixedFormat(Format):
     """
 
     def from_system_mix(self, file_name, type_map=None, **kwargs):
-        return dpdata.deepmd.mixed.to_system_data(
+        return dpdata.formats.deepmd.mixed.to_system_data(
             file_name, type_map=type_map, labels=False
         )
 
@@ -155,10 +157,12 @@ def to_system(
         **kwargs : dict
             other parameters
         """
-        dpdata.deepmd.mixed.dump(file_name, data, set_size=set_size, comp_prec=prec)
+        dpdata.formats.deepmd.mixed.dump(
+            file_name, data, set_size=set_size, comp_prec=prec
+        )
 
     def from_labeled_system_mix(self, file_name, type_map=None, **kwargs):
-        return dpdata.deepmd.mixed.to_system_data(
+        return dpdata.formats.deepmd.mixed.to_system_data(
             file_name, type_map=type_map, labels=True
         )
 
@@ -193,7 +197,7 @@ def mix_system(self, *system, type_map, atom_numb_pad=None, **kwargs):
         >>> import dpdata
         >>> dpdata.MultiSystems(*systems).to_deepmd_npy_mixed("mixed_dir", atom_numb_pad=8)
         """
-        return dpdata.deepmd.mixed.mix_system(
+        return dpdata.formats.deepmd.mixed.mix_system(
             *system, type_map=type_map, atom_numb_pad=atom_numb_pad, **kwargs
         )
 
@@ -257,14 +261,14 @@ def _from_system(
         register_spin()
 
         if isinstance(file_name, (h5py.Group, h5py.File)):
-            return dpdata.deepmd.hdf5.to_system_data(
+            return dpdata.formats.deepmd.hdf5.to_system_data(
                 file_name, "", type_map=type_map, labels=labels
             )
         elif isinstance(file_name, str):
             s = file_name.split("#")
             name = s[1] if len(s) > 1 else ""
             with h5py.File(s[0], "r") as f:
-                return dpdata.deepmd.hdf5.to_system_data(
+                return dpdata.formats.deepmd.hdf5.to_system_data(
                     f, name, type_map=type_map, labels=labels
                 )
         else:
@@ -357,14 +361,14 @@ def to_system(
         import h5py
 
         if isinstance(file_name, (h5py.Group, h5py.File)):
-            dpdata.deepmd.hdf5.dump(
+            dpdata.formats.deepmd.hdf5.dump(
                 file_name, "", data, set_size=set_size, comp_prec=comp_prec
             )
         elif isinstance(file_name, str):
             s = file_name.split("#")
             name = s[1] if len(s) > 1 else ""
             with h5py.File(s[0], "w") as f:
-                dpdata.deepmd.hdf5.dump(
+                dpdata.formats.deepmd.hdf5.dump(
                     f, name, data, set_size=set_size, comp_prec=comp_prec
                 )
         else:
diff --git a/dpdata/plugins/dftbplus.py b/dpdata/plugins/dftbplus.py
index 247fedc9e..191576e05 100644
--- a/dpdata/plugins/dftbplus.py
+++ b/dpdata/plugins/dftbplus.py
@@ -2,8 +2,8 @@
 
 import numpy as np
 
-from dpdata.dftbplus.output import read_dftb_plus
 from dpdata.format import Format
+from dpdata.formats.dftbplus.output import read_dftb_plus
 from dpdata.unit import EnergyConversion, ForceConversion
 
 energy_convert = EnergyConversion("hartree", "eV").value()
diff --git a/dpdata/plugins/fhi_aims.py b/dpdata/plugins/fhi_aims.py
index 3c198aff6..310496e2a 100644
--- a/dpdata/plugins/fhi_aims.py
+++ b/dpdata/plugins/fhi_aims.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-import dpdata.fhi_aims.output
+import dpdata.formats.fhi_aims.output
 from dpdata.format import Format
 
 
@@ -20,7 +20,7 @@ def from_labeled_system(
             data["energies"],
             data["forces"],
             tmp_virial,
-        ) = dpdata.fhi_aims.output.get_frames(
+        ) = dpdata.formats.fhi_aims.output.get_frames(
             file_name,
             md=md,
             begin=begin,
@@ -45,7 +45,9 @@ def from_labeled_system(self, file_name, **kwargs):
             data["energies"],
             data["forces"],
             tmp_virial,
-        ) = dpdata.fhi_aims.output.get_frames(file_name, md=False, begin=0, step=1)
+        ) = dpdata.formats.fhi_aims.output.get_frames(
+            file_name, md=False, begin=0, step=1
+        )
         if tmp_virial is not None:
             data["virials"] = tmp_virial
         return data
diff --git a/dpdata/plugins/gaussian.py b/dpdata/plugins/gaussian.py
index d2c0f7237..bfc8b273d 100644
--- a/dpdata/plugins/gaussian.py
+++ b/dpdata/plugins/gaussian.py
@@ -7,9 +7,9 @@
 
 import numpy as np
 
-import dpdata.gaussian.fchk
-import dpdata.gaussian.gjf
-import dpdata.gaussian.log
+import dpdata.formats.gaussian.fchk
+import dpdata.formats.gaussian.gjf
+import dpdata.formats.gaussian.log
 from dpdata.data_type import Axis, DataType
 from dpdata.driver import Driver
 from dpdata.format import Format
@@ -35,7 +35,7 @@ def register_hessian_data(data):
 class GaussianLogFormat(Format):
     def from_labeled_system(self, file_name: FileType, md=False, **kwargs):
         try:
-            return dpdata.gaussian.log.to_system_data(file_name, md=md)
+            return dpdata.formats.gaussian.log.to_system_data(file_name, md=md)
         except AssertionError:
             return {"energies": [], "forces": [], "nopbc": True}
 
@@ -46,7 +46,7 @@ def from_labeled_system(
         self, file_name: FileType, has_forces=True, has_hessian=True, **kwargs
     ):
         try:
-            data = dpdata.gaussian.fchk.to_system_data(
+            data = dpdata.formats.gaussian.fchk.to_system_data(
                 file_name, has_forces=has_forces, has_hessian=has_hessian
             )
             register_hessian_data(data)
@@ -77,7 +77,7 @@ def from_system(self, file_name: FileType, **kwargs):
         """
         with open_file(file_name) as fp:
             text = fp.read()
-        return dpdata.gaussian.gjf.read_gaussian_input(text)
+        return dpdata.formats.gaussian.gjf.read_gaussian_input(text)
 
     def to_system(self, data: dict, file_name: FileType, **kwargs):
         """Generate Gaussian input file.
@@ -89,9 +89,9 @@ def to_system(self, data: dict, file_name: FileType, **kwargs):
         file_name : str
             file name
         **kwargs : dict
-            Other parameters to make input files. See :meth:`dpdata.gaussian.gjf.make_gaussian_input`
+            Other parameters to make input files. See :meth:`dpdata.formats.gaussian.gjf.make_gaussian_input`
         """
-        text = dpdata.gaussian.gjf.make_gaussian_input(data, **kwargs)
+        text = dpdata.formats.gaussian.gjf.make_gaussian_input(data, **kwargs)
         with open_file(file_name, "w") as fp:
             fp.write(text)
 
@@ -108,7 +108,7 @@ class GaussianDriver(Driver):
     gaussian_exec : str, default=g16
         path to gaussian program
     **kwargs : dict
-        other arguments to make input files. See :meth:`dpdata.gaussian.gjf.make_gaussian_input`
+        other arguments to make input files. See :meth:`dpdata.formats.gaussian.gjf.make_gaussian_input`
 
     Examples
     --------
diff --git a/dpdata/plugins/gromacs.py b/dpdata/plugins/gromacs.py
index a7066bbcc..837b6f6d8 100644
--- a/dpdata/plugins/gromacs.py
+++ b/dpdata/plugins/gromacs.py
@@ -2,7 +2,7 @@
 
 from typing import TYPE_CHECKING
 
-import dpdata.gromacs.gro
+import dpdata.formats.gromacs.gro
 from dpdata.format import Format
 from dpdata.utils import open_file
 
@@ -25,7 +25,7 @@ def from_system(self, file_name, format_atom_name=True, **kwargs):
         **kwargs : dict
             other parameters
         """
-        return dpdata.gromacs.gro.file_to_system_data(
+        return dpdata.formats.gromacs.gro.file_to_system_data(
             file_name, format_atom_name=format_atom_name, **kwargs
         )
 
@@ -49,11 +49,13 @@ def to_system(
         if frame_idx == -1:
             strs = []
             for idx in range(data["coords"].shape[0]):
-                gro_str = dpdata.gromacs.gro.from_system_data(data, f_idx=idx, **kwargs)
+                gro_str = dpdata.formats.gromacs.gro.from_system_data(
+                    data, f_idx=idx, **kwargs
+                )
                 strs.append(gro_str)
             gro_str = "\n".join(strs)
         else:
-            gro_str = dpdata.gromacs.gro.from_system_data(
+            gro_str = dpdata.formats.gromacs.gro.from_system_data(
                 data, f_idx=frame_idx, **kwargs
             )
 
diff --git a/dpdata/plugins/lammps.py b/dpdata/plugins/lammps.py
index b00d4ff0c..9a4622659 100644
--- a/dpdata/plugins/lammps.py
+++ b/dpdata/plugins/lammps.py
@@ -4,8 +4,8 @@
 
 import numpy as np
 
-import dpdata.lammps.dump
-import dpdata.lammps.lmp
+import dpdata.formats.lammps.dump
+import dpdata.formats.lammps.lmp
 from dpdata.data_type import Axis, DataType
 from dpdata.format import Format
 from dpdata.utils import open_file
@@ -103,7 +103,9 @@ def from_system(
         """
         with open_file(file_name) as fp:
             lines = [line.rstrip("\n") for line in fp]
-        data = dpdata.lammps.lmp.to_system_data(lines, type_map, atom_style=atom_style)
+        data = dpdata.formats.lammps.lmp.to_system_data(
+            lines, type_map, atom_style=atom_style
+        )
         register_spin(data)
         register_charge(data)
         return data
@@ -123,7 +125,7 @@ def to_system(self, data, file_name: FileType, frame_idx=0, **kwargs):
             other parameters
         """
         assert frame_idx < len(data["coords"])
-        w_str = dpdata.lammps.lmp.from_system_data(data, frame_idx)
+        w_str = dpdata.formats.lammps.lmp.from_system_data(data, frame_idx)
         with open_file(file_name, "w") as fp:
             fp.write(w_str)
 
@@ -164,8 +166,8 @@ def from_system(
         dict
             The system data
         """
-        lines = dpdata.lammps.dump.load_file(file_name, begin=begin, step=step)
-        data = dpdata.lammps.dump.system_data(
+        lines = dpdata.formats.lammps.dump.load_file(file_name, begin=begin, step=step)
+        data = dpdata.formats.lammps.dump.system_data(
             lines, type_map, unwrap=unwrap, input_file=input_file
         )
         register_spin(data)
@@ -188,6 +190,6 @@ def to_system(self, data, file_name: FileType, frame_idx=0, timestep=0, **kwargs
             other parameters
         """
         assert frame_idx < len(data["coords"])
-        w_str = dpdata.lammps.dump.from_system_data(data, frame_idx, timestep)
+        w_str = dpdata.formats.lammps.dump.from_system_data(data, frame_idx, timestep)
         with open_file(file_name, "w") as fp:
             fp.write(w_str)
diff --git a/dpdata/plugins/lmdb.py b/dpdata/plugins/lmdb.py
index 8391c1fae..5c83dbe21 100644
--- a/dpdata/plugins/lmdb.py
+++ b/dpdata/plugins/lmdb.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
 from dpdata.format import Format
-from dpdata.lmdb.format import LMDBFormat
+from dpdata.formats.lmdb.format import LMDBFormat
 
 Format.register("lmdb")(LMDBFormat)
diff --git a/dpdata/plugins/openmx.py b/dpdata/plugins/openmx.py
index 4e16566dc..dc0af9c2e 100644
--- a/dpdata/plugins/openmx.py
+++ b/dpdata/plugins/openmx.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
+import dpdata.formats.openmx.omx
 import dpdata.md.pbc
-import dpdata.openmx.omx
 from dpdata.format import Format
 
 
@@ -35,7 +35,7 @@ def from_system(self, file_name: str, **kwargs) -> dict:
         fname = f"{file_name}.dat"
         mdname = f"{file_name}.md"
 
-        data, _ = dpdata.openmx.omx.to_system_data(fname, mdname)
+        data, _ = dpdata.formats.openmx.omx.to_system_data(fname, mdname)
         data["coords"] = dpdata.md.pbc.apply_pbc(
             data["coords"],
             data["cells"],
@@ -61,12 +61,12 @@ def from_labeled_system(self, file_name: str, **kwargs) -> dict:
         fname = f"{file_name}.dat"
         mdname = f"{file_name}.md"
 
-        data, cs = dpdata.openmx.omx.to_system_data(fname, mdname)
+        data, cs = dpdata.formats.openmx.omx.to_system_data(fname, mdname)
         data["coords"] = dpdata.md.pbc.apply_pbc(
             data["coords"],
             data["cells"],
         )
-        data["energies"], data["forces"] = dpdata.openmx.omx.to_system_label(
+        data["energies"], data["forces"] = dpdata.formats.openmx.omx.to_system_label(
             fname, mdname
         )
         return data
diff --git a/dpdata/plugins/orca.py b/dpdata/plugins/orca.py
index 7a0b806c9..2c436be1e 100644
--- a/dpdata/plugins/orca.py
+++ b/dpdata/plugins/orca.py
@@ -5,7 +5,7 @@
 import numpy as np
 
 from dpdata.format import Format
-from dpdata.orca.output import read_orca_sp_output
+from dpdata.formats.orca.output import read_orca_sp_output
 from dpdata.unit import EnergyConversion, ForceConversion
 
 if TYPE_CHECKING:
diff --git a/dpdata/plugins/psi4.py b/dpdata/plugins/psi4.py
index 2bbfc2321..9aca1b80e 100644
--- a/dpdata/plugins/psi4.py
+++ b/dpdata/plugins/psi4.py
@@ -5,8 +5,8 @@
 import numpy as np
 
 from dpdata.format import Format
-from dpdata.psi4.input import write_psi4_input
-from dpdata.psi4.output import read_psi4_output
+from dpdata.formats.psi4.input import write_psi4_input
+from dpdata.formats.psi4.output import read_psi4_output
 from dpdata.unit import EnergyConversion, ForceConversion
 from dpdata.utils import open_file
 
diff --git a/dpdata/plugins/pwmat.py b/dpdata/plugins/pwmat.py
index 38a5bb297..5ee8483c2 100644
--- a/dpdata/plugins/pwmat.py
+++ b/dpdata/plugins/pwmat.py
@@ -4,8 +4,8 @@
 
 import numpy as np
 
-import dpdata.pwmat.atomconfig
-import dpdata.pwmat.movement
+import dpdata.formats.pwmat.atomconfig
+import dpdata.formats.pwmat.movement
 from dpdata.format import Format
 from dpdata.utils import open_file
 
@@ -33,7 +33,7 @@ def from_labeled_system(
             data["energies"],
             tmp_force,
             tmp_virial,
-        ) = dpdata.pwmat.movement.get_frames(
+        ) = dpdata.formats.pwmat.movement.get_frames(
             file_name, begin=begin, step=step, convergence_check=convergence_check
         )
         if tmp_force is not None:
@@ -58,7 +58,7 @@ class PwmatAtomconfigFormat(Format):
     def from_system(self, file_name: FileType, **kwargs):
         with open_file(file_name) as fp:
             lines = [line.rstrip("\n") for line in fp]
-        return dpdata.pwmat.atomconfig.to_system_data(lines)
+        return dpdata.formats.pwmat.atomconfig.to_system_data(lines)
 
     def to_system(self, data, file_name: FileType, frame_idx=0, *args, **kwargs):
         """Dump the system in pwmat atom.config format.
@@ -77,6 +77,6 @@ def to_system(self, data, file_name: FileType, frame_idx=0, *args, **kwargs):
             other parameters
         """
         assert frame_idx < len(data["coords"])
-        w_str = dpdata.pwmat.atomconfig.from_system_data(data, frame_idx)
+        w_str = dpdata.formats.pwmat.atomconfig.from_system_data(data, frame_idx)
         with open_file(file_name, "w") as fp:
             fp.write(w_str)
diff --git a/dpdata/plugins/pymatgen.py b/dpdata/plugins/pymatgen.py
index b8099a3ab..e9c91fc8a 100644
--- a/dpdata/plugins/pymatgen.py
+++ b/dpdata/plugins/pymatgen.py
@@ -2,8 +2,8 @@
 
 import numpy as np
 
-import dpdata.pymatgen.molecule
-import dpdata.pymatgen.structure
+import dpdata.formats.pymatgen.molecule
+import dpdata.formats.pymatgen.structure
 from dpdata.format import Format
 
 
@@ -24,7 +24,7 @@ def from_system(self, structure, **kwargs) -> dict:
         dict
             data dict
         """
-        return dpdata.pymatgen.structure.from_system_data(structure)
+        return dpdata.formats.pymatgen.structure.from_system_data(structure)
 
     def to_system(self, data, **kwargs):
         """Convert System to Pymatgen Structure obj."""
@@ -56,7 +56,7 @@ def from_system(self, file_name, **kwargs):
         except ModuleNotFoundError as e:
             raise ImportError("No module pymatgen.Molecule") from e
 
-        return dpdata.pymatgen.molecule.to_system_data(file_name)
+        return dpdata.formats.pymatgen.molecule.to_system_data(file_name)
 
     def to_system(self, data, **kwargs):
         """Convert System to Pymatgen Molecule obj."""
diff --git a/dpdata/plugins/qe.py b/dpdata/plugins/qe.py
index 682bb202e..b9bd84c0c 100644
--- a/dpdata/plugins/qe.py
+++ b/dpdata/plugins/qe.py
@@ -1,8 +1,8 @@
 from __future__ import annotations
 
+import dpdata.formats.qe.scf
+import dpdata.formats.qe.traj
 import dpdata.md.pbc
-import dpdata.qe.scf
-import dpdata.qe.traj
 from dpdata.format import Format
 
 
@@ -10,7 +10,7 @@
 class QECPTrajFormat(Format):
     @Format.post("rot_lower_triangular")
     def from_system(self, file_name, begin=0, step=1, **kwargs):
-        data, _ = dpdata.qe.traj.to_system_data(
+        data, _ = dpdata.formats.qe.traj.to_system_data(
             file_name + ".in", file_name, begin=begin, step=step
         )
         data["coords"] = dpdata.md.pbc.apply_pbc(
@@ -21,14 +21,14 @@ def from_system(self, file_name, begin=0, step=1, **kwargs):
 
     @Format.post("rot_lower_triangular")
     def from_labeled_system(self, file_name, begin=0, step=1, **kwargs):
-        data, cs = dpdata.qe.traj.to_system_data(
+        data, cs = dpdata.formats.qe.traj.to_system_data(
             file_name + ".in", file_name, begin=begin, step=step
         )
         data["coords"] = dpdata.md.pbc.apply_pbc(
             data["coords"],
             data["cells"],
         )
-        data["energies"], data["forces"], es = dpdata.qe.traj.to_system_label(
+        data["energies"], data["forces"], es = dpdata.formats.qe.traj.to_system_label(
             file_name + ".in", file_name, begin=begin, step=step
         )
         assert cs == es, "the step key between files are not consistent"
@@ -49,7 +49,7 @@ def from_labeled_system(self, file_name, **kwargs):
             data["energies"],
             data["forces"],
             tmp_virial,
-        ) = dpdata.qe.scf.get_frame(file_name)
+        ) = dpdata.formats.qe.scf.get_frame(file_name)
         if tmp_virial is not None:
             data["virials"] = tmp_virial
         return data
diff --git a/dpdata/plugins/rdkit.py b/dpdata/plugins/rdkit.py
index f01b277d6..2b336a165 100644
--- a/dpdata/plugins/rdkit.py
+++ b/dpdata/plugins/rdkit.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-import dpdata.rdkit.utils
+import dpdata.formats.rdkit.utils
 from dpdata.format import Format
 
 
@@ -31,7 +31,7 @@ def from_bond_order_system(self, file_name, **kwargs):
             for m in rdkit.Chem.SDMolSupplier(file_name, sanitize=False, removeHs=False)
         ]
         if len(mols) > 1:
-            mol = dpdata.rdkit.utils.combine_molecules(mols)
+            mol = dpdata.formats.rdkit.utils.combine_molecules(mols)
         else:
             mol = mols[0]
         return mol
diff --git a/dpdata/plugins/siesta.py b/dpdata/plugins/siesta.py
index 906eeb51f..3404d37c9 100644
--- a/dpdata/plugins/siesta.py
+++ b/dpdata/plugins/siesta.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
-import dpdata.siesta.aiMD_output
-import dpdata.siesta.output
+import dpdata.formats.siesta.aiMD_output
+import dpdata.formats.siesta.output
 from dpdata.format import Format
 
 
@@ -18,7 +18,7 @@ def from_system(self, file_name, **kwargs):
             _e,
             _f,
             _v,
-        ) = dpdata.siesta.output.obtain_frame(file_name)
+        ) = dpdata.formats.siesta.output.obtain_frame(file_name)
         return data
 
     def from_labeled_system(self, file_name, **kwargs):
@@ -32,7 +32,7 @@ def from_labeled_system(self, file_name, **kwargs):
             data["energies"],
             data["forces"],
             data["virials"],
-        ) = dpdata.siesta.output.obtain_frame(file_name)
+        ) = dpdata.formats.siesta.output.obtain_frame(file_name)
         return data
 
 
@@ -50,7 +50,7 @@ def from_system(self, file_name, **kwargs):
             _e,
             _f,
             _v,
-        ) = dpdata.siesta.aiMD_output.get_aiMD_frame(file_name)
+        ) = dpdata.formats.siesta.aiMD_output.get_aiMD_frame(file_name)
         return data
 
     def from_labeled_system(self, file_name, **kwargs):
@@ -64,5 +64,5 @@ def from_labeled_system(self, file_name, **kwargs):
             data["energies"],
             data["forces"],
             data["virials"],
-        ) = dpdata.siesta.aiMD_output.get_aiMD_frame(file_name)
+        ) = dpdata.formats.siesta.aiMD_output.get_aiMD_frame(file_name)
         return data
diff --git a/dpdata/plugins/vasp.py b/dpdata/plugins/vasp.py
index 49d25ecba..84d3142b6 100644
--- a/dpdata/plugins/vasp.py
+++ b/dpdata/plugins/vasp.py
@@ -4,9 +4,9 @@
 
 import numpy as np
 
-import dpdata.vasp.outcar
-import dpdata.vasp.poscar
-import dpdata.vasp.xml
+import dpdata.formats.vasp.outcar
+import dpdata.formats.vasp.poscar
+import dpdata.formats.vasp.xml
 from dpdata.data_type import Axis, DataType
 from dpdata.format import Format
 from dpdata.utils import open_file, uniq_atom_names
@@ -36,7 +36,7 @@ class VASPPoscarFormat(Format):
     def from_system(self, file_name: FileType, **kwargs):
         with open_file(file_name) as fp:
             lines = [line.rstrip("\n") for line in fp]
-        data = dpdata.vasp.poscar.to_system_data(lines)
+        data = dpdata.formats.vasp.poscar.to_system_data(lines)
         data = uniq_atom_names(data)
         register_move_data(data)
         return data
@@ -75,7 +75,7 @@ def to_system(self, data, frame_idx=0, **kwargs):
             other parameters
         """
         assert frame_idx < len(data["coords"])
-        return dpdata.vasp.poscar.from_system_data(data, frame_idx)
+        return dpdata.formats.vasp.poscar.from_system_data(data, frame_idx)
 
 
 # rotate the system to lammps convention
@@ -97,7 +97,7 @@ def from_labeled_system(
             data["energies"],
             tmp_force,
             tmp_virial,
-        ) = dpdata.vasp.outcar.get_frames(
+        ) = dpdata.formats.vasp.outcar.get_frames(
             file_name,
             begin=begin,
             step=step,
@@ -136,7 +136,7 @@ def from_labeled_system(
             data["energies"],
             data["forces"],
             tmp_virial,
-        ) = dpdata.vasp.xml.analyze(
+        ) = dpdata.formats.vasp.xml.analyze(
             file_name,
             type_idx_zero=True,
             begin=begin,
diff --git a/dpdata/plugins/xyz.py b/dpdata/plugins/xyz.py
index d005f114f..63aaeabe3 100644
--- a/dpdata/plugins/xyz.py
+++ b/dpdata/plugins/xyz.py
@@ -10,8 +10,8 @@
 
 if TYPE_CHECKING:
     from dpdata.utils import FileType
-from dpdata.xyz.quip_gap_xyz import QuipGapxyzSystems, format_single_frame
-from dpdata.xyz.xyz import coord_to_xyz, xyz_to_coord
+from dpdata.formats.xyz.quip_gap_xyz import QuipGapxyzSystems, format_single_frame
+from dpdata.formats.xyz.xyz import coord_to_xyz, xyz_to_coord
 
 
 @Format.register("xyz")
diff --git a/dpdata/psi4/__init__.py b/dpdata/psi4/__init__.py
index e69de29bb..06ef58244 100644
--- a/dpdata/psi4/__init__.py
+++ b/dpdata/psi4/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import annotations
+
+from dpdata.formats.psi4 import *  # noqa: F403
diff --git a/dpdata/psi4/input.py b/dpdata/psi4/input.py
index 3959cb753..f151f0a48 100644
--- a/dpdata/psi4/input.py
+++ b/dpdata/psi4/input.py
@@ -1,62 +1,3 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
-    import numpy as np
-
-# Angston is used in Psi4 by default
-template = """molecule {{
-{atoms:s}
-{charge:d} {multiplicity:d}
-}}
-set basis {basis:s}
-set gradient_write on
-G, wfn = gradient("WB97M-D3BJ", return_wfn=True)
-wfn.energy()
-wfn.gradient().print_out()
-"""
-
-
-def write_psi4_input(
-    types: np.ndarray,
-    coords: np.ndarray,
-    method: str,
-    basis: str,
-    charge: int = 0,
-    multiplicity: int = 1,
-) -> str:
-    """Write Psi4 input file.
-
-    Parameters
-    ----------
-    types : np.ndarray
-        atomic symbols
-    coords : np.ndarray
-        atomic coordinates
-    method : str
-        computational method
-    basis : str
-        basis set; see https://psicode.org/psi4manual/master/basissets_tables.html
-    charge : int, default=0
-        charge of system
-    multiplicity : int, default=1
-        multiplicity of system
-
-    Returns
-    -------
-    str
-        content of Psi4 input file
-    """
-    return template.format(
-        atoms="\n".join(
-            [
-                "{:s} {:16.9f} {:16.9f} {:16.9f}".format(*ii)
-                for ii in zip(types, *coords.T)
-            ]
-        ),
-        charge=charge,
-        multiplicity=multiplicity,
-        method=method,
-        basis=basis,
-    )
+from dpdata.formats.psi4.input import *  # noqa: F403
diff --git a/dpdata/psi4/output.py b/dpdata/psi4/output.py
index c3594ffb4..66f1e33cf 100644
--- a/dpdata/psi4/output.py
+++ b/dpdata/psi4/output.py
@@ -1,80 +1,3 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
-
-import numpy as np
-
-from dpdata.unit import LengthConversion
-from dpdata.utils import open_file
-
-if TYPE_CHECKING:
-    from dpdata.utils import FileType
-
-
-def read_psi4_output(fn: FileType) -> tuple[str, np.ndarray, float, np.ndarray]:
-    """Read from Psi4 output.
-
-    Note that both the energy and the gradient should be printed.
-
-    Parameters
-    ----------
-    fn : str
-        file name
-
-    Returns
-    -------
-    str
-        atomic symbols
-    np.ndarray
-        atomic coordinates
-    float
-        total potential energy
-    np.ndarray
-        atomic forces
-    """
-    coord = None
-    symbols = None
-    forces = None
-    energy = None
-    length_unit = None
-    with open_file(fn) as f:
-        flag = 0
-        for line in f:
-            if flag in (1, 3, 4, 5, 6):
-                flag += 1
-            elif flag == 2:
-                s = line.split()
-                if not len(s):
-                    flag = 0
-                else:
-                    symbols.append(s[0].capitalize())
-                    coord.append([float(s[1]), float(s[2]), float(s[3])])
-            elif flag == 7:
-                s = line.split()
-                if not len(s):
-                    flag = 0
-                else:
-                    forces.append([float(s[1]), float(s[2]), float(s[3])])
-            elif line.startswith(
-                "       Center              X                  Y                   Z               Mass"
-            ):
-                # coord
-                flag = 1
-                coord = []
-                symbols = []
-            elif line.startswith("    Geometry (in "):
-                # remove ),
-                length_unit = line.split()[2][:-2].lower()
-            elif line.startswith("  ## Total Gradient"):
-                flag = 3
-                forces = []
-            elif line.startswith("    Total Energy ="):
-                energy = float(line.split()[-1])
-    assert length_unit is not None
-    length_convert = LengthConversion(length_unit, "angstrom").value()
-    symbols = np.array(symbols)
-    forces = -np.array(forces)
-    coord = np.array(coord) * length_convert
-    assert coord.shape == forces.shape
-
-    return symbols, coord, energy, forces
+from dpdata.formats.psi4.output import *  # noqa: F403
diff --git a/dpdata/pwmat/__init__.py b/dpdata/pwmat/__init__.py
index e69de29bb..3a8bde615 100644
--- a/dpdata/pwmat/__init__.py
+++ b/dpdata/pwmat/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import annotations
+
+from dpdata.formats.pwmat import *  # noqa: F403
diff --git a/dpdata/pwmat/atomconfig.py b/dpdata/pwmat/atomconfig.py
index 5f01c8409..c6b5928d5 100644
--- a/dpdata/pwmat/atomconfig.py
+++ b/dpdata/pwmat/atomconfig.py
@@ -1,95 +1,3 @@
-#!/usr/bin/python3
 from __future__ import annotations
 
-import numpy as np
-
-from ..periodic_table import ELEMENTS
-
-
-def _to_system_data_lower(lines):
-    system = {}
-    natoms = int(lines[0].split()[0])
-    cell = []
-    for idx, ii in enumerate(lines):
-        if "lattice" in ii or "Lattice" in ii or "LATTICE" in ii:
-            for kk in range(idx + 1, idx + 1 + 3):
-                vector = [float(jj) for jj in lines[kk].split()[0:3]]
-                cell.append(vector)
-    system["cells"] = np.array([cell])
-    coord = []
-    atomic_number = []
-    atom_numbs = []
-    for idx, ii in enumerate(lines):
-        if "Position" in ii or "POSITION" in ii or "position" in ii:
-            for kk in range(idx + 1, idx + 1 + natoms):
-                min = kk
-                for jj in range(kk + 1, idx + 1 + natoms):
-                    if int(lines[jj].split()[0]) < int(lines[min].split()[0]):
-                        min = jj
-                        lines[min], lines[kk] = lines[kk], lines[min]
-            for gg in range(idx + 1, idx + 1 + natoms):
-                tmpv = [float(jj) for jj in lines[gg].split()[1:4]]
-                tmpv = np.matmul(np.array(tmpv), system["cells"][0])
-                coord.append(tmpv)
-                tmpn = int(lines[gg].split()[0])
-                atomic_number.append(tmpn)
-    for ii in np.unique(sorted(atomic_number)):
-        atom_numbs.append(atomic_number.count(ii))
-    system["atom_numbs"] = [int(ii) for ii in atom_numbs]
-    system["coords"] = np.array([coord])
-    system["orig"] = np.zeros(3)
-    atom_types = []
-    for idx, ii in enumerate(system["atom_numbs"]):
-        for jj in range(ii):
-            atom_types.append(idx)
-    system["atom_types"] = np.array(atom_types, dtype=int)
-    system["atom_names"] = [ELEMENTS[ii - 1] for ii in np.unique(sorted(atomic_number))]
-    return system
-
-
-def to_system_data(lines):
-    return _to_system_data_lower(lines)
-
-
-def from_system_data(system, f_idx=0, skip_zeros=True):
-    ret = ""
-    natoms = sum(system["atom_numbs"])
-    ret += "%d" % natoms  # noqa: UP031
-    ret += "\n"
-    ret += "LATTICE"
-    ret += "\n"
-    for ii in system["cells"][f_idx]:
-        for jj in ii:
-            ret += f"{jj:.16e} "
-        ret += "\n"
-    ret += "POSITION"
-    ret += "\n"
-    atom_numbs = system["atom_numbs"]
-    atom_names = system["atom_names"]
-    atype = system["atom_types"]
-    posis = system["coords"][f_idx]
-    # atype_idx = [[idx,tt] for idx,tt in enumerate(atype)]
-    # sort_idx = np.argsort(atype, kind = 'mergesort')
-    sort_idx = np.lexsort((np.arange(len(atype)), atype))
-    atype = atype[sort_idx]
-    posis = posis[sort_idx]
-    symbal = []
-    for ii, jj in zip(atom_numbs, atom_names):
-        for kk in range(ii):
-            symbal.append(jj)
-    atomic_numbers = []
-    for ii in symbal:
-        atomic_numbers.append(ELEMENTS.index(ii) + 1)
-    posi_list = []
-    for jj, ii in zip(atomic_numbers, posis):
-        ii = np.matmul(ii, np.linalg.inv(system["cells"][0]))
-        posi_list.append("%d %15.10f %15.10f %15.10f 1 1 1" % (jj, ii[0], ii[1], ii[2]))  # noqa: UP031
-    for kk in range(len(posi_list)):
-        min = kk
-        for jj in range(kk, len(posi_list)):
-            if int(posi_list[jj].split()[0]) < int(posi_list[min].split()[0]):
-                min = jj
-                posi_list[min], posi_list[kk] = posi_list[kk], posi_list[min]
-    posi_list.append("")
-    ret += "\n".join(posi_list)
-    return ret
+from dpdata.formats.pwmat.atomconfig import *  # noqa: F403
diff --git a/dpdata/pwmat/movement.py b/dpdata/pwmat/movement.py
index ccfd819db..d20575b48 100644
--- a/dpdata/pwmat/movement.py
+++ b/dpdata/pwmat/movement.py
@@ -1,208 +1,3 @@
 from __future__ import annotations
 
-import warnings
-
-import numpy as np
-
-from ..periodic_table import ELEMENTS
-
-
-def system_info(lines, type_idx_zero=False):
-    atom_names = []
-    atom_numbs = []
-    nelm = 0
-    natoms = int(lines[0].split()[0])
-    iteration = float(lines[0].split("Etot")[0].split("=")[1].split(",")[0])
-    #    print(iteration)
-    if iteration > 0:
-        nelm = 40
-    else:
-        nelm = 100
-    atomic_number = []
-    for idx, ii in enumerate(lines):
-        if ("Position" in ii) and ("nonperiodic_Position" not in ii):
-            for kk in range(idx + 1, idx + 1 + natoms):
-                min = kk
-                for jj in range(kk + 1, idx + 1 + natoms):
-                    if int(lines[jj].split()[0]) < int(lines[min].split()[0]):
-                        min = jj
-                        lines[min], lines[kk] = lines[kk], lines[min]
-            for gg in range(idx + 1, idx + 1 + natoms):
-                tmpn = int(lines[gg].split()[0])
-                atomic_number.append(tmpn)
-    for ii in np.unique(sorted(atomic_number)):
-        atom_numbs.append(atomic_number.count(ii))
-    atom_types = []
-    for idx, ii in enumerate(atom_numbs):
-        for jj in range(ii):
-            if type_idx_zero:
-                atom_types.append(idx)
-            else:
-                atom_types.append(idx + 1)
-    for ii in np.unique(sorted(atomic_number)):
-        atom_names.append(ELEMENTS[ii - 1])
-    return atom_names, atom_numbs, np.array(atom_types, dtype=int), nelm
-
-
-def get_movement_block(fp):
-    blk = []
-    for ii in fp:
-        if not ii:
-            return blk
-        blk.append(ii.rstrip("\n"))
-        if "------------" in ii:
-            return blk
-    return blk
-
-
-# we assume that the force is printed ...
-def get_frames(fname, begin=0, step=1, convergence_check=True):
-    fp = open(fname)
-    blk = get_movement_block(fp)
-
-    atom_names, atom_numbs, atom_types, nelm = system_info(blk, type_idx_zero=True)
-    ntot = sum(atom_numbs)
-
-    all_coords = []
-    all_cells = []
-    all_energies = []
-    all_atomic_energy = []
-    all_forces = []
-    all_virials = []
-
-    cc = 0
-    rec_failed = []
-    while len(blk) > 0:
-        if cc >= begin and (cc - begin) % step == 0:
-            coord, cell, energy, force, virial, is_converge = analyze_block(
-                blk, ntot, nelm
-            )
-            if len(coord) == 0:
-                break
-            if is_converge or not convergence_check:
-                all_coords.append(coord)
-                all_cells.append(cell)
-                all_energies.append(energy)
-                all_forces.append(force)
-                if virial is not None:
-                    all_virials.append(virial)
-            if not is_converge:
-                rec_failed.append(cc + 1)
-
-        blk = get_movement_block(fp)
-        cc += 1
-
-    if len(rec_failed) > 0:
-        prt = (
-            "so they are not collected."
-            if convergence_check
-            else "but they are still collected due to the requirement for ignoring convergence checks."
-        )
-        warnings.warn(
-            f"The following structures were unconverged: {rec_failed}; " + prt
-        )
-
-    if len(all_virials) == 0:
-        all_virials = None
-    else:
-        all_virials = np.array(all_virials)
-    fp.close()
-    return (
-        atom_names,
-        atom_numbs,
-        atom_types,
-        np.array(all_cells),
-        np.array(all_coords),
-        np.array(all_energies),
-        np.array(all_forces),
-        all_virials,
-    )
-
-
-def analyze_block(lines, ntot, nelm):
-    coord = []
-    cell = []
-    energy = None
-    #    atomic_energy = []
-    force = []
-    virial = None
-    is_converge = True
-    sc_index = 0
-    for idx, ii in enumerate(lines):
-        if "Iteration" in ii:
-            sc_index = int(ii.split("SCF =")[1])
-            if sc_index >= nelm:
-                is_converge = False
-            energy = float(
-                ii.split("Etot,Ep,Ek (eV)")[1].split()[2]
-            )  # use Ep, not Etot=Ep+Ek
-        elif "----------" in ii:
-            assert (force is not None) and len(coord) > 0 and len(cell) > 0
-            # all_coords.append(coord)
-            # all_cells.append(cell)
-            # all_energies.append(energy)
-            # all_forces.append(force)
-            # if virial is not None :
-            #     all_virials.append(virial)
-            return coord, cell, energy, force, virial, is_converge
-        #        elif 'NPT' in ii:
-        #            tmp_v = []
-        elif "Lattice vector" in ii:
-            if "stress" in lines[idx + 1]:
-                tmp_v = []
-                for dd in range(3):
-                    tmp_l = lines[idx + 1 + dd]
-                    cell.append([float(ss) for ss in tmp_l.split()[0:3]])
-                    tmp_v.append([float(stress) for stress in tmp_l.split()[5:8]])
-                virial = np.zeros([3, 3])
-                virial[0][0] = tmp_v[0][0]
-                virial[0][1] = tmp_v[0][1]
-                virial[0][2] = tmp_v[0][2]
-                virial[1][0] = tmp_v[1][0]
-                virial[1][1] = tmp_v[1][1]
-                virial[1][2] = tmp_v[1][2]
-                virial[2][0] = tmp_v[2][0]
-                virial[2][1] = tmp_v[2][1]
-                virial[2][2] = tmp_v[2][2]
-                volume = np.linalg.det(np.array(cell))
-                virial = virial * 160.2 * 10.0 / volume
-            else:
-                for dd in range(3):
-                    tmp_l = lines[idx + 1 + dd]
-                    cell.append([float(ss) for ss in tmp_l.split()[0:3]])
-
-        #            else :
-        #                for dd in range(3) :
-        #                    tmp_l = lines[idx+1+dd]
-        #                    cell.append([float(ss)
-        #                                 for ss in tmp_l.split()[0:3]])
-        #                virial = np.zeros([3,3])
-        elif ("Position" in ii) and ("nonperiodic_Position" not in ii):
-            for kk in range(idx + 1, idx + 1 + ntot):
-                min = kk
-                for jj in range(kk + 1, idx + 1 + ntot):
-                    if int(lines[jj].split()[0]) < int(lines[min].split()[0]):
-                        min = jj
-                        lines[min], lines[kk] = lines[kk], lines[min]
-            for gg in range(idx + 1, idx + 1 + ntot):
-                info = [float(jj) for jj in lines[gg].split()[1:4]]
-                info = np.matmul(np.array(info), np.array(cell))
-                coord.append(info)
-        elif "Force" in ii:
-            for kk in range(idx + 1, idx + 1 + ntot):
-                min = kk
-                for jj in range(kk + 1, idx + 1 + ntot):
-                    if int(lines[jj].split()[0]) < int(lines[min].split()[0]):
-                        min = jj
-                        lines[min], lines[kk] = lines[kk], lines[min]
-            for gg in range(idx + 1, idx + 1 + ntot):
-                info = [
-                    -float(ss) for ss in lines[gg].split()
-                ]  # forces in MOVEMENT file are dE/dR, lacking a minus sign
-                force.append(info[1:4])
-    #        elif 'Atomic-Energy' in ii:
-    #            for jj in range(idx+1, idx+1+ntot) :
-    #                tmp_l = lines[jj]
-    #                info = [float(ss) for ss in tmp_l.split()]
-    #                atomic_energy.append(info[1])
-    return coord, cell, energy, force, virial, is_converge
+from dpdata.formats.pwmat.movement import *  # noqa: F403
diff --git a/dpdata/pymatgen/__init__.py b/dpdata/pymatgen/__init__.py
index e69de29bb..eefbaad82 100644
--- a/dpdata/pymatgen/__init__.py
+++ b/dpdata/pymatgen/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import annotations
+
+from dpdata.formats.pymatgen import *  # noqa: F403
diff --git a/dpdata/pymatgen/molecule.py b/dpdata/pymatgen/molecule.py
index 8d397984a..05467c12d 100644
--- a/dpdata/pymatgen/molecule.py
+++ b/dpdata/pymatgen/molecule.py
@@ -1,30 +1,3 @@
 from __future__ import annotations
 
-from collections import Counter
-
-import numpy as np
-
-
-def to_system_data(file_name, protect_layer=9):
-    from pymatgen.core import Molecule
-
-    mol = Molecule.from_file(file_name)
-    elem_mol = list(str(site.species.elements[0]) for site in mol.sites)
-    elem_counter = Counter(elem_mol)
-    atom_names = list(elem_counter.keys())
-    atom_numbs = list(elem_counter.values())
-    atom_types = [list(atom_names).index(e) for e in elem_mol]
-    natoms = np.sum(atom_numbs)
-
-    tmpcoord = np.copy(mol.cart_coords)
-
-    system = {}
-    system["atom_names"] = atom_names
-    system["atom_numbs"] = atom_numbs
-    system["atom_types"] = np.array(atom_types, dtype=int)
-    # center = [c - h_cell_size for c in mol.center_of_mass]
-    system["orig"] = np.array([0, 0, 0])
-
-    system["coords"] = np.array([tmpcoord])
-    system["cells"] = np.array([10.0 * np.eye(3)])
-    return system
+from dpdata.formats.pymatgen.molecule import *  # noqa: F403
diff --git a/dpdata/pymatgen/structure.py b/dpdata/pymatgen/structure.py
index 1f74dbdd0..08e1cf0e9 100644
--- a/dpdata/pymatgen/structure.py
+++ b/dpdata/pymatgen/structure.py
@@ -1,30 +1,3 @@
 from __future__ import annotations
 
-import numpy as np
-
-
-def from_system_data(structure) -> dict:
-    """Convert one pymatgen structure to dpdata's datadict."""
-    symbols = [ii.specie.symbol for ii in structure]
-    atom_names = list(structure.symbol_set)
-    atom_numbs = [symbols.count(symbol) for symbol in atom_names]
-    atom_types = np.array([atom_names.index(symbol) for symbol in symbols]).astype(int)
-    coords = structure.cart_coords
-    cells = structure.lattice.matrix
-    if all(structure.pbc):
-        pbc = True
-    elif not any(structure.pbc):
-        pbc = False
-    else:
-        raise ValueError(f"Partial pbc condition {structure.pbc} is not supported")
-
-    info_dict = {
-        "atom_names": atom_names,
-        "atom_numbs": atom_numbs,
-        "atom_types": atom_types,
-        "coords": np.array([coords]),
-        "cells": np.array([cells]),
-        "orig": np.zeros(3),
-        "nopbc": not pbc,
-    }
-    return info_dict
+from dpdata.formats.pymatgen.structure import *  # noqa: F403
diff --git a/dpdata/qe/__init__.py b/dpdata/qe/__init__.py
index e69de29bb..dc582be29 100644
--- a/dpdata/qe/__init__.py
+++ b/dpdata/qe/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import annotations
+
+from dpdata.formats.qe import *  # noqa: F403
diff --git a/dpdata/qe/scf.py b/dpdata/qe/scf.py
old mode 100755
new mode 100644
index 341261d22..4f461792e
--- a/dpdata/qe/scf.py
+++ b/dpdata/qe/scf.py
@@ -1,188 +1,3 @@
-#!/usr/bin/env python3
 from __future__ import annotations
 
-import os
-
-import numpy as np
-
-from dpdata.utils import open_file
-
-from .traj import (
-    kbar2evperang3,
-    ry2ev,
-)
-from .traj import (
-    length_convert as bohr2ang,
-)
-
-_QE_BLOCK_KEYWORDS = [
-    "ATOMIC_SPECIES",
-    "ATOMIC_POSITIONS",
-    "K_POINTS",
-    "ADDITIONAL_K_POINTS",
-    "CELL_PARAMETERS",
-    "CONSTRAINTS",
-    "OCCUPATIONS",
-    "ATOMIC_VELOCITIES",
-    "ATOMIC_FORCES",
-    "SOLVENTS",
-    "HUBBARD",
-]
-
-
-def get_block(lines, keyword, skip=0):
-    ret = []
-    for idx, ii in enumerate(lines):
-        if keyword in ii:
-            blk_idx = idx + 1 + skip
-            while len(lines[blk_idx].split()) == 0:
-                blk_idx += 1
-            while (
-                len(lines[blk_idx].split()) != 0
-                and (lines[blk_idx].split()[0] not in _QE_BLOCK_KEYWORDS)
-            ) and blk_idx != len(lines):
-                ret.append(lines[blk_idx])
-                blk_idx += 1
-            break
-    return ret
-
-
-def get_cell(lines):
-    ret = []
-    for idx, ii in enumerate(lines):
-        if "ibrav" in ii:
-            break
-    blk = lines[idx : idx + 2]
-    ibrav = int(blk[0].replace(",", "").split("=")[-1])
-    if ibrav == 0:
-        for iline in lines:
-            if "CELL_PARAMETERS" in iline and "angstrom" not in iline.lower():
-                raise RuntimeError(
-                    "CELL_PARAMETERS must be written in Angstrom. Other units are not supported yet."
-                )
-        blk = get_block(lines, "CELL_PARAMETERS")
-        for ii in blk:
-            ret.append([float(jj) for jj in ii.split()[0:3]])
-        ret = np.array(ret)
-    elif ibrav == 1:
-        a = None
-        for iline in lines:
-            line = iline.replace("=", " ").replace(",", "").split()
-            if len(line) >= 2 and "a" == line[0]:
-                # print("line = ", line)
-                a = float(line[1])
-            if len(line) >= 2 and "celldm(1)" == line[0]:
-                a = float(line[1]) * bohr2ang
-        # print("a = ", a)
-        if not a:
-            raise RuntimeError("parameter 'a' or 'celldm(1)' cannot be found.")
-        ret = np.array([[a, 0.0, 0.0], [0.0, a, 0.0], [0.0, 0.0, a]])
-    else:
-        raise RuntimeError("ibrav > 1 not supported yet.")
-    return ret
-
-
-def get_coords(lines, cell):
-    coord = []
-    atom_symbol_list = []
-    for iline in lines:
-        if "ATOMIC_POSITIONS" in iline and (
-            "angstrom" not in iline.lower() and "crystal" not in iline.lower()
-        ):
-            raise RuntimeError(
-                "ATOMIC_POSITIONS must be written in Angstrom or crystal. Other units are not supported yet."
-            )
-        if "ATOMIC_POSITIONS" in iline and "angstrom" in iline.lower():
-            blk = get_block(lines, "ATOMIC_POSITIONS")
-            for ii in blk:
-                coord.append([float(jj) for jj in ii.split()[1:4]])
-                atom_symbol_list.append(ii.split()[0])
-            coord = np.array(coord)
-        elif "ATOMIC_POSITIONS" in iline and "crystal" in iline.lower():
-            blk = get_block(lines, "ATOMIC_POSITIONS")
-            for ii in blk:
-                coord.append([float(jj) for jj in ii.split()[1:4]])
-                atom_symbol_list.append(ii.split()[0])
-            coord = np.array(coord)
-            coord = np.matmul(coord, cell)
-    atom_symbol_list = np.array(atom_symbol_list)
-    tmp_names, symbol_idx = np.unique(atom_symbol_list, return_index=True)
-    atom_types = []
-    atom_numbs = []
-    # preserve the atom_name order
-    atom_names = atom_symbol_list[np.sort(symbol_idx, kind="stable")]
-    for jj in atom_symbol_list:
-        for idx, ii in enumerate(atom_names):
-            if jj == ii:
-                atom_types.append(idx)
-    for idx in range(len(atom_names)):
-        atom_numbs.append(atom_types.count(idx))
-    atom_types = np.array(atom_types)
-
-    return list(atom_names), atom_numbs, atom_types, coord
-
-
-def get_energy(lines):
-    energy = None
-    for ii in lines:
-        if "!    total energy" in ii:
-            energy = ry2ev * float(ii.split("=")[1].split()[0])
-    return energy
-
-
-def get_force(lines, natoms):
-    blk = get_block(lines, "Forces acting on atoms", skip=1)
-    ret = []
-    blk = blk[0 : sum(natoms)]
-    for ii in blk:
-        ret.append([float(jj) for jj in ii.split("=")[1].split()])
-    ret = np.array(ret)
-    ret *= ry2ev / bohr2ang
-    return ret
-
-
-def get_stress(lines):
-    blk = get_block(lines, "total   stress")
-    if len(blk) == 0:
-        return None
-    ret = []
-    for ii in blk:
-        ret.append([float(jj) for jj in ii.split()[3:6]])
-    ret = np.array(ret)
-    ret *= kbar2evperang3
-    return ret
-
-
-def get_frame(fname):
-    if isinstance(fname, str):
-        path_out = fname
-        outname = os.path.basename(path_out)
-        # the name of the input file is assumed to be different from the output by 'in' and 'out'
-        inname = outname.replace("out", "in")
-        path_in = os.path.join(os.path.dirname(path_out), inname)
-    elif isinstance(fname, list) and len(fname) == 2:
-        path_in = fname[0]
-        path_out = fname[1]
-    else:
-        raise RuntimeError("invalid input")
-    with open_file(path_out) as fp:
-        outlines = fp.read().split("\n")
-    with open_file(path_in) as fp:
-        inlines = fp.read().split("\n")
-    cell = get_cell(inlines)
-    atom_names, natoms, types, coords = get_coords(inlines, cell)
-    energy = get_energy(outlines)
-    force = get_force(outlines, natoms)
-    stress = get_stress(outlines)
-    if stress is not None:
-        stress = (stress * np.linalg.det(cell))[np.newaxis, :, :]
-    return (
-        atom_names,
-        natoms,
-        types,
-        cell[np.newaxis, :, :],
-        coords[np.newaxis, :, :],
-        np.array(energy)[np.newaxis],
-        force[np.newaxis, :, :],
-        stress,
-    )
+from dpdata.formats.qe.scf import *  # noqa: F403
diff --git a/dpdata/qe/traj.py b/dpdata/qe/traj.py
index aa12ebb8d..1b4f67754 100644
--- a/dpdata/qe/traj.py
+++ b/dpdata/qe/traj.py
@@ -1,284 +1,3 @@
-#!/usr/bin/python3
 from __future__ import annotations
 
-import warnings
-from typing import TYPE_CHECKING
-
-import numpy as np
-
-from dpdata.utils import open_file
-
-if TYPE_CHECKING:
-    from dpdata.utils import FileType
-
-import os
-
-from ..unit import (
-    EnergyConversion,
-    ForceConversion,
-    LengthConversion,
-    PressureConversion,
-)
-
-ry2ev = EnergyConversion("rydberg", "eV").value()
-kbar2evperang3 = PressureConversion("kbar", "eV/angstrom^3").value()
-gpa2evperbohr = PressureConversion("GPa", "eV/bohr^3").value()
-
-length_convert = LengthConversion("bohr", "angstrom").value()
-energy_convert = EnergyConversion("hartree", "eV").value()
-force_convert = ForceConversion("hartree/bohr", "eV/angstrom").value()
-
-
-def load_key(lines, key):
-    for ii in lines:
-        if key in ii:
-            words = ii.split(",")
-            for jj in words:
-                if key in jj:
-                    return jj.split("=")[1]
-    return None
-
-
-def load_block(lines, key, nlines):
-    for idx, ii in enumerate(lines):
-        if key in ii:
-            break
-    return lines[idx + 1 : idx + 1 + nlines]
-
-
-def convert_celldm(ibrav, celldm):
-    if ibrav == 1:
-        return celldm[0] * np.eye(3)
-    elif ibrav == 2:
-        return celldm[0] * 0.5 * np.array([[-1, 0, 1], [0, 1, 1], [-1, 1, 0]])
-    elif ibrav == 3:
-        return celldm[0] * 0.5 * np.array([[1, 1, 1], [-1, 1, 1], [-1, -1, 1]])
-    elif ibrav == -3:
-        return celldm[0] * 0.5 * np.array([[-1, 1, 1], [1, -1, 1], [1, 1, -1]])
-    else:
-        warnings.warn(
-            "unsupported ibrav "
-            + str(ibrav)
-            + " if no .cel file, the cell convertion may be wrong. "
-        )
-        return np.eye(3)
-        # raise RuntimeError('unsupported ibrav ' + str(ibrav))
-
-
-def load_cell_parameters(lines):
-    blk = load_block(lines, "CELL_PARAMETERS", 3)
-    ret = []
-    for ii in blk:
-        ret.append([float(jj) for jj in ii.split()[0:3]])
-    return np.array(ret)
-
-
-def load_atom_names(lines, ntypes):
-    blk = load_block(lines, "ATOMIC_SPECIES", ntypes)
-    return [ii.split()[0] for ii in blk]
-
-
-def load_celldm(lines):
-    celldm = np.zeros(6)
-    for ii in range(6):
-        key = "celldm(%d)" % (ii + 1)  # noqa: UP031
-        val = load_key(lines, key)
-        if val is not None:
-            celldm[ii] = float(val)
-    return celldm
-
-
-def load_atom_types(lines, natoms, atom_names):
-    blk = load_block(lines, "ATOMIC_POSITIONS", natoms)
-    ret = []
-    for ii in blk:
-        ret.append(atom_names.index(ii.split()[0]))
-    return np.array(ret, dtype=int)
-
-
-def load_param_file(fname: FileType):
-    with open_file(fname) as fp:
-        lines = fp.read().split("\n")
-    natoms = int(load_key(lines, "nat"))
-    ntypes = int(load_key(lines, "ntyp"))
-    atom_names = load_atom_names(lines, ntypes)
-    atom_types = load_atom_types(lines, natoms, atom_names)
-    atom_numbs = []
-    for ii in range(ntypes):
-        atom_numbs.append(np.sum(atom_types == ii))
-    ibrav = int(load_key(lines, "ibrav"))
-    celldm = load_celldm(lines)
-    if ibrav == 0:
-        cell = load_cell_parameters(lines)
-    else:
-        cell = convert_celldm(ibrav, celldm)
-    cell = cell * length_convert
-    # print(atom_names)
-    # print(atom_numbs)
-    # print(atom_types)
-    # print(cell)
-    return atom_names, atom_numbs, atom_types, cell
-
-
-def _load_pos_block(fp, natoms):
-    head = fp.readline()
-    if not head:
-        # print('get None')
-        return None, None
-    else:
-        ss = head.split()[0]
-        blk = []
-        for ii in range(natoms):
-            newline = fp.readline()
-            if not newline:
-                return None, None
-            blk.append([float(jj) for jj in newline.split()])
-        return blk, ss
-
-
-def load_data(fname: FileType, natoms, begin=0, step=1, convert=1.0):
-    coords = []
-    steps = []
-    cc = 0
-    with open_file(fname) as fp:
-        while True:
-            blk, ss = _load_pos_block(fp, natoms)
-            if blk is None:
-                break
-            else:
-                if cc >= begin and (cc - begin) % step == 0:
-                    coords.append(blk)
-                    steps.append(ss)
-            cc += 1
-    coords = convert * np.array(coords)
-    return coords, steps
-
-
-# def load_pos(fname, natoms) :
-#     coords = []
-#     with open_file(fname) as fp:
-#         while True:
-#             blk = _load_pos_block(fp, natoms)
-#             # print(blk)
-#             if blk == None :
-#                 break
-#             else :
-#                 coords.append(blk)
-#     coords= length_convert * np.array(coords)
-#     return coords
-
-
-def load_energy(fname, begin=0, step=1):
-    data = np.loadtxt(fname, ndmin=2)
-    steps = []
-    for ii in data[begin::step, 0]:
-        steps.append("%d" % ii)  # noqa: UP031
-    with open_file(fname) as fp:
-        while True:
-            line = fp.readline()
-            if not line:
-                return None
-            if line.split()[0][0] != "#":
-                nw = len(line.split())
-                break
-    data = np.reshape(data, [-1, nw])
-    return energy_convert * data[begin::step, 5], steps
-
-
-# def load_force(fname, natoms) :
-#     coords = []
-#     with open_file(fname) as fp:
-#         while True:
-#             blk = _load_pos_block(fp, natoms)
-#             # print(blk)
-#             if blk == None :
-#                 break
-#             else :
-#                 coords.append(blk)
-#     coords= force_convert * np.array(coords)
-#     return coords
-
-
-def to_system_data(input_name, prefix, begin=0, step=1):
-    data = {}
-    data["atom_names"], data["atom_numbs"], data["atom_types"], cell = load_param_file(
-        input_name
-    )
-    data["coords"], csteps = load_data(
-        prefix + ".pos",
-        np.sum(data["atom_numbs"]),
-        begin=begin,
-        step=step,
-        convert=length_convert,
-    )
-    data["orig"] = np.zeros(3)
-    try:
-        data["cells"], tmp_steps = load_data(
-            prefix + ".cel", 3, begin=begin, step=step, convert=length_convert
-        )
-        data["cells"] = np.transpose(data["cells"], (0, 2, 1))
-        if csteps != tmp_steps:
-            csteps.append(None)
-            tmp_steps.append(None)
-            for int_id in range(len(csteps)):
-                if csteps[int_id] != tmp_steps[int_id]:
-                    break
-            step_id = begin + int_id * step
-            raise RuntimeError(
-                f"the step key between files are not consistent. "
-                f"The difference locates at step: {step_id}, "
-                f".pos is {csteps[int_id]}, .cel is {tmp_steps[int_id]}"
-            )
-    except FileNotFoundError:
-        data["cells"] = np.tile(cell, (data["coords"].shape[0], 1, 1))
-
-    # handle virial
-    stress_fname = prefix + ".str"
-    if os.path.exists(stress_fname):
-        # 1. Read stress tensor (in GPa) for each structure
-        stress, vsteps = load_data(stress_fname, 3, begin=begin, step=step, convert=1.0)
-        if csteps != vsteps:
-            csteps.append(None)
-            vsteps.append(None)
-            for int_id in range(len(csteps)):
-                if csteps[int_id] != vsteps[int_id]:
-                    break
-            step_id = begin + int_id * step
-            raise RuntimeError(
-                f"the step key between files are not consistent. "
-                f"The difference locates at step: {step_id}, "
-                f".pos is {csteps[int_id]}, .str is {vsteps[int_id]}"
-            )
-        # 2. Calculate volume from cell. revert unit to bohr before taking det
-        volumes = np.linalg.det(data["cells"] / length_convert).reshape(-1)
-        # 3. Calculate virials for each structure, shape [nf x 3 x 3]
-        data["virials"] = gpa2evperbohr * volumes[:, None, None] * stress
-
-    return data, csteps
-
-
-def to_system_label(input_name, prefix, begin=0, step=1):
-    atom_names, atom_numbs, atom_types, cell = load_param_file(input_name)
-    energy, esteps = load_energy(prefix + ".evp", begin=begin, step=step)
-    force, fsteps = load_data(
-        prefix + ".for",
-        np.sum(atom_numbs),
-        begin=begin,
-        step=step,
-        convert=force_convert,
-    )
-    assert esteps == fsteps, "the step key between files are not consistent "
-    return energy, force, esteps
-
-
-if __name__ == "__main__":
-    prefix = "nacl"
-    atom_names, atom_numbs, atom_types, cell = load_param_file(prefix + ".in")
-    coords = load_data(prefix + ".pos", np.sum(atom_numbs))
-    cells = load_data(prefix + ".cel", 3)
-    print(atom_names)
-    print(atom_numbs)
-    print(atom_types)
-    print(cells)
-    print(coords.shape)
-    print(cells.shape)
+from dpdata.formats.qe.traj import *  # noqa: F403
diff --git a/dpdata/rdkit/__init__.py b/dpdata/rdkit/__init__.py
index e69de29bb..b86820923 100644
--- a/dpdata/rdkit/__init__.py
+++ b/dpdata/rdkit/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import annotations
+
+from dpdata.formats.rdkit import *  # noqa: F403
diff --git a/dpdata/rdkit/sanitize.py b/dpdata/rdkit/sanitize.py
index 9afc52c9a..781a8742a 100644
--- a/dpdata/rdkit/sanitize.py
+++ b/dpdata/rdkit/sanitize.py
@@ -1,728 +1,3 @@
 from __future__ import annotations
 
-import os
-import time
-from copy import deepcopy
-
-
-def get_explicit_valence(atom, verbose=False):
-    exp_val_calculated_from_bonds = int(
-        sum([bond.GetBondTypeAsDouble() for bond in atom.GetBonds()])
-    )
-    try:
-        try:
-            from rdkit import Chem
-
-            exp_val = atom.GetValence(Chem.ValenceType.EXPLICIT)
-            valence_method = "GetValence(Chem.ValenceType.EXPLICIT)"
-        except (ImportError, AttributeError, TypeError):
-            exp_val = atom.GetExplicitValence()
-            valence_method = "GetExplicitValence()"
-        if exp_val != exp_val_calculated_from_bonds:
-            if verbose:
-                print(
-                    f"Explicit valence given by {valence_method} and sum of bond order are inconsistent on {atom.GetSymbol()}{atom.GetIdx() + 1}, using sum of bond order."
-                )
-        return exp_val_calculated_from_bonds
-    except Exception:
-        return exp_val_calculated_from_bonds
-
-
-def regularize_formal_charges(mol, sanitize=True, verbose=False):
-    """Regularize formal charges of atoms."""
-    from rdkit import Chem
-
-    assert isinstance(mol, Chem.rdchem.Mol)
-    for atom in mol.GetAtoms():
-        assign_formal_charge_for_atom(atom, verbose)
-    if sanitize:
-        try:
-            Chem.SanitizeMol(mol)
-            return mol
-        except Exception:
-            return None
-    else:
-        return mol
-
-
-def assign_formal_charge_for_atom(atom, verbose=False):
-    """Assigen formal charge according to 8-electron rule for element B,C,N,O,S,P,As."""
-    from rdkit import Chem
-
-    assert isinstance(atom, Chem.rdchem.Atom)
-    valence = get_explicit_valence(atom, verbose)
-    if atom.GetSymbol() == "B":
-        atom.SetFormalCharge(3 - valence)
-    elif atom.GetSymbol() == "C":
-        atom.SetFormalCharge(valence - 4)
-        if valence == 3:
-            print(
-                f"Detect a valence of 3 on #C{atom.GetIdx() + 1}, the formal charge of this atom will be assigned to -1"
-            )
-        elif valence > 4:
-            raise ValueError(f"#C{atom.GetIdx() + 1} has a valence larger than 4")
-    elif atom.GetSymbol() == "N":
-        if valence > 4:
-            raise ValueError(f"#N{atom.GetIdx() + 1} has a valence larger than 4")
-        else:
-            atom.SetFormalCharge(valence - 3)
-    elif atom.GetSymbol() == "O":
-        atom.SetFormalCharge(valence - 2)
-    elif atom.GetSymbol() == "S":
-        if valence == 1:
-            atom.SetFormalCharge(-1)
-        elif valence == 3:
-            atom.SetFormalCharge(1)
-        elif valence > 6:
-            raise ValueError(f"#S{atom.GetIdx() + 1} has a valence larger than 6")
-        else:
-            atom.SetFormalCharge(0)
-    elif atom.GetSymbol() == "P" or atom.GetSymbol() == "As":
-        if valence == 5:
-            atom.SetFormalCharge(0)
-        elif valence > 5:
-            raise ValueError(
-                f"#{atom.GetSymbol()}{atom.GetIdx() + 1} has a valence larger than 5"
-            )
-        else:
-            atom.SetFormalCharge(valence - 3)
-
-
-# print bond and atom information (for debugger)
-def print_bonds(mol):
-    for bond in mol.GetBonds():
-        begin_atom = bond.GetBeginAtom()
-        end_atom = bond.GetEndAtom()
-        print(
-            f"{begin_atom.GetSymbol()}{begin_atom.GetIdx() + 1} {end_atom.GetSymbol()}{end_atom.GetIdx() + 1} {bond.GetBondType()}"
-        )
-
-
-def print_atoms(mol):
-    for atom in mol.GetAtoms():
-        print(
-            f"{atom.GetSymbol()}{atom.GetIdx() + 1} {atom.GetFormalCharge()} {get_explicit_valence(atom)}"
-        )
-
-
-def is_terminal_oxygen(O_atom):
-    return len(O_atom.GetNeighbors()) == 1
-
-
-def get_terminal_oxygens(atom):
-    terminal_oxygens = []
-    for nei in atom.GetNeighbors():
-        if nei.GetSymbol() == "O" or nei.GetSymbol() == "S":
-            if is_terminal_oxygen(nei):
-                terminal_oxygens.append(nei)
-    return terminal_oxygens
-
-
-def is_terminal_NR2(N_atom):
-    return len(N_atom.GetNeighbors()) == 3
-
-
-def get_terminal_NR2s(atom):
-    terminal_NR2s = []
-    for nei in atom.GetNeighbors():
-        if nei.GetSymbol() == "N":
-            if is_terminal_NR2(nei):
-                terminal_NR2s.append(nei)
-    terminal_NR2s.sort(
-        key=lambda N_atom: len(
-            [atom for atom in N_atom.GetNeighbors() if atom.GetSymbol() == "H"]
-        )
-    )
-    return terminal_NR2s
-
-
-def sanitize_phosphate_Patom(P_atom, verbose=True):
-    from rdkit import Chem
-
-    if P_atom.GetSymbol() == "P":
-        terminal_oxygens = get_terminal_oxygens(P_atom)
-        mol = P_atom.GetOwningMol()
-        if len(terminal_oxygens) > 1:
-            if verbose:
-                print("Phospate group detected, sanitizing it...")
-            # set one P=O and two P-O
-            bond1 = mol.GetBondBetweenAtoms(
-                P_atom.GetIdx(), terminal_oxygens[0].GetIdx()
-            )
-            bond1.SetBondType(Chem.rdchem.BondType.DOUBLE)
-            for ii in range(1, len(terminal_oxygens)):
-                bond = mol.GetBondBetweenAtoms(
-                    P_atom.GetIdx(), terminal_oxygens[ii].GetIdx()
-                )
-                bond.SetBondType(Chem.rdchem.BondType.SINGLE)
-                terminal_oxygens[ii].SetFormalCharge(-1)
-
-
-def sanitize_phosphate(mol):
-    for atom in mol.GetAtoms():
-        sanitize_phosphate_Patom(atom)
-    return mol
-
-
-def sanitize_sulfate_Satom(S_atom, verbose=True):
-    from rdkit import Chem
-
-    if S_atom.GetSymbol() == "S":
-        terminal_oxygens = get_terminal_oxygens(S_atom)
-        mol = S_atom.GetOwningMol()
-        if len(terminal_oxygens) == 3:
-            if verbose:
-                print("Sulfate group detected, sanitizing it...")
-            # set one S-O and two S=O
-            bond1 = mol.GetBondBetweenAtoms(
-                S_atom.GetIdx(), terminal_oxygens[0].GetIdx()
-            )
-            bond1.SetBondType(Chem.rdchem.BondType.SINGLE)
-            terminal_oxygens[0].SetFormalCharge(-1)
-            for ii in range(1, len(terminal_oxygens)):
-                bond = mol.GetBondBetweenAtoms(
-                    S_atom.GetIdx(), terminal_oxygens[ii].GetIdx()
-                )
-                bond.SetBondType(Chem.rdchem.BondType.DOUBLE)
-
-
-def sanitize_sulfate(mol):
-    for atom in mol.GetAtoms():
-        sanitize_sulfate_Satom(atom)
-    return mol
-
-
-def sanitize_carboxyl_Catom(C_atom, verbose=True):
-    from rdkit import Chem
-
-    if C_atom.GetSymbol() == "C":
-        terminal_oxygens = get_terminal_oxygens(C_atom)
-        mol = C_atom.GetOwningMol()
-        if len(terminal_oxygens) == 2:
-            if verbose:
-                print("Carbonxyl group detected, sanitizing it...")
-            # set one C-O and one C=O
-            bond1 = mol.GetBondBetweenAtoms(
-                C_atom.GetIdx(), terminal_oxygens[0].GetIdx()
-            )
-            bond1.SetBondType(Chem.rdchem.BondType.SINGLE)
-            terminal_oxygens[0].SetFormalCharge(-1)
-
-            bond2 = mol.GetBondBetweenAtoms(
-                C_atom.GetIdx(), terminal_oxygens[1].GetIdx()
-            )
-            bond2.SetBondType(Chem.rdchem.BondType.DOUBLE)
-            terminal_oxygens[1].SetFormalCharge(0)
-
-
-def sanitize_carboxyl(mol):
-    for atom in mol.GetAtoms():
-        sanitize_carboxyl_Catom(atom)
-    return mol
-
-
-def sanitize_guanidine_Catom(C_atom, verbose=True):
-    from rdkit import Chem
-
-    if C_atom.GetSymbol() == "C":
-        terminal_NR2s = get_terminal_NR2s(C_atom)
-        mol = C_atom.GetOwningMol()
-        if len(terminal_NR2s) == 3:
-            if verbose:
-                print("Guanidyl group detected, sanitizing it...")
-            # set two C-N and one C=N+
-            bond1 = mol.GetBondBetweenAtoms(C_atom.GetIdx(), terminal_NR2s[0].GetIdx())
-            bond1.SetBondType(Chem.rdchem.BondType.SINGLE)
-            terminal_NR2s[0].SetFormalCharge(-1)
-
-            bond2 = mol.GetBondBetweenAtoms(C_atom.GetIdx(), terminal_NR2s[1].GetIdx())
-            bond2.SetBondType(Chem.rdchem.BondType.SINGLE)
-            terminal_NR2s[1].SetFormalCharge(0)
-
-            bond3 = mol.GetBondBetweenAtoms(C_atom.GetIdx(), terminal_NR2s[2].GetIdx())
-            bond3.SetBondType(Chem.rdchem.BondType.DOUBLE)
-            terminal_NR2s[2].SetFormalCharge(1)
-
-
-def sanitize_guanidine(mol):
-    for atom in mol.GetAtoms():
-        sanitize_guanidine_Catom(atom)
-    return mol
-
-
-def sanitize_nitro_Natom(N_atom, verbose=True):
-    from rdkit import Chem
-
-    if N_atom.GetSymbol() == "N":
-        terminal_oxygens = get_terminal_oxygens(N_atom)
-        mol = N_atom.GetOwningMol()
-        if len(terminal_oxygens) == 2:
-            if verbose:
-                print("Nitro group detected, sanitizing it...")
-            # set one N-O and one N=O
-            bond1 = mol.GetBondBetweenAtoms(
-                N_atom.GetIdx(), terminal_oxygens[0].GetIdx()
-            )
-            bond1.SetBondType(Chem.rdchem.BondType.SINGLE)
-            terminal_oxygens[0].SetFormalCharge(-1)
-
-            bond2 = mol.GetBondBetweenAtoms(
-                N_atom.GetIdx(), terminal_oxygens[1].GetIdx()
-            )
-            bond2.SetBondType(Chem.rdchem.BondType.DOUBLE)
-            terminal_oxygens[1].SetFormalCharge(0)
-
-
-def sanitize_nitro(mol):
-    for atom in mol.GetAtoms():
-        sanitize_nitro_Natom(atom)
-    return mol
-
-
-def is_terminal_nitrogen(N_atom):
-    if N_atom.GetSymbol() == "N" and len(N_atom.GetNeighbors()) == 1:
-        return True
-    else:
-        return False
-
-
-def sanitize_nitrine_Natom(atom, verbose=True):
-    from rdkit import Chem
-
-    if atom.GetSymbol() == "N" and len(atom.GetNeighbors()) == 2:
-        mol = atom.GetOwningMol()
-        nei1, nei2 = atom.GetNeighbors()[0], atom.GetNeighbors()[1]
-        if nei1.GetSymbol() == "N" and nei2.GetSymbol() == "N":
-            if is_terminal_nitrogen(nei1):
-                N_terminal = nei1
-                N_non_terminal = nei2
-            elif is_terminal_nitrogen(nei2):
-                N_terminal = nei2
-                N_non_terminal = nei1
-            else:
-                N_terminal = None
-                N_non_terminal = None
-            if (N_terminal is not None) and (N_non_terminal is not None):
-                # set X-N=[N+]=[N-]
-                if verbose:
-                    print("Detecting nitrine group, fixing it...")
-                bond = mol.GetBondBetweenAtoms(atom.GetIdx(), N_terminal.GetIdx())
-                bond.SetBondType(Chem.rdchem.BondType.DOUBLE)
-                N_terminal.SetFormalCharge(-1)
-
-                bond = mol.GetBondBetweenAtoms(atom.GetIdx(), N_non_terminal.GetIdx())
-                bond.SetBondType(Chem.rdchem.BondType.DOUBLE)
-                atom.SetFormalCharge(1)
-
-
-def contain_hetero_aromatic(mol):
-    flag = False
-    for atom in mol.GetAtoms():
-        if atom.GetSymbol() != "C" and atom.GetIsAromatic():
-            flag = True
-            break
-    return flag
-
-
-# for carbon with explicit valence > 4
-def regularize_carbon_bond_order(atom, verbose=True):
-    from rdkit import Chem
-
-    if atom.GetSymbol() == "C" and get_explicit_valence(atom) > 4:
-        if verbose:
-            print("Detecting carbon with explicit valence > 4, fixing it...")
-        mol = atom.GetOwningMol()
-        double_bond_idx = -1
-        for nei in atom.GetNeighbors():
-            bond = mol.GetBondBetweenAtoms(atom.GetIdx(), nei.GetIdx())
-            if bond.GetBondTypeAsDouble() == 2:
-                double_bond_idx = bond.GetIdx()
-                break
-        if double_bond_idx != -1:
-            for bond in atom.GetBonds():
-                if bond.GetIdx() != double_bond_idx:
-                    bond.SetBondType(Chem.rdchem.BondType.SINGLE)
-
-
-# for nitrogen with explicit valence > 4
-def regularize_nitrogen_bond_order(atom, verbose=True):
-    from rdkit import Chem
-
-    mol = atom.GetOwningMol()
-    if atom.GetSymbol() == "N" and get_explicit_valence(atom) > 4:
-        O_atoms = get_terminal_oxygens(atom)
-        for O_atom in O_atoms:
-            bond = mol.GetBondBetweenAtoms(atom.GetIdx(), O_atom.GetIdx())
-            if bond.GetBondTypeAsDouble() == 2:
-                bond.SetBondType(Chem.rdchem.BondType.SINGLE)
-                O_atom.SetFormalCharge(-1)
-
-
-def sanitize_mol(mol, verbose=False):
-    for atom in mol.GetAtoms():
-        sanitize_carboxyl_Catom(atom, verbose)
-        sanitize_guanidine_Catom(atom, verbose)
-        sanitize_phosphate_Patom(atom, verbose)
-        sanitize_sulfate_Satom(atom, verbose)
-        sanitize_nitro_Natom(atom, verbose)
-        sanitize_nitrine_Natom(atom, verbose)
-        regularize_carbon_bond_order(atom, verbose)
-        regularize_nitrogen_bond_order(atom, verbose)
-    return mol
-
-
-# copy from FEprep
-def mol_edit_log(mol, i, j):
-    if not mol.HasProp("edit"):
-        mol.SetProp("edit", "%d_%d" % (i, j))  # noqa: UP031
-    else:
-        edited = mol.GetProp("edit")
-        mol.SetProp("edit", edited + ",%d_%d" % (i, j))  # noqa: UP031
-
-
-def kekulize_aromatic_heterocycles(mol_in, assign_formal_charge=True, sanitize=True):
-    from rdkit import Chem
-    from rdkit.Chem.rdchem import BondType
-
-    mol = Chem.RWMol(mol_in)
-    rings = Chem.rdmolops.GetSymmSSSR(mol)
-    rings = [list(i) for i in list(rings)]
-    rings.sort(key=lambda r: len(r))
-
-    def search_and_assign_ring(
-        mol, ring, hetero, start, forward=True, start_switch=True
-    ):
-        j = start
-        switch = start_switch
-        lring = len(ring)
-        delta = 1 if forward else -1
-        n_edit = 0
-        n_double = 0
-        while not ((j in hetero) & (not switch)):
-            btype = BondType.SINGLE if switch else BondType.DOUBLE
-            bond = mol.GetBondBetweenAtoms(ring[j], ring[(j + delta) % lring])
-            if bond.GetBondType() == BondType.AROMATIC:
-                bond.SetBondType(btype)
-                mol_edit_log(mol, ring[j], ring[(j + delta) % lring])
-                # print(ring[j], ring[(j + delta) % lring], bond.GetBondType())
-                if btype == BondType.DOUBLE:
-                    n_double += 1
-                n_edit += 1
-            else:
-                break
-            j = (j + delta) % lring
-            switch = not switch
-        return n_edit, n_double
-
-    def print_bondtypes(mol, rings):
-        for ring in rings:
-            lring = len(ring)
-            btype = []
-            for i in range(lring):
-                btype.append(
-                    mol.GetBondBetweenAtoms(
-                        ring[i], ring[(i + 1) % lring]
-                    ).GetBondType()
-                )
-            atoms = [mol.GetAtomWithIdx(i).GetSymbol() for i in ring]
-            print(ring)
-            print(atoms)
-            print(btype)
-
-    def hetero_priority(idx, mol):
-        atom = mol.GetAtomWithIdx(idx)
-        sym = atom.GetSymbol()
-        valence = len(atom.GetBonds())
-
-        if (sym in ["O", "S"]) & (valence == 2):
-            return 0
-        elif sym in ["N", "P", "As", "B"]:
-            if valence == 3:
-                return 1
-            elif valence == 2:
-                return 2
-
-    # save carbon/hetero aromatic rings
-    CAr = []
-    HAr = []
-    for ring in rings:
-        lring = len(ring)
-        bAllAr = True
-        bAllC = True
-        for i in range(lring):
-            atom = mol.GetAtomWithIdx(ring[i])
-            if atom.GetSymbol() != "C":
-                bAllC = False
-
-            bond = mol.GetBondBetweenAtoms(ring[i], ring[(i + 1) % lring])
-            if bond.GetBondType() != BondType.AROMATIC:
-                bAllAr = False
-        if bAllAr and bAllC:
-            CAr.append(ring)
-        elif bAllAr and not bAllC:
-            HAr.append(ring)
-
-    if len(HAr) == 0:
-        # no hetrerocycles
-        return mol_in
-    else:
-        # edit heterocycles
-        for ring in HAr:
-            lring = len(ring)
-            cring = len(CAr)
-            hetero = []
-            hasDouble = []
-            fuseCAr = []
-            fuseDouble = []
-            for i in range(lring):
-                fuseCAr.append(-1)
-                for j in range(cring):
-                    if ring[i] in CAr[j]:
-                        fuseCAr[i] = j
-                        break
-                if i > 1:
-                    if (fuseCAr[i] == fuseCAr[i - 1]) & (fuseCAr[i] >= 0):
-                        fuseDouble.append(i)
-                atom = mol.GetAtomWithIdx(ring[i])
-                if atom.GetSymbol() != "C":
-                    hetero.append(i)
-                atom_bonds = atom.GetBonds()
-                btype = [bond.GetBondType() for bond in atom_bonds]
-                # print(btype)
-                if BondType.DOUBLE in btype:
-                    hasDouble.append(i)
-                bond = mol.GetBondBetweenAtoms(ring[i], ring[(i + 1) % lring])
-
-            if (fuseCAr[0] == fuseCAr[lring - 1]) & (fuseCAr[0] >= 0):
-                fuseDouble.append(0)
-
-            if (len(hetero) > 0) | (len(hasDouble) > 0):
-                n_targetDouble = lring // 2
-                n_targetEdit = lring
-                hetero_prior = {i: hetero_priority(ring[i], mol) for i in hetero}
-                hetero.sort(key=lambda i: hetero_prior[i])
-                for i in hasDouble:
-                    d1, e1 = search_and_assign_ring(mol, ring, hetero, i, forward=True)
-                    d2, e2 = search_and_assign_ring(mol, ring, hetero, i, forward=False)
-                    n_targetDouble -= d1 + d2 + 1
-                    n_targetEdit -= e1 + e2
-                for i in fuseDouble:
-                    bond = mol.GetBondBetweenAtoms(ring[i], ring[(i - 1) % lring])
-                    if bond.GetBondType() == BondType.AROMATIC:
-                        bond.SetBondType(BondType.DOUBLE)
-                        mol_edit_log(mol, ring[i], ring[(i - 1) % lring])
-                    d1, e1 = search_and_assign_ring(mol, ring, hetero, i, forward=True)
-                    d2, e2 = search_and_assign_ring(
-                        mol, ring, hetero, (i - 1) % lring, forward=False
-                    )
-                    n_targetDouble -= d1 + d2 + 1
-                    n_targetEdit -= e1 + e2 + 1
-                for i in hetero:
-                    atom = mol.GetAtomWithIdx(ring[i])
-                    if (hetero_prior[i] == 2) | (n_targetDouble * 2 >= n_targetEdit):
-                        forward_btype = mol.GetBondBetweenAtoms(
-                            ring[i], ring[(i + 1) % lring]
-                        ).GetBondType()
-                        backward_btype = mol.GetBondBetweenAtoms(
-                            ring[i], ring[(i - 1) % lring]
-                        ).GetBondType()
-                        if forward_btype != BondType.AROMATIC:
-                            switch = forward_btype == BondType.DOUBLE
-                            d1, e1 = search_and_assign_ring(
-                                mol, ring, hetero, i, forward=False, start_switch=switch
-                            )
-                            d2 = e2 = 0
-                        elif backward_btype != BondType.AROMATIC:
-                            switch = backward_btype == BondType.DOUBLE
-                            d1, e1 = search_and_assign_ring(
-                                mol, ring, hetero, i, forward=True, start_switch=switch
-                            )
-                            d2 = e2 = 0
-                        else:
-                            d1, e1 = search_and_assign_ring(
-                                mol, ring, hetero, i, forward=True, start_switch=True
-                            )
-                            d2, e2 = search_and_assign_ring(
-                                mol, ring, hetero, i, forward=False, start_switch=False
-                            )
-                        n_targetDouble -= d1 + d2
-                        n_targetEdit -= e1 + e2
-                    else:
-                        d1, e1 = search_and_assign_ring(
-                            mol, ring, hetero, i, forward=True, start_switch=True
-                        )
-                        d2, e2 = search_and_assign_ring(
-                            mol, ring, hetero, i, forward=False, start_switch=True
-                        )
-                        n_targetDouble -= d1 + d2
-                        n_targetEdit -= e1 + e2
-
-        for ring in CAr:
-            lring = len(ring)
-            for i in range(lring):
-                bond = mol.GetBondBetweenAtoms(ring[i], ring[(i + 1) % lring])
-                bond.SetBondType(BondType.AROMATIC)
-        print("Manual kekulization for aromatic heterocycles:")
-        print_bondtypes(mol, rings)
-
-        atoms = mol.GetAtoms()
-        for i in range(len(atoms)):
-            mol.ReplaceAtom(i, Chem.Atom(atoms[i].GetSymbol()))
-        mol_edited = mol.GetMol()
-        # charge assignment
-        if assign_formal_charge:
-            mol_edited = regularize_formal_charges(mol_edited, sanitize=False)
-        if not sanitize:
-            return mol_edited
-        else:
-            try:
-                Chem.SanitizeMol(mol_edited)
-                return mol_edited
-            except Exception as e:
-                raise RuntimeError(
-                    f"Manual kekulization for aromatic heterocycles failed, below are errors:\n\t {e}"
-                )
-
-
-def convert_by_obabel(
-    mol, cache_dir=os.path.join(os.getcwd(), ".cache"), obabel_path="obabel"
-):
-    from openbabel import openbabel
-    from rdkit import Chem
-
-    if not os.path.exists(cache_dir):
-        os.mkdir(cache_dir)
-    if mol.HasProp("_Name"):
-        name = mol.GetProp("_Name")
-    else:
-        name = f"mol{int(time.time())}"
-    mol_file_in = os.path.join(cache_dir, f"{name}.mol")
-    mol_file_out = os.path.join(cache_dir, f"{name}_obabel.mol")
-    Chem.MolToMolFile(mol, mol_file_in, kekulize=False)
-    obConversion = openbabel.OBConversion()
-    obConversion.SetInAndOutFormats("mol", "mol")
-    mol = openbabel.OBMol()
-    obConversion.ReadFile(mol, mol_file_in)
-    obConversion.WriteFile(mol, mol_file_out)
-    mol_obabel = Chem.MolFromMolFile(mol_file_out, removeHs=False, sanitize=False)
-    return mol_obabel
-
-
-def super_sanitize_mol(mol, name=None, verbose=True):
-    from rdkit import Chem
-
-    if name is None:
-        if mol.HasProp("_Name"):
-            name = mol.GetProp("_Name")
-        else:
-            name = "mol"
-    try:
-        if verbose:
-            print("=====Stage 1: use Hermite procedure=====")
-        # use our procedure
-        mol = sanitize_mol(mol, verbose)
-        mol = regularize_formal_charges(mol, sanitize=False)
-        mol_copy = deepcopy(mol)
-        Chem.SanitizeMol(mol_copy)
-        if verbose:
-            print(name, "Success.")
-        return mol_copy
-    except Exception as e:
-        try:
-            if verbose:
-                print(
-                    "Hermite procedure failed, maybe due to unsupported representation of hetero aromatic rings, re-try with obabel"
-                )
-                print("=====Stage 2: re-try with obabel=====")
-            mol = convert_by_obabel(mol)
-            mol = sanitize_mol(mol, verbose)
-            mol = kekulize_aromatic_heterocycles(
-                mol, assign_formal_charge=False, sanitize=False
-            )  # aromatic heterocycles
-            mol = regularize_formal_charges(mol, sanitize=False)
-            mol_copy = deepcopy(mol)
-            Chem.SanitizeMol(mol_copy)
-            if verbose:
-                print(name, "Success.")
-            return mol_copy
-        except Exception as e:
-            if verbose:
-                print(e)
-                print(name, "Failed!")
-            return None
-
-
-class Sanitizer:
-    def __init__(self, level="medium", raise_errors=True, verbose=False):
-        """Set up sanitizer.
-        --------.
-
-        Parameters
-        ----------
-        level : 'low', 'medium' or 'high'.
-            `low`    - use rdkit.Chem.SanitizeMol() to sanitize
-            `medium` - before using rdkit, assign formal charges of each atom first, which requires
-                        the rightness of bond order information
-            `high`   - try to regularize bond order of nitro, phosphate, sulfate, nitrine, guanidine,
-                        pyridine-oxide function groups and aromatic heterocycles. If failed, the program
-                        will call obabel to pre-process the mol object and re-try the procedure.
-        raise_errors : bool, default=True
-            If True, raise SanitizeError when failed.
-        verbose : bool, default=False
-            If True, print error information when failed.
-        """
-        self._check_level(level)
-        self.level = level
-        self.raise_errors = raise_errors
-        self.verbose = verbose
-
-    def _check_level(self, level):
-        if level not in ["low", "medium", "high"]:
-            raise ValueError(
-                f"Invalid level '{level}', please set to 'low', 'medium' or 'high'"
-            )
-
-    def _handle_exception(self, error_info):
-        if self.raise_errors:
-            raise SanitizeError(error_info)
-        elif self.verbose:
-            print(error_info)
-
-    def sanitize(self, mol):
-        """Sanitize mol according to `self.level`. If failed, return None."""
-        from rdkit import Chem
-
-        if self.level == "low":
-            try:
-                Chem.SanitizeMol(mol)
-                return mol
-            except Exception as e:
-                error_info = f"Sanitization Failed, please use more strict sanitizer by setting 'level' to 'medium' or 'high'. The error occurs:\n\t{e}"
-                self._handle_exception(error_info)
-                return None
-        elif self.level == "medium":
-            try:
-                mol = regularize_formal_charges(mol, sanitize=False)
-                Chem.SanitizeMol(mol)
-                return mol
-            except Exception as e:
-                error_info = f"Sanitization Failed, please use more strict sanitizer by setting 'level' to 'high'. The error occurs:\n\t{e}"
-                self._handle_exception(error_info)
-                return None
-        elif self.level == "high":
-            mol = super_sanitize_mol(mol, verbose=self.verbose)
-            error_info = "Sanitization Failed. Please check your molecule file."
-            if mol is None:
-                self._handle_exception(error_info)
-            return mol
-
-
-class SanitizeError(Exception):
-    def __init__(self, content="Sanitization Failed."):
-        self.content = content
-
-    def __str__(self):
-        return self.content
-
-    def __repr__(self):
-        return self.__str__()
+from dpdata.formats.rdkit.sanitize import *  # noqa: F403
diff --git a/dpdata/rdkit/utils.py b/dpdata/rdkit/utils.py
index efeef6070..124910271 100644
--- a/dpdata/rdkit/utils.py
+++ b/dpdata/rdkit/utils.py
@@ -1,131 +1,3 @@
 from __future__ import annotations
 
-import numpy as np
-
-
-def mol_to_system_data(mol):
-    from rdkit import Chem
-
-    if not isinstance(mol, Chem.rdchem.Mol):
-        raise TypeError(f"rdkit.Chem.Mol required, not {type(mol)}")
-
-    num_confs = mol.GetNumConformers()
-    if num_confs:
-        atom_symbols = [at.GetSymbol() for at in mol.GetAtoms()]
-        atom_names, atom_types, atom_numbs = np.unique(
-            atom_symbols, return_inverse=True, return_counts=True
-        )
-        coords = np.array([conf.GetPositions() for conf in mol.GetConformers()])
-        bonds = np.array(
-            [
-                [
-                    bond.GetBeginAtomIdx(),
-                    bond.GetEndAtomIdx(),
-                    bond.GetBondTypeAsDouble(),
-                ]
-                for bond in mol.GetBonds()
-            ]
-        )
-        formal_charges = np.array(
-            [at.GetFormalCharge() for at in mol.GetAtoms()], dtype=np.int32
-        )
-        data = {}
-        data["atom_numbs"] = list(atom_numbs)
-        data["atom_names"] = list(atom_names)
-        data["atom_types"] = atom_types
-        data["cells"] = np.array(
-            [
-                [[100.0, 0.0, 0.0], [0.0, 100.0, 0.0], [0.0, 0.0, 100.0]]
-                for _ in range(num_confs)
-            ]
-        )
-        data["coords"] = coords
-        data["bonds"] = bonds
-        data["formal_charges"] = formal_charges
-        data["orig"] = np.array([0.0, 0.0, 0.0])
-        # other properties
-        if mol.HasProp("_Name"):
-            data["_name"] = mol.GetProp("_Name")
-        return data
-    else:
-        raise ValueError("The moleclue does not contain 3-D conformers")
-
-
-def system_data_to_mol(data):
-    from rdkit import Chem
-
-    mol_ed = Chem.RWMol()
-    atom_symbols = [data["atom_names"][i] for i in data["atom_types"]]
-    # add atoms
-    for atom_type in data["atom_types"]:
-        symbol = data["atom_names"][atom_type]
-        atom = Chem.Atom(symbol)
-        mol_ed.AddAtom(atom)
-    # add bonds
-    for bond_info in data["bonds"]:
-        if bond_info[2] == 1:
-            mol_ed.AddBond(int(bond_info[0]), int(bond_info[1]), Chem.BondType.SINGLE)
-        elif bond_info[2] == 2:
-            mol_ed.AddBond(int(bond_info[0]), int(bond_info[1]), Chem.BondType.DOUBLE)
-        elif bond_info[2] == 3:
-            mol_ed.AddBond(int(bond_info[0]), int(bond_info[1]), Chem.BondType.TRIPLE)
-        elif bond_info[2] == 1.5:
-            mol_ed.AddBond(int(bond_info[0]), int(bond_info[1]), Chem.BondType.AROMATIC)
-    # set conformers
-    for frame_idx in range(data["coords"].shape[0]):
-        conf = Chem.rdchem.Conformer(len(data["atom_types"]))
-        for atom_idx in range(len(data["atom_types"])):
-            conf.SetAtomPosition(atom_idx, data["coords"][frame_idx][atom_idx])
-        mol_ed.AddConformer(conf, assignId=True)
-    mol = mol_ed.GetMol()
-    # set formal charges
-    for idx, atom in enumerate(mol.GetAtoms()):
-        atom.SetFormalCharge(int(data["formal_charges"][idx]))
-    # set mol name
-    if "_name" in list(data.keys()):
-        mol.SetProp("_Name", data["_name"])
-    # sanitize
-    Chem.SanitizeMol(mol_ed)
-    return mol
-
-
-def check_same_atom(atom_1, atom_2):
-    if atom_1.GetIdx() != atom_2.GetIdx():
-        return False
-    elif atom_1.GetSymbol() != atom_2.GetSymbol():
-        return False
-    else:
-        return True
-
-
-def check_same_molecule(mol_1, mol_2):
-    flag = True
-    for bond_1, bond_2 in zip(mol_1.GetBonds(), mol_2.GetBonds()):
-        begin_atom_1, end_atom_1 = bond_1.GetBeginAtom(), bond_1.GetEndAtom()
-        begin_atom_2, end_atom_2 = bond_2.GetBeginAtom(), bond_2.GetEndAtom()
-        if not check_same_atom(begin_atom_1, begin_atom_2):
-            flag = False
-            break
-        elif not check_same_atom(end_atom_1, end_atom_2):
-            flag = False
-            break
-    return flag
-
-
-def check_molecule_list(mols):
-    flag = True
-    for mol in mols[1:]:
-        if not check_same_molecule(mol, mols[0]):
-            flag = False
-            break
-    return flag
-
-
-def combine_molecules(mols):
-    if check_molecule_list(mols):
-        for mol in mols[1:]:
-            for conf in mol.GetConformers():
-                mols[0].AddConformer(conf, assignId=True)
-        return mols[0]
-    else:
-        raise ValueError("molecules are not of the same topology.")
+from dpdata.formats.rdkit.utils import *  # noqa: F403
diff --git a/dpdata/siesta/__init__.py b/dpdata/siesta/__init__.py
index e69de29bb..0210250ed 100644
--- a/dpdata/siesta/__init__.py
+++ b/dpdata/siesta/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import annotations
+
+from dpdata.formats.siesta import *  # noqa: F403
diff --git a/dpdata/siesta/aiMD_output.py b/dpdata/siesta/aiMD_output.py
index daa4f6a25..04a154aea 100644
--- a/dpdata/siesta/aiMD_output.py
+++ b/dpdata/siesta/aiMD_output.py
@@ -1,187 +1,3 @@
-# !/usr/bin/python3
 from __future__ import annotations
 
-import numpy as np
-
-ev2ev = 1
-ang2ang = 1
-
-
-#############################read output#####################################
-def get_single_line_tail(fin, keyword, num=1):
-    file = open(fin)
-    part_res = []
-    for value in file:
-        if keyword in value:
-            temp = len(value.split()) - num
-            part_res.append(float(value.split()[temp]))
-
-    file.close()
-    return part_res
-
-
-## atomnum: number of atoms,  row numbers
-## begin_column: begin column num
-## read_column_num: read column num
-## column_num: the column number in nxet reading line
-def extract_keyword(
-    fout,
-    keyword,
-    down_line_num,
-    begin_column,
-    read_column_num,
-    is_repeated_read,
-    column_num,
-):
-    file = open(fout)
-    ret = []
-    part_ret = []
-    flag = 0
-    idx = 0
-    extr_frame = 0
-    length = obtain_nframe(fout)
-    # for (num,value) in enumerate(file):
-    for value in file:
-        if keyword in value:
-            flag = 1
-            continue
-        if flag == 1:
-            if idx < down_line_num:
-                idx += 1
-            else:
-                flag = 0
-                part_ret.append(np.array(ret))
-                ret = []
-                extr_frame += 1
-                if extr_frame == length:
-                    file.close()
-                    return part_ret
-                ## is_repeated_read = 0: only read 1 time for SCF
-                ## is_repeated_read = 1:  read all for aiMD --> get all frames
-                if is_repeated_read:
-                    idx = 0
-                continue
-
-            for i in range(begin_column, read_column_num):
-                if len(value.split()) == column_num:
-                    if not value.split()[i].isalpha():
-                        ret.append(float(value.strip().split()[i]))
-                    else:
-                        ret.append(value.strip().split()[i])
-            continue
-    file.close()
-    return part_ret
-
-
-def obtain_nframe(fname):
-    fp = open(fname)
-    flag = False
-    idx = 0
-    temp = 0
-    for ii in fp:
-        if "siesta: Stress tensor (static) (eV/Ang**3):" in ii:
-            flag = True
-            continue
-        if flag:
-            if "siesta: Pressure (static):" not in ii:
-                if len(ii.split()) == 3:
-                    temp += 1
-                    if temp == 3:
-                        idx += 1
-                        # print(idx)
-                        flag = False
-                        temp = 0
-    fp.close()
-    return idx
-
-
-def get_atom_types(fout, atomnums):
-    covert_type = extract_keyword(
-        fout, "outcoor: Atomic coordinates (Ang):", atomnums, 3, 4, 0, 6
-    )[0]
-    atomtype = []
-    # print(covert_type)
-    for i in range(0, len(covert_type)):
-        atomtype.append(int(covert_type[i]) - 1)
-    return atomtype
-
-
-def get_atom_name(fout):
-    file = open(fout)
-    ret = []
-    for value in file:
-        if "Species number:" in value:
-            for j in range(len(value.split())):
-                if value.split()[j] == "Label:":
-                    ret.append(value.split()[j + 1])
-                    break
-    file.close()
-    return ret
-
-
-def get_atom_numbs(atomtypes):
-    atom_numbs = []
-    for i in set(atomtypes):
-        atom_numbs.append(atomtypes.count(i))
-    return atom_numbs
-
-
-def get_virial(fout, cell):
-    viri = extract_keyword(
-        fout, "siesta: Stress tensor (static) (eV/Ang**3):", 3, 0, 3, 1, 3
-    )
-    vols = []
-    length = obtain_nframe(fout)
-    for ii in range(length):
-        vols.append(np.linalg.det(cell[ii].reshape([3, 3])))
-        for jj in range(len(viri[ii])):
-            ## siesta: 1eV/A^3= 1.60217*10^11 Pa ,  ---> qe: kBar=10^8Pa
-            # ii *= vols[idx] * 1e3 / 1.602176621e6 * (1.602176621e3)
-            viri[ii][jj] *= vols[ii]
-    return viri
-
-
-def covert_dimension(arr, num):
-    arr = np.array(arr)
-    frames = len(arr)
-    ret = np.zeros((frames, num, 3))
-    for i in range(frames):
-        ret[i] = arr[i].reshape(num, 3)
-    return ret
-
-
-def get_aiMD_frame(fname):
-    NumberOfSpecies = int(
-        get_single_line_tail(fname, "redata: Number of Atomic Species")[0]
-    )
-    atom_names = get_atom_name(fname)
-    tot_natoms = int(get_single_line_tail(fname, "Number of atoms", 3)[0])
-
-    atom_types = get_atom_types(fname, tot_natoms)
-    atom_numbs = get_atom_numbs(atom_types)
-    assert max(atom_types) + 1 == NumberOfSpecies
-
-    cell = extract_keyword(fname, "outcell: Unit cell vectors (Ang):", 3, 0, 3, 1, 3)
-    coord = extract_keyword(
-        fname, "outcoor: Atomic coordinates (Ang):", tot_natoms, 0, 3, 1, 6
-    )
-    energy = get_single_line_tail(fname, "siesta: E_KS(eV) =")
-    force = extract_keyword(
-        fname, "siesta: Atomic forces (eV/Ang):", tot_natoms, 1, 4, 1, 4
-    )
-    virial = get_virial(fname, cell)
-
-    cells = covert_dimension(np.array(cell), 3)
-    coords = covert_dimension(np.array(coord), tot_natoms)
-    forces = covert_dimension(np.array(force), tot_natoms)
-    virials = covert_dimension(np.array(virial), 3)
-    return (
-        atom_names,
-        atom_numbs,
-        np.array(atom_types),
-        cells,
-        coords,
-        np.array(energy),
-        forces,
-        virials,
-    )
+from dpdata.formats.siesta.aiMD_output import *  # noqa: F403
diff --git a/dpdata/siesta/output.py b/dpdata/siesta/output.py
index 0c944d5b5..eb7e383b8 100644
--- a/dpdata/siesta/output.py
+++ b/dpdata/siesta/output.py
@@ -1,142 +1,3 @@
-#!/usr/bin/python3
 from __future__ import annotations
 
-import numpy as np
-
-ev2ev = 1
-ang2ang = 1
-
-
-#############################read output#####################################
-def get_single_line_tail(fin, keyword, num=1):
-    file = open(fin)
-    res = []
-    for value in file:
-        if keyword in value:
-            temp = len(value.split()) - num
-            res.append(float(value.split()[temp]))
-            file.close()
-            return res
-    return res
-
-
-## atomnum: number of atoms,  row numbers
-## begin_column: begin column num
-## column_num: read column num
-def extract_keyword(fout, keyword, down_line_num, begin_column, column_num):
-    file = open(fout)
-    ret = []
-    flag = 0
-    idx = 0
-    # for (num,value) in enumerate(file):
-    for value in file:
-        if keyword in value:
-            flag = 1
-            continue
-        if flag == 1:
-            if idx < down_line_num:
-                idx += 1
-            else:
-                flag = 0
-                continue
-            if len(value.split()) >= column_num:
-                for i in range(begin_column, column_num):
-                    if not value.split()[i].isalpha():
-                        ret.append(float(value.strip().split()[i]))
-                    else:
-                        ret.append(value.strip().split()[i])
-            ## compatible siesta-4.0.2 and siesta-4.1-b4
-            else:
-                flag = 0
-                idx = 0
-    file.close()
-    return ret
-
-
-def get_atom_types(fout, atomnums):
-    covert_type = extract_keyword(
-        fout, "outcoor: Atomic coordinates (Ang):", atomnums, 3, 4
-    )
-    atomtype = []
-    for i in range(0, len(covert_type)):
-        atomtype.append(int(covert_type[i]) - 1)
-    return atomtype
-
-
-def get_atom_name(fout):
-    file = open(fout)
-    ret = []
-    for value in file:
-        if "Species number:" in value:
-            for j in range(len(value.split())):
-                if value.split()[j] == "Label:":
-                    ret.append(value.split()[j + 1])
-                    break
-    file.close()
-    return ret
-
-
-def get_atom_numbs(atomtypes):
-    atom_numbs = []
-    for i in set(atomtypes):
-        atom_numbs.append(atomtypes.count(i))
-    return atom_numbs
-
-
-def get_virial(fout, cells):
-    vols = []
-    for ii in cells:
-        ### calucate vol
-        vols.append(np.linalg.det(ii.reshape([3, 3])))
-    ret = extract_keyword(fout, "siesta: Stress tensor (static) (eV/Ang**3):", 3, 1, 4)
-    ret = np.array([ret])
-    for idx, ii in enumerate(ret):
-        ## siesta: 1eV/A^3= 1.60217*10^11 Pa ,  ---> qe: kBar=10^8Pa
-        # ii *= vols[idx] * 1e3 / 1.602176621e6 * (1.602176621e3)
-        ii *= vols[idx]
-    return ret
-
-
-def obtain_frame(fname):
-    NumberOfSpecies = int(
-        get_single_line_tail(fname, "redata: Number of Atomic Species")[0]
-    )
-    atom_names = get_atom_name(fname)
-    tot_natoms = int(get_single_line_tail(fname, "Number of atoms", 3)[0])
-    atom_types = get_atom_types(fname, tot_natoms)
-    atom_numbs = get_atom_numbs(atom_types)
-    assert max(atom_types) + 1 == NumberOfSpecies
-    cell = extract_keyword(fname, "outcell: Unit cell vectors (Ang):", 3, 0, 3)
-    coord = extract_keyword(
-        fname, "outcoor: Atomic coordinates (Ang):", tot_natoms, 0, 3
-    )
-    energy = get_single_line_tail(fname, "siesta: E_KS(eV) =")
-    force = extract_keyword(fname, "siesta: Atomic forces (eV/Ang):", tot_natoms, 1, 4)
-    virial = get_virial(fname, np.array([cell]))
-
-    cell = np.array(cell).reshape(3, 3)
-    coord = np.array(coord).reshape(tot_natoms, 3)
-    force = np.array(force).reshape(tot_natoms, 3)
-    virial = np.array(virial).reshape(3, 3)
-
-    # data = {}
-    # data['orig'] = np.array([0, 0, 0])
-    # data['atom_names'] = atom_names
-    # data['atom_numbs'] = atom_numbs
-    # data['atom_types'] = np.array(atom_types)
-    # data['cells'] = np.array([cell])
-    # data['coords'] = np.array([coord])
-    # data['energies'] = np.array([energy])
-    # data['forces'] = np.array([force])
-    # data['virials'] = virial
-    # return data
-    return (
-        atom_names,
-        atom_numbs,
-        np.array(atom_types),
-        np.array([cell]),
-        np.array([coord]),
-        np.array(energy),
-        np.array([force]),
-        np.array([virial]),
-    )
+from dpdata.formats.siesta.output import *  # noqa: F403
diff --git a/dpdata/system.py b/dpdata/system.py
index 6023891ff..4150abc89 100644
--- a/dpdata/system.py
+++ b/dpdata/system.py
@@ -22,10 +22,10 @@
 # ensure all plugins are loaded!
 import dpdata.plugins
 import dpdata.plugins.deepmd
-from dpdata.amber.mask import load_param_file, pick_by_amber_mask
 from dpdata.data_type import Axis, DataError, DataType, get_data_types
 from dpdata.driver import Driver, Minimizer
 from dpdata.format import Format
+from dpdata.formats.amber.mask import load_param_file, pick_by_amber_mask
 from dpdata.plugin import Plugin
 from dpdata.utils import (
     add_atom_names,
diff --git a/dpdata/vasp/__init__.py b/dpdata/vasp/__init__.py
index e69de29bb..82488a465 100644
--- a/dpdata/vasp/__init__.py
+++ b/dpdata/vasp/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import annotations
+
+from dpdata.formats.vasp import *  # noqa: F403
diff --git a/dpdata/vasp/outcar.py b/dpdata/vasp/outcar.py
index a16fd6f9f..e5f0b7c71 100644
--- a/dpdata/vasp/outcar.py
+++ b/dpdata/vasp/outcar.py
@@ -1,275 +1,3 @@
 from __future__ import annotations
 
-import re
-import warnings
-
-import numpy as np
-
-
-def atom_name_from_potcar_string(instr: str) -> str:
-    """Get atom name from a potcar element name.
-
-    e.g. Sn_d -> Sn
-
-    Parameters
-    ----------
-    instr : str
-        input potcar elemenet name
-
-    Returns
-    -------
-    name: str
-        name of atoms
-    """
-    if "_" in instr:
-        # for case like : TITEL  = PAW_PBE Sn_d 06Sep2000
-        return instr.split("_")[0]
-    else:
-        return instr
-
-
-def system_info(
-    lines: list[str],
-    type_idx_zero: bool = False,
-) -> tuple[list[str], list[int], np.ndarray, int | None, int | None]:
-    """Get system information from lines of an OUTCAR file.
-
-    Parameters
-    ----------
-    lines : list[str]
-        the lines of the OUTCAR file
-    type_idx_zero : bool
-        if true atom types starts from 0 otherwise from 1.
-
-    Returns
-    -------
-    atom_names: list[str]
-        name of atoms
-    atom_numbs: list[int]
-        number of atoms that have a certain name. same length as atom_names
-    atom_types: np.ndarray
-        type of each atom, the array has same lenght as number of atoms
-    nelm: optional[int]
-        the value of NELM parameter
-    nwrite: optional[int]
-        the value of NWRITE parameter
-    """
-    atom_names = []
-    atom_names_potcar = []
-    atom_numbs = None
-    nelm = None
-    nwrite = None
-    for ii in lines:
-        if "TITEL" in ii:
-            # get atom names from POTCAR info, tested only for PAW_PBE ...
-            # for case like : TITEL  = PAW_PBE Sn_d 06Sep2000
-            _ii = ii.split()[3]
-            atom_names.append(atom_name_from_potcar_string(_ii))
-        elif "POTCAR:" in ii:
-            # get atom names from POTCAR info, tested only for PAW_PBE ...
-            # for case like : POTCAR:  PAW_PBE Ti 08Apr2002
-            _ii = ii.split()[2]
-            atom_names_potcar.append(atom_name_from_potcar_string(_ii))
-        # a stricker check for "NELM"; compatible with distingct formats in different versions(6 and older, newers_expect-to-work) of vasp
-        elif nelm is None:
-            m = re.search(r"NELM\s*=\s*(\d+)", ii)
-            if m:
-                nelm = int(m.group(1))
-        elif nwrite is None:
-            m = re.search(r"NWRITE\s*=\s*(\d+)", ii)
-            if m:
-                nwrite = int(m.group(1))
-        if "ions per type" in ii:
-            atom_numbs_ = [int(s) for s in ii.split()[4:]]
-            if atom_numbs is None:
-                atom_numbs = atom_numbs_
-            else:
-                assert atom_numbs == atom_numbs_, "in consistent numb atoms in OUTCAR"
-    if len(atom_names) == 0:
-        # try to use atom_names_potcar
-        if len(atom_names_potcar) == 0:
-            raise ValueError("cannot get atom names from potcar")
-        nnames = len(atom_names_potcar)
-        # the names are repeated. check if it is the case
-        assert atom_names_potcar[: nnames // 2] == atom_names_potcar[nnames // 2 :]
-        atom_names = atom_names_potcar[: nnames // 2]
-    assert nelm is not None, "cannot find maximum steps for each SC iteration"
-    assert atom_numbs is not None, "cannot find ion type info in OUTCAR"
-    if len(atom_numbs) != len(atom_names):
-        raise RuntimeError(
-            f"The number of the atom numbers per each type ({len(atom_numbs)}) "
-            f"does not match that of the atom types ({len(atom_names)}) detected "
-            f"from the OUTCAR. This issue may be cause by a bug in vasp <= 6.3. "
-            f"Please try to convert data from vasprun.xml instead."
-        )
-    atom_names = atom_names[: len(atom_numbs)]
-    atom_types = []
-    for idx, ii in enumerate(atom_numbs):
-        for jj in range(ii):
-            if type_idx_zero:
-                atom_types.append(idx)
-            else:
-                atom_types.append(idx + 1)
-    return atom_names, atom_numbs, np.array(atom_types, dtype=int), nelm, nwrite
-
-
-def get_outcar_block(fp, ml=False):
-    blk = []
-    energy_token = ["free  energy   TOTEN", "free  energy ML TOTEN"]
-    ml_index = int(ml)
-    for ii in fp:
-        if not ii:
-            return blk
-        blk.append(ii.rstrip("\n"))
-        if energy_token[ml_index] in ii:
-            return blk
-    return blk
-
-
-def check_outputs(coord, cell, force):
-    if len(force) == 0:
-        raise ValueError("cannot find forces in OUTCAR block")
-    if len(coord) == 0:
-        raise ValueError("cannot find coordinates in OUTCAR block")
-    if len(cell) == 0:
-        raise ValueError("cannot find cell in OUTCAR block")
-    return True
-
-
-# we assume that the force is printed ...
-def get_frames(fname, begin=0, step=1, ml=False, convergence_check=True):
-    with open(fname) as fp:
-        return _get_frames_lower(
-            fp,
-            fname,
-            begin=begin,
-            step=step,
-            ml=ml,
-            convergence_check=convergence_check,
-        )
-
-
-def _get_frames_lower(fp, fname, begin=0, step=1, ml=False, convergence_check=True):
-    blk = get_outcar_block(fp)
-
-    atom_names, atom_numbs, atom_types, nelm, nwrite = system_info(
-        blk, type_idx_zero=True
-    )
-    ntot = sum(atom_numbs)
-
-    all_coords = []
-    all_cells = []
-    all_energies = []
-    all_forces = []
-    all_virials = []
-
-    cc = 0
-    rec_failed = []
-    while len(blk) > 0:
-        if cc >= begin and (cc - begin) % step == 0:
-            coord, cell, energy, force, virial, is_converge = analyze_block(
-                blk, ntot, nelm, ml
-            )
-            if energy is None:
-                break
-            if nwrite == 0:
-                has_label = len(force) > 0 and len(coord) > 0 and len(cell) > 0
-                if not has_label:
-                    warnings.warn("cannot find labels in the frame, ingore")
-            else:
-                has_label = check_outputs(coord, cell, force)
-            if (is_converge or not convergence_check) and has_label:
-                all_coords.append(coord)
-                all_cells.append(cell)
-                all_energies.append(energy)
-                all_forces.append(force)
-                if virial is not None:
-                    all_virials.append(virial)
-            if not is_converge:
-                rec_failed.append(cc + 1)
-
-        blk = get_outcar_block(fp, ml)
-        cc += 1
-
-    if len(rec_failed) > 0:
-        prt = (
-            "so they are not collected."
-            if convergence_check
-            else "but they are still collected due to the requirement for ignoring convergence checks."
-        )
-        warnings.warn(
-            f"The following structures were unconverged: {rec_failed}; " + prt
-        )
-
-    if len(all_virials) == 0:
-        all_virials = None
-    else:
-        all_virials = np.array(all_virials)
-    return (
-        atom_names,
-        atom_numbs,
-        atom_types,
-        np.array(all_cells),
-        np.array(all_coords),
-        np.array(all_energies),
-        np.array(all_forces),
-        all_virials,
-    )
-
-
-def analyze_block(lines, ntot, nelm, ml=False):
-    coord = []
-    cell = []
-    energy = None
-    force = []
-    virial = None
-    is_converge = True
-    sc_index = 0
-    # select different searching tokens based on the ml label
-    energy_token = ["free  energy   TOTEN", "free  energy ML TOTEN"]
-    energy_index = [4, 5]
-    virial_token = ["FORCE on cell =-STRESS in cart. coord.  units", "ML FORCE"]
-    virial_index = [14, 4]
-    cell_token = ["VOLUME and BASIS", "ML FORCE"]
-    cell_index = [5, 12]
-    ml_index = int(ml)
-    for idx, ii in enumerate(lines):
-        # if set ml == True, is_converged will always be True
-        if ("Iteration" in ii) and (not ml):
-            sc_index = int(ii.split()[3][:-1])
-            if sc_index >= nelm:
-                is_converge = False
-        elif energy_token[ml_index] in ii:
-            energy = float(ii.split()[energy_index[ml_index]])
-            return coord, cell, energy, force, virial, is_converge
-        elif cell_token[ml_index] in ii:
-            for dd in range(3):
-                tmp_l = lines[idx + cell_index[ml_index] + dd]
-                cell.append([float(ss) for ss in tmp_l.replace("-", " -").split()[0:3]])
-        elif virial_token[ml_index] in ii:
-            in_kB_index = virial_index[ml_index]
-            while idx + in_kB_index < len(lines) and (
-                not lines[idx + in_kB_index].split()[0:2] == ["in", "kB"]
-            ):
-                in_kB_index += 1
-            assert idx + in_kB_index < len(lines), (
-                'ERROR: "in kB" is not found in OUTCAR. Unable to extract virial.'
-            )
-            tmp_v = [float(ss) for ss in lines[idx + in_kB_index].split()[2:8]]
-            virial = np.zeros([3, 3])
-            virial[0][0] = tmp_v[0]
-            virial[1][1] = tmp_v[1]
-            virial[2][2] = tmp_v[2]
-            virial[0][1] = tmp_v[3]
-            virial[1][0] = tmp_v[3]
-            virial[1][2] = tmp_v[4]
-            virial[2][1] = tmp_v[4]
-            virial[0][2] = tmp_v[5]
-            virial[2][0] = tmp_v[5]
-        elif "TOTAL-FORCE" in ii and (("ML" in ii) == ml):
-            for jj in range(idx + 2, idx + 2 + ntot):
-                tmp_l = lines[jj]
-                info = [float(ss) for ss in tmp_l.split()]
-                coord.append(info[:3])
-                force.append(info[3:6])
-    return coord, cell, energy, force, virial, is_converge
+from dpdata.formats.vasp.outcar import *  # noqa: F403
diff --git a/dpdata/vasp/poscar.py b/dpdata/vasp/poscar.py
index 78b8dbbeb..c207dd2ae 100644
--- a/dpdata/vasp/poscar.py
+++ b/dpdata/vasp/poscar.py
@@ -1,134 +1,3 @@
-#!/usr/bin/python3
 from __future__ import annotations
 
-import numpy as np
-
-
-def _to_system_data_lower(lines, cartesian=True, selective_dynamics=False):
-    def move_flag_mapper(flag):
-        if flag == "T":
-            return True
-        elif flag == "F":
-            return False
-        else:
-            raise RuntimeError(f"Invalid move flag: {flag}")
-
-    """Treat as cartesian poscar."""
-    system = {}
-    system["atom_names"] = [str(ii) for ii in lines[5].split()]
-    system["atom_numbs"] = [int(ii) for ii in lines[6].split()]
-    scale = float(lines[1])
-    cell = []
-    move_flags = []
-    for ii in range(2, 5):
-        boxv = [float(jj) for jj in lines[ii].split()]
-        boxv = np.array(boxv) * scale
-        cell.append(boxv)
-    system["cells"] = [np.array(cell)]
-    natoms = sum(system["atom_numbs"])
-    coord = []
-    for ii in range(8, 8 + natoms):
-        tmp = lines[ii].split()
-        tmpv = [float(jj) for jj in tmp[:3]]
-        if cartesian:
-            tmpv = np.array(tmpv) * scale
-        else:
-            tmpv = np.matmul(np.array(tmpv), system["cells"][0])
-        coord.append(tmpv)
-        if selective_dynamics:
-            if len(tmp) == 6:
-                move_flags.append(list(map(move_flag_mapper, tmp[3:])))
-            else:
-                raise RuntimeError(
-                    f"Invalid move flags, should be 6 columns, got {tmp}"
-                )
-
-    system["coords"] = [np.array(coord)]
-    system["orig"] = np.zeros(3)
-    atom_types = []
-    for idx, ii in enumerate(system["atom_numbs"]):
-        for jj in range(ii):
-            atom_types.append(idx)
-    system["atom_types"] = np.array(atom_types, dtype=int)
-    system["cells"] = np.array(system["cells"])
-    system["coords"] = np.array(system["coords"])
-    if move_flags:
-        move_flags = np.array(move_flags, dtype=bool)
-        move_flags = move_flags.reshape((1, natoms, 3))
-        system["move"] = np.array(move_flags, dtype=bool)
-    return system
-
-
-def to_system_data(lines):
-    # remove the line that has 'selective dynamics'
-    selective_dynamics = False
-    if lines[7][0] == "S" or lines[7][0] == "s":
-        selective_dynamics = True
-        lines.pop(7)
-    is_cartesian = lines[7][0] in ["C", "c", "K", "k"]
-    if not is_cartesian:
-        if lines[7][0] not in ["d", "D"]:
-            raise RuntimeError(
-                "seem not to be a valid POSCAR of vasp 5.x, may be a POSCAR of vasp 4.x?"
-            )
-    return _to_system_data_lower(lines, is_cartesian, selective_dynamics)
-
-
-def from_system_data(system, f_idx=0, skip_zeros=True):
-    ret = ""
-    for ii, name in zip(system["atom_numbs"], system["atom_names"]):
-        if ii == 0:
-            continue
-        ret += "%s%d " % (name, ii)  # noqa: UP031
-    ret += "\n"
-    ret += "1.0\n"
-    for ii in system["cells"][f_idx]:
-        for jj in ii:
-            ret += f"{jj:.16e} "
-        ret += "\n"
-    for idx, ii in enumerate(system["atom_names"]):
-        if system["atom_numbs"][idx] == 0:
-            continue
-        ret += f"{ii} "
-    ret += "\n"
-    for ii in system["atom_numbs"]:
-        if ii == 0:
-            continue
-        ret += "%d " % ii  # noqa: UP031
-    ret += "\n"
-    move = system.get("move", None)
-    if move is not None and len(move) > 0:
-        ret += "Selective Dynamics\n"
-
-    # should use Cartesian for VESTA software
-    ret += "Cartesian\n"
-    atype = system["atom_types"]
-    posis = system["coords"][f_idx]
-    # atype_idx = [[idx,tt] for idx,tt in enumerate(atype)]
-    # sort_idx = np.argsort(atype, kind = 'mergesort')
-    sort_idx = np.lexsort((np.arange(len(atype)), atype))
-    atype = atype[sort_idx]
-    posis = posis[sort_idx]
-    if move is not None and len(move) > 0:
-        move = move[f_idx][sort_idx]
-
-    if isinstance(move, np.ndarray):
-        move = move.tolist()
-
-    posi_list = []
-    for idx in range(len(posis)):
-        ii_posi = posis[idx]
-        line = f"{ii_posi[0]:15.10f} {ii_posi[1]:15.10f} {ii_posi[2]:15.10f}"
-        if move is not None and len(move) > 0:
-            move_flags = move[idx]
-            if not isinstance(move_flags, list) or len(move_flags) != 3:
-                raise RuntimeError(
-                    f"Invalid move flags: {move_flags}, should be a list of 3 bools"
-                )
-            line += " " + " ".join("T" if flag else "F" for flag in move_flags)
-
-        posi_list.append(line)
-
-    posi_list.append("")
-    ret += "\n".join(posi_list)
-    return ret
+from dpdata.formats.vasp.poscar import *  # noqa: F403
diff --git a/dpdata/vasp/xml.py b/dpdata/vasp/xml.py
old mode 100755
new mode 100644
index 1b407c254..808ea7adb
--- a/dpdata/vasp/xml.py
+++ b/dpdata/vasp/xml.py
@@ -1,176 +1,3 @@
-#!/usr/bin/env python3
 from __future__ import annotations
 
-import xml.etree.ElementTree as ET
-from typing import Any
-
-import numpy as np
-
-
-def check_name(item, name):
-    assert item.attrib["name"] == name, (
-        "item attrib '{}' dose not math required '{}'".format(item.attrib["name"], name)
-    )
-
-
-def get_varray(varray):
-    array = []
-    for vv in varray.findall("v"):
-        array.append([float(ii) for ii in vv.text.split()])
-    return np.array(array)
-
-
-def analyze_atominfo(atominfo_xml):
-    check_name(atominfo_xml.find("array"), "atoms")
-    eles = []
-    types = []
-    visited = set()
-    for ii in atominfo_xml.find("array").find("set"):
-        atom_type = int(ii.findall("c")[1].text)
-        if atom_type not in visited:
-            eles.append(ii.findall("c")[0].text.strip())
-            visited.add(atom_type)
-        types.append(atom_type)
-    return eles, types
-
-
-def analyze_calculation(
-    cc: Any,
-    nelm: int | None,
-) -> tuple[np.ndarray, np.ndarray, float, np.ndarray, np.ndarray | None, bool | None]:
-    """Analyze a calculation block.
-
-    Parameters
-    ----------
-    cc : xml.etree.ElementTree.Element
-        The xml element for a ion step calculation
-    nelm : Optional[int]
-        The number nelm, if it is not None, convergence check is performed.
-
-    Returns
-    -------
-    posi : np.ndarray
-        The positions
-    cell : np.ndarray
-        The cell
-    ener : float
-        The energy
-    force : np.ndarray
-        The forces
-    str : Optional[np.ndarray]
-        The stress
-    is_converged: Optional[bool]
-        If the scf calculation is converged. Only return boolean when
-        nelm is not None. Otherwise return None.
-
-    """
-    structure_xml = cc.find("structure")
-    check_name(structure_xml.find("crystal").find("varray"), "basis")
-    check_name(structure_xml.find("varray"), "positions")
-    cell = get_varray(structure_xml.find("crystal").find("varray"))
-    posi = get_varray(structure_xml.find("varray"))
-    strs = None
-    is_converged = None
-    if nelm is not None:
-        niter = len(cc.findall(".//scstep"))
-        is_converged = niter < nelm
-    for vv in cc.findall("varray"):
-        if vv.attrib["name"] == "forces":
-            forc = get_varray(vv)
-        elif vv.attrib["name"] == "stress":
-            strs = get_varray(vv)
-    for ii in cc.find("energy").findall("i"):
-        if ii.attrib["name"] == "e_fr_energy":
-            ener = float(ii.text)
-    return posi, cell, ener, forc, strs, is_converged
-
-
-def formulate_config(eles, types, posi, cell, ener, forc, strs_):
-    strs = strs_ / 1602
-    natoms = len(types)
-    ntypes = len(eles)
-    ret = ""
-    ret += "#N %d %d\n" % (natoms, ntypes - 1)  # noqa: UP031
-    ret += "#C "
-    for ii in eles:
-        ret += " " + ii
-    ret += "\n"
-    ret += "##\n"
-    ret += f"#X {cell[0][0]:13.8f} {cell[0][1]:13.8f} {cell[0][2]:13.8f}\n"
-    ret += f"#Y {cell[1][0]:13.8f} {cell[1][1]:13.8f} {cell[1][2]:13.8f}\n"
-    ret += f"#Z {cell[2][0]:13.8f} {cell[2][1]:13.8f} {cell[2][2]:13.8f}\n"
-    ret += "#W 1.0\n"
-    ret += "#E %.10f\n" % (ener / natoms)
-    ret += f"#S {strs[0][0]:.9e} {strs[1][1]:.9e} {strs[2][2]:.9e} {strs[0][1]:.9e} {strs[1][2]:.9e} {strs[0][2]:.9e}\n"
-    ret += "#F\n"
-    for ii in range(natoms):
-        sp = np.matmul(cell.T, posi[ii])
-        ret += "%d" % (types[ii] - 1)  # noqa: UP031
-        ret += f" {sp[0]:12.6f} {sp[1]:12.6f} {sp[2]:12.6f}"
-        ret += f" {forc[ii][0]:12.6f} {forc[ii][1]:12.6f} {forc[ii][2]:12.6f}"
-        ret += "\n"
-    return ret
-
-
-def analyze(fname, type_idx_zero=False, begin=0, step=1, convergence_check=True):
-    """Deal with broken xml file."""
-    all_posi = []
-    all_cell = []
-    all_ener = []
-    all_forc = []
-    all_strs = []
-    cc = 0
-    if convergence_check:
-        tree = ET.parse(fname)
-        root = tree.getroot()
-        parameters = root.find(".//parameters")
-        nelm = parameters.find(".//i[@name='NELM']")
-        # will check convergence
-        nelm = int(nelm.text)
-    else:
-        # not checking convergence
-        nelm = None
-    try:
-        for event, elem in ET.iterparse(fname):
-            if elem.tag == "atominfo":
-                eles, types = analyze_atominfo(elem)
-                types = np.array(types, dtype=int)
-                if type_idx_zero:
-                    types = types - 1
-            if elem.tag == "calculation":
-                posi, cell, ener, forc, strs, is_converged = analyze_calculation(
-                    elem, nelm
-                )
-                # record when not checking convergence or is_converged
-                # and the step criteria is satisfied
-                if (
-                    (nelm is None or is_converged)
-                    and cc >= begin
-                    and (cc - begin) % step == 0
-                ):
-                    all_posi.append(posi)
-                    all_cell.append(cell)
-                    all_ener.append(ener)
-                    all_forc.append(forc)
-                    if strs is not None:
-                        all_strs.append(strs)
-                cc += 1
-    except ET.ParseError:
-        return (
-            eles,
-            types,
-            np.array(all_cell),
-            np.array(all_posi),
-            np.array(all_ener),
-            np.array(all_forc),
-            np.array(all_strs),
-        )
-    return (
-        eles,
-        types,
-        np.array(all_cell),
-        np.array(all_posi),
-        np.array(all_ener),
-        np.array(all_forc),
-        np.array(all_strs),
-    )
+from dpdata.formats.vasp.xml import *  # noqa: F403
diff --git a/dpdata/xyz/__init__.py b/dpdata/xyz/__init__.py
index e69de29bb..c43228e18 100644
--- a/dpdata/xyz/__init__.py
+++ b/dpdata/xyz/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import annotations
+
+from dpdata.formats.xyz import *  # noqa: F403
diff --git a/dpdata/xyz/quip_gap_xyz.py b/dpdata/xyz/quip_gap_xyz.py
index 71e976de6..4eca6dc75 100644
--- a/dpdata/xyz/quip_gap_xyz.py
+++ b/dpdata/xyz/quip_gap_xyz.py
@@ -1,250 +1,3 @@
-#!/usr/bin/env python3
-# %%
 from __future__ import annotations
 
-import re
-from collections import OrderedDict
-
-import numpy as np
-
-from dpdata.periodic_table import Element
-
-
-class QuipGapxyzSystems:
-    """deal with QuipGapxyzFile."""
-
-    def __init__(self, file_name):
-        self.file_object = open(file_name)
-        self.block_generator = self.get_block_generator()
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        return self.handle_single_xyz_frame(next(self.block_generator))
-
-    def __del__(self):
-        self.file_object.close()
-
-    def get_block_generator(self):
-        p3 = re.compile(r"^\s*(\d+)\s*")
-        while True:
-            line = self.file_object.readline()
-            if not line:
-                break
-            if p3.match(line):
-                atom_num = int(p3.match(line).group(1))
-                lines = []
-                lines.append(line)
-                for ii in range(atom_num + 1):
-                    lines.append(self.file_object.readline())
-                if not lines[-1]:
-                    raise RuntimeError(
-                        f"this xyz file may lack of lines, should be {atom_num + 2};lines:{lines}"
-                    )
-                yield lines
-
-    @staticmethod
-    def handle_single_xyz_frame(lines):
-        atom_num = int(lines[0].strip("\n").strip())
-        if len(lines) != atom_num + 2:
-            raise RuntimeError(
-                f"format error, atom_num=={atom_num}, {len(lines)}!=atom_num+2"
-            )
-        data_format_line = lines[1].strip("\n").strip() + " "
-        field_value_pattern = re.compile(
-            r"(?P<key>\S+)=(?P<quote>[\'\"]?)(?P<value>.*?)(?P=quote)\s+"
-        )
-        prop_pattern = re.compile(
-            r"(?P<key>\w+?):(?P<datatype>[a-zA-Z]):(?P<value>\d+)"
-        )
-
-        data_format_list = [
-            kv_dict.groupdict()
-            for kv_dict in field_value_pattern.finditer(data_format_line)
-        ]
-        field_dict = {}
-        for item in data_format_list:
-            field_dict[item["key"]] = item["value"]
-
-        Properties = field_dict["Properties"]
-        prop_list = [
-            kv_dict.groupdict() for kv_dict in prop_pattern.finditer(Properties)
-        ]
-
-        data_lines = []
-        for line in lines[2:]:
-            data_lines.append(list(filter(bool, line.strip().split())))
-        data_array = np.array(data_lines)
-        used_colomn = 0
-
-        type_array = None
-        coords_array = None
-        Z_array = None
-        force_array = None
-        virials = None
-        for kv_dict in prop_list:
-            if kv_dict["key"] == "species":
-                if kv_dict["datatype"] != "S":
-                    raise RuntimeError(
-                        "datatype for species must be 'S' instead of {}".format(
-                            kv_dict["datatype"]
-                        )
-                    )
-                field_length = int(kv_dict["value"])
-                type_array = data_array[
-                    :, used_colomn : used_colomn + field_length
-                ].flatten()
-                used_colomn += field_length
-                continue
-            elif kv_dict["key"] == "pos":
-                if kv_dict["datatype"] != "R":
-                    raise RuntimeError(
-                        "datatype for pos must be 'R' instead of {}".format(
-                            kv_dict["datatype"]
-                        )
-                    )
-                field_length = int(kv_dict["value"])
-                coords_array = data_array[:, used_colomn : used_colomn + field_length]
-                used_colomn += field_length
-                continue
-            elif kv_dict["key"] == "Z":
-                if kv_dict["datatype"] != "I":
-                    raise RuntimeError(
-                        "datatype for pos must be 'R' instead of {}".format(
-                            kv_dict["datatype"]
-                        )
-                    )
-                field_length = int(kv_dict["value"])
-                Z_array = data_array[
-                    :, used_colomn : used_colomn + field_length
-                ].flatten()
-                used_colomn += field_length
-                continue
-            elif kv_dict["key"] == "force":
-                if kv_dict["datatype"] != "R":
-                    raise RuntimeError(
-                        "datatype for pos must be 'R' instead of {}".format(
-                            kv_dict["datatype"]
-                        )
-                    )
-                field_length = int(kv_dict["value"])
-                force_array = data_array[:, used_colomn : used_colomn + field_length]
-                used_colomn += field_length
-                continue
-            else:
-                raise RuntimeError("unknown field {}".format(kv_dict["key"]))
-
-        type_num_dict = OrderedDict()
-        atom_type_list = []
-        type_map = {}
-        temp_atom_max_index = 0
-        if type_array is None:
-            raise RuntimeError("type_array can't be None type, check .xyz file")
-        for ii in type_array:
-            if ii not in type_map:
-                type_map[ii] = temp_atom_max_index
-                temp_atom_max_index += 1
-                temp_atom_index = type_map[ii]
-                atom_type_list.append(temp_atom_index)
-                type_num_dict[ii] = 1
-            else:
-                temp_atom_index = type_map[ii]
-                atom_type_list.append(temp_atom_index)
-                type_num_dict[ii] += 1
-        type_num_list = []
-        for atom_type, atom_num in type_num_dict.items():
-            type_num_list.append((atom_type, atom_num))
-        type_num_array = np.array(type_num_list)
-        if field_dict.get("virial", None):
-            virials = np.array(
-                [
-                    np.array(
-                        list(filter(bool, field_dict["virial"].split(" ")))
-                    ).reshape(3, 3)
-                ]
-            ).astype(np.float64)
-        else:
-            virials = None
-
-        info_dict = {}
-        info_dict["atom_names"] = list(type_num_array[:, 0])
-        info_dict["atom_numbs"] = list(type_num_array[:, 1].astype(int))
-        info_dict["atom_types"] = np.array(atom_type_list).astype(int)
-        info_dict["cells"] = np.array(
-            [
-                np.array(list(filter(bool, field_dict["Lattice"].split(" ")))).reshape(
-                    3, 3
-                )
-            ]
-        ).astype(np.float64)
-        info_dict["coords"] = np.array([coords_array]).astype(np.float64)
-        info_dict["energies"] = np.array([field_dict["energy"]]).astype(np.float64)
-        info_dict["forces"] = np.array([force_array]).astype(np.float64)
-        if virials is not None:
-            info_dict["virials"] = virials
-        info_dict["orig"] = np.zeros(3)
-        return info_dict
-
-
-def format_single_frame(data, frame_idx):
-    """Format a single frame of system data into QUIP/GAP XYZ format lines.
-
-    Parameters
-    ----------
-    data : dict
-        system data
-    frame_idx : int
-        frame index
-
-    Returns
-    -------
-    list[str]
-        lines for the frame
-    """
-    # Number of atoms
-    natoms = len(data["atom_types"])
-
-    # Build header line with metadata
-    header_parts = []
-
-    # Energy
-    energy = data["energies"][frame_idx]
-    header_parts.append(f"energy={energy:.12e}")
-
-    # Virial (if present)
-    if "virials" in data:
-        virial = data["virials"][frame_idx]
-        virial_str = "    ".join(f"{v:.12e}" for v in virial.flatten())
-        header_parts.append(f'virial="{virial_str}"')
-
-    # Lattice
-    cell = data["cells"][frame_idx]
-    lattice_str = "   ".join(f"{c:.12e}" for c in cell.flatten())
-    header_parts.append(f'Lattice="{lattice_str}"')
-
-    # Properties
-    header_parts.append("Properties=species:S:1:pos:R:3:Z:I:1:force:R:3")
-
-    header_line = "    ".join(header_parts)
-
-    # Format atom lines
-    atom_lines = []
-    coords = data["coords"][frame_idx]
-    forces = data["forces"][frame_idx]
-    atom_names = np.array(data["atom_names"])
-    atom_types = data["atom_types"]
-
-    for i in range(natoms):
-        atom_type_idx = atom_types[i]
-        species = atom_names[atom_type_idx]
-        x, y, z = coords[i]
-        fx, fy, fz = forces[i]
-        atomic_number = Element(species).Z
-
-        atom_line = f"{species}    {x:.11e}   {y:.11e}   {z:.11e}   {atomic_number}    {fx:.11e}  {fy:.11e}   {fz:.11e}"
-        atom_lines.append(atom_line)
-
-    # Combine all lines for this frame
-    frame_lines = [str(natoms), header_line] + atom_lines
-    return frame_lines
+from dpdata.formats.xyz.quip_gap_xyz import *  # noqa: F403
diff --git a/dpdata/xyz/xyz.py b/dpdata/xyz/xyz.py
index 0c36ac32b..9db695c2a 100644
--- a/dpdata/xyz/xyz.py
+++ b/dpdata/xyz/xyz.py
@@ -1,59 +1,3 @@
 from __future__ import annotations
 
-import numpy as np
-
-
-def coord_to_xyz(coord: np.ndarray, types: list) -> str:
-    """Convert coordinates and types to xyz format.
-
-    Parameters
-    ----------
-    coord : np.ndarray
-        coordinates, Nx3 array
-    types : list
-        list of types
-
-    Returns
-    -------
-    str
-        xyz format string
-
-    Examples
-    --------
-    >>> coord_to_xyz(np.ones((1,3)), ["C"])
-    1
-
-    C 1.000000 1.000000 1.000000
-    """
-    buff = [str(len(types)), ""]
-    for at, cc in zip(types, coord):
-        buff.append("{} {:.6f} {:.6f} {:.6f}".format(at, *cc))
-    return "\n".join(buff)
-
-
-def xyz_to_coord(xyz: str) -> tuple[np.ndarray, list]:
-    """Convert xyz format to coordinates and types.
-
-    Parameters
-    ----------
-    xyz : str
-        xyz format string
-
-    Returns
-    -------
-    coords : np.ndarray
-        coordinates, Nx3 array
-    types : list
-        list of types
-    """
-    symbols = []
-    coords = []
-    for ii, line in enumerate(xyz.split("\n")):
-        if ii == 0:
-            natoms = int(line.strip())
-        elif 2 <= ii <= 1 + natoms:
-            # symbol x y z
-            symbol, x, y, z = line.split()
-            coords.append((float(x), float(y), float(z)))
-            symbols.append(symbol)
-    return np.array(coords), symbols
+from dpdata.formats.xyz.xyz import *  # noqa: F403
diff --git a/tests/context.py b/tests/context.py
index 3214e28ea..85d7c33cf 100644
--- a/tests/context.py
+++ b/tests/context.py
@@ -5,7 +5,7 @@
 
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
 import dpdata
-import dpdata.gaussian.gjf  # noqa: F401
+import dpdata.formats.gaussian.gjf  # noqa: F401
 import dpdata.md.msd  # noqa: F401
 import dpdata.md.water  # noqa: F401
 import dpdata.stat  # noqa: F401
diff --git a/tests/test_abacus_stru_dump.py b/tests/test_abacus_stru_dump.py
index cf071920d..7b4317c37 100644
--- a/tests/test_abacus_stru_dump.py
+++ b/tests/test_abacus_stru_dump.py
@@ -7,7 +7,7 @@
 from context import dpdata
 from test_vasp_poscar_dump import myfilecmp
 
-from dpdata.abacus.stru import parse_pos_oneline
+from dpdata.formats.abacus.stru import parse_pos_oneline
 
 
 class TestStruDump(unittest.TestCase):
diff --git a/tests/test_cell_to_low_triangle.py b/tests/test_cell_to_low_triangle.py
index 34d0a90ae..93a128263 100644
--- a/tests/test_cell_to_low_triangle.py
+++ b/tests/test_cell_to_low_triangle.py
@@ -3,12 +3,13 @@
 import unittest
 
 import numpy as np
-from context import dpdata
+
+from dpdata.formats.cp2k.cell import cell_to_low_triangle
 
 
 class TestCellToLowTriangle(unittest.TestCase):
     def test_func1(self):
-        cell_1 = dpdata.cp2k.cell.cell_to_low_triangle(
+        cell_1 = cell_to_low_triangle(
             6, 6, 6, np.pi * 1 / 2, np.pi * 1 / 2, np.pi * 1 / 2
         )
         cell_2 = np.asarray([[6, 0, 0], [0, 6, 0], [0, 0, 6]])
@@ -17,7 +18,7 @@ def test_func1(self):
                 self.assertAlmostEqual(cell_1[ii, jj], cell_2[ii, jj], places=6)
 
     def test_func2(self):
-        cell_1 = dpdata.cp2k.cell.cell_to_low_triangle(
+        cell_1 = cell_to_low_triangle(
             6, 6, 6, np.pi * 1 / 3, np.pi * 1 / 3, np.pi * 1 / 3
         )
         cell_2 = np.asarray(
@@ -28,7 +29,7 @@ def test_func2(self):
                 self.assertAlmostEqual(cell_1[ii, jj], cell_2[ii, jj], places=6)
 
     def test_func3(self):
-        cell_1 = dpdata.cp2k.cell.cell_to_low_triangle(
+        cell_1 = cell_to_low_triangle(
             6, 7, 8, np.pi * 133 / 180, np.pi * 84 / 180, np.pi * 69 / 180
         )
         cell_2 = np.asarray(
@@ -45,21 +46,17 @@ def test_func3(self):
 
     def test_func4(self):
         with self.assertRaises(Exception) as c:
-            dpdata.cp2k.cell.cell_to_low_triangle(
-                0.1, 6, 6, np.pi * 1 / 2, np.pi * 1 / 2, np.pi * 1 / 2
-            )
+            cell_to_low_triangle(0.1, 6, 6, np.pi * 1 / 2, np.pi * 1 / 2, np.pi * 1 / 2)
         self.assertTrue("A==0.1" in str(c.exception))
 
     def test_func5(self):
         with self.assertRaises(Exception) as c:
-            dpdata.cp2k.cell.cell_to_low_triangle(
-                6, 6, 6, np.pi * 3 / 180, np.pi * 1 / 2, np.pi * 1 / 2
-            )
+            cell_to_low_triangle(6, 6, 6, np.pi * 3 / 180, np.pi * 1 / 2, np.pi * 1 / 2)
         self.assertTrue("alpha" in str(c.exception))
 
     def test_func6(self):
         with self.assertRaises(Exception) as c:
-            dpdata.cp2k.cell.cell_to_low_triangle(
+            cell_to_low_triangle(
                 6, 7, 8, np.pi * 153 / 180, np.pi * 84 / 180, np.pi * 69 / 180
             )
         self.assertTrue("lz^2" in str(c.exception))
diff --git a/tests/test_gaussian_driver.py b/tests/test_gaussian_driver.py
index ff1638488..3c28738c1 100644
--- a/tests/test_gaussian_driver.py
+++ b/tests/test_gaussian_driver.py
@@ -9,6 +9,8 @@
 from comp_sys import CompSys, IsNoPBC
 from context import dpdata
 
+from dpdata.formats.gaussian.gjf import detect_multiplicity
+
 
 @unittest.skipIf(shutil.which("g16") is None, "g16 is not installed")
 @unittest.skipIf(
@@ -83,9 +85,7 @@ def test_detect_multiplicity(self):
         self._check_multiplicity(["C", "H"], 2)
 
     def _check_multiplicity(self, symbols, multiplicity):
-        self.assertEqual(
-            dpdata.gaussian.gjf.detect_multiplicity(np.array(symbols)), multiplicity
-        )
+        self.assertEqual(detect_multiplicity(np.array(symbols)), multiplicity)
 
     def tearDown(self):
         if os.path.exists("gaussian/tmp.gjf"):
diff --git a/tests/test_lammps_lmp_dump.py b/tests/test_lammps_lmp_dump.py
index a717c6cfc..86e4b3cdd 100644
--- a/tests/test_lammps_lmp_dump.py
+++ b/tests/test_lammps_lmp_dump.py
@@ -8,7 +8,7 @@
 from context import dpdata
 from poscars.poscar_ref_oh import TestPOSCARoh
 
-from dpdata.lammps.lmp import rotate_to_lower_triangle
+from dpdata.formats.lammps.lmp import rotate_to_lower_triangle
 
 TEST_DIR = os.path.dirname(__file__)
 POSCAR_CONF_LMP = os.path.join(TEST_DIR, "poscars", "conf.lmp")
diff --git a/tests/test_lammps_spin.py b/tests/test_lammps_spin.py
index d3d58920e..bcb3442b7 100644
--- a/tests/test_lammps_spin.py
+++ b/tests/test_lammps_spin.py
@@ -7,7 +7,7 @@
 import numpy as np
 from context import dpdata
 
-from dpdata.lammps.dump import get_spin
+from dpdata.formats.lammps.dump import get_spin
 
 TRAJ_NO_ID = """ITEM: TIMESTEP
 0
diff --git a/tests/test_lmdb.py b/tests/test_lmdb.py
index ee651edce..bc0fdeecc 100644
--- a/tests/test_lmdb.py
+++ b/tests/test_lmdb.py
@@ -17,7 +17,7 @@
 )
 from context import dpdata
 
-from dpdata.lmdb.format import LMDBFrameError, LMDBMetadataError
+from dpdata.formats.lmdb.format import LMDBFrameError, LMDBMetadataError
 
 
 class TestLMDBLabeledSystem(unittest.TestCase, CompLabeledSys, IsPBC):
diff --git a/tests/test_msd.py b/tests/test_msd.py
index 5d26db645..9d53ba0fc 100644
--- a/tests/test_msd.py
+++ b/tests/test_msd.py
@@ -5,6 +5,8 @@
 import numpy as np
 from context import dpdata
 
+from dpdata.md.msd import msd as calc_msd
+
 
 class TestMSD(unittest.TestCase):
     def setUp(self):
@@ -22,9 +24,9 @@ def setUp(self):
 
     def test_msd(self):
         # print(self.system['atom_types'] == 0)
-        msd = dpdata.md.msd.msd(self.system)
-        msd0 = dpdata.md.msd.msd(self.system, self.system["atom_types"] == 0)
-        msd1 = dpdata.md.msd.msd(self.system, self.system["atom_types"] == 1)
+        msd = calc_msd(self.system)
+        msd0 = calc_msd(self.system, self.system["atom_types"] == 0)
+        msd1 = calc_msd(self.system, self.system["atom_types"] == 1)
         # print(msd)
         ncomp = msd.shape[0]
         for ii in range(ncomp):
diff --git a/tests/test_qe_cp_traj.py b/tests/test_qe_cp_traj.py
index d6403ff67..a670bd4d0 100644
--- a/tests/test_qe_cp_traj.py
+++ b/tests/test_qe_cp_traj.py
@@ -5,6 +5,8 @@
 import numpy as np
 from context import dpdata
 
+from dpdata.formats.qe.traj import convert_celldm
+
 bohr2ang = dpdata.unit.LengthConversion("bohr", "angstrom").value()
 
 
@@ -61,7 +63,7 @@ def setUp(self):
 
 class TestConverCellDim(unittest.TestCase):
     def test_case_null(self):
-        cell = dpdata.qe.traj.convert_celldm(8, [1, 1, 1])
+        cell = convert_celldm(8, [1, 1, 1])
         ref = np.eye(3)
         for ii in range(3):
             for jj in range(3):
diff --git a/tests/test_water_ions.py b/tests/test_water_ions.py
index 40c1c143c..34ab21279 100644
--- a/tests/test_water_ions.py
+++ b/tests/test_water_ions.py
@@ -5,6 +5,13 @@
 
 from context import dpdata
 
+from dpdata.md.water import (
+    compute_bonds,
+    compute_bonds_ase,
+    compute_bonds_naive,
+    find_ions,
+)
+
 try:
     import ase  # noqa: F401
     import ase.neighborlist  # noqa: F401
@@ -20,16 +27,14 @@ def setUp(self):
         self.system.from_lammps_lmp(
             os.path.join("poscars", "conf.waterion.lmp"), type_map=["O", "H"]
         )
-        self.bonds = dpdata.md.water.compute_bonds(
+        self.bonds = compute_bonds(
             self.system.data["cells"][0],
             self.system.data["coords"][0],
             self.system.data["atom_types"],
         )
 
     def test_ions_count(self):
-        no, noh, noh2, noh3, nh = dpdata.md.water.find_ions(
-            self.system.data["atom_types"], self.bonds
-        )
+        no, noh, noh2, noh3, nh = find_ions(self.system.data["atom_types"], self.bonds)
         self.assertEqual(len(no), 0)
         self.assertEqual(len(noh), 1)
         self.assertEqual(len(noh2), 125)
@@ -46,12 +51,12 @@ def setUp(self):
         self.system.from_lammps_lmp(
             os.path.join("poscars", "conf.waterion.lmp"), type_map=["O", "H"]
         )
-        self.bonds = dpdata.md.water.compute_bonds_naive(
+        self.bonds = compute_bonds_naive(
             self.system.data["cells"][0],
             self.system.data["coords"][0],
             self.system.data["atom_types"],
         )
-        self.bonds_ase = dpdata.md.water.compute_bonds_ase(
+        self.bonds_ase = compute_bonds_ase(
             self.system.data["cells"][0],
             self.system.data["coords"][0],
             self.system.data["atom_types"],