diff --git a/docs/documentation/case.md b/docs/documentation/case.md index 5acd85a7a3..dbd127f23c 100644 --- a/docs/documentation/case.md +++ b/docs/documentation/case.md @@ -638,7 +638,7 @@ To restart the simulation from $k$-th time step, see @ref running "Restarting Ca | `alpha_wrt(i)` | Logical | Add the volume fraction of fluid $i$ to the database | | `gamma_wrt` | Logical | Add the specific heat ratio function to the database | | `heat_ratio_wrt` | Logical | Add the specific heat ratio to the database | -| `ib_state_wrt` | Logical | Write IB state and loads to a datafile at each time step | +| `ib_state_wrt` | Logical | Parameter to handle writing IB state on saves and outputting the state as a point mesh to SILO files. | | `pi_inf_wrt` | Logical | Add the liquid stiffness function to the database | | `pres_inf_wrt` | Logical | Add the liquid stiffness to the formatted database | | `c_wrt` | Logical | Add the sound speed to the database | @@ -706,7 +706,7 @@ If `file_per_process` is true, then pre_process, simulation, and post_process mu - `probe_wrt` activates the output of state variables at coordinates specified by `probe(i)%[x;y,z]`. -- `ib_state_wrt` activates the output of data specified by patch_ib(i)%force(:) (and torque, vel, angular_vel, angles, [x,y,z]_centroid) into a single binary datafile for all IBs at all timesteps. During post_processing, this file is converted into separate time histories for each IB. +- `ib_state_wrt` is used to trigger post-processing of the IB state to be written out as a point mesh in the SILO files. When no IBs are moving, it also triggers force and torque calculation so that those values may be written to the output state files. - `output_partial_domain` activates the output of part of the domain specified by `[x,y,z]_output%%beg` and `[x,y,z]_output%%end`. This is useful for large domains where only a portion of the domain is of interest. diff --git a/src/common/m_constants.fpp b/src/common/m_constants.fpp index cacc50f528..d66fb10614 100644 --- a/src/common/m_constants.fpp +++ b/src/common/m_constants.fpp @@ -25,6 +25,7 @@ module m_constants integer, parameter :: num_probes_max = 10 !< Maximum number of flow probes in the simulation integer, parameter :: num_patches_max = 10 !< Maximum number of IC patches integer, parameter :: num_ib_patches_max = 50000 !< Maximum number of immersed boundary patches (patch_ib) + integer, parameter :: num_local_ibs_max = 2000 !< Maximum number of immersed boundary patches (patch_ib) integer, parameter :: num_bc_patches_max = 10 !< Maximum number of boundary condition patches integer, parameter :: max_2d_fourier_modes = 10 !< Max Fourier mode index for 2D modal patch (geometry 13) integer, parameter :: max_sph_harm_degree = 5 !< Max degree L for 3D spherical harmonic patch (geometry 14) diff --git a/src/common/m_derived_types.fpp b/src/common/m_derived_types.fpp index 6ee2e836a5..c4e5873ead 100644 --- a/src/common/m_derived_types.fpp +++ b/src/common/m_derived_types.fpp @@ -202,12 +202,10 @@ module m_derived_types type :: t_model_array ! Original CPU-side fields (unchanged) - type(t_model), allocatable :: model !< STL/OBJ geometry model - real(wp), allocatable, dimension(:,:,:) :: boundary_v !< Boundary vertices - real(wp), allocatable, dimension(:,:) :: interpolated_boundary_v !< Interpolated boundary vertices - integer :: boundary_edge_count !< Number of boundary edges - integer :: total_vertices !< Total vertex count - integer :: interpolate !< Interpolation flag + type(t_model), allocatable :: model !< STL/OBJ geometry model + real(wp), allocatable, dimension(:,:,:) :: boundary_v !< Boundary vertices + integer :: boundary_edge_count !< Number of boundary edges + integer :: total_vertices !< Total vertex count ! GPU-friendly flattened arrays integer :: ntrs !< Copy of model%ntrs @@ -272,9 +270,10 @@ module m_derived_types end type ic_patch_parameters type ib_patch_parameters - integer :: geometry !< Type of geometry for the patch + integer :: gbl_patch_id real(wp) :: x_centroid, y_centroid, z_centroid !< Geometric center coordinates of the patch + !> Centroid locations of intermediate steps in the time_stepper module real(wp) :: step_x_centroid, step_y_centroid, step_z_centroid real(wp), dimension(1:3) :: centroid_offset !< offset of center of mass from computed cell center for odd-shaped IBs diff --git a/src/common/m_model.fpp b/src/common/m_model.fpp index 0dab036f9b..f02474f8d0 100644 --- a/src/common/m_model.fpp +++ b/src/common/m_model.fpp @@ -983,10 +983,11 @@ contains dx_local = minval(dx); dy_local = minval(dy) if (p /= 0) dz_local = minval(dz) - allocate (stl_bounding_boxes(num_ibs,1:3,1:3)) + allocate (stl_bounding_boxes(num_gbl_ibs,1:3,1:3)) do patch_id = 1, num_ibs if (patch_ib(patch_id)%geometry == 5 .or. patch_ib(patch_id)%geometry == 12) then + print *, proc_rank, patch_id, num_ibs, patch_ib(patch_id)%geometry allocate (models(patch_id)%model) print *, " * Reading model: " // trim(patch_ib(patch_id)%model_filepath) diff --git a/src/post_process/m_data_output.fpp b/src/post_process/m_data_output.fpp index aece31ff8d..3a3a7bb10c 100644 --- a/src/post_process/m_data_output.fpp +++ b/src/post_process/m_data_output.fpp @@ -1396,48 +1396,42 @@ contains end do close (file_unit) - end if - - call MPI_BCAST(ib_data, nBodies*NFIELDS_PER_IB, mpi_p, 0, MPI_COMM_WORLD, ierr) - - do i = 1, nBodies - force_x(i) = ib_data(i, 2); force_y(i) = ib_data(i, 3); force_z(i) = ib_data(i, 4) - torque_x(i) = ib_data(i, 5); torque_y(i) = ib_data(i, 6); torque_z(i) = ib_data(i, 7) - vel_x(i) = ib_data(i, 8); vel_y(i) = ib_data(i, 9); vel_z(i) = ib_data(i, 10) - omega_x(i) = ib_data(i, 11); omega_y(i) = ib_data(i, 12); omega_z(i) = ib_data(i, 13) - angle_x(i) = ib_data(i, 14); angle_y(i) = ib_data(i, 15); angle_z(i) = ib_data(i, 16) - px(i) = ib_data(i, 17); py(i) = ib_data(i, 18); pz(i) = ib_data(i, 19) - ib_diameter(i) = ib_data(i, 20)*2.0_wp - end do - if (proc_rank == 0) then - do i = 1, num_procs - write (meshnames(i), '(A,I0,A,I0,A)') '../p', i - 1, '/', t_step, '.silo:ib_bodies' - meshtypes(i) = DB_POINTMESH + do i = 1, nBodies + force_x(i) = ib_data(i, 2); force_y(i) = ib_data(i, 3); force_z(i) = ib_data(i, 4) + torque_x(i) = ib_data(i, 5); torque_y(i) = ib_data(i, 6); torque_z(i) = ib_data(i, 7) + vel_x(i) = ib_data(i, 8); vel_y(i) = ib_data(i, 9); vel_z(i) = ib_data(i, 10) + omega_x(i) = ib_data(i, 11); omega_y(i) = ib_data(i, 12); omega_z(i) = ib_data(i, 13) + angle_x(i) = ib_data(i, 14); angle_y(i) = ib_data(i, 15); angle_z(i) = ib_data(i, 16) + px(i) = ib_data(i, 17); py(i) = ib_data(i, 18); pz(i) = ib_data(i, 19) + ib_diameter(i) = ib_data(i, 20)*2.0_wp end do + + write (meshnames(1), '(A,I0,A)') '../p0/', t_step, '.silo:ib_bodies' + meshtypes(1) = DB_POINTMESH err = DBSET2DSTRLEN(len(meshnames(1))) - err = DBPUTMMESH(dbroot, 'ib_bodies', 16, num_procs, meshnames, len_trim(meshnames), meshtypes, DB_F77NULL, ierr) + err = DBPUTMMESH(dbroot, 'ib_bodies', 16, 1, meshnames, len_trim(meshnames), meshtypes, DB_F77NULL, ierr) + + err = DBPUTPM(dbfile, 'ib_bodies', 9, 3, px, py, pz, nBodies, DB_DOUBLE, DB_F77NULL, ierr) + + call s_write_ib_variable('ib_force_x', t_step, force_x, nBodies) + call s_write_ib_variable('ib_force_y', t_step, force_y, nBodies) + call s_write_ib_variable('ib_force_z', t_step, force_z, nBodies) + call s_write_ib_variable('ib_torque_x', t_step, torque_x, nBodies) + call s_write_ib_variable('ib_torque_y', t_step, torque_y, nBodies) + call s_write_ib_variable('ib_torque_z', t_step, torque_z, nBodies) + call s_write_ib_variable('ib_vel_x', t_step, vel_x, nBodies) + call s_write_ib_variable('ib_vel_y', t_step, vel_y, nBodies) + call s_write_ib_variable('ib_vel_z', t_step, vel_z, nBodies) + call s_write_ib_variable('ib_omega_x', t_step, omega_x, nBodies) + call s_write_ib_variable('ib_omega_y', t_step, omega_y, nBodies) + call s_write_ib_variable('ib_omega_z', t_step, omega_z, nBodies) + call s_write_ib_variable('ib_angle_x', t_step, angle_x, nBodies) + call s_write_ib_variable('ib_angle_y', t_step, angle_y, nBodies) + call s_write_ib_variable('ib_angle_z', t_step, angle_z, nBodies) + call s_write_ib_variable('ib_diameter', t_step, ib_diameter, nBodies) end if - err = DBPUTPM(dbfile, 'ib_bodies', 9, 3, px, py, pz, nBodies, DB_DOUBLE, DB_F77NULL, ierr) - - call s_write_ib_variable('ib_force_x', t_step, force_x, nBodies) - call s_write_ib_variable('ib_force_y', t_step, force_y, nBodies) - call s_write_ib_variable('ib_force_z', t_step, force_z, nBodies) - call s_write_ib_variable('ib_torque_x', t_step, torque_x, nBodies) - call s_write_ib_variable('ib_torque_y', t_step, torque_y, nBodies) - call s_write_ib_variable('ib_torque_z', t_step, torque_z, nBodies) - call s_write_ib_variable('ib_vel_x', t_step, vel_x, nBodies) - call s_write_ib_variable('ib_vel_y', t_step, vel_y, nBodies) - call s_write_ib_variable('ib_vel_z', t_step, vel_z, nBodies) - call s_write_ib_variable('ib_omega_x', t_step, omega_x, nBodies) - call s_write_ib_variable('ib_omega_y', t_step, omega_y, nBodies) - call s_write_ib_variable('ib_omega_z', t_step, omega_z, nBodies) - call s_write_ib_variable('ib_angle_x', t_step, angle_x, nBodies) - call s_write_ib_variable('ib_angle_y', t_step, angle_y, nBodies) - call s_write_ib_variable('ib_angle_z', t_step, angle_z, nBodies) - call s_write_ib_variable('ib_diameter', t_step, ib_diameter, nBodies) - deallocate (ib_data, px, py, pz, force_x, force_y, force_z) deallocate (torque_x, torque_y, torque_z, vel_x, vel_y, vel_z) deallocate (omega_x, omega_y, omega_z, angle_x, angle_y, angle_z) @@ -1450,23 +1444,18 @@ contains !> Write a single IB point-variable to the Silo database slave and master files. subroutine s_write_ib_variable(varname, t_step, data, nBodies) - character(len=*), intent(in) :: varname - integer, intent(in) :: t_step - real(wp), dimension(:), intent(in) :: data - integer, intent(in) :: nBodies - character(len=4*name_len), dimension(num_procs) :: var_names - integer, dimension(num_procs) :: var_types - integer :: ierr, i - - if (proc_rank == 0) then - do i = 1, num_procs - write (var_names(i), '(A,I0,A,I0,A)') '../p', i - 1, '/', t_step, '.silo:' // trim(varname) - var_types(i) = DB_POINTVAR - end do - err = DBSET2DSTRLEN(len(var_names(1))) - err = DBPUTMVAR(dbroot, trim(varname), len_trim(varname), num_procs, var_names, len_trim(var_names), var_types, & - & DB_F77NULL, ierr) - end if + character(len=*), intent(in) :: varname + integer, intent(in) :: t_step + real(wp), dimension(:), intent(in) :: data + integer, intent(in) :: nBodies + character(len=4*name_len) :: var_name_entry + integer :: var_type_entry, ierr + + write (var_name_entry, '(A,I0,A)') '../p0/', t_step, '.silo:' // trim(varname) + var_type_entry = DB_POINTVAR + err = DBSET2DSTRLEN(len(var_name_entry)) + err = DBPUTMVAR(dbroot, trim(varname), len_trim(varname), 1, var_name_entry, len_trim(var_name_entry), var_type_entry, & + & DB_F77NULL, ierr) err = DBPUTPV1(dbfile, trim(varname), len_trim(varname), 'ib_bodies', 9, data, nBodies, DB_DOUBLE, DB_F77NULL, ierr) diff --git a/src/simulation/m_collisions.fpp b/src/simulation/m_collisions.fpp index fe860cb055..c871496fd9 100644 --- a/src/simulation/m_collisions.fpp +++ b/src/simulation/m_collisions.fpp @@ -19,7 +19,8 @@ module m_collisions implicit none - private; public :: s_apply_collision_forces, s_initialize_collisions_module, s_finalize_collisions_module + private; public :: s_apply_collision_forces, s_initialize_collisions_module, s_finalize_collisions_module, & + & f_local_rank_owns_collision ! overlap distances for computing collisions integer, allocatable, dimension(:,:) :: collision_lookup real(wp), allocatable, dimension(:,:) :: wall_overlap_distances @@ -405,35 +406,35 @@ contains logical :: owns_collision real(wp), dimension(3) :: projected_location - #:if defined('MFC_MPI') - if (num_procs == 1) then - owns_collision = .true. - else - projected_location(:) = collision_location(:) - - ! catch the edge case where th collision lies just outside the computational domain - #:for X, ID in [('x', 1), ('y', 2), ('z', 3)] - if (num_dims >= ${ID}$) then - if (ib_bc_${X}$%beg /= BC_PERIODIC) then - ! if it is outside the domain in one direction, project it somewhere inside so at least one rank owns it - if (collision_location(${ID}$) < ${X}$_domain%beg) then - projected_location(${ID}$) = ${X}$_domain%beg - else if (${X}$_domain%end < collision_location(${ID}$)) then - projected_location(${ID}$) = ${X}$_domain%end - 1.0e-10_wp - end if +#ifdef MFC_MPI + if (num_procs == 1) then + owns_collision = .true. + else + projected_location(:) = collision_location(:) + + ! catch the edge case where th collision lies just outside the computational domain + #:for X, ID in [('x', 1), ('y', 2), ('z', 3)] + if (num_dims >= ${ID}$) then + if (ib_bc_${X}$%beg /= BC_PERIODIC) then + ! if it is outside the domain in one direction, project it somewhere inside so at least one rank owns it + if (collision_location(${ID}$) < ${X}$_domain%beg) then + projected_location(${ID}$) = ${X}$_domain%beg + else if (${X}$_domain%end < collision_location(${ID}$)) then + projected_location(${ID}$) = ${X}$_domain%end - 1.0e-10_wp end if end if - #:endfor + end if + #:endfor - ! the object that contains the collision location owns the collisions - owns_collision = x_cb(-1) <= projected_location(1) .and. projected_location(1) < x_cb(m) - owns_collision = owns_collision .and. y_cb(-1) <= projected_location(2) .and. projected_location(2) < y_cb(n) - if (num_dims == 3) owns_collision = owns_collision .and. z_cb(-1) <= projected_location(3) & - & .and. projected_location(3) < z_cb(p) - end if - #:else - owns_collision = .true. - #:endif + ! the object that contains the collision location owns the collisions + owns_collision = x_cb(-1) <= projected_location(1) .and. projected_location(1) < x_cb(m) + owns_collision = owns_collision .and. y_cb(-1) <= projected_location(2) .and. projected_location(2) < y_cb(n) + if (num_dims == 3) owns_collision = owns_collision .and. z_cb(-1) <= projected_location(3) .and. projected_location(3) & + & < z_cb(p) + end if +#else + owns_collision = .true. +#endif end function f_local_rank_owns_collision diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp index a5fdb3ef81..effc020325 100644 --- a/src/simulation/m_data_output.fpp +++ b/src/simulation/m_data_output.fpp @@ -909,7 +909,7 @@ contains integer :: ifile, ierr integer, dimension(MPI_STATUS_SIZE) :: status logical :: file_exist - integer :: i + integer :: i, ib_idx integer, parameter :: NFIELDS_PER_IB = 20 real(wp) :: ib_buf(NFIELDS_PER_IB) @@ -923,19 +923,6 @@ contains end if call s_mpi_barrier() - ! Divide num_ibs across num_procs - nibs_per_rank = num_ibs/num_procs - remainder = mod(num_ibs, num_procs) - - ! Ranks < remainder get one extra IB - if (proc_rank < remainder) then - ib_start = proc_rank*(nibs_per_rank + 1) + 1 - ib_end = ib_start + nibs_per_rank ! nibs_per_rank + 1 total - else - ib_start = remainder*(nibs_per_rank + 1) + (proc_rank - remainder)*nibs_per_rank + 1 - ib_end = ib_start + nibs_per_rank - 1 - end if - write (file_loc, '(A,I0,A)') '/restart_data/ib_state_', t_step, '.dat' file_loc = trim(case_dir) // trim(file_loc) @@ -947,20 +934,21 @@ contains call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, ior(MPI_MODE_WRONLY, MPI_MODE_CREATE), mpi_info_int, ifile, ierr) - do i = ib_start, ib_end + do i = 1, num_local_ibs + ib_idx = local_ib_patch_ids(i) ib_buf(1) = mytime - ib_buf(2:4) = patch_ib(i)%force(1:3) - ib_buf(5:7) = patch_ib(i)%torque(1:3) - ib_buf(8:10) = patch_ib(i)%vel(1:3) - ib_buf(11:13) = patch_ib(i)%angular_vel(1:3) - ib_buf(14:16) = patch_ib(i)%angles(1:3) - ib_buf(17) = patch_ib(i)%x_centroid - ib_buf(18) = patch_ib(i)%y_centroid - ib_buf(19) = patch_ib(i)%z_centroid - ib_buf(20) = patch_ib(i)%radius + ib_buf(2:4) = patch_ib(ib_idx)%force(1:3) + ib_buf(5:7) = patch_ib(ib_idx)%torque(1:3) + ib_buf(8:10) = patch_ib(ib_idx)%vel(1:3) + ib_buf(11:13) = patch_ib(ib_idx)%angular_vel(1:3) + ib_buf(14:16) = patch_ib(ib_idx)%angles(1:3) + ib_buf(17) = patch_ib(ib_idx)%x_centroid + ib_buf(18) = patch_ib(ib_idx)%y_centroid + ib_buf(19) = patch_ib(ib_idx)%z_centroid + ib_buf(20) = patch_ib(ib_idx)%radius ! Global IB index (i) determines position in file - disp = int(i - 1, MPI_OFFSET_KIND)*int(NFIELDS_PER_IB, MPI_OFFSET_KIND)*WP_MOK + disp = int(patch_ib(ib_idx)%gbl_patch_id - 1, MPI_OFFSET_KIND)*int(NFIELDS_PER_IB, MPI_OFFSET_KIND)*WP_MOK call MPI_FILE_WRITE_AT(ifile, disp, ib_buf, NFIELDS_PER_IB, mpi_p, status, ierr) end do diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp index 7fb3353e2c..446a3c024c 100644 --- a/src/simulation/m_global_parameters.fpp +++ b/src/simulation/m_global_parameters.fpp @@ -229,7 +229,8 @@ module m_global_parameters $:GPU_DECLARE(create='[ib_bc_x, ib_bc_y, ib_bc_z]') #endif type(bounds_info) :: x_domain, y_domain, z_domain - $:GPU_DECLARE(create='[x_domain, y_domain, z_domain]') + type(bounds_info) :: neighbor_domain_x, neighbor_domain_y, neighbor_domain_z + $:GPU_DECLARE(create='[x_domain, y_domain, z_domain, neighbor_domain_x, neighbor_domain_y, neighbor_domain_z]') real(wp) :: x_a, y_a, z_a real(wp) :: x_b, y_b, z_b logical :: parallel_io !< Format of the data files @@ -336,18 +337,22 @@ module m_global_parameters !> @name Immersed Boundaries !> @{ - logical :: ib - integer :: num_ibs - integer :: collision_model - real(wp) :: coefficient_of_restitution - real(wp) :: collision_time - real(wp) :: ib_coefficient_of_friction - logical :: ib_state_wrt - type(ib_patch_parameters), dimension(num_ib_patches_max) :: patch_ib !< Immersed boundary patch parameters - type(vec3_dt), allocatable, dimension(:) :: airfoil_grid_u, airfoil_grid_l - integer :: Np - - $:GPU_DECLARE(create='[ib, num_ibs, patch_ib, Np, airfoil_grid_u, airfoil_grid_l]') + logical :: ib + integer :: num_ibs !< number of IBs that the current processor is aware of + integer :: num_gbl_ibs !< number of IBs in the overall simulation + integer :: num_local_ibs !< number of IBs that lie inside the processor domain + integer :: collision_model + real(wp) :: coefficient_of_restitution + real(wp) :: collision_time + real(wp) :: ib_coefficient_of_friction + logical :: ib_state_wrt + type(ib_patch_parameters), allocatable, dimension(:) :: patch_ib !< Immersed boundary patch parameters + integer, dimension(num_local_ibs_max) :: local_ib_patch_ids !< lookup table of IBs in the local compute domain + integer, dimension(-1:1,-1:1,-1:1) :: ib_neighbor_ranks !< MPI ranks of all 26 neighbor domains + type(vec3_dt), allocatable, dimension(:) :: airfoil_grid_u, airfoil_grid_l + integer :: Np + + $:GPU_DECLARE(create='[ib, num_ibs, num_gbl_ibs, num_local_ibs, patch_ib, Np, airfoil_grid_u, airfoil_grid_l, local_ib_patch_ids]') $:GPU_DECLARE(create='[ib_coefficient_of_friction]') !> @} @@ -787,7 +792,9 @@ contains relativity = .false. #:endif + allocate (patch_ib(num_ib_patches_max)) do i = 1, num_ib_patches_max + patch_ib(i)%gbl_patch_id = i patch_ib(i)%geometry = dflt_int patch_ib(i)%x_centroid = 0._wp patch_ib(i)%y_centroid = 0._wp diff --git a/src/simulation/m_ib_patches.fpp b/src/simulation/m_ib_patches.fpp index cf3ad9ecfa..1770e9c271 100644 --- a/src/simulation/m_ib_patches.fpp +++ b/src/simulation/m_ib_patches.fpp @@ -109,7 +109,7 @@ contains radius = patch_ib(patch_id)%radius ! encode the periodicity information into the patch_id - call s_encode_patch_periodicity(patch_id, xp, yp, 0, encoded_patch_id) + call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, 0, encoded_patch_id) ! find the indices to the left and right of the IB in i, j, k il = -gp_layers - 1 @@ -221,7 +221,7 @@ contains end if ! encode the periodicity information into the patch_id - call s_encode_patch_periodicity(patch_id, xp, yp, 0, encoded_patch_id) + call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, 0, encoded_patch_id) ! find the indices to the left and right of the IB in i, j, k il = -gp_layers - 1 @@ -376,7 +376,7 @@ contains end if ! encode the periodicity information into the patch_id - call s_encode_patch_periodicity(patch_id, xp, yp, zp, encoded_patch_id) + call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, zp, encoded_patch_id) ! find the indices to the left and right of the IB in i, j, k il = -gp_layers - 1 @@ -466,7 +466,7 @@ contains inverse_rotation(:,:) = patch_ib(patch_id)%rotation_matrix_inverse(:,:) ! encode the periodicity information into the patch_id - call s_encode_patch_periodicity(patch_id, xp, yp, 0, encoded_patch_id) + call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, 0, encoded_patch_id) ! find the indices to the left and right of the IB in i, j, k il = -gp_layers - 1 @@ -510,16 +510,12 @@ contains real(wp) :: radius real(wp), dimension(1:3) :: center - ! Variables to initialize the pressure field that corresponds to the bubble-collapse test case found in Tiwari et al. (2013) - - ! Transferring spherical patch's radius, centroid, smoothing patch identity and smoothing coefficient information - center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(x_domain%end - x_domain%beg) center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(y_domain%end - y_domain%beg) center(3) = patch_ib(patch_id)%z_centroid + real(zp, wp)*(z_domain%end - z_domain%beg) radius = patch_ib(patch_id)%radius - ! completely skip particles no in the domain + ! completely skip particles not in the domain if (center(1) - radius > x_cc(m + gp_layers + 1) .or. center(1) + radius < x_cc(-gp_layers - 1) .or. center(2) & & - radius > y_cc(n + gp_layers + 1) .or. center(2) + radius < y_cc(-gp_layers - 1) .or. center(3) - radius > z_cc(p & & + gp_layers + 1) .or. center(3) + radius < z_cc(-gp_layers - 1)) then @@ -527,7 +523,7 @@ contains end if ! encode the periodicity information into the patch_id - call s_encode_patch_periodicity(patch_id, xp, yp, zp, encoded_patch_id) + call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, zp, encoded_patch_id) ! find the indices to the left and right of the IB in i, j, k il = -gp_layers - 1 @@ -542,18 +538,12 @@ contains ! Checking whether the sphere covers a particular cell in the domain and verifying whether the current patch has permission ! to write to that cell. If both queries check out, the primitive variables of the current patch are assigned to this cell. - $:GPU_PARALLEL_LOOP(private='[i, j, k, cart_y, cart_z]', copyin='[encoded_patch_id, center, radius]', collapse=3) + $:GPU_PARALLEL_LOOP(private='[i, j, k]', copyin='[encoded_patch_id, center, radius]', collapse=3) do k = kl, kr do j = jl, jr do i = il, ir - if (grid_geometry == 3) then - call s_convert_cylindrical_to_cartesian_coord(y_cc(j), z_cc(k)) - else - cart_y = y_cc(j) - cart_z = z_cc(k) - end if ! Updating the patch identities bookkeeping variable - if (((x_cc(i) - center(1))**2 + (cart_y - center(2))**2 + (cart_z - center(3))**2 <= radius**2)) then + if (((x_cc(i) - center(1))**2 + (y_cc(j) - center(2))**2 + (z_cc(k) - center(3))**2 <= radius**2)) then ib_markers%sf(i, j, k) = encoded_patch_id end if end do @@ -586,7 +576,7 @@ contains inverse_rotation(:,:) = patch_ib(patch_id)%rotation_matrix_inverse(:,:) ! encode the periodicity information into the patch_id - call s_encode_patch_periodicity(patch_id, xp, yp, zp, encoded_patch_id) + call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, zp, encoded_patch_id) ! find the indices to the left and right of the IB in i, j, k il = -gp_layers - 1 @@ -656,7 +646,7 @@ contains inverse_rotation(:,:) = patch_ib(patch_id)%rotation_matrix_inverse(:,:) ! encode the periodicity information into the patch_id - call s_encode_patch_periodicity(patch_id, xp, yp, zp, encoded_patch_id) + call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, zp, encoded_patch_id) il = -gp_layers - 1 jl = -gp_layers - 1 @@ -724,7 +714,7 @@ contains inverse_rotation(:,:) = patch_ib(patch_id)%rotation_matrix_inverse(:,:) ! encode the periodicity information into the patch_id - call s_encode_patch_periodicity(patch_id, xp, yp, 0, encoded_patch_id) + call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, 0, encoded_patch_id) ! find the indices to the left and right of the IB in i, j, k il = -gp_layers - 1 @@ -781,7 +771,7 @@ contains threshold = patch_ib(patch_id)%model_threshold ! encode the periodicity information into the patch_id - call s_encode_patch_periodicity(patch_id, xp, yp, 0, encoded_patch_id) + call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, 0, encoded_patch_id) il = -gp_layers - 1 jl = -gp_layers - 1 @@ -858,7 +848,7 @@ contains rotation(:,:) = patch_ib(patch_id)%rotation_matrix(:,:) ! encode the periodicity information into the patch_id - call s_encode_patch_periodicity(patch_id, xp, yp, zp, encoded_patch_id) + call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, zp, encoded_patch_id) il = -gp_layers - 1 jl = -gp_layers - 1 @@ -1039,7 +1029,7 @@ contains temp_y_per = y_periodicity; if (y_periodicity == -1) temp_y_per = 2 temp_z_per = z_periodicity; if (z_periodicity == -1) temp_z_per = 2 - offset = (num_ibs + 1)*temp_x_per + 3*(num_ibs + 1)*temp_y_per + 9*(num_ibs + 1)*temp_z_per + offset = (num_gbl_ibs + 1)*temp_x_per + 3*(num_gbl_ibs + 1)*temp_y_per + 9*(num_gbl_ibs + 1)*temp_z_per encoded_patch_id = patch_id + offset end subroutine s_encode_patch_periodicity @@ -1054,7 +1044,7 @@ contains integer, intent(out), optional :: x_periodicity, y_periodicity, z_periodicity integer :: offset, remainder, xp, yp, zp, base - base = num_ibs + 1 + base = num_gbl_ibs + 1 patch_id = mod(encoded_patch_id - 1, base) + 1 offset = (encoded_patch_id - patch_id)/base diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp index 5f865cffeb..7c393155a6 100644 --- a/src/simulation/m_ibm.fpp +++ b/src/simulation/m_ibm.fpp @@ -71,11 +71,9 @@ contains call nvtxStartRange("SETUP-IBM-MODULE") ! do all set up for moving immersed boundaries - moving_immersed_boundary_flag = .false. do i = 1, num_ibs if (patch_ib(i)%moving_ibm /= 0) then call s_compute_moment_of_inertia(i, patch_ib(i)%angular_vel) - moving_immersed_boundary_flag = .true. end if call s_update_ib_rotation_matrix(i) end do @@ -123,6 +121,8 @@ contains call nvtxEndRange + ! print *, proc_rank, num_local_ibs, num_ibs, num_gbl_ibs + end subroutine s_ibm_setup !> Update the conservative variables at the ghost points @@ -1010,9 +1010,8 @@ contains call s_apply_collision_forces(ghost_points, num_gps, ib_markers, forces, torques) - ! reduce the forces across all MPI ranks - call s_mpi_allreduce_vectors_sum(forces, forces, num_ibs, 3) - call s_mpi_allreduce_vectors_sum(torques, torques, num_ibs, 3) + ! reduce the forces across local neighborhood ranks + call s_communicate_ib_forces(forces, torques) ! consider body forces after reducing to avoid double counting do i = 1, num_ibs @@ -1225,4 +1224,392 @@ contains end subroutine s_wrap_periodic_ibs + !> @brief reasseses ownership of IBs and passes ownership of IBs to neighbor processors + !> Reduces forces and torques across the local neighborhood without a global allreduce. Accumulation phase: 2 passes per + !! dimension receiving from the low-index (-X) neighbor. Pass 1: add received values; save what was received as recv_snap. Pass + !! 2: send current (post-pass-1) values; add received; subtract recv_snap to remove double-counting of the direct contribution + !! already added in pass 1. Back-propagation phase: 2 passes per dimension receiving from the high-index (+X) neighbor, each + !! overwriting local forces with the neighbor's accumulated total. + subroutine s_communicate_ib_forces(forces, torques) + + real(wp), dimension(num_ibs, 3), intent(inout) :: forces, torques + +#ifdef MFC_MPI + integer :: i, j, pack_pos, unpack_pos, buf_size, ierr + integer :: send_neighbor, recv_neighbor, recv_count, pid + real(wp), dimension(3) :: fval, tval + real(wp), allocatable :: recv_forces_snap(:,:), recv_torques_snap(:,:) + character(len=1), allocatable :: send_buf(:), recv_buf(:) + + if (num_procs == 1) return + + buf_size = storage_size(0)/8 + (storage_size(0)/8 + 6*storage_size(0._wp)/8)*size(patch_ib) + allocate (send_buf(buf_size), recv_buf(buf_size), recv_forces_snap(num_ibs, 3), recv_torques_snap(num_ibs, 3)) + + ! Accumulation phase: propagate contributions toward the high-index corner. + #:for X, ID, TAG1, TAG2 in [('x', 1, 300, 302), ('y', 2, 304, 306), ('z', 3, 308, 310)] + if (num_dims >= ${ID}$) then + send_neighbor = merge(bc_${X}$%end, MPI_PROC_NULL, bc_${X}$%end >= 0) + recv_neighbor = merge(bc_${X}$%beg, MPI_PROC_NULL, bc_${X}$%beg >= 0) + + ! Pass 1: send current forces to +${X}$ neighbor; receive from -${X}$ neighbor and add. Save what was received as + ! recv_snap for double-count removal in pass 2. + recv_forces_snap = 0._wp + recv_torques_snap = 0._wp + pack_pos = 0 + call MPI_PACK(num_ibs, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr) + do i = 1, num_ibs + call MPI_PACK(patch_ib(i)%gbl_patch_id, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr) + fval(:) = forces(i,:); tval(:) = torques(i,:) + call MPI_PACK(fval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr) + call MPI_PACK(tval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr) + end do + call MPI_SENDRECV(send_buf, pack_pos, MPI_PACKED, send_neighbor, ${TAG1}$, recv_buf, buf_size, MPI_PACKED, & + & recv_neighbor, ${TAG1}$, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) + if (recv_neighbor /= MPI_PROC_NULL) then + unpack_pos = 0 + call MPI_UNPACK(recv_buf, buf_size, unpack_pos, recv_count, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr) + do i = 1, recv_count + call MPI_UNPACK(recv_buf, buf_size, unpack_pos, pid, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr) + call MPI_UNPACK(recv_buf, buf_size, unpack_pos, fval, 3, mpi_p, MPI_COMM_WORLD, ierr) + call MPI_UNPACK(recv_buf, buf_size, unpack_pos, tval, 3, mpi_p, MPI_COMM_WORLD, ierr) + do j = 1, num_ibs + if (patch_ib(j)%gbl_patch_id == pid) then + recv_forces_snap(j,:) = fval(:) + recv_torques_snap(j,:) = tval(:) + forces(j,:) = forces(j,:) + fval(:) + torques(j,:) = torques(j,:) + tval(:) + exit + end if + end do + end do + end if + + ! Pass 2: send post-pass-1 forces to +${X}$ neighbor; receive from -${X}$ neighbor. Add received values then + ! subtract recv_snap to remove the pass-1 contribution that was already counted, leaving only the 2-hop delta. + pack_pos = 0 + call MPI_PACK(num_ibs, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr) + do i = 1, num_ibs + call MPI_PACK(patch_ib(i)%gbl_patch_id, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr) + fval(:) = forces(i,:); tval(:) = torques(i,:) + call MPI_PACK(fval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr) + call MPI_PACK(tval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr) + end do + call MPI_SENDRECV(send_buf, pack_pos, MPI_PACKED, send_neighbor, ${TAG2}$, recv_buf, buf_size, MPI_PACKED, & + & recv_neighbor, ${TAG2}$, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) + if (recv_neighbor /= MPI_PROC_NULL) then + unpack_pos = 0 + call MPI_UNPACK(recv_buf, buf_size, unpack_pos, recv_count, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr) + do i = 1, recv_count + call MPI_UNPACK(recv_buf, buf_size, unpack_pos, pid, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr) + call MPI_UNPACK(recv_buf, buf_size, unpack_pos, fval, 3, mpi_p, MPI_COMM_WORLD, ierr) + call MPI_UNPACK(recv_buf, buf_size, unpack_pos, tval, 3, mpi_p, MPI_COMM_WORLD, ierr) + do j = 1, num_ibs + if (patch_ib(j)%gbl_patch_id == pid) then + forces(j,:) = forces(j,:) + fval(:) - recv_forces_snap(j,:) + torques(j,:) = torques(j,:) + tval(:) - recv_torques_snap(j,:) + exit + end if + end do + end do + end if + end if + #:endfor + + ! Back-propagation phase: for each dimension, 2 passes receiving from the high-index neighbor. Each pass overwrites local + ! forces with the neighbor's accumulated total. Two passes ensure the total reaches 2 hops back, covering the full + ! neighborhood. + #:for X, ID, TAG1, TAG2 in [('x', 1, 312, 314), ('y', 2, 316, 318), ('z', 3, 320, 322)] + if (num_dims >= ${ID}$) then + send_neighbor = merge(bc_${X}$%beg, MPI_PROC_NULL, bc_${X}$%beg >= 0) + recv_neighbor = merge(bc_${X}$%end, MPI_PROC_NULL, bc_${X}$%end >= 0) + + #:for TAG in [TAG1, TAG2] + pack_pos = 0 + call MPI_PACK(num_ibs, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr) + do i = 1, num_ibs + call MPI_PACK(patch_ib(i)%gbl_patch_id, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr) + fval(:) = forces(i,:); tval(:) = torques(i,:) + call MPI_PACK(fval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr) + call MPI_PACK(tval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr) + end do + call MPI_SENDRECV(send_buf, pack_pos, MPI_PACKED, send_neighbor, ${TAG}$, recv_buf, buf_size, MPI_PACKED, & + & recv_neighbor, ${TAG}$, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) + if (recv_neighbor /= MPI_PROC_NULL) then + unpack_pos = 0 + call MPI_UNPACK(recv_buf, buf_size, unpack_pos, recv_count, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr) + do i = 1, recv_count + call MPI_UNPACK(recv_buf, buf_size, unpack_pos, pid, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr) + call MPI_UNPACK(recv_buf, buf_size, unpack_pos, fval, 3, mpi_p, MPI_COMM_WORLD, ierr) + call MPI_UNPACK(recv_buf, buf_size, unpack_pos, tval, 3, mpi_p, MPI_COMM_WORLD, ierr) + do j = 1, num_ibs + if (patch_ib(j)%gbl_patch_id == pid) then + forces(j,:) = fval(:) + torques(j,:) = tval(:) + exit + end if + end do + end do + end if + #:endfor + end if + #:endfor + + deallocate (send_buf, recv_buf, recv_forces_snap, recv_torques_snap) +#endif + + end subroutine s_communicate_ib_forces + + !> Alternative force reduction using two non-blocking all-to-neighbor broadcasts. Phase 1: every rank sends its full force array + !! to all 26 neighborhood neighbors simultaneously. After MPI_WAITALL, each rank sums contributions from neighbors for its owned + !! particles. Phase 2: each rank sends its finalized owned-particle forces (by gbl_patch_id) back to all neighbors + !! simultaneously. After MPI_WAITALL, each rank overwrites ghost-particle forces with the authoritative values from the owning + !! rank. Not currently called - available for benchmarking against s_communicate_ib_forces. + ! subroutine s_communicate_ib_forces_scatter(forces, torques) + + ! real(wp), dimension(num_ibs, 3), intent(inout) :: forces, torques + + ! #ifdef MFC_MPI integer, parameter :: max_nbrs = 26 integer :: i, j, k, nbr_idx, nreqs, pack_pos, unpack_pos integer :: + ! buf_size, entry_bytes, ierr, recv_count, pid integer :: send_neighbor, recv_neighbor, dx, dy, dz, tag integer, dimension(3) :: + ! nbr_coords logical :: is_owned real(wp), dimension(3) :: fval, tval real(wp), dimension(num_ibs, 3) :: forces_total, + ! torques_total integer, dimension(max_nbrs) :: recv_neighbor_list integer, dimension(2*max_nbrs) :: requests character(len=1), + ! allocatable :: send_buf(:), recv_bufs(:,:) character(len=1), allocatable :: owned_send_buf(:), owned_recv_bufs(:,:) integer :: + ! owned_buf_size + + ! if (num_procs == 1) return + + ! ! Buffer sized to hold count + (gbl_patch_id, forces, torques) per particle entry_bytes = storage_size(0)/8 + + ! 6*storage_size(0._wp)/8 buf_size = storage_size(0)/8 + entry_bytes*num_ibs owned_buf_size = storage_size(0)/8 + + ! entry_bytes*num_local_ibs_max allocate (send_buf(buf_size), recv_bufs(buf_size, max_nbrs), owned_send_buf(owned_buf_size), & & + ! owned_recv_bufs(owned_buf_size, max_nbrs)) + + ! ! Phase 1: pack full local force array and broadcast to all neighborhood neighbors. pack_pos = 0 call MPI_PACK(num_ibs, 1, + ! MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr) do i = 1, num_ibs call MPI_PACK(patch_ib(i)%gbl_patch_id, 1, + ! MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr) fval(:) = forces(i,:); tval(:) = torques(i,:) call + ! MPI_PACK(fval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr) call MPI_PACK(tval, 3, mpi_p, send_buf, buf_size, + ! pack_pos, MPI_COMM_WORLD, ierr) end do + + ! nreqs = 0 nbr_idx = 0 do dz = merge(-1, 0, num_dims == 3), merge(1, 0, num_dims == 3) do dy = -1, 1 do dx = -1, 1 if (dx == 0 + ! .and. dy == 0 .and. dz == 0) cycle nbr_idx = nbr_idx + 1 tag = 400 + (dx + 1)*9 + (dy + 1)*3 + (dz + 1) + + ! nbr_coords = proc_coords - [dx, dy, dz] call MPI_CART_RANK(MPI_COMM_CART, nbr_coords, recv_neighbor, ierr) if (ierr /= + ! MPI_SUCCESS) recv_neighbor = MPI_PROC_NULL recv_neighbor_list(nbr_idx) = recv_neighbor + + ! nreqs = nreqs + 1 call MPI_IRECV(recv_bufs(:,nbr_idx), buf_size, MPI_PACKED, recv_neighbor, tag, MPI_COMM_WORLD, & & + ! requests(nreqs), ierr) end do end do end do + + ! do dz = merge(-1, 0, num_dims == 3), merge(1, 0, num_dims == 3) do dy = -1, 1 do dx = -1, 1 if (dx == 0 .and. dy == 0 .and. dz + ! == 0) cycle tag = 400 + (dx + 1)*9 + (dy + 1)*3 + (dz + 1) + + ! nbr_coords = proc_coords + [dx, dy, dz] call MPI_CART_RANK(MPI_COMM_CART, nbr_coords, send_neighbor, ierr) if (ierr /= + ! MPI_SUCCESS) send_neighbor = MPI_PROC_NULL + + ! nreqs = nreqs + 1 call MPI_ISEND(send_buf, pack_pos, MPI_PACKED, send_neighbor, tag, MPI_COMM_WORLD, requests(nreqs), ierr) + ! end do end do end do + + ! call MPI_WAITALL(nreqs, requests, MPI_STATUSES_IGNORE, ierr) + + ! ! Local reduction: for each owned particle, sum contributions from all neighbors. forces_total = forces torques_total = + ! torques do nbr_idx = 1, merge(26, 8, num_dims == 3) if (recv_neighbor_list(nbr_idx) == MPI_PROC_NULL) cycle unpack_pos = 0 + ! call MPI_UNPACK(recv_bufs(:,nbr_idx), buf_size, unpack_pos, recv_count, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr) do i = 1, + ! recv_count call MPI_UNPACK(recv_bufs(:,nbr_idx), buf_size, unpack_pos, pid, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr) call + ! MPI_UNPACK(recv_bufs(:,nbr_idx), buf_size, unpack_pos, fval, 3, mpi_p, MPI_COMM_WORLD, ierr) call + ! MPI_UNPACK(recv_bufs(:,nbr_idx), buf_size, unpack_pos, tval, 3, mpi_p, MPI_COMM_WORLD, ierr) ! Only accumulate for particles + ! this rank owns do k = 1, num_local_ibs j = local_ib_patch_ids(k) if (patch_ib(j)%gbl_patch_id == pid) then forces_total(j,:) = + ! forces_total(j,:) + fval(:) torques_total(j,:) = torques_total(j,:) + tval(:) exit end if end do end do end do + + ! ! Write totals back for owned particles only do k = 1, num_local_ibs j = local_ib_patch_ids(k) forces(j,:) = forces_total(j,:) + ! torques(j,:) = torques_total(j,:) end do + + ! ! Phase 2: pack finalized owned-particle forces and back-broadcast to all neighbors. pack_pos = 0 call MPI_PACK(num_local_ibs, + ! 1, MPI_INTEGER, owned_send_buf, owned_buf_size, pack_pos, MPI_COMM_WORLD, ierr) do k = 1, num_local_ibs j = + ! local_ib_patch_ids(k) call MPI_PACK(patch_ib(j)%gbl_patch_id, 1, MPI_INTEGER, owned_send_buf, owned_buf_size, pack_pos, + ! MPI_COMM_WORLD, ierr) fval(:) = forces(j,:); tval(:) = torques(j,:) call MPI_PACK(fval, 3, mpi_p, owned_send_buf, + ! owned_buf_size, pack_pos, MPI_COMM_WORLD, ierr) call MPI_PACK(tval, 3, mpi_p, owned_send_buf, owned_buf_size, pack_pos, + ! MPI_COMM_WORLD, ierr) end do + + ! nreqs = 0 nbr_idx = 0 do dz = merge(-1, 0, num_dims == 3), merge(1, 0, num_dims == 3) do dy = -1, 1 do dx = -1, 1 if (dx == 0 + ! .and. dy == 0 .and. dz == 0) cycle nbr_idx = nbr_idx + 1 tag = 427 + (dx + 1)*9 + (dy + 1)*3 + (dz + 1) + + ! nbr_coords = proc_coords - [dx, dy, dz] call MPI_CART_RANK(MPI_COMM_CART, nbr_coords, recv_neighbor, ierr) if (ierr /= + ! MPI_SUCCESS) recv_neighbor = MPI_PROC_NULL recv_neighbor_list(nbr_idx) = recv_neighbor + + ! nreqs = nreqs + 1 call MPI_IRECV(owned_recv_bufs(:,nbr_idx), owned_buf_size, MPI_PACKED, recv_neighbor, tag, MPI_COMM_WORLD, & + ! & requests(nreqs), ierr) end do end do end do + + ! do dz = merge(-1, 0, num_dims == 3), merge(1, 0, num_dims == 3) do dy = -1, 1 do dx = -1, 1 if (dx == 0 .and. dy == 0 .and. dz + ! == 0) cycle tag = 427 + (dx + 1)*9 + (dy + 1)*3 + (dz + 1) + + ! nbr_coords = proc_coords + [dx, dy, dz] call MPI_CART_RANK(MPI_COMM_CART, nbr_coords, send_neighbor, ierr) if (ierr /= + ! MPI_SUCCESS) send_neighbor = MPI_PROC_NULL + + ! nreqs = nreqs + 1 call MPI_ISEND(owned_send_buf, pack_pos, MPI_PACKED, send_neighbor, tag, MPI_COMM_WORLD, requests(nreqs), + ! ierr) end do end do end do + + ! call MPI_WAITALL(nreqs, requests, MPI_STATUSES_IGNORE, ierr) + + ! ! Overwrite ghost-particle forces with authoritative values from the owning rank. do nbr_idx = 1, merge(26, 8, num_dims == 3) + ! if (recv_neighbor_list(nbr_idx) == MPI_PROC_NULL) cycle unpack_pos = 0 call MPI_UNPACK(owned_recv_bufs(:,nbr_idx), + ! owned_buf_size, unpack_pos, recv_count, 1, MPI_INTEGER, MPI_COMM_WORLD, & & ierr) do i = 1, recv_count call + ! MPI_UNPACK(owned_recv_bufs(:,nbr_idx), owned_buf_size, unpack_pos, pid, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr) call + ! MPI_UNPACK(owned_recv_bufs(:,nbr_idx), owned_buf_size, unpack_pos, fval, 3, mpi_p, MPI_COMM_WORLD, ierr) call + ! MPI_UNPACK(owned_recv_bufs(:,nbr_idx), owned_buf_size, unpack_pos, tval, 3, mpi_p, MPI_COMM_WORLD, ierr) ! Only overwrite + ! ghost particles (not owned ones - this rank's total is authoritative) do j = 1, num_ibs if (patch_ib(j)%gbl_patch_id == pid) + ! then is_owned = .false. do k = 1, num_local_ibs if (local_ib_patch_ids(k) == j) then is_owned = .true. exit end if end do if + ! (.not. is_owned) then forces(j,:) = fval(:) torques(j,:) = tval(:) end if exit end if end do end do end do + + ! deallocate (send_buf, recv_bufs, owned_send_buf, owned_recv_bufs) #endif + + ! end subroutine s_communicate_ib_forces_scatter + + subroutine s_handoff_ib_ownership() + + integer :: i, j, k, output_idx, local_output_idx + integer :: old_num_local_ibs + integer :: new_count, recv_count + integer :: pack_pos, unpack_pos, buf_size, patch_bytes + integer :: send_neighbor, recv_neighbor, ierr + integer :: dx, dy, dz, tag, nbr_idx, nreqs + real(wp) :: position + real(wp), dimension(3) :: centroid + logical :: is_new, already_known + type(ib_patch_parameters) :: tmp_patch + integer, dimension(num_local_ibs_max) :: local_ib_idx_old + ! 26 neighbors max in 3D (8 in 2D); each gets its own recv buffer + integer, parameter :: max_nbrs = 26 + character(len=1), allocatable :: send_buf(:), recv_bufs(:,:) + integer, dimension(2*max_nbrs) :: requests + integer, dimension(max_nbrs) :: recv_neighbor_list + +#ifdef MFC_MPI + if (num_procs > 1) then + ! save a copy of the local IB's global indices to cross-reference for later. + local_ib_idx_old = 0 + old_num_local_ibs = num_local_ibs + do i = 1, num_local_ibs + local_ib_idx_old(i) = patch_ib(local_ib_patch_ids(i))%gbl_patch_id + end do + + ! delete any particles that no longer need to be tracked and coalesce the array + output_idx = 0 + local_output_idx = 0 + do i = 1, num_ibs + ! delete if not in neighborhood + #:for X, ID in [('x', 1), ('y', 2), ('z', 3)] + if (patch_ib(i)%${X}$_centroid < neighbor_domain_${X}$%beg .or. neighbor_domain_${X}$%end < patch_ib(i) & + & %${X}$_centroid) then + cycle + end if + #:endfor + output_idx = output_idx + 1 + if (i /= output_idx) patch_ib(output_idx) = patch_ib(i) + + ! check if in local domain + centroid = [patch_ib(i)%x_centroid, patch_ib(i)%y_centroid, 0._wp] + if (num_dims == 3) centroid(3) = patch_ib(i)%z_centroid + if (f_local_rank_owns_collision(centroid)) then + local_output_idx = local_output_idx + 1 + local_ib_patch_ids(local_output_idx) = output_idx + end if + end do + num_ibs = output_idx + num_local_ibs = local_output_idx + + ! Broadcast newly-owned patches to all neighborhood neighbors (including corners/edges). + patch_bytes = storage_size(tmp_patch)/8 + buf_size = storage_size(0)/8 + patch_bytes*num_local_ibs_max + allocate (send_buf(buf_size), recv_bufs(buf_size, max_nbrs)) + + ! Write placeholder count at position 0 + pack_pos = 0 + call MPI_PACK(0, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr) + + ! pack new patches and count them + new_count = 0 + do i = 1, num_local_ibs + k = local_ib_patch_ids(i) + is_new = .true. + do j = 1, old_num_local_ibs + if (patch_ib(k)%gbl_patch_id == local_ib_idx_old(j)) then + is_new = .false. + exit + end if + end do + if (is_new) then + print *, proc_rank, " New Owner ", patch_ib(k)%gbl_patch_id + call MPI_PACK(patch_ib(k), patch_bytes, MPI_BYTE, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr) + new_count = new_count + 1 + end if + end do + + ! Overwrite the placeholder with the real count + pack_pos = 0 + call MPI_PACK(new_count, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr) + pack_pos = storage_size(0)/8 + new_count*patch_bytes + + ! Post all receives first, then sends, using pre-built ib_neighbor_ranks lookup. Tags: 200 + (dx+1)*9 + (dy+1)*3 + + ! (dz+1) + nreqs = 0 + nbr_idx = 0 + do dz = merge(-1, 0, num_dims == 3), merge(1, 0, num_dims == 3) + do dy = -1, 1 + do dx = -1, 1 + if (dx == 0 .and. dy == 0 .and. dz == 0) cycle + nbr_idx = nbr_idx + 1 + tag = 200 + (dx + 1)*9 + (dy + 1)*3 + (dz + 1) + recv_neighbor = ib_neighbor_ranks(-dx, -dy, -dz) + recv_neighbor_list(nbr_idx) = MPI_PROC_NULL + if (recv_neighbor < 0) cycle + recv_neighbor_list(nbr_idx) = recv_neighbor + nreqs = nreqs + 1 + call MPI_IRECV(recv_bufs(:,nbr_idx), buf_size, MPI_PACKED, recv_neighbor, tag, MPI_COMM_WORLD, & + & requests(nreqs), ierr) + end do + end do + end do + + do dz = merge(-1, 0, num_dims == 3), merge(1, 0, num_dims == 3) + do dy = -1, 1 + do dx = -1, 1 + if (dx == 0 .and. dy == 0 .and. dz == 0) cycle + tag = 200 + (dx + 1)*9 + (dy + 1)*3 + (dz + 1) + send_neighbor = ib_neighbor_ranks(dx, dy, dz) + if (send_neighbor < 0) cycle + nreqs = nreqs + 1 + call MPI_ISEND(send_buf, pack_pos, MPI_PACKED, send_neighbor, tag, MPI_COMM_WORLD, requests(nreqs), ierr) + end do + end do + end do + + call MPI_WAITALL(nreqs, requests, MPI_STATUSES_IGNORE, ierr) + + ! Unpack all received buffers + do nbr_idx = 1, merge(26, 8, num_dims == 3) + if (recv_neighbor_list(nbr_idx) == MPI_PROC_NULL) cycle + unpack_pos = 0 + call MPI_UNPACK(recv_bufs(:,nbr_idx), buf_size, unpack_pos, recv_count, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr) + do i = 1, recv_count + call MPI_UNPACK(recv_bufs(:,nbr_idx), buf_size, unpack_pos, tmp_patch, patch_bytes, MPI_BYTE, MPI_COMM_WORLD, & + & ierr) + already_known = .false. + do j = 1, num_ibs + if (patch_ib(j)%gbl_patch_id == tmp_patch%gbl_patch_id) then + already_known = .true. + exit + end if + end do + if (.not. already_known) then + num_ibs = num_ibs + 1 + @:ASSERT(num_ibs <= size(patch_ib), 'patch_ib overflow in neighborhood handoff') + patch_ib(num_ibs) = tmp_patch + end if + end do + end do + + deallocate (send_buf, recv_bufs) + end if +#endif + + end subroutine s_handoff_ib_ownership + end module m_ibm diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp index d544324c8a..967afb3032 100644 --- a/src/simulation/m_start_up.fpp +++ b/src/simulation/m_start_up.fpp @@ -39,6 +39,7 @@ module m_start_up use m_nvtx use m_ibm + use m_collisions use m_compile_specific use m_checker_common use m_checker @@ -915,6 +916,7 @@ contains if (model_eqns == 3) call s_initialize_internal_energy_equations(q_cons_ts(1)%vf) if (ib) then if (t_step_start > 0) call s_read_ib_restart_data(t_step_start) + call s_reduce_ib_patch_array() call s_ibm_setup() if (t_step_start == 0) then call s_write_ib_data_file(0) @@ -1010,6 +1012,8 @@ contains #else "on CPUs" #endif + else + allocate (patch_ib(num_ib_patches_max)) end if call s_mpi_bcast_user_inputs() @@ -1192,4 +1196,208 @@ contains end subroutine s_read_ib_restart_data + !> @brief takes the patch_ib struct array that contains all global IB patches and reduces to only contain patches that are in + !! the local computational domain. + subroutine s_reduce_ib_patch_array() + + type(ib_patch_parameters), dimension(num_ib_patches_max) :: patch_ib_gbl + real(wp) :: position + integer :: i, j + integer :: num_aware_ibs + logical :: is_in_neighborhood, is_local + + ! do all set up for moving immersed boundaries + + moving_immersed_boundary_flag = .false. + do i = 1, num_ibs + if (patch_ib(i)%moving_ibm /= 0) then + moving_immersed_boundary_flag = .true. + exit + end if + end do + + patch_ib_gbl(:) = patch_ib(:) + call get_neighbor_bounds() ! make sure the bounds of the neighbors are correctly set up + call s_compute_ib_neighbor_ranks() ! build lookup of all 26 neighbor MPI ranks + + deallocate (patch_ib) + if (num_dims == 3) then + num_aware_ibs = min(num_local_ibs_max*27, num_ib_patches_max) + else + num_aware_ibs = min(num_local_ibs_max*9, num_ib_patches_max) + end if + allocate (patch_ib(num_aware_ibs)) + + num_gbl_ibs = num_ibs + +#ifdef MFC_MPI + ! fallback for 1-rank case + if (num_procs == 1) then + patch_ib(:) = patch_ib_gbl(1:num_aware_ibs) + else + ! determine the set of patches owned by local rank + num_local_ibs = 0 + num_ibs = 0 + do i = 1, num_gbl_ibs + ! catch the edge case where th collision lies just outside the computational domain + is_in_neighborhood = .true. + is_local = .true. + + #:for X, ID, DIM in [('x', 1, 'm'), ('y', 2, 'n'), ('z', 3, 'p')] + if (num_dims >= ${ID}$) then + position = patch_ib_gbl(i)%${X}$_centroid + if (neighbor_domain_${X}$%beg > position .or. position > neighbor_domain_${X}$%end) then + is_in_neighborhood = .false. + is_local = .false. + else if (${X}$_cb(-1) > position .or. position > ${X}$_cb(${DIM}$)) then + is_local = .false. + end if + end if + #:endfor + + if (is_in_neighborhood) then + num_ibs = num_ibs + 1 + patch_ib(num_ibs) = patch_ib_gbl(i) + patch_ib(num_ibs)%gbl_patch_id = i + if (is_local) then + num_local_ibs = num_local_ibs + 1 + local_ib_patch_ids(num_local_ibs) = num_ibs + end if + end if + end do + end if +#else + ! reduce the size of the array for local simulation in no-MPI case + patch_ib(:) = patch_ib_gbl(1:num_aware_ibs) +#endif + + end subroutine s_reduce_ib_patch_array + + !> Build ib_neighbor_ranks(-1:1,-1:1,-1:1): MPI ranks of all 26 neighbor domains. Uses two rounds of MPI_SENDRECV cascades - + !! face neighbors are known from bc_*, edge neighbors are obtained in round 1, and (3D) corner neighbors in round 2. + subroutine s_compute_ib_neighbor_ranks() + +#ifdef MFC_MPI + integer :: ierr + integer, dimension(4) :: buf4 + integer, dimension(2) :: buf2, rbuf2 + + ib_neighbor_ranks = MPI_PROC_NULL + ib_neighbor_ranks(0, 0, 0) = proc_rank + + ! Face neighbors - already known from domain decomposition + ib_neighbor_ranks(-1, 0, 0) = bc_x%beg + ib_neighbor_ranks(+1, 0, 0) = bc_x%end + if (num_dims >= 2) then + ib_neighbor_ranks(0, -1, 0) = bc_y%beg + ib_neighbor_ranks(0, +1, 0) = bc_y%end + end if + if (num_dims == 3) then + ib_neighbor_ranks(0, 0, -1) = bc_z%beg + ib_neighbor_ranks(0, 0, +1) = bc_z%end + end if + + if (num_dims >= 2) then + ! Round 1a: exchange y/z face ranks with +/-x face neighbors -> xy and xz edge ranks + buf4 = [bc_y%beg, bc_y%end, bc_z%beg, bc_z%end] + + ! Send to -x, receive from +x -> edges (+1,+/-1,0) and (+1,0,+/-1) + call MPI_SENDRECV(buf4, 4, MPI_INTEGER, merge(bc_x%beg, MPI_PROC_NULL, bc_x%beg >= 0), 310, buf4, 4, MPI_INTEGER, & + & merge(bc_x%end, MPI_PROC_NULL, bc_x%end >= 0), 310, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) + if (bc_x%end >= 0) then + ib_neighbor_ranks(+1, -1, 0) = buf4(1) + ib_neighbor_ranks(+1, +1, 0) = buf4(2) + ib_neighbor_ranks(+1, 0, -1) = buf4(3) + ib_neighbor_ranks(+1, 0, +1) = buf4(4) + end if + + ! Restore buf4, then send to +x, receive from -x -> edges (-1,+/-1,0) and (-1,0,+/-1) + buf4 = [bc_y%beg, bc_y%end, bc_z%beg, bc_z%end] + call MPI_SENDRECV(buf4, 4, MPI_INTEGER, merge(bc_x%end, MPI_PROC_NULL, bc_x%end >= 0), 311, buf4, 4, MPI_INTEGER, & + & merge(bc_x%beg, MPI_PROC_NULL, bc_x%beg >= 0), 311, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) + if (bc_x%beg >= 0) then + ib_neighbor_ranks(-1, -1, 0) = buf4(1) + ib_neighbor_ranks(-1, +1, 0) = buf4(2) + ib_neighbor_ranks(-1, 0, -1) = buf4(3) + ib_neighbor_ranks(-1, 0, +1) = buf4(4) + end if + end if + + if (num_dims == 3) then + ! Round 1b: exchange z face ranks with +/-y face neighbors -> yz edge ranks + buf2 = [bc_z%beg, bc_z%end] + + call MPI_SENDRECV(buf2, 2, MPI_INTEGER, merge(bc_y%beg, MPI_PROC_NULL, bc_y%beg >= 0), 312, rbuf2, 2, MPI_INTEGER, & + & merge(bc_y%end, MPI_PROC_NULL, bc_y%end >= 0), 312, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) + if (bc_y%end >= 0) then + ib_neighbor_ranks(0, +1, -1) = rbuf2(1) + ib_neighbor_ranks(0, +1, +1) = rbuf2(2) + end if + + call MPI_SENDRECV(buf2, 2, MPI_INTEGER, merge(bc_y%end, MPI_PROC_NULL, bc_y%end >= 0), 313, rbuf2, 2, MPI_INTEGER, & + & merge(bc_y%beg, MPI_PROC_NULL, bc_y%beg >= 0), 313, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) + if (bc_y%beg >= 0) then + ib_neighbor_ranks(0, -1, -1) = rbuf2(1) + ib_neighbor_ranks(0, -1, +1) = rbuf2(2) + end if + + ! Round 2: exchange z face ranks with xy-diagonal edge neighbors -> corner ranks Each of the 4 xy diagonals gives 2 + ! corners (the +/-z variants). Pattern: send buf2 to mirror diagonal, receive from this diagonal -> that edge's z face + ! ranks. + #:for DX, DY, MDX, MDY, TAG in [(1,1,-1,-1,320), (1,-1,-1,1,321), (-1,1,1,-1,322), (-1,-1,1,1,323)] + call MPI_SENDRECV(buf2, 2, MPI_INTEGER, merge(ib_neighbor_ranks(${MDX}$, ${MDY}$, 0), MPI_PROC_NULL, & + & ib_neighbor_ranks(${MDX}$, ${MDY}$, 0) >= 0), ${TAG}$, rbuf2, 2, MPI_INTEGER, & + & merge(ib_neighbor_ranks(${DX}$, ${DY}$, 0), MPI_PROC_NULL, ib_neighbor_ranks(${DX}$, ${DY}$, & + & 0) >= 0), ${TAG}$, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) + if (ib_neighbor_ranks(${DX}$, ${DY}$, 0) >= 0) then + ib_neighbor_ranks(${DX}$, ${DY}$, -1) = rbuf2(1) + ib_neighbor_ranks(${DX}$, ${DY}$, +1) = rbuf2(2) + end if + #:endfor + end if +#endif + + end subroutine s_compute_ib_neighbor_ranks + + subroutine get_neighbor_bounds() + + real(wp) :: send_val, recv_val + integer :: send_neighbor, recv_neighbor, ierr + + ! Default: no neighbor in any direction + + neighbor_domain_x%beg = -huge(0._wp) + neighbor_domain_x%end = huge(0._wp) + neighbor_domain_y%beg = -huge(0._wp) + neighbor_domain_y%end = huge(0._wp) + neighbor_domain_z%beg = -huge(0._wp) + neighbor_domain_z%end = huge(0._wp) + +#ifdef MFC_MPI + #:for X, ID, TAG, DIM in [('x', 1, 100, 'm'), ('y', 2, 102, 'n'), ('z', 3, 104, 'p')] + if (num_dims >= ${ID}$) then + ! Step 1: broadcast left edge (-1 face) rightward; receive left neighbor's left edge -> neighbor_domain_${X}$%beg + send_val = ${X}$_cb(-1) + send_neighbor = merge(bc_${X}$%end, MPI_PROC_NULL, bc_${X}$%end >= 0) + recv_neighbor = merge(bc_${X}$%beg, MPI_PROC_NULL, bc_${X}$%beg >= 0) + recv_val = -huge(0._wp) + call MPI_SENDRECV(send_val, 1, mpi_p, send_neighbor, ${TAG}$, recv_val, 1, mpi_p, recv_neighbor, ${TAG}$, & + & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) + neighbor_domain_${X}$%beg = recv_val + + ! Step 2: broadcast right edge (${DIM}$ face) leftward; receive right neighbor's right edge -> + ! neighbor_domain_${X}$%end + send_val = ${X}$_cb(${DIM}$) + send_neighbor = merge(bc_${X}$%beg, MPI_PROC_NULL, bc_${X}$%beg >= 0) + recv_neighbor = merge(bc_${X}$%end, MPI_PROC_NULL, bc_${X}$%end >= 0) + recv_val = huge(0._wp) + call MPI_SENDRECV(send_val, 1, mpi_p, send_neighbor, ${TAG + 1}$, recv_val, 1, mpi_p, recv_neighbor, ${TAG + 1}$, & + & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) + neighbor_domain_${X}$%end = recv_val + end if + #:endfor +#endif + + end subroutine get_neighbor_bounds + end module m_start_up diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp index ce0ae48c4b..eb25dd9309 100644 --- a/src/simulation/m_time_steppers.fpp +++ b/src/simulation/m_time_steppers.fpp @@ -552,7 +552,6 @@ contains ! check if any IBMS are moving, and if so, update the markers, ghost points, levelsets, and levelset norms if (moving_immersed_boundary_flag) then call s_propagate_immersed_boundaries(s) - ! compute ib forces for fixed immersed boundaries if requested for output end if ! update the ghost fluid properties point values based on IB state @@ -564,10 +563,11 @@ contains end if end do - ! if (ib) then - if (moving_immersed_boundary_flag) call s_wrap_periodic_ibs() - if (ib_state_wrt .and. (.not. moving_immersed_boundary_flag)) then + if (moving_immersed_boundary_flag) then + call s_wrap_periodic_ibs() ! wraps the positions of IBs to the local proc + call s_handoff_ib_ownership() ! recomputes which ranks own which IBs and communicate to neighbors + else if (ib_state_wrt) then call s_compute_ib_forces(q_prim_vf, fluid_pp) end if end if @@ -714,6 +714,8 @@ contains forces_computed = .false. + if (moving_immersed_boundary_flag) call s_compute_ib_forces(q_prim_vf, fluid_pp) + do i = 1, num_ibs if (s == 1) then patch_ib(i)%step_vel = patch_ib(i)%vel @@ -726,11 +728,6 @@ contains ! Compute forces BEFORE the RK velocity blend so the device copy of patch_ib%vel matches the host (pre-blend) when ! velocity-dependent collision damping forces are evaluated on the GPU. - if (patch_ib(i)%moving_ibm == 2 .and. .not. forces_computed) then - call s_compute_ib_forces(q_prim_vf, fluid_pp) - forces_computed = .true. - end if - if (patch_ib(i)%moving_ibm > 0) then patch_ib(i)%vel = (rk_coef(s, 1)*patch_ib(i)%step_vel + rk_coef(s, 2)*patch_ib(i)%vel)/rk_coef(s, 4) patch_ib(i)%angular_vel = (rk_coef(s, 1)*patch_ib(i)%step_angular_vel + rk_coef(s, &