diff --git a/docs/documentation/case.md b/docs/documentation/case.md
index 5acd85a7a3..dbd127f23c 100644
--- a/docs/documentation/case.md
+++ b/docs/documentation/case.md
@@ -638,7 +638,7 @@ To restart the simulation from $k$-th time step, see @ref running "Restarting Ca
 | `alpha_wrt(i)`          | Logical | Add the volume fraction of fluid $i$ to the database	|
 | `gamma_wrt`             | Logical | Add the specific heat ratio function to the database	|
 | `heat_ratio_wrt`        | Logical | Add the specific heat ratio to the database	|
-| `ib_state_wrt`          | Logical | Write IB state and loads to a datafile at each time step |
+| `ib_state_wrt`          | Logical | Parameter to handle writing IB state on saves and outputting the state as a point mesh to SILO files. |
 | `pi_inf_wrt`            | Logical | Add the liquid stiffness function to the database |
 | `pres_inf_wrt`          | Logical | Add the liquid stiffness to the formatted database	 |
 | `c_wrt`                 | Logical | Add the sound speed to the database	 |
@@ -706,7 +706,7 @@ If `file_per_process` is true, then pre_process, simulation, and post_process mu
 
 - `probe_wrt` activates the output of state variables at coordinates specified by `probe(i)%[x;y,z]`.
 
-- `ib_state_wrt` activates the output of data specified by patch_ib(i)%force(:) (and torque, vel, angular_vel, angles, [x,y,z]_centroid) into a single binary datafile for all IBs at all timesteps. During post_processing, this file is converted into separate time histories for each IB.
+- `ib_state_wrt` is used to trigger post-processing of the IB state to be written out as a point mesh in the SILO files. When no IBs are moving, it also triggers force and torque calculation so that those values may be written to the output state files.
 
 - `output_partial_domain` activates the output of part of the domain specified by `[x,y,z]_output%%beg` and `[x,y,z]_output%%end`.
 This is useful for large domains where only a portion of the domain is of interest.
diff --git a/src/common/m_constants.fpp b/src/common/m_constants.fpp
index cacc50f528..d66fb10614 100644
--- a/src/common/m_constants.fpp
+++ b/src/common/m_constants.fpp
@@ -25,6 +25,7 @@ module m_constants
     integer, parameter  :: num_probes_max = 10                 !< Maximum number of flow probes in the simulation
     integer, parameter  :: num_patches_max = 10                !< Maximum number of IC patches
     integer, parameter  :: num_ib_patches_max = 50000          !< Maximum number of immersed boundary patches (patch_ib)
+    integer, parameter  :: num_local_ibs_max = 2000            !< Maximum number of immersed boundary patches (patch_ib)
     integer, parameter  :: num_bc_patches_max = 10             !< Maximum number of boundary condition patches
     integer, parameter  :: max_2d_fourier_modes = 10           !< Max Fourier mode index for 2D modal patch (geometry 13)
     integer, parameter  :: max_sph_harm_degree = 5             !< Max degree L for 3D spherical harmonic patch (geometry 14)
diff --git a/src/common/m_derived_types.fpp b/src/common/m_derived_types.fpp
index 6ee2e836a5..c4e5873ead 100644
--- a/src/common/m_derived_types.fpp
+++ b/src/common/m_derived_types.fpp
@@ -202,12 +202,10 @@ module m_derived_types
 
     type :: t_model_array
         ! Original CPU-side fields (unchanged)
-        type(t_model), allocatable              :: model                    !< STL/OBJ geometry model
-        real(wp), allocatable, dimension(:,:,:) :: boundary_v               !< Boundary vertices
-        real(wp), allocatable, dimension(:,:)   :: interpolated_boundary_v  !< Interpolated boundary vertices
-        integer                                 :: boundary_edge_count      !< Number of boundary edges
-        integer                                 :: total_vertices           !< Total vertex count
-        integer                                 :: interpolate              !< Interpolation flag
+        type(t_model), allocatable              :: model                !< STL/OBJ geometry model
+        real(wp), allocatable, dimension(:,:,:) :: boundary_v           !< Boundary vertices
+        integer                                 :: boundary_edge_count  !< Number of boundary edges
+        integer                                 :: total_vertices       !< Total vertex count
 
         ! GPU-friendly flattened arrays
         integer                                 :: ntrs   !< Copy of model%ntrs
@@ -272,9 +270,10 @@ module m_derived_types
     end type ic_patch_parameters
 
     type ib_patch_parameters
-
         integer  :: geometry                            !< Type of geometry for the patch
+        integer  :: gbl_patch_id
         real(wp) :: x_centroid, y_centroid, z_centroid  !< Geometric center coordinates of the patch
+
         !> Centroid locations of intermediate steps in the time_stepper module
         real(wp)                 :: step_x_centroid, step_y_centroid, step_z_centroid
         real(wp), dimension(1:3) :: centroid_offset  !< offset of center of mass from computed cell center for odd-shaped IBs
diff --git a/src/common/m_model.fpp b/src/common/m_model.fpp
index 0dab036f9b..f02474f8d0 100644
--- a/src/common/m_model.fpp
+++ b/src/common/m_model.fpp
@@ -983,10 +983,11 @@ contains
         dx_local = minval(dx); dy_local = minval(dy)
         if (p /= 0) dz_local = minval(dz)
 
-        allocate (stl_bounding_boxes(num_ibs,1:3,1:3))
+        allocate (stl_bounding_boxes(num_gbl_ibs,1:3,1:3))
 
         do patch_id = 1, num_ibs
             if (patch_ib(patch_id)%geometry == 5 .or. patch_ib(patch_id)%geometry == 12) then
+                print *, proc_rank, patch_id, num_ibs, patch_ib(patch_id)%geometry
                 allocate (models(patch_id)%model)
                 print *, " * Reading model: " // trim(patch_ib(patch_id)%model_filepath)
 
diff --git a/src/post_process/m_data_output.fpp b/src/post_process/m_data_output.fpp
index aece31ff8d..3a3a7bb10c 100644
--- a/src/post_process/m_data_output.fpp
+++ b/src/post_process/m_data_output.fpp
@@ -1396,48 +1396,42 @@ contains
                 end do
 
                 close (file_unit)
-            end if
-
-            call MPI_BCAST(ib_data, nBodies*NFIELDS_PER_IB, mpi_p, 0, MPI_COMM_WORLD, ierr)
-
-            do i = 1, nBodies
-                force_x(i) = ib_data(i, 2); force_y(i) = ib_data(i, 3); force_z(i) = ib_data(i, 4)
-                torque_x(i) = ib_data(i, 5); torque_y(i) = ib_data(i, 6); torque_z(i) = ib_data(i, 7)
-                vel_x(i) = ib_data(i, 8); vel_y(i) = ib_data(i, 9); vel_z(i) = ib_data(i, 10)
-                omega_x(i) = ib_data(i, 11); omega_y(i) = ib_data(i, 12); omega_z(i) = ib_data(i, 13)
-                angle_x(i) = ib_data(i, 14); angle_y(i) = ib_data(i, 15); angle_z(i) = ib_data(i, 16)
-                px(i) = ib_data(i, 17); py(i) = ib_data(i, 18); pz(i) = ib_data(i, 19)
-                ib_diameter(i) = ib_data(i, 20)*2.0_wp
-            end do
 
-            if (proc_rank == 0) then
-                do i = 1, num_procs
-                    write (meshnames(i), '(A,I0,A,I0,A)') '../p', i - 1, '/', t_step, '.silo:ib_bodies'
-                    meshtypes(i) = DB_POINTMESH
+                do i = 1, nBodies
+                    force_x(i) = ib_data(i, 2); force_y(i) = ib_data(i, 3); force_z(i) = ib_data(i, 4)
+                    torque_x(i) = ib_data(i, 5); torque_y(i) = ib_data(i, 6); torque_z(i) = ib_data(i, 7)
+                    vel_x(i) = ib_data(i, 8); vel_y(i) = ib_data(i, 9); vel_z(i) = ib_data(i, 10)
+                    omega_x(i) = ib_data(i, 11); omega_y(i) = ib_data(i, 12); omega_z(i) = ib_data(i, 13)
+                    angle_x(i) = ib_data(i, 14); angle_y(i) = ib_data(i, 15); angle_z(i) = ib_data(i, 16)
+                    px(i) = ib_data(i, 17); py(i) = ib_data(i, 18); pz(i) = ib_data(i, 19)
+                    ib_diameter(i) = ib_data(i, 20)*2.0_wp
                 end do
+
+                write (meshnames(1), '(A,I0,A)') '../p0/', t_step, '.silo:ib_bodies'
+                meshtypes(1) = DB_POINTMESH
                 err = DBSET2DSTRLEN(len(meshnames(1)))
-                err = DBPUTMMESH(dbroot, 'ib_bodies', 16, num_procs, meshnames, len_trim(meshnames), meshtypes, DB_F77NULL, ierr)
+                err = DBPUTMMESH(dbroot, 'ib_bodies', 16, 1, meshnames, len_trim(meshnames), meshtypes, DB_F77NULL, ierr)
+
+                err = DBPUTPM(dbfile, 'ib_bodies', 9, 3, px, py, pz, nBodies, DB_DOUBLE, DB_F77NULL, ierr)
+
+                call s_write_ib_variable('ib_force_x', t_step, force_x, nBodies)
+                call s_write_ib_variable('ib_force_y', t_step, force_y, nBodies)
+                call s_write_ib_variable('ib_force_z', t_step, force_z, nBodies)
+                call s_write_ib_variable('ib_torque_x', t_step, torque_x, nBodies)
+                call s_write_ib_variable('ib_torque_y', t_step, torque_y, nBodies)
+                call s_write_ib_variable('ib_torque_z', t_step, torque_z, nBodies)
+                call s_write_ib_variable('ib_vel_x', t_step, vel_x, nBodies)
+                call s_write_ib_variable('ib_vel_y', t_step, vel_y, nBodies)
+                call s_write_ib_variable('ib_vel_z', t_step, vel_z, nBodies)
+                call s_write_ib_variable('ib_omega_x', t_step, omega_x, nBodies)
+                call s_write_ib_variable('ib_omega_y', t_step, omega_y, nBodies)
+                call s_write_ib_variable('ib_omega_z', t_step, omega_z, nBodies)
+                call s_write_ib_variable('ib_angle_x', t_step, angle_x, nBodies)
+                call s_write_ib_variable('ib_angle_y', t_step, angle_y, nBodies)
+                call s_write_ib_variable('ib_angle_z', t_step, angle_z, nBodies)
+                call s_write_ib_variable('ib_diameter', t_step, ib_diameter, nBodies)
             end if
 
-            err = DBPUTPM(dbfile, 'ib_bodies', 9, 3, px, py, pz, nBodies, DB_DOUBLE, DB_F77NULL, ierr)
-
-            call s_write_ib_variable('ib_force_x', t_step, force_x, nBodies)
-            call s_write_ib_variable('ib_force_y', t_step, force_y, nBodies)
-            call s_write_ib_variable('ib_force_z', t_step, force_z, nBodies)
-            call s_write_ib_variable('ib_torque_x', t_step, torque_x, nBodies)
-            call s_write_ib_variable('ib_torque_y', t_step, torque_y, nBodies)
-            call s_write_ib_variable('ib_torque_z', t_step, torque_z, nBodies)
-            call s_write_ib_variable('ib_vel_x', t_step, vel_x, nBodies)
-            call s_write_ib_variable('ib_vel_y', t_step, vel_y, nBodies)
-            call s_write_ib_variable('ib_vel_z', t_step, vel_z, nBodies)
-            call s_write_ib_variable('ib_omega_x', t_step, omega_x, nBodies)
-            call s_write_ib_variable('ib_omega_y', t_step, omega_y, nBodies)
-            call s_write_ib_variable('ib_omega_z', t_step, omega_z, nBodies)
-            call s_write_ib_variable('ib_angle_x', t_step, angle_x, nBodies)
-            call s_write_ib_variable('ib_angle_y', t_step, angle_y, nBodies)
-            call s_write_ib_variable('ib_angle_z', t_step, angle_z, nBodies)
-            call s_write_ib_variable('ib_diameter', t_step, ib_diameter, nBodies)
-
             deallocate (ib_data, px, py, pz, force_x, force_y, force_z)
             deallocate (torque_x, torque_y, torque_z, vel_x, vel_y, vel_z)
             deallocate (omega_x, omega_y, omega_z, angle_x, angle_y, angle_z)
@@ -1450,23 +1444,18 @@ contains
     !> Write a single IB point-variable to the Silo database slave and master files.
     subroutine s_write_ib_variable(varname, t_step, data, nBodies)
 
-        character(len=*), intent(in)                    :: varname
-        integer, intent(in)                             :: t_step
-        real(wp), dimension(:), intent(in)              :: data
-        integer, intent(in)                             :: nBodies
-        character(len=4*name_len), dimension(num_procs) :: var_names
-        integer, dimension(num_procs)                   :: var_types
-        integer                                         :: ierr, i
-
-        if (proc_rank == 0) then
-            do i = 1, num_procs
-                write (var_names(i), '(A,I0,A,I0,A)') '../p', i - 1, '/', t_step, '.silo:' // trim(varname)
-                var_types(i) = DB_POINTVAR
-            end do
-            err = DBSET2DSTRLEN(len(var_names(1)))
-            err = DBPUTMVAR(dbroot, trim(varname), len_trim(varname), num_procs, var_names, len_trim(var_names), var_types, &
-                            & DB_F77NULL, ierr)
-        end if
+        character(len=*), intent(in)       :: varname
+        integer, intent(in)                :: t_step
+        real(wp), dimension(:), intent(in) :: data
+        integer, intent(in)                :: nBodies
+        character(len=4*name_len)          :: var_name_entry
+        integer                            :: var_type_entry, ierr
+
+        write (var_name_entry, '(A,I0,A)') '../p0/', t_step, '.silo:' // trim(varname)
+        var_type_entry = DB_POINTVAR
+        err = DBSET2DSTRLEN(len(var_name_entry))
+        err = DBPUTMVAR(dbroot, trim(varname), len_trim(varname), 1, var_name_entry, len_trim(var_name_entry), var_type_entry, &
+                        & DB_F77NULL, ierr)
 
         err = DBPUTPV1(dbfile, trim(varname), len_trim(varname), 'ib_bodies', 9, data, nBodies, DB_DOUBLE, DB_F77NULL, ierr)
 
diff --git a/src/simulation/m_collisions.fpp b/src/simulation/m_collisions.fpp
index fe860cb055..c871496fd9 100644
--- a/src/simulation/m_collisions.fpp
+++ b/src/simulation/m_collisions.fpp
@@ -19,7 +19,8 @@ module m_collisions
 
     implicit none
 
-    private; public :: s_apply_collision_forces, s_initialize_collisions_module, s_finalize_collisions_module
+    private; public :: s_apply_collision_forces, s_initialize_collisions_module, s_finalize_collisions_module, &
+        & f_local_rank_owns_collision
     ! overlap distances for computing collisions
     integer, allocatable, dimension(:,:)  :: collision_lookup
     real(wp), allocatable, dimension(:,:) :: wall_overlap_distances
@@ -405,35 +406,35 @@ contains
         logical                            :: owns_collision
         real(wp), dimension(3)             :: projected_location
 
-        #:if defined('MFC_MPI')
-            if (num_procs == 1) then
-                owns_collision = .true.
-            else
-                projected_location(:) = collision_location(:)
-
-                ! catch the edge case where th collision lies just outside the computational domain
-                #:for X, ID in [('x', 1), ('y', 2), ('z', 3)]
-                    if (num_dims >= ${ID}$) then
-                        if (ib_bc_${X}$%beg /= BC_PERIODIC) then
-                            ! if it is outside the domain in one direction, project it somewhere inside so at least one rank owns it
-                            if (collision_location(${ID}$) < ${X}$_domain%beg) then
-                                projected_location(${ID}$) = ${X}$_domain%beg
-                            else if (${X}$_domain%end < collision_location(${ID}$)) then
-                                projected_location(${ID}$) = ${X}$_domain%end - 1.0e-10_wp
-                            end if
+#ifdef MFC_MPI
+        if (num_procs == 1) then
+            owns_collision = .true.
+        else
+            projected_location(:) = collision_location(:)
+
+            ! catch the edge case where th collision lies just outside the computational domain
+            #:for X, ID in [('x', 1), ('y', 2), ('z', 3)]
+                if (num_dims >= ${ID}$) then
+                    if (ib_bc_${X}$%beg /= BC_PERIODIC) then
+                        ! if it is outside the domain in one direction, project it somewhere inside so at least one rank owns it
+                        if (collision_location(${ID}$) < ${X}$_domain%beg) then
+                            projected_location(${ID}$) = ${X}$_domain%beg
+                        else if (${X}$_domain%end < collision_location(${ID}$)) then
+                            projected_location(${ID}$) = ${X}$_domain%end - 1.0e-10_wp
                         end if
                     end if
-                #:endfor
+                end if
+            #:endfor
 
-                ! the object that contains the collision location owns the collisions
-                owns_collision = x_cb(-1) <= projected_location(1) .and. projected_location(1) < x_cb(m)
-                owns_collision = owns_collision .and. y_cb(-1) <= projected_location(2) .and. projected_location(2) < y_cb(n)
-                if (num_dims == 3) owns_collision = owns_collision .and. z_cb(-1) <= projected_location(3) &
-                    & .and. projected_location(3) < z_cb(p)
-            end if
-        #:else
-            owns_collision = .true.
-        #:endif
+            ! the object that contains the collision location owns the collisions
+            owns_collision = x_cb(-1) <= projected_location(1) .and. projected_location(1) < x_cb(m)
+            owns_collision = owns_collision .and. y_cb(-1) <= projected_location(2) .and. projected_location(2) < y_cb(n)
+            if (num_dims == 3) owns_collision = owns_collision .and. z_cb(-1) <= projected_location(3) .and. projected_location(3) &
+                & < z_cb(p)
+        end if
+#else
+        owns_collision = .true.
+#endif
 
     end function f_local_rank_owns_collision
 
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index a5fdb3ef81..effc020325 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -909,7 +909,7 @@ contains
         integer                              :: ifile, ierr
         integer, dimension(MPI_STATUS_SIZE)  :: status
         logical                              :: file_exist
-        integer                              :: i
+        integer                              :: i, ib_idx
         integer, parameter                   :: NFIELDS_PER_IB = 20
         real(wp)                             :: ib_buf(NFIELDS_PER_IB)
 
@@ -923,19 +923,6 @@ contains
         end if
         call s_mpi_barrier()
 
-        ! Divide num_ibs across num_procs
-        nibs_per_rank = num_ibs/num_procs
-        remainder = mod(num_ibs, num_procs)
-
-        ! Ranks < remainder get one extra IB
-        if (proc_rank < remainder) then
-            ib_start = proc_rank*(nibs_per_rank + 1) + 1
-            ib_end = ib_start + nibs_per_rank  ! nibs_per_rank + 1 total
-        else
-            ib_start = remainder*(nibs_per_rank + 1) + (proc_rank - remainder)*nibs_per_rank + 1
-            ib_end = ib_start + nibs_per_rank - 1
-        end if
-
         write (file_loc, '(A,I0,A)') '/restart_data/ib_state_', t_step, '.dat'
         file_loc = trim(case_dir) // trim(file_loc)
 
@@ -947,20 +934,21 @@ contains
 
         call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, ior(MPI_MODE_WRONLY, MPI_MODE_CREATE), mpi_info_int, ifile, ierr)
 
-        do i = ib_start, ib_end
+        do i = 1, num_local_ibs
+            ib_idx = local_ib_patch_ids(i)
             ib_buf(1) = mytime
-            ib_buf(2:4) = patch_ib(i)%force(1:3)
-            ib_buf(5:7) = patch_ib(i)%torque(1:3)
-            ib_buf(8:10) = patch_ib(i)%vel(1:3)
-            ib_buf(11:13) = patch_ib(i)%angular_vel(1:3)
-            ib_buf(14:16) = patch_ib(i)%angles(1:3)
-            ib_buf(17) = patch_ib(i)%x_centroid
-            ib_buf(18) = patch_ib(i)%y_centroid
-            ib_buf(19) = patch_ib(i)%z_centroid
-            ib_buf(20) = patch_ib(i)%radius
+            ib_buf(2:4) = patch_ib(ib_idx)%force(1:3)
+            ib_buf(5:7) = patch_ib(ib_idx)%torque(1:3)
+            ib_buf(8:10) = patch_ib(ib_idx)%vel(1:3)
+            ib_buf(11:13) = patch_ib(ib_idx)%angular_vel(1:3)
+            ib_buf(14:16) = patch_ib(ib_idx)%angles(1:3)
+            ib_buf(17) = patch_ib(ib_idx)%x_centroid
+            ib_buf(18) = patch_ib(ib_idx)%y_centroid
+            ib_buf(19) = patch_ib(ib_idx)%z_centroid
+            ib_buf(20) = patch_ib(ib_idx)%radius
 
             ! Global IB index (i) determines position in file
-            disp = int(i - 1, MPI_OFFSET_KIND)*int(NFIELDS_PER_IB, MPI_OFFSET_KIND)*WP_MOK
+            disp = int(patch_ib(ib_idx)%gbl_patch_id - 1, MPI_OFFSET_KIND)*int(NFIELDS_PER_IB, MPI_OFFSET_KIND)*WP_MOK
 
             call MPI_FILE_WRITE_AT(ifile, disp, ib_buf, NFIELDS_PER_IB, mpi_p, status, ierr)
         end do
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index 7fb3353e2c..446a3c024c 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -229,7 +229,8 @@ module m_global_parameters
     $:GPU_DECLARE(create='[ib_bc_x, ib_bc_y, ib_bc_z]')
 #endif
     type(bounds_info) :: x_domain, y_domain, z_domain
-    $:GPU_DECLARE(create='[x_domain, y_domain, z_domain]')
+    type(bounds_info) :: neighbor_domain_x, neighbor_domain_y, neighbor_domain_z
+    $:GPU_DECLARE(create='[x_domain, y_domain, z_domain, neighbor_domain_x, neighbor_domain_y, neighbor_domain_z]')
     real(wp) :: x_a, y_a, z_a
     real(wp) :: x_b, y_b, z_b
     logical  :: parallel_io       !< Format of the data files
@@ -336,18 +337,22 @@ module m_global_parameters
 
     !> @name Immersed Boundaries
     !> @{
-    logical                                                  :: ib
-    integer                                                  :: num_ibs
-    integer                                                  :: collision_model
-    real(wp)                                                 :: coefficient_of_restitution
-    real(wp)                                                 :: collision_time
-    real(wp)                                                 :: ib_coefficient_of_friction
-    logical                                                  :: ib_state_wrt
-    type(ib_patch_parameters), dimension(num_ib_patches_max) :: patch_ib  !< Immersed boundary patch parameters
-    type(vec3_dt), allocatable, dimension(:)                 :: airfoil_grid_u, airfoil_grid_l
-    integer                                                  :: Np
-
-    $:GPU_DECLARE(create='[ib, num_ibs, patch_ib, Np, airfoil_grid_u, airfoil_grid_l]')
+    logical                                              :: ib
+    integer                                              :: num_ibs  !< number of IBs that the current processor is aware of
+    integer                                              :: num_gbl_ibs  !< number of IBs in the overall simulation
+    integer                                              :: num_local_ibs  !< number of IBs that lie inside the processor domain
+    integer                                              :: collision_model
+    real(wp)                                             :: coefficient_of_restitution
+    real(wp)                                             :: collision_time
+    real(wp)                                             :: ib_coefficient_of_friction
+    logical                                              :: ib_state_wrt
+    type(ib_patch_parameters), allocatable, dimension(:) :: patch_ib  !< Immersed boundary patch parameters
+    integer, dimension(num_local_ibs_max)                :: local_ib_patch_ids  !< lookup table of IBs in the local compute domain
+    integer, dimension(-1:1,-1:1,-1:1)                   :: ib_neighbor_ranks  !< MPI ranks of all 26 neighbor domains
+    type(vec3_dt), allocatable, dimension(:)             :: airfoil_grid_u, airfoil_grid_l
+    integer                                              :: Np
+
+    $:GPU_DECLARE(create='[ib, num_ibs, num_gbl_ibs, num_local_ibs, patch_ib, Np, airfoil_grid_u, airfoil_grid_l, local_ib_patch_ids]')
     $:GPU_DECLARE(create='[ib_coefficient_of_friction]')
     !> @}
 
@@ -787,7 +792,9 @@ contains
             relativity = .false.
         #:endif
 
+        allocate (patch_ib(num_ib_patches_max))
         do i = 1, num_ib_patches_max
+            patch_ib(i)%gbl_patch_id = i
             patch_ib(i)%geometry = dflt_int
             patch_ib(i)%x_centroid = 0._wp
             patch_ib(i)%y_centroid = 0._wp
diff --git a/src/simulation/m_ib_patches.fpp b/src/simulation/m_ib_patches.fpp
index cf3ad9ecfa..1770e9c271 100644
--- a/src/simulation/m_ib_patches.fpp
+++ b/src/simulation/m_ib_patches.fpp
@@ -109,7 +109,7 @@ contains
         radius = patch_ib(patch_id)%radius
 
         ! encode the periodicity information into the patch_id
-        call s_encode_patch_periodicity(patch_id, xp, yp, 0, encoded_patch_id)
+        call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, 0, encoded_patch_id)
 
         ! find the indices to the left and right of the IB in i, j, k
         il = -gp_layers - 1
@@ -221,7 +221,7 @@ contains
         end if
 
         ! encode the periodicity information into the patch_id
-        call s_encode_patch_periodicity(patch_id, xp, yp, 0, encoded_patch_id)
+        call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, 0, encoded_patch_id)
 
         ! find the indices to the left and right of the IB in i, j, k
         il = -gp_layers - 1
@@ -376,7 +376,7 @@ contains
         end if
 
         ! encode the periodicity information into the patch_id
-        call s_encode_patch_periodicity(patch_id, xp, yp, zp, encoded_patch_id)
+        call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, zp, encoded_patch_id)
 
         ! find the indices to the left and right of the IB in i, j, k
         il = -gp_layers - 1
@@ -466,7 +466,7 @@ contains
         inverse_rotation(:,:) = patch_ib(patch_id)%rotation_matrix_inverse(:,:)
 
         ! encode the periodicity information into the patch_id
-        call s_encode_patch_periodicity(patch_id, xp, yp, 0, encoded_patch_id)
+        call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, 0, encoded_patch_id)
 
         ! find the indices to the left and right of the IB in i, j, k
         il = -gp_layers - 1
@@ -510,16 +510,12 @@ contains
         real(wp)                 :: radius
         real(wp), dimension(1:3) :: center
 
-        ! Variables to initialize the pressure field that corresponds to the bubble-collapse test case found in Tiwari et al. (2013)
-
-        ! Transferring spherical patch's radius, centroid, smoothing patch identity and smoothing coefficient information
-
         center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(x_domain%end - x_domain%beg)
         center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(y_domain%end - y_domain%beg)
         center(3) = patch_ib(patch_id)%z_centroid + real(zp, wp)*(z_domain%end - z_domain%beg)
         radius = patch_ib(patch_id)%radius
 
-        ! completely skip particles no in the domain
+        ! completely skip particles not in the domain
         if (center(1) - radius > x_cc(m + gp_layers + 1) .or. center(1) + radius < x_cc(-gp_layers - 1) .or. center(2) &
             & - radius > y_cc(n + gp_layers + 1) .or. center(2) + radius < y_cc(-gp_layers - 1) .or. center(3) - radius > z_cc(p &
             & + gp_layers + 1) .or. center(3) + radius < z_cc(-gp_layers - 1)) then
@@ -527,7 +523,7 @@ contains
         end if
 
         ! encode the periodicity information into the patch_id
-        call s_encode_patch_periodicity(patch_id, xp, yp, zp, encoded_patch_id)
+        call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, zp, encoded_patch_id)
 
         ! find the indices to the left and right of the IB in i, j, k
         il = -gp_layers - 1
@@ -542,18 +538,12 @@ contains
 
         ! Checking whether the sphere covers a particular cell in the domain and verifying whether the current patch has permission
         ! to write to that cell. If both queries check out, the primitive variables of the current patch are assigned to this cell.
-        $:GPU_PARALLEL_LOOP(private='[i, j, k, cart_y, cart_z]', copyin='[encoded_patch_id, center, radius]', collapse=3)
+        $:GPU_PARALLEL_LOOP(private='[i, j, k]', copyin='[encoded_patch_id, center, radius]', collapse=3)
         do k = kl, kr
             do j = jl, jr
                 do i = il, ir
-                    if (grid_geometry == 3) then
-                        call s_convert_cylindrical_to_cartesian_coord(y_cc(j), z_cc(k))
-                    else
-                        cart_y = y_cc(j)
-                        cart_z = z_cc(k)
-                    end if
                     ! Updating the patch identities bookkeeping variable
-                    if (((x_cc(i) - center(1))**2 + (cart_y - center(2))**2 + (cart_z - center(3))**2 <= radius**2)) then
+                    if (((x_cc(i) - center(1))**2 + (y_cc(j) - center(2))**2 + (z_cc(k) - center(3))**2 <= radius**2)) then
                         ib_markers%sf(i, j, k) = encoded_patch_id
                     end if
                 end do
@@ -586,7 +576,7 @@ contains
         inverse_rotation(:,:) = patch_ib(patch_id)%rotation_matrix_inverse(:,:)
 
         ! encode the periodicity information into the patch_id
-        call s_encode_patch_periodicity(patch_id, xp, yp, zp, encoded_patch_id)
+        call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, zp, encoded_patch_id)
 
         ! find the indices to the left and right of the IB in i, j, k
         il = -gp_layers - 1
@@ -656,7 +646,7 @@ contains
         inverse_rotation(:,:) = patch_ib(patch_id)%rotation_matrix_inverse(:,:)
 
         ! encode the periodicity information into the patch_id
-        call s_encode_patch_periodicity(patch_id, xp, yp, zp, encoded_patch_id)
+        call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, zp, encoded_patch_id)
 
         il = -gp_layers - 1
         jl = -gp_layers - 1
@@ -724,7 +714,7 @@ contains
         inverse_rotation(:,:) = patch_ib(patch_id)%rotation_matrix_inverse(:,:)
 
         ! encode the periodicity information into the patch_id
-        call s_encode_patch_periodicity(patch_id, xp, yp, 0, encoded_patch_id)
+        call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, 0, encoded_patch_id)
 
         ! find the indices to the left and right of the IB in i, j, k
         il = -gp_layers - 1
@@ -781,7 +771,7 @@ contains
         threshold = patch_ib(patch_id)%model_threshold
 
         ! encode the periodicity information into the patch_id
-        call s_encode_patch_periodicity(patch_id, xp, yp, 0, encoded_patch_id)
+        call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, 0, encoded_patch_id)
 
         il = -gp_layers - 1
         jl = -gp_layers - 1
@@ -858,7 +848,7 @@ contains
         rotation(:,:) = patch_ib(patch_id)%rotation_matrix(:,:)
 
         ! encode the periodicity information into the patch_id
-        call s_encode_patch_periodicity(patch_id, xp, yp, zp, encoded_patch_id)
+        call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, zp, encoded_patch_id)
 
         il = -gp_layers - 1
         jl = -gp_layers - 1
@@ -1039,7 +1029,7 @@ contains
         temp_y_per = y_periodicity; if (y_periodicity == -1) temp_y_per = 2
         temp_z_per = z_periodicity; if (z_periodicity == -1) temp_z_per = 2
 
-        offset = (num_ibs + 1)*temp_x_per + 3*(num_ibs + 1)*temp_y_per + 9*(num_ibs + 1)*temp_z_per
+        offset = (num_gbl_ibs + 1)*temp_x_per + 3*(num_gbl_ibs + 1)*temp_y_per + 9*(num_gbl_ibs + 1)*temp_z_per
         encoded_patch_id = patch_id + offset
 
     end subroutine s_encode_patch_periodicity
@@ -1054,7 +1044,7 @@ contains
         integer, intent(out), optional :: x_periodicity, y_periodicity, z_periodicity
         integer                        :: offset, remainder, xp, yp, zp, base
 
-        base = num_ibs + 1
+        base = num_gbl_ibs + 1
 
         patch_id = mod(encoded_patch_id - 1, base) + 1
         offset = (encoded_patch_id - patch_id)/base
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index 5f865cffeb..7c393155a6 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -71,11 +71,9 @@ contains
         call nvtxStartRange("SETUP-IBM-MODULE")
 
         ! do all set up for moving immersed boundaries
-        moving_immersed_boundary_flag = .false.
         do i = 1, num_ibs
             if (patch_ib(i)%moving_ibm /= 0) then
                 call s_compute_moment_of_inertia(i, patch_ib(i)%angular_vel)
-                moving_immersed_boundary_flag = .true.
             end if
             call s_update_ib_rotation_matrix(i)
         end do
@@ -123,6 +121,8 @@ contains
 
         call nvtxEndRange
 
+        ! print *, proc_rank, num_local_ibs, num_ibs, num_gbl_ibs
+
     end subroutine s_ibm_setup
 
     !> Update the conservative variables at the ghost points
@@ -1010,9 +1010,8 @@ contains
 
         call s_apply_collision_forces(ghost_points, num_gps, ib_markers, forces, torques)
 
-        ! reduce the forces across all MPI ranks
-        call s_mpi_allreduce_vectors_sum(forces, forces, num_ibs, 3)
-        call s_mpi_allreduce_vectors_sum(torques, torques, num_ibs, 3)
+        ! reduce the forces across local neighborhood ranks
+        call s_communicate_ib_forces(forces, torques)
 
         ! consider body forces after reducing to avoid double counting
         do i = 1, num_ibs
@@ -1225,4 +1224,392 @@ contains
 
     end subroutine s_wrap_periodic_ibs
 
+    !> @brief reasseses ownership of IBs and passes ownership of IBs to neighbor processors
+    !> Reduces forces and torques across the local neighborhood without a global allreduce. Accumulation phase: 2 passes per
+    !! dimension receiving from the low-index (-X) neighbor. Pass 1: add received values; save what was received as recv_snap. Pass
+    !! 2: send current (post-pass-1) values; add received; subtract recv_snap to remove double-counting of the direct contribution
+    !! already added in pass 1. Back-propagation phase: 2 passes per dimension receiving from the high-index (+X) neighbor, each
+    !! overwriting local forces with the neighbor's accumulated total.
+    subroutine s_communicate_ib_forces(forces, torques)
+
+        real(wp), dimension(num_ibs, 3), intent(inout) :: forces, torques
+
+#ifdef MFC_MPI
+        integer                       :: i, j, pack_pos, unpack_pos, buf_size, ierr
+        integer                       :: send_neighbor, recv_neighbor, recv_count, pid
+        real(wp), dimension(3)        :: fval, tval
+        real(wp), allocatable         :: recv_forces_snap(:,:), recv_torques_snap(:,:)
+        character(len=1), allocatable :: send_buf(:), recv_buf(:)
+
+        if (num_procs == 1) return
+
+        buf_size = storage_size(0)/8 + (storage_size(0)/8 + 6*storage_size(0._wp)/8)*size(patch_ib)
+        allocate (send_buf(buf_size), recv_buf(buf_size), recv_forces_snap(num_ibs, 3), recv_torques_snap(num_ibs, 3))
+
+        ! Accumulation phase: propagate contributions toward the high-index corner.
+        #:for X, ID, TAG1, TAG2 in [('x', 1, 300, 302), ('y', 2, 304, 306), ('z', 3, 308, 310)]
+            if (num_dims >= ${ID}$) then
+                send_neighbor = merge(bc_${X}$%end, MPI_PROC_NULL, bc_${X}$%end >= 0)
+                recv_neighbor = merge(bc_${X}$%beg, MPI_PROC_NULL, bc_${X}$%beg >= 0)
+
+                ! Pass 1: send current forces to +${X}$ neighbor; receive from -${X}$ neighbor and add. Save what was received as
+                ! recv_snap for double-count removal in pass 2.
+                recv_forces_snap = 0._wp
+                recv_torques_snap = 0._wp
+                pack_pos = 0
+                call MPI_PACK(num_ibs, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                do i = 1, num_ibs
+                    call MPI_PACK(patch_ib(i)%gbl_patch_id, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                    fval(:) = forces(i,:); tval(:) = torques(i,:)
+                    call MPI_PACK(fval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                    call MPI_PACK(tval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                end do
+                call MPI_SENDRECV(send_buf, pack_pos, MPI_PACKED, send_neighbor, ${TAG1}$, recv_buf, buf_size, MPI_PACKED, &
+                                  & recv_neighbor, ${TAG1}$, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+                if (recv_neighbor /= MPI_PROC_NULL) then
+                    unpack_pos = 0
+                    call MPI_UNPACK(recv_buf, buf_size, unpack_pos, recv_count, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr)
+                    do i = 1, recv_count
+                        call MPI_UNPACK(recv_buf, buf_size, unpack_pos, pid, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr)
+                        call MPI_UNPACK(recv_buf, buf_size, unpack_pos, fval, 3, mpi_p, MPI_COMM_WORLD, ierr)
+                        call MPI_UNPACK(recv_buf, buf_size, unpack_pos, tval, 3, mpi_p, MPI_COMM_WORLD, ierr)
+                        do j = 1, num_ibs
+                            if (patch_ib(j)%gbl_patch_id == pid) then
+                                recv_forces_snap(j,:) = fval(:)
+                                recv_torques_snap(j,:) = tval(:)
+                                forces(j,:) = forces(j,:) + fval(:)
+                                torques(j,:) = torques(j,:) + tval(:)
+                                exit
+                            end if
+                        end do
+                    end do
+                end if
+
+                ! Pass 2: send post-pass-1 forces to +${X}$ neighbor; receive from -${X}$ neighbor. Add received values then
+                ! subtract recv_snap to remove the pass-1 contribution that was already counted, leaving only the 2-hop delta.
+                pack_pos = 0
+                call MPI_PACK(num_ibs, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                do i = 1, num_ibs
+                    call MPI_PACK(patch_ib(i)%gbl_patch_id, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                    fval(:) = forces(i,:); tval(:) = torques(i,:)
+                    call MPI_PACK(fval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                    call MPI_PACK(tval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                end do
+                call MPI_SENDRECV(send_buf, pack_pos, MPI_PACKED, send_neighbor, ${TAG2}$, recv_buf, buf_size, MPI_PACKED, &
+                                  & recv_neighbor, ${TAG2}$, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+                if (recv_neighbor /= MPI_PROC_NULL) then
+                    unpack_pos = 0
+                    call MPI_UNPACK(recv_buf, buf_size, unpack_pos, recv_count, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr)
+                    do i = 1, recv_count
+                        call MPI_UNPACK(recv_buf, buf_size, unpack_pos, pid, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr)
+                        call MPI_UNPACK(recv_buf, buf_size, unpack_pos, fval, 3, mpi_p, MPI_COMM_WORLD, ierr)
+                        call MPI_UNPACK(recv_buf, buf_size, unpack_pos, tval, 3, mpi_p, MPI_COMM_WORLD, ierr)
+                        do j = 1, num_ibs
+                            if (patch_ib(j)%gbl_patch_id == pid) then
+                                forces(j,:) = forces(j,:) + fval(:) - recv_forces_snap(j,:)
+                                torques(j,:) = torques(j,:) + tval(:) - recv_torques_snap(j,:)
+                                exit
+                            end if
+                        end do
+                    end do
+                end if
+            end if
+        #:endfor
+
+        ! Back-propagation phase: for each dimension, 2 passes receiving from the high-index neighbor. Each pass overwrites local
+        ! forces with the neighbor's accumulated total. Two passes ensure the total reaches 2 hops back, covering the full
+        ! neighborhood.
+        #:for X, ID, TAG1, TAG2 in [('x', 1, 312, 314), ('y', 2, 316, 318), ('z', 3, 320, 322)]
+            if (num_dims >= ${ID}$) then
+                send_neighbor = merge(bc_${X}$%beg, MPI_PROC_NULL, bc_${X}$%beg >= 0)
+                recv_neighbor = merge(bc_${X}$%end, MPI_PROC_NULL, bc_${X}$%end >= 0)
+
+                #:for TAG in [TAG1, TAG2]
+                    pack_pos = 0
+                    call MPI_PACK(num_ibs, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                    do i = 1, num_ibs
+                        call MPI_PACK(patch_ib(i)%gbl_patch_id, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                        fval(:) = forces(i,:); tval(:) = torques(i,:)
+                        call MPI_PACK(fval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                        call MPI_PACK(tval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                    end do
+                    call MPI_SENDRECV(send_buf, pack_pos, MPI_PACKED, send_neighbor, ${TAG}$, recv_buf, buf_size, MPI_PACKED, &
+                                      & recv_neighbor, ${TAG}$, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+                    if (recv_neighbor /= MPI_PROC_NULL) then
+                        unpack_pos = 0
+                        call MPI_UNPACK(recv_buf, buf_size, unpack_pos, recv_count, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr)
+                        do i = 1, recv_count
+                            call MPI_UNPACK(recv_buf, buf_size, unpack_pos, pid, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr)
+                            call MPI_UNPACK(recv_buf, buf_size, unpack_pos, fval, 3, mpi_p, MPI_COMM_WORLD, ierr)
+                            call MPI_UNPACK(recv_buf, buf_size, unpack_pos, tval, 3, mpi_p, MPI_COMM_WORLD, ierr)
+                            do j = 1, num_ibs
+                                if (patch_ib(j)%gbl_patch_id == pid) then
+                                    forces(j,:) = fval(:)
+                                    torques(j,:) = tval(:)
+                                    exit
+                                end if
+                            end do
+                        end do
+                    end if
+                #:endfor
+            end if
+        #:endfor
+
+        deallocate (send_buf, recv_buf, recv_forces_snap, recv_torques_snap)
+#endif
+
+    end subroutine s_communicate_ib_forces
+
+    !> Alternative force reduction using two non-blocking all-to-neighbor broadcasts. Phase 1: every rank sends its full force array
+    !! to all 26 neighborhood neighbors simultaneously. After MPI_WAITALL, each rank sums contributions from neighbors for its owned
+    !! particles. Phase 2: each rank sends its finalized owned-particle forces (by gbl_patch_id) back to all neighbors
+    !! simultaneously. After MPI_WAITALL, each rank overwrites ghost-particle forces with the authoritative values from the owning
+    !! rank. Not currently called - available for benchmarking against s_communicate_ib_forces.
+    !     subroutine s_communicate_ib_forces_scatter(forces, torques)
+
+    !         real(wp), dimension(num_ibs, 3), intent(inout) :: forces, torques
+
+    ! #ifdef MFC_MPI integer, parameter :: max_nbrs = 26 integer :: i, j, k, nbr_idx, nreqs, pack_pos, unpack_pos integer ::
+    ! buf_size, entry_bytes, ierr, recv_count, pid integer :: send_neighbor, recv_neighbor, dx, dy, dz, tag integer, dimension(3) ::
+    ! nbr_coords logical :: is_owned real(wp), dimension(3) :: fval, tval real(wp), dimension(num_ibs, 3) :: forces_total,
+    ! torques_total integer, dimension(max_nbrs) :: recv_neighbor_list integer, dimension(2*max_nbrs) :: requests character(len=1),
+    ! allocatable :: send_buf(:), recv_bufs(:,:) character(len=1), allocatable :: owned_send_buf(:), owned_recv_bufs(:,:) integer ::
+    ! owned_buf_size
+
+    !         if (num_procs == 1) return
+
+    ! ! Buffer sized to hold count + (gbl_patch_id, forces, torques) per particle entry_bytes = storage_size(0)/8 +
+    ! 6*storage_size(0._wp)/8 buf_size = storage_size(0)/8 + entry_bytes*num_ibs owned_buf_size = storage_size(0)/8 +
+    ! entry_bytes*num_local_ibs_max allocate (send_buf(buf_size), recv_bufs(buf_size, max_nbrs), owned_send_buf(owned_buf_size), & &
+    ! owned_recv_bufs(owned_buf_size, max_nbrs))
+
+    ! ! Phase 1: pack full local force array and broadcast to all neighborhood neighbors. pack_pos = 0 call MPI_PACK(num_ibs, 1,
+    ! MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr) do i = 1, num_ibs call MPI_PACK(patch_ib(i)%gbl_patch_id, 1,
+    ! MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr) fval(:) = forces(i,:); tval(:) = torques(i,:) call
+    ! MPI_PACK(fval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr) call MPI_PACK(tval, 3, mpi_p, send_buf, buf_size,
+    ! pack_pos, MPI_COMM_WORLD, ierr) end do
+
+    ! nreqs = 0 nbr_idx = 0 do dz = merge(-1, 0, num_dims == 3), merge(1, 0, num_dims == 3) do dy = -1, 1 do dx = -1, 1 if (dx == 0
+    ! .and. dy == 0 .and. dz == 0) cycle nbr_idx = nbr_idx + 1 tag = 400 + (dx + 1)*9 + (dy + 1)*3 + (dz + 1)
+
+    ! nbr_coords = proc_coords - [dx, dy, dz] call MPI_CART_RANK(MPI_COMM_CART, nbr_coords, recv_neighbor, ierr) if (ierr /=
+    ! MPI_SUCCESS) recv_neighbor = MPI_PROC_NULL recv_neighbor_list(nbr_idx) = recv_neighbor
+
+    ! nreqs = nreqs + 1 call MPI_IRECV(recv_bufs(:,nbr_idx), buf_size, MPI_PACKED, recv_neighbor, tag, MPI_COMM_WORLD, & &
+    ! requests(nreqs), ierr) end do end do end do
+
+    ! do dz = merge(-1, 0, num_dims == 3), merge(1, 0, num_dims == 3) do dy = -1, 1 do dx = -1, 1 if (dx == 0 .and. dy == 0 .and. dz
+    ! == 0) cycle tag = 400 + (dx + 1)*9 + (dy + 1)*3 + (dz + 1)
+
+    ! nbr_coords = proc_coords + [dx, dy, dz] call MPI_CART_RANK(MPI_COMM_CART, nbr_coords, send_neighbor, ierr) if (ierr /=
+    ! MPI_SUCCESS) send_neighbor = MPI_PROC_NULL
+
+    ! nreqs = nreqs + 1 call MPI_ISEND(send_buf, pack_pos, MPI_PACKED, send_neighbor, tag, MPI_COMM_WORLD, requests(nreqs), ierr)
+    ! end do end do end do
+
+    !         call MPI_WAITALL(nreqs, requests, MPI_STATUSES_IGNORE, ierr)
+
+    ! ! Local reduction: for each owned particle, sum contributions from all neighbors. forces_total = forces torques_total =
+    ! torques do nbr_idx = 1, merge(26, 8, num_dims == 3) if (recv_neighbor_list(nbr_idx) == MPI_PROC_NULL) cycle unpack_pos = 0
+    ! call MPI_UNPACK(recv_bufs(:,nbr_idx), buf_size, unpack_pos, recv_count, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr) do i = 1,
+    ! recv_count call MPI_UNPACK(recv_bufs(:,nbr_idx), buf_size, unpack_pos, pid, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr) call
+    ! MPI_UNPACK(recv_bufs(:,nbr_idx), buf_size, unpack_pos, fval, 3, mpi_p, MPI_COMM_WORLD, ierr) call
+    ! MPI_UNPACK(recv_bufs(:,nbr_idx), buf_size, unpack_pos, tval, 3, mpi_p, MPI_COMM_WORLD, ierr) ! Only accumulate for particles
+    ! this rank owns do k = 1, num_local_ibs j = local_ib_patch_ids(k) if (patch_ib(j)%gbl_patch_id == pid) then forces_total(j,:) =
+    ! forces_total(j,:) + fval(:) torques_total(j,:) = torques_total(j,:) + tval(:) exit end if end do end do end do
+
+    ! ! Write totals back for owned particles only do k = 1, num_local_ibs j = local_ib_patch_ids(k) forces(j,:) = forces_total(j,:)
+    ! torques(j,:) = torques_total(j,:) end do
+
+    ! ! Phase 2: pack finalized owned-particle forces and back-broadcast to all neighbors. pack_pos = 0 call MPI_PACK(num_local_ibs,
+    ! 1, MPI_INTEGER, owned_send_buf, owned_buf_size, pack_pos, MPI_COMM_WORLD, ierr) do k = 1, num_local_ibs j =
+    ! local_ib_patch_ids(k) call MPI_PACK(patch_ib(j)%gbl_patch_id, 1, MPI_INTEGER, owned_send_buf, owned_buf_size, pack_pos,
+    ! MPI_COMM_WORLD, ierr) fval(:) = forces(j,:); tval(:) = torques(j,:) call MPI_PACK(fval, 3, mpi_p, owned_send_buf,
+    ! owned_buf_size, pack_pos, MPI_COMM_WORLD, ierr) call MPI_PACK(tval, 3, mpi_p, owned_send_buf, owned_buf_size, pack_pos,
+    ! MPI_COMM_WORLD, ierr) end do
+
+    ! nreqs = 0 nbr_idx = 0 do dz = merge(-1, 0, num_dims == 3), merge(1, 0, num_dims == 3) do dy = -1, 1 do dx = -1, 1 if (dx == 0
+    ! .and. dy == 0 .and. dz == 0) cycle nbr_idx = nbr_idx + 1 tag = 427 + (dx + 1)*9 + (dy + 1)*3 + (dz + 1)
+
+    ! nbr_coords = proc_coords - [dx, dy, dz] call MPI_CART_RANK(MPI_COMM_CART, nbr_coords, recv_neighbor, ierr) if (ierr /=
+    ! MPI_SUCCESS) recv_neighbor = MPI_PROC_NULL recv_neighbor_list(nbr_idx) = recv_neighbor
+
+    ! nreqs = nreqs + 1 call MPI_IRECV(owned_recv_bufs(:,nbr_idx), owned_buf_size, MPI_PACKED, recv_neighbor, tag, MPI_COMM_WORLD, &
+    ! & requests(nreqs), ierr) end do end do end do
+
+    ! do dz = merge(-1, 0, num_dims == 3), merge(1, 0, num_dims == 3) do dy = -1, 1 do dx = -1, 1 if (dx == 0 .and. dy == 0 .and. dz
+    ! == 0) cycle tag = 427 + (dx + 1)*9 + (dy + 1)*3 + (dz + 1)
+
+    ! nbr_coords = proc_coords + [dx, dy, dz] call MPI_CART_RANK(MPI_COMM_CART, nbr_coords, send_neighbor, ierr) if (ierr /=
+    ! MPI_SUCCESS) send_neighbor = MPI_PROC_NULL
+
+    ! nreqs = nreqs + 1 call MPI_ISEND(owned_send_buf, pack_pos, MPI_PACKED, send_neighbor, tag, MPI_COMM_WORLD, requests(nreqs),
+    ! ierr) end do end do end do
+
+    !         call MPI_WAITALL(nreqs, requests, MPI_STATUSES_IGNORE, ierr)
+
+    ! ! Overwrite ghost-particle forces with authoritative values from the owning rank. do nbr_idx = 1, merge(26, 8, num_dims == 3)
+    ! if (recv_neighbor_list(nbr_idx) == MPI_PROC_NULL) cycle unpack_pos = 0 call MPI_UNPACK(owned_recv_bufs(:,nbr_idx),
+    ! owned_buf_size, unpack_pos, recv_count, 1, MPI_INTEGER, MPI_COMM_WORLD, & & ierr) do i = 1, recv_count call
+    ! MPI_UNPACK(owned_recv_bufs(:,nbr_idx), owned_buf_size, unpack_pos, pid, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr) call
+    ! MPI_UNPACK(owned_recv_bufs(:,nbr_idx), owned_buf_size, unpack_pos, fval, 3, mpi_p, MPI_COMM_WORLD, ierr) call
+    ! MPI_UNPACK(owned_recv_bufs(:,nbr_idx), owned_buf_size, unpack_pos, tval, 3, mpi_p, MPI_COMM_WORLD, ierr) ! Only overwrite
+    ! ghost particles (not owned ones - this rank's total is authoritative) do j = 1, num_ibs if (patch_ib(j)%gbl_patch_id == pid)
+    ! then is_owned = .false. do k = 1, num_local_ibs if (local_ib_patch_ids(k) == j) then is_owned = .true. exit end if end do if
+    ! (.not. is_owned) then forces(j,:) = fval(:) torques(j,:) = tval(:) end if exit end if end do end do end do
+
+    ! deallocate (send_buf, recv_bufs, owned_send_buf, owned_recv_bufs) #endif
+
+    !     end subroutine s_communicate_ib_forces_scatter
+
+    subroutine s_handoff_ib_ownership()
+
+        integer                               :: i, j, k, output_idx, local_output_idx
+        integer                               :: old_num_local_ibs
+        integer                               :: new_count, recv_count
+        integer                               :: pack_pos, unpack_pos, buf_size, patch_bytes
+        integer                               :: send_neighbor, recv_neighbor, ierr
+        integer                               :: dx, dy, dz, tag, nbr_idx, nreqs
+        real(wp)                              :: position
+        real(wp), dimension(3)                :: centroid
+        logical                               :: is_new, already_known
+        type(ib_patch_parameters)             :: tmp_patch
+        integer, dimension(num_local_ibs_max) :: local_ib_idx_old
+        ! 26 neighbors max in 3D (8 in 2D); each gets its own recv buffer
+        integer, parameter             :: max_nbrs = 26
+        character(len=1), allocatable  :: send_buf(:), recv_bufs(:,:)
+        integer, dimension(2*max_nbrs) :: requests
+        integer, dimension(max_nbrs)   :: recv_neighbor_list
+
+#ifdef MFC_MPI
+        if (num_procs > 1) then
+            ! save a copy of the local IB's global indices to cross-reference for later.
+            local_ib_idx_old = 0
+            old_num_local_ibs = num_local_ibs
+            do i = 1, num_local_ibs
+                local_ib_idx_old(i) = patch_ib(local_ib_patch_ids(i))%gbl_patch_id
+            end do
+
+            ! delete any particles that no longer need to be tracked and coalesce the array
+            output_idx = 0
+            local_output_idx = 0
+            do i = 1, num_ibs
+                ! delete if not in neighborhood
+                #:for X, ID in [('x', 1), ('y', 2), ('z', 3)]
+                    if (patch_ib(i)%${X}$_centroid < neighbor_domain_${X}$%beg .or. neighbor_domain_${X}$%end < patch_ib(i) &
+                        & %${X}$_centroid) then
+                        cycle
+                    end if
+                #:endfor
+                output_idx = output_idx + 1
+                if (i /= output_idx) patch_ib(output_idx) = patch_ib(i)
+
+                ! check if in local domain
+                centroid = [patch_ib(i)%x_centroid, patch_ib(i)%y_centroid, 0._wp]
+                if (num_dims == 3) centroid(3) = patch_ib(i)%z_centroid
+                if (f_local_rank_owns_collision(centroid)) then
+                    local_output_idx = local_output_idx + 1
+                    local_ib_patch_ids(local_output_idx) = output_idx
+                end if
+            end do
+            num_ibs = output_idx
+            num_local_ibs = local_output_idx
+
+            ! Broadcast newly-owned patches to all neighborhood neighbors (including corners/edges).
+            patch_bytes = storage_size(tmp_patch)/8
+            buf_size = storage_size(0)/8 + patch_bytes*num_local_ibs_max
+            allocate (send_buf(buf_size), recv_bufs(buf_size, max_nbrs))
+
+            ! Write placeholder count at position 0
+            pack_pos = 0
+            call MPI_PACK(0, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+
+            ! pack new patches and count them
+            new_count = 0
+            do i = 1, num_local_ibs
+                k = local_ib_patch_ids(i)
+                is_new = .true.
+                do j = 1, old_num_local_ibs
+                    if (patch_ib(k)%gbl_patch_id == local_ib_idx_old(j)) then
+                        is_new = .false.
+                        exit
+                    end if
+                end do
+                if (is_new) then
+                    print *, proc_rank, " New Owner ", patch_ib(k)%gbl_patch_id
+                    call MPI_PACK(patch_ib(k), patch_bytes, MPI_BYTE, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                    new_count = new_count + 1
+                end if
+            end do
+
+            ! Overwrite the placeholder with the real count
+            pack_pos = 0
+            call MPI_PACK(new_count, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+            pack_pos = storage_size(0)/8 + new_count*patch_bytes
+
+            ! Post all receives first, then sends, using pre-built ib_neighbor_ranks lookup. Tags: 200 + (dx+1)*9 + (dy+1)*3 +
+            ! (dz+1)
+            nreqs = 0
+            nbr_idx = 0
+            do dz = merge(-1, 0, num_dims == 3), merge(1, 0, num_dims == 3)
+                do dy = -1, 1
+                    do dx = -1, 1
+                        if (dx == 0 .and. dy == 0 .and. dz == 0) cycle
+                        nbr_idx = nbr_idx + 1
+                        tag = 200 + (dx + 1)*9 + (dy + 1)*3 + (dz + 1)
+                        recv_neighbor = ib_neighbor_ranks(-dx, -dy, -dz)
+                        recv_neighbor_list(nbr_idx) = MPI_PROC_NULL
+                        if (recv_neighbor < 0) cycle
+                        recv_neighbor_list(nbr_idx) = recv_neighbor
+                        nreqs = nreqs + 1
+                        call MPI_IRECV(recv_bufs(:,nbr_idx), buf_size, MPI_PACKED, recv_neighbor, tag, MPI_COMM_WORLD, &
+                                       & requests(nreqs), ierr)
+                    end do
+                end do
+            end do
+
+            do dz = merge(-1, 0, num_dims == 3), merge(1, 0, num_dims == 3)
+                do dy = -1, 1
+                    do dx = -1, 1
+                        if (dx == 0 .and. dy == 0 .and. dz == 0) cycle
+                        tag = 200 + (dx + 1)*9 + (dy + 1)*3 + (dz + 1)
+                        send_neighbor = ib_neighbor_ranks(dx, dy, dz)
+                        if (send_neighbor < 0) cycle
+                        nreqs = nreqs + 1
+                        call MPI_ISEND(send_buf, pack_pos, MPI_PACKED, send_neighbor, tag, MPI_COMM_WORLD, requests(nreqs), ierr)
+                    end do
+                end do
+            end do
+
+            call MPI_WAITALL(nreqs, requests, MPI_STATUSES_IGNORE, ierr)
+
+            ! Unpack all received buffers
+            do nbr_idx = 1, merge(26, 8, num_dims == 3)
+                if (recv_neighbor_list(nbr_idx) == MPI_PROC_NULL) cycle
+                unpack_pos = 0
+                call MPI_UNPACK(recv_bufs(:,nbr_idx), buf_size, unpack_pos, recv_count, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr)
+                do i = 1, recv_count
+                    call MPI_UNPACK(recv_bufs(:,nbr_idx), buf_size, unpack_pos, tmp_patch, patch_bytes, MPI_BYTE, MPI_COMM_WORLD, &
+                                    & ierr)
+                    already_known = .false.
+                    do j = 1, num_ibs
+                        if (patch_ib(j)%gbl_patch_id == tmp_patch%gbl_patch_id) then
+                            already_known = .true.
+                            exit
+                        end if
+                    end do
+                    if (.not. already_known) then
+                        num_ibs = num_ibs + 1
+                        @:ASSERT(num_ibs <= size(patch_ib), 'patch_ib overflow in neighborhood handoff')
+                        patch_ib(num_ibs) = tmp_patch
+                    end if
+                end do
+            end do
+
+            deallocate (send_buf, recv_bufs)
+        end if
+#endif
+
+    end subroutine s_handoff_ib_ownership
+
 end module m_ibm
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index d544324c8a..967afb3032 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -39,6 +39,7 @@ module m_start_up
 
     use m_nvtx
     use m_ibm
+    use m_collisions
     use m_compile_specific
     use m_checker_common
     use m_checker
@@ -915,6 +916,7 @@ contains
         if (model_eqns == 3) call s_initialize_internal_energy_equations(q_cons_ts(1)%vf)
         if (ib) then
             if (t_step_start > 0) call s_read_ib_restart_data(t_step_start)
+            call s_reduce_ib_patch_array()
             call s_ibm_setup()
             if (t_step_start == 0) then
                 call s_write_ib_data_file(0)
@@ -1010,6 +1012,8 @@ contains
 #else
             "on CPUs"
 #endif
+        else
+            allocate (patch_ib(num_ib_patches_max))
         end if
 
         call s_mpi_bcast_user_inputs()
@@ -1192,4 +1196,208 @@ contains
 
     end subroutine s_read_ib_restart_data
 
+    !> @brief takes the patch_ib struct array that contains all global IB patches and reduces to only contain patches that are in
+    !! the local computational domain.
+    subroutine s_reduce_ib_patch_array()
+
+        type(ib_patch_parameters), dimension(num_ib_patches_max) :: patch_ib_gbl
+        real(wp)                                                 :: position
+        integer                                                  :: i, j
+        integer                                                  :: num_aware_ibs
+        logical                                                  :: is_in_neighborhood, is_local
+
+        ! do all set up for moving immersed boundaries
+
+        moving_immersed_boundary_flag = .false.
+        do i = 1, num_ibs
+            if (patch_ib(i)%moving_ibm /= 0) then
+                moving_immersed_boundary_flag = .true.
+                exit
+            end if
+        end do
+
+        patch_ib_gbl(:) = patch_ib(:)
+        call get_neighbor_bounds()  ! make sure the bounds of the neighbors are correctly set up
+        call s_compute_ib_neighbor_ranks()  ! build lookup of all 26 neighbor MPI ranks
+
+        deallocate (patch_ib)
+        if (num_dims == 3) then
+            num_aware_ibs = min(num_local_ibs_max*27, num_ib_patches_max)
+        else
+            num_aware_ibs = min(num_local_ibs_max*9, num_ib_patches_max)
+        end if
+        allocate (patch_ib(num_aware_ibs))
+
+        num_gbl_ibs = num_ibs
+
+#ifdef MFC_MPI
+        ! fallback for 1-rank case
+        if (num_procs == 1) then
+            patch_ib(:) = patch_ib_gbl(1:num_aware_ibs)
+        else
+            ! determine the set of patches owned by local rank
+            num_local_ibs = 0
+            num_ibs = 0
+            do i = 1, num_gbl_ibs
+                ! catch the edge case where th collision lies just outside the computational domain
+                is_in_neighborhood = .true.
+                is_local = .true.
+
+                #:for X, ID, DIM in [('x', 1, 'm'), ('y', 2, 'n'), ('z', 3, 'p')]
+                    if (num_dims >= ${ID}$) then
+                        position = patch_ib_gbl(i)%${X}$_centroid
+                        if (neighbor_domain_${X}$%beg > position .or. position > neighbor_domain_${X}$%end) then
+                            is_in_neighborhood = .false.
+                            is_local = .false.
+                        else if (${X}$_cb(-1) > position .or. position > ${X}$_cb(${DIM}$)) then
+                            is_local = .false.
+                        end if
+                    end if
+                #:endfor
+
+                if (is_in_neighborhood) then
+                    num_ibs = num_ibs + 1
+                    patch_ib(num_ibs) = patch_ib_gbl(i)
+                    patch_ib(num_ibs)%gbl_patch_id = i
+                    if (is_local) then
+                        num_local_ibs = num_local_ibs + 1
+                        local_ib_patch_ids(num_local_ibs) = num_ibs
+                    end if
+                end if
+            end do
+        end if
+#else
+        ! reduce the size of the array for local simulation in no-MPI case
+        patch_ib(:) = patch_ib_gbl(1:num_aware_ibs)
+#endif
+
+    end subroutine s_reduce_ib_patch_array
+
+    !> Build ib_neighbor_ranks(-1:1,-1:1,-1:1): MPI ranks of all 26 neighbor domains. Uses two rounds of MPI_SENDRECV cascades -
+    !! face neighbors are known from bc_*, edge neighbors are obtained in round 1, and (3D) corner neighbors in round 2.
+    subroutine s_compute_ib_neighbor_ranks()
+
+#ifdef MFC_MPI
+        integer               :: ierr
+        integer, dimension(4) :: buf4
+        integer, dimension(2) :: buf2, rbuf2
+
+        ib_neighbor_ranks = MPI_PROC_NULL
+        ib_neighbor_ranks(0, 0, 0) = proc_rank
+
+        ! Face neighbors - already known from domain decomposition
+        ib_neighbor_ranks(-1, 0, 0) = bc_x%beg
+        ib_neighbor_ranks(+1, 0, 0) = bc_x%end
+        if (num_dims >= 2) then
+            ib_neighbor_ranks(0, -1, 0) = bc_y%beg
+            ib_neighbor_ranks(0, +1, 0) = bc_y%end
+        end if
+        if (num_dims == 3) then
+            ib_neighbor_ranks(0, 0, -1) = bc_z%beg
+            ib_neighbor_ranks(0, 0, +1) = bc_z%end
+        end if
+
+        if (num_dims >= 2) then
+            ! Round 1a: exchange y/z face ranks with +/-x face neighbors -> xy and xz edge ranks
+            buf4 = [bc_y%beg, bc_y%end, bc_z%beg, bc_z%end]
+
+            ! Send to -x, receive from +x -> edges (+1,+/-1,0) and (+1,0,+/-1)
+            call MPI_SENDRECV(buf4, 4, MPI_INTEGER, merge(bc_x%beg, MPI_PROC_NULL, bc_x%beg >= 0), 310, buf4, 4, MPI_INTEGER, &
+                              & merge(bc_x%end, MPI_PROC_NULL, bc_x%end >= 0), 310, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+            if (bc_x%end >= 0) then
+                ib_neighbor_ranks(+1, -1, 0) = buf4(1)
+                ib_neighbor_ranks(+1, +1, 0) = buf4(2)
+                ib_neighbor_ranks(+1, 0, -1) = buf4(3)
+                ib_neighbor_ranks(+1, 0, +1) = buf4(4)
+            end if
+
+            ! Restore buf4, then send to +x, receive from -x -> edges (-1,+/-1,0) and (-1,0,+/-1)
+            buf4 = [bc_y%beg, bc_y%end, bc_z%beg, bc_z%end]
+            call MPI_SENDRECV(buf4, 4, MPI_INTEGER, merge(bc_x%end, MPI_PROC_NULL, bc_x%end >= 0), 311, buf4, 4, MPI_INTEGER, &
+                              & merge(bc_x%beg, MPI_PROC_NULL, bc_x%beg >= 0), 311, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+            if (bc_x%beg >= 0) then
+                ib_neighbor_ranks(-1, -1, 0) = buf4(1)
+                ib_neighbor_ranks(-1, +1, 0) = buf4(2)
+                ib_neighbor_ranks(-1, 0, -1) = buf4(3)
+                ib_neighbor_ranks(-1, 0, +1) = buf4(4)
+            end if
+        end if
+
+        if (num_dims == 3) then
+            ! Round 1b: exchange z face ranks with +/-y face neighbors -> yz edge ranks
+            buf2 = [bc_z%beg, bc_z%end]
+
+            call MPI_SENDRECV(buf2, 2, MPI_INTEGER, merge(bc_y%beg, MPI_PROC_NULL, bc_y%beg >= 0), 312, rbuf2, 2, MPI_INTEGER, &
+                              & merge(bc_y%end, MPI_PROC_NULL, bc_y%end >= 0), 312, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+            if (bc_y%end >= 0) then
+                ib_neighbor_ranks(0, +1, -1) = rbuf2(1)
+                ib_neighbor_ranks(0, +1, +1) = rbuf2(2)
+            end if
+
+            call MPI_SENDRECV(buf2, 2, MPI_INTEGER, merge(bc_y%end, MPI_PROC_NULL, bc_y%end >= 0), 313, rbuf2, 2, MPI_INTEGER, &
+                              & merge(bc_y%beg, MPI_PROC_NULL, bc_y%beg >= 0), 313, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+            if (bc_y%beg >= 0) then
+                ib_neighbor_ranks(0, -1, -1) = rbuf2(1)
+                ib_neighbor_ranks(0, -1, +1) = rbuf2(2)
+            end if
+
+            ! Round 2: exchange z face ranks with xy-diagonal edge neighbors -> corner ranks Each of the 4 xy diagonals gives 2
+            ! corners (the +/-z variants). Pattern: send buf2 to mirror diagonal, receive from this diagonal -> that edge's z face
+            ! ranks.
+            #:for DX, DY, MDX, MDY, TAG in [(1,1,-1,-1,320), (1,-1,-1,1,321), (-1,1,1,-1,322), (-1,-1,1,1,323)]
+                call MPI_SENDRECV(buf2, 2, MPI_INTEGER, merge(ib_neighbor_ranks(${MDX}$, ${MDY}$, 0), MPI_PROC_NULL, &
+                                  & ib_neighbor_ranks(${MDX}$, ${MDY}$, 0) >= 0), ${TAG}$, rbuf2, 2, MPI_INTEGER, &
+                                  & merge(ib_neighbor_ranks(${DX}$, ${DY}$, 0), MPI_PROC_NULL, ib_neighbor_ranks(${DX}$, ${DY}$, &
+                                  & 0) >= 0), ${TAG}$, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+                if (ib_neighbor_ranks(${DX}$, ${DY}$, 0) >= 0) then
+                    ib_neighbor_ranks(${DX}$, ${DY}$, -1) = rbuf2(1)
+                    ib_neighbor_ranks(${DX}$, ${DY}$, +1) = rbuf2(2)
+                end if
+            #:endfor
+        end if
+#endif
+
+    end subroutine s_compute_ib_neighbor_ranks
+
+    subroutine get_neighbor_bounds()
+
+        real(wp) :: send_val, recv_val
+        integer  :: send_neighbor, recv_neighbor, ierr
+
+        ! Default: no neighbor in any direction
+
+        neighbor_domain_x%beg = -huge(0._wp)
+        neighbor_domain_x%end = huge(0._wp)
+        neighbor_domain_y%beg = -huge(0._wp)
+        neighbor_domain_y%end = huge(0._wp)
+        neighbor_domain_z%beg = -huge(0._wp)
+        neighbor_domain_z%end = huge(0._wp)
+
+#ifdef MFC_MPI
+        #:for X, ID, TAG, DIM in [('x', 1, 100, 'm'), ('y', 2, 102, 'n'), ('z', 3, 104, 'p')]
+            if (num_dims >= ${ID}$) then
+                ! Step 1: broadcast left edge (-1 face) rightward; receive left neighbor's left edge -> neighbor_domain_${X}$%beg
+                send_val = ${X}$_cb(-1)
+                send_neighbor = merge(bc_${X}$%end, MPI_PROC_NULL, bc_${X}$%end >= 0)
+                recv_neighbor = merge(bc_${X}$%beg, MPI_PROC_NULL, bc_${X}$%beg >= 0)
+                recv_val = -huge(0._wp)
+                call MPI_SENDRECV(send_val, 1, mpi_p, send_neighbor, ${TAG}$, recv_val, 1, mpi_p, recv_neighbor, ${TAG}$, &
+                                  & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+                neighbor_domain_${X}$%beg = recv_val
+
+                ! Step 2: broadcast right edge (${DIM}$ face) leftward; receive right neighbor's right edge ->
+                ! neighbor_domain_${X}$%end
+                send_val = ${X}$_cb(${DIM}$)
+                send_neighbor = merge(bc_${X}$%beg, MPI_PROC_NULL, bc_${X}$%beg >= 0)
+                recv_neighbor = merge(bc_${X}$%end, MPI_PROC_NULL, bc_${X}$%end >= 0)
+                recv_val = huge(0._wp)
+                call MPI_SENDRECV(send_val, 1, mpi_p, send_neighbor, ${TAG + 1}$, recv_val, 1, mpi_p, recv_neighbor, ${TAG + 1}$, &
+                                  & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+                neighbor_domain_${X}$%end = recv_val
+            end if
+        #:endfor
+#endif
+
+    end subroutine get_neighbor_bounds
+
 end module m_start_up
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index ce0ae48c4b..eb25dd9309 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -552,7 +552,6 @@ contains
                 ! check if any IBMS are moving, and if so, update the markers, ghost points, levelsets, and levelset norms
                 if (moving_immersed_boundary_flag) then
                     call s_propagate_immersed_boundaries(s)
-                    ! compute ib forces for fixed immersed boundaries if requested for output
                 end if
 
                 ! update the ghost fluid properties point values based on IB state
@@ -564,10 +563,11 @@ contains
             end if
         end do
 
-        !
         if (ib) then
-            if (moving_immersed_boundary_flag) call s_wrap_periodic_ibs()
-            if (ib_state_wrt .and. (.not. moving_immersed_boundary_flag)) then
+            if (moving_immersed_boundary_flag) then
+                call s_wrap_periodic_ibs()  ! wraps the positions of IBs to the local proc
+                call s_handoff_ib_ownership()  ! recomputes which ranks own which IBs and communicate to neighbors
+            else if (ib_state_wrt) then
                 call s_compute_ib_forces(q_prim_vf, fluid_pp)
             end if
         end if
@@ -714,6 +714,8 @@ contains
 
         forces_computed = .false.
 
+        if (moving_immersed_boundary_flag) call s_compute_ib_forces(q_prim_vf, fluid_pp)
+
         do i = 1, num_ibs
             if (s == 1) then
                 patch_ib(i)%step_vel = patch_ib(i)%vel
@@ -726,11 +728,6 @@ contains
 
             ! Compute forces BEFORE the RK velocity blend so the device copy of patch_ib%vel matches the host (pre-blend) when
             ! velocity-dependent collision damping forces are evaluated on the GPU.
-            if (patch_ib(i)%moving_ibm == 2 .and. .not. forces_computed) then
-                call s_compute_ib_forces(q_prim_vf, fluid_pp)
-                forces_computed = .true.
-            end if
-
             if (patch_ib(i)%moving_ibm > 0) then
                 patch_ib(i)%vel = (rk_coef(s, 1)*patch_ib(i)%step_vel + rk_coef(s, 2)*patch_ib(i)%vel)/rk_coef(s, 4)
                 patch_ib(i)%angular_vel = (rk_coef(s, 1)*patch_ib(i)%step_angular_vel + rk_coef(s, &