DeepEP/third-party/nvshmem.patch
2025-03-05 16:16:52 +08:00

423 lines
20 KiB
Diff

From 9e6cc27cceb3130784e4ea7b61ea3171156365fd Mon Sep 17 00:00:00 2001
From: Shangyan Zhou <sy.zhou@deepseek.com>
Date: Fri, 20 Dec 2024 10:57:12 +0800
Subject: [PATCH 1/4] Change QP creating order.
---
src/modules/transport/ibgda/ibgda.cpp | 13 ++++++++-----
1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/src/modules/transport/ibgda/ibgda.cpp b/src/modules/transport/ibgda/ibgda.cpp
index ef325cd..286132e 100644
--- a/src/modules/transport/ibgda/ibgda.cpp
+++ b/src/modules/transport/ibgda/ibgda.cpp
@@ -2936,17 +2936,20 @@ int nvshmemt_ibgda_connect_endpoints(nvshmem_transport_t t, int *selected_dev_id
INFO(ibgda_state->log_level, "Creating %d RC QPs", device->rc.num_eps_per_pe);
for (int i = 0; i < num_rc_eps; ++i) {
// Do not create loopback to self
- if (i / device->rc.num_eps_per_pe == mype) {
+ int dst_pe = (i + 1 + mype) % n_pes;
+ int offset = i / n_pes;
+ int mapped_i = dst_pe * device->rc.num_eps_per_pe + offset;
+ if (dst_pe == mype) {
continue;
}
- status = ibgda_create_qp(&device->rc.eps[i], device, portid, i,
+ status = ibgda_create_qp(&device->rc.eps[mapped_i], device, portid, mapped_i,
NVSHMEMI_IBGDA_DEVICE_QP_TYPE_RC);
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out,
- "ibgda_create_dci failed on RC #%d.", i);
+ "ibgda_create_dci failed on RC #%d.", mapped_i);
- status = ibgda_get_rc_handle(&local_rc_handles[i], device->rc.eps[i], device);
+ status = ibgda_get_rc_handle(&local_rc_handles[mapped_i], device->rc.eps[mapped_i], device);
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out,
- "ibgda_get_rc_handle failed on RC #%d.", i);
+ "ibgda_get_rc_handle failed on RC #%d.", mapped_i);
}
if (num_rc_eps) {
--
2.25.1
From b11d41e4f3727f2f6ccc00a8c852e59e2ee33c8a Mon Sep 17 00:00:00 2001
From: Shangyan Zhou <sy.zhou@deepseek.com>
Date: Fri, 10 Jan 2025 11:53:38 +0800
Subject: [PATCH 2/4] Add recv queue and recv cq for rc qps.
Let the ibgda rc qps use regular recv queue.
Add recv queue to ibgda dev qp.
IBGDA create recv cq
Setup recv cq.
fix recv queue.
Remove some useless idx.
Longer recv queue.
---
.../nvshmem_common_ibgda.h | 19 +++++-
src/modules/transport/ibgda/ibgda.cpp | 65 ++++++++++++++++---
2 files changed, 71 insertions(+), 13 deletions(-)
diff --git a/src/include/device_host_transport/nvshmem_common_ibgda.h b/src/include/device_host_transport/nvshmem_common_ibgda.h
index 8b8a263..1be3dec 100644
--- a/src/include/device_host_transport/nvshmem_common_ibgda.h
+++ b/src/include/device_host_transport/nvshmem_common_ibgda.h
@@ -168,14 +168,17 @@ typedef struct {
uint64_t get_head; // last wqe idx + 1 with a "fetch" operation (g, get, amo_fetch)
uint64_t get_tail; // last wqe idx + 1 polled with cst; get_tail > get_head is possible
} tx_wq;
+ struct {
+ uint64_t resv_head; // last reserved wqe idx + 1
+ } rx_wq;
struct {
uint64_t head;
uint64_t tail;
} ibuf;
char padding[NVSHMEMI_IBGDA_QP_MANAGEMENT_PADDING];
} __attribute__((__aligned__(8))) nvshmemi_ibgda_device_qp_management_v1;
-static_assert(sizeof(nvshmemi_ibgda_device_qp_management_v1) == 96,
- "ibgda_device_qp_management_v1 must be 96 bytes.");
+static_assert(sizeof(nvshmemi_ibgda_device_qp_management_v1) == 104,
+ "ibgda_device_qp_management_v1 must be 104 bytes.");
typedef nvshmemi_ibgda_device_qp_management_v1 nvshmemi_ibgda_device_qp_management_t;
@@ -199,9 +202,19 @@ typedef struct nvshmemi_ibgda_device_qp {
// May point to mvars.prod_idx or internal prod_idx
uint64_t *prod_idx;
} tx_wq;
+ struct {
+ uint16_t nwqes;
+ uint64_t tail;
+ void *wqe;
+ __be32 *dbrec;
+ void *bf;
+ nvshmemi_ibgda_device_cq_t *cq;
+ // May point to mvars.prod_idx or internal prod_idx
+ uint64_t *prod_idx;
+ } rx_wq;
nvshmemi_ibgda_device_qp_management_v1 mvars; // management variables
} nvshmemi_ibgda_device_qp_v1;
-static_assert(sizeof(nvshmemi_ibgda_device_qp_v1) == 184, "ibgda_device_qp_v1 must be 184 bytes.");
+static_assert(sizeof(nvshmemi_ibgda_device_qp_v1) == 248, "ibgda_device_qp_v1 must be 248 bytes.");
typedef nvshmemi_ibgda_device_qp_v1 nvshmemi_ibgda_device_qp_t;
diff --git a/src/modules/transport/ibgda/ibgda.cpp b/src/modules/transport/ibgda/ibgda.cpp
index 286132e..e0b2d5c 100644
--- a/src/modules/transport/ibgda/ibgda.cpp
+++ b/src/modules/transport/ibgda/ibgda.cpp
@@ -198,6 +198,7 @@ struct ibgda_ep {
off_t dbr_offset;
struct ibgda_cq *send_cq;
+ struct ibgda_cq *recv_cq;
struct ibv_ah *ah;
uint32_t user_index;
@@ -1538,7 +1539,8 @@ static int ibgda_create_cq_shared_objects(nvshmemt_ibgda_state_t *ibgda_state,
struct ibv_context *context = device->context;
- unsigned int num_cqs = device->dci.num_eps + device->rc.num_eps_per_pe * n_pes;
+ // Each RC qp has one send CQ and one recv CQ.
+ unsigned int num_cqs = device->dci.num_eps + device->rc.num_eps_per_pe * n_pes * 2;
assert(ibgda_qp_depth > 0);
size_t num_cqe = IBGDA_ROUND_UP_POW2_OR_0(ibgda_qp_depth);
@@ -1701,7 +1703,8 @@ static int ibgda_create_qp_shared_objects(nvshmemt_ibgda_state_t *ibgda_state,
}
// Allocate and map WQ buffer for all QPs.
- wq_buf_size_per_qp = num_wqebb * MLX5_SEND_WQE_BB; // num_wqebb is always a power of 2
+ // Todo: reduce the size of wq buffer.
+ wq_buf_size_per_qp = num_wqebb * MLX5_SEND_WQE_BB * 2; // num_wqebb is always a power of 2
wq_buf_size = wq_buf_size_per_qp * num_eps;
status = ibgda_nic_control_alloc(&wq_mobject, wq_buf_size, IBGDA_GPAGE_SIZE);
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out, "cannot allocate wq buf.\n");
@@ -1882,8 +1885,11 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
int cqe_version = 0;
struct ibgda_cq *send_cq = NULL;
+ struct ibgda_cq *recv_cq = NULL;
size_t num_wqebb = IBGDA_ROUND_UP_POW2_OR_0(ibgda_qp_depth);
+ size_t num_recv_wqe = ibgda_qp_depth;
+ size_t recv_wqe_size = 16;
int status = 0;
@@ -1911,6 +1917,11 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
status = ibgda_create_cq(&send_cq, device);
NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out, "ibgda_create_cq failed.\n");
+ if (qp_type == NVSHMEMI_IBGDA_DEVICE_QP_TYPE_RC) {
+ status = ibgda_create_cq(&recv_cq, device);
+ NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out, "ibgda_create_cq failed.\n");
+ }
+
ep = (struct ibgda_ep *)calloc(1, sizeof(struct ibgda_ep));
NVSHMEMI_NULL_ERROR_JMP(ep, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, out,
"Unable to allocate mem for ep.\n");
@@ -1939,12 +1950,9 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
DEVX_SET(qpc, qp_context, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
DEVX_SET(qpc, qp_context, pd, device->qp_shared_object.pdn);
DEVX_SET(qpc, qp_context, uar_page, uar_mobject->uar->page_id); // BF register
- DEVX_SET(qpc, qp_context, rq_type, IBGDA_SRQ_TYPE_VALUE); // Shared Receive Queue
- DEVX_SET(qpc, qp_context, srqn_rmpn_xrqn, device->qp_shared_object.srqn);
DEVX_SET(qpc, qp_context, cqn_snd, send_cq->cqn);
- DEVX_SET(qpc, qp_context, cqn_rcv, device->qp_shared_object.rcqn);
+ DEVX_SET(qpc, qp_context, cqn_rcv, qp_type == NVSHMEMI_IBGDA_DEVICE_QP_TYPE_RC ? recv_cq->cqn : device->qp_shared_object.rcqn);
DEVX_SET(qpc, qp_context, log_sq_size, IBGDA_ILOG2_OR0(num_wqebb));
- DEVX_SET(qpc, qp_context, log_rq_size, 0);
DEVX_SET(qpc, qp_context, cs_req, 0); // Disable CS Request
DEVX_SET(qpc, qp_context, cs_res, 0); // Disable CS Response
DEVX_SET(qpc, qp_context, dbr_umem_valid, IBGDA_MLX5_UMEM_VALID_ENABLE); // Enable dbr_umem_id
@@ -1953,6 +1961,15 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
DEVX_SET(qpc, qp_context, dbr_umem_id, dbr_umem->umem_id); // DBR buffer
DEVX_SET(qpc, qp_context, user_index, qp_idx);
DEVX_SET(qpc, qp_context, page_offset, 0);
+ if (qp_type == NVSHMEMI_IBGDA_DEVICE_QP_TYPE_RC){
+ DEVX_SET(qpc, qp_context, rq_type, 0); // Regular recv queue
+ DEVX_SET(qpc, qp_context, log_rq_size, IBGDA_ILOG2(num_recv_wqe)); // 4 wqe
+ DEVX_SET(qpc, qp_context, log_rq_stride, IBGDA_ILOG2(recv_wqe_size) - 4); // max recv wqe size = 16B
+ } else {
+ DEVX_SET(qpc, qp_context, rq_type, IBGDA_SRQ_TYPE_VALUE); // Shared Receive Queue, DC must use this.
+ DEVX_SET(qpc, qp_context, srqn_rmpn_xrqn, device->qp_shared_object.srqn);
+ DEVX_SET(qpc, qp_context, log_rq_size, 0);
+ }
ep->devx_qp = mlx5dv_devx_obj_create(context, cmd_in, sizeof(cmd_in), cmd_out, sizeof(cmd_out));
NVSHMEMI_NULL_ERROR_JMP(ep->devx_qp, status, NVSHMEMX_ERROR_INTERNAL, out,
@@ -1962,9 +1979,9 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
ep->portid = portid;
ep->sq_cnt = num_wqebb;
- ep->sq_buf_offset = 0;
+ ep->sq_buf_offset = num_recv_wqe * recv_wqe_size;
- ep->rq_cnt = 0;
+ ep->rq_cnt = num_recv_wqe;
ep->rq_buf_offset = 0;
ep->wq_mobject = device->qp_shared_object.wq_mobject;
@@ -1978,6 +1995,7 @@ static int ibgda_create_qp(struct ibgda_ep **ep_ptr, struct ibgda_device *device
ep->uar_mobject = uar_mobject;
ep->send_cq = send_cq;
+ ep->recv_cq = recv_cq;
ep->qp_type = qp_type;
@@ -1989,6 +2007,7 @@ out:
if (status) {
if (uar_mobject) ibgda_unmap_and_free_qp_uar(uar_mobject);
if (send_cq) ibgda_destroy_cq(send_cq);
+ if (recv_cq) ibgda_destroy_cq(recv_cq);
if (ep) free(ep);
}
@@ -2287,6 +2306,10 @@ static int ibgda_destroy_ep(struct ibgda_ep *ep) {
ibgda_destroy_cq(ep->send_cq);
}
+ if (ep->recv_cq) {
+ ibgda_destroy_cq(ep->recv_cq);
+ }
+
if (ep->ah) {
ftable.destroy_ah(ep->ah);
}
@@ -2318,7 +2341,7 @@ static void ibgda_get_device_qp(nvshmemi_ibgda_device_qp_t *dev_qp, struct ibgda
dev_qp->qpn = ep->qpn;
assert(ep->wq_mobject->has_gpu_mapping);
- dev_qp->tx_wq.wqe = (void *)((uintptr_t)ep->wq_mobject->aligned.gpu_ptr + ep->wq_offset);
+ dev_qp->tx_wq.wqe = (void *)((uintptr_t)ep->wq_mobject->aligned.gpu_ptr + ep->wq_offset + ep->sq_buf_offset);
if (ibgda_nic_handler == IBGDA_NIC_HANDLER_GPU) {
assert(ep->dbr_mobject->has_gpu_mapping);
@@ -2330,6 +2353,12 @@ static void ibgda_get_device_qp(nvshmemi_ibgda_device_qp_t *dev_qp, struct ibgda
}
dev_qp->tx_wq.nwqes = ep->sq_cnt;
+ if (ep->qp_type == NVSHMEMI_IBGDA_DEVICE_QP_TYPE_RC) {
+ dev_qp->rx_wq.nwqes = ep->rq_cnt;
+ dev_qp->rx_wq.wqe = (void *)((uintptr_t)ep->wq_mobject->aligned.gpu_ptr + ep->wq_offset + ep->rq_buf_offset);
+ dev_qp->rx_wq.dbrec = (__be32 *)((uintptr_t)ep->dbr_mobject->aligned.gpu_ptr + ep->dbr_offset);
+ dev_qp->rx_wq.bf = (void *)ep->uar_mobject->aligned.gpu_ptr;
+ }
ibuf_dci_start = (uintptr_t)device->qp_shared_object.internal_buf.mem_object->aligned.gpu_ptr;
ibuf_rc_start = ibuf_dci_start + (size_per_dci * device->dci.num_eps);
@@ -2379,6 +2408,9 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
nvshmemi_ibgda_device_cq_t *cq_d = NULL;
nvshmemi_ibgda_device_cq_t *cq_h = NULL;
+ nvshmemi_ibgda_device_cq_t *recv_cq_d = NULL;
+ nvshmemi_ibgda_device_cq_t *recv_cq_h = NULL;
+
uint8_t *qp_group_switches_d = NULL;
const size_t mvars_offset = offsetof(nvshmemi_ibgda_device_qp_t, mvars);
@@ -2386,6 +2418,7 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
const size_t cons_t_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, tx_wq.cons_idx);
const size_t wqe_h_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, tx_wq.resv_head);
const size_t wqe_t_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, tx_wq.ready_head);
+ const size_t rx_resv_head_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, rx_wq.resv_head);
nvshmemi_ibgda_device_qp_map_type_t rc_map_type = NVSHMEMI_IBGDA_DEVICE_QP_MAP_TYPE_INVALID;
nvshmemi_ibgda_device_qp_map_type_t dc_map_type = NVSHMEMI_IBGDA_DEVICE_QP_MAP_TYPE_INVALID;
@@ -2421,7 +2454,7 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
num_dct_handles += device->dct.num_eps * n_pes;
num_dci_handles += device->dci.num_eps;
num_rc_handles += device->rc.num_eps_per_pe * n_pes;
- num_cq_handles += device->dci.num_eps + (device->rc.num_eps_per_pe * (n_pes - 1));
+ num_cq_handles += device->dci.num_eps + (device->rc.num_eps_per_pe * (n_pes - 1) * 2);
num_shared_dci_handles += device->dci.num_shared_eps;
}
assert(num_dci_handles - num_shared_dci_handles >= 0);
@@ -2456,6 +2489,10 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
for (int i = 0; i < num_cq_handles; i++) {
nvshmemi_init_ibgda_device_cq(cq_h[i]);
}
+
+ recv_cq_h = (nvshmemi_ibgda_device_cq_t *)calloc(1, sizeof(*recv_cq_h));
+ NVSHMEMI_NULL_ERROR_JMP(recv_cq_h, status, NVSHMEMX_ERROR_OUT_OF_MEMORY, out, "recv_cq calloc err.");
+ nvshmemi_init_ibgda_device_cq(recv_cq_h[0]);
/* allocate host memory for dct, rc, cq, dci end */
/* allocate device memory for dct, rc, cq, dci start */
@@ -2559,6 +2596,14 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
}
++cq_idx;
+
+ rc_h[arr_idx].rx_wq.cq = &cq_d[cq_idx];
+
+ ibgda_get_device_cq(&cq_h[cq_idx], device->rc.eps[i]->recv_cq);
+ cq_h[cq_idx].resv_head = (uint64_t *)(base_mvars_d_addr + rx_resv_head_offset);
+ cq_h[cq_idx].qpn = rc_h[arr_idx].qpn;
+ cq_h[cq_idx].qp_type = rc_h[arr_idx].qp_type;
+ ++cq_idx;
}
}
}
--
2.25.1
From af479f9f23103d4a1579fae38676d6b3022df887 Mon Sep 17 00:00:00 2001
From: Shangyan Zhou <sy.zhou@deepseek.com>
Date: Sat, 8 Feb 2025 18:02:39 +0800
Subject: [PATCH 3/4] Maintain recv queue's cons_idx.
---
src/include/device_host_transport/nvshmem_common_ibgda.h | 5 +++--
src/modules/transport/ibgda/ibgda.cpp | 6 ++++--
2 files changed, 7 insertions(+), 4 deletions(-)
diff --git a/src/include/device_host_transport/nvshmem_common_ibgda.h b/src/include/device_host_transport/nvshmem_common_ibgda.h
index 1be3dec..ea1e284 100644
--- a/src/include/device_host_transport/nvshmem_common_ibgda.h
+++ b/src/include/device_host_transport/nvshmem_common_ibgda.h
@@ -170,6 +170,7 @@ typedef struct {
} tx_wq;
struct {
uint64_t resv_head; // last reserved wqe idx + 1
+ uint64_t cons_idx; // polled wqe idx + 1 (consumer index + 1)
} rx_wq;
struct {
uint64_t head;
@@ -177,7 +178,7 @@ typedef struct {
} ibuf;
char padding[NVSHMEMI_IBGDA_QP_MANAGEMENT_PADDING];
} __attribute__((__aligned__(8))) nvshmemi_ibgda_device_qp_management_v1;
-static_assert(sizeof(nvshmemi_ibgda_device_qp_management_v1) == 104,
+static_assert(sizeof(nvshmemi_ibgda_device_qp_management_v1) == 112,
"ibgda_device_qp_management_v1 must be 104 bytes.");
typedef nvshmemi_ibgda_device_qp_management_v1 nvshmemi_ibgda_device_qp_management_t;
@@ -214,7 +215,7 @@ typedef struct nvshmemi_ibgda_device_qp {
} rx_wq;
nvshmemi_ibgda_device_qp_management_v1 mvars; // management variables
} nvshmemi_ibgda_device_qp_v1;
-static_assert(sizeof(nvshmemi_ibgda_device_qp_v1) == 248, "ibgda_device_qp_v1 must be 248 bytes.");
+static_assert(sizeof(nvshmemi_ibgda_device_qp_v1) == 256, "ibgda_device_qp_v1 must be 248 bytes.");
typedef nvshmemi_ibgda_device_qp_v1 nvshmemi_ibgda_device_qp_t;
diff --git a/src/modules/transport/ibgda/ibgda.cpp b/src/modules/transport/ibgda/ibgda.cpp
index e0b2d5c..bc339c5 100644
--- a/src/modules/transport/ibgda/ibgda.cpp
+++ b/src/modules/transport/ibgda/ibgda.cpp
@@ -1067,7 +1067,7 @@ static inline void ibgda_nic_control_free(struct ibgda_mem_object *mobject) {
ibgda_host_mem_free(mobject);
}
-static int ibgda_create_cq(struct ibgda_cq **pgcq, struct ibgda_device *device) {
+static int ibgda_create_cq(struct ibgda_cq **pgcq, struct ibgda_device *device, int cc = 1) {
int status = 0;
struct ibgda_cq *gcq = NULL;
@@ -1118,7 +1118,7 @@ static int ibgda_create_cq(struct ibgda_cq **pgcq, struct ibgda_device *device)
cq_context = DEVX_ADDR_OF(create_cq_in, cmd_in, cq_context);
DEVX_SET(cqc, cq_context, dbr_umem_valid, IBGDA_MLX5_UMEM_VALID_ENABLE);
DEVX_SET(cqc, cq_context, cqe_sz, MLX5_CQE_SIZE_64B);
- DEVX_SET(cqc, cq_context, cc, 0x1); // Use collapsed CQ
+ DEVX_SET(cqc, cq_context, cc, cc); // Use collapsed CQ
DEVX_SET(cqc, cq_context, oi, 0x1); // Allow overrun
DEVX_SET(cqc, cq_context, dbr_umem_id, dbr_umem->umem_id);
DEVX_SET(cqc, cq_context, log_cq_size, IBGDA_ILOG2_OR0(num_cqe));
@@ -2419,6 +2419,7 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
const size_t wqe_h_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, tx_wq.resv_head);
const size_t wqe_t_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, tx_wq.ready_head);
const size_t rx_resv_head_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, rx_wq.resv_head);
+ const size_t rx_cons_offset = offsetof(nvshmemi_ibgda_device_qp_management_t, rx_wq.cons_idx);
nvshmemi_ibgda_device_qp_map_type_t rc_map_type = NVSHMEMI_IBGDA_DEVICE_QP_MAP_TYPE_INVALID;
nvshmemi_ibgda_device_qp_map_type_t dc_map_type = NVSHMEMI_IBGDA_DEVICE_QP_MAP_TYPE_INVALID;
@@ -2601,6 +2602,7 @@ static int ibgda_setup_gpu_state(nvshmem_transport_t t) {
ibgda_get_device_cq(&cq_h[cq_idx], device->rc.eps[i]->recv_cq);
cq_h[cq_idx].resv_head = (uint64_t *)(base_mvars_d_addr + rx_resv_head_offset);
+ cq_h[cq_idx].cons_idx = (uint64_t *)(base_mvars_d_addr + rx_cons_offset);
cq_h[cq_idx].qpn = rc_h[arr_idx].qpn;
cq_h[cq_idx].qp_type = rc_h[arr_idx].qp_type;
++cq_idx;
--
2.25.1
From e0ba3fa21b4b633b481c6684c3ad04f2670c8df4 Mon Sep 17 00:00:00 2001
From: Shangyan Zhou <sy.zhou@deepseek.com>
Date: Tue, 11 Feb 2025 11:00:57 +0800
Subject: [PATCH 4/4] Init rx_wq counters.
---
src/include/device_host_transport/nvshmem_common_ibgda.h | 2 ++
1 file changed, 2 insertions(+)
diff --git a/src/include/device_host_transport/nvshmem_common_ibgda.h b/src/include/device_host_transport/nvshmem_common_ibgda.h
index ea1e284..e6640d6 100644
--- a/src/include/device_host_transport/nvshmem_common_ibgda.h
+++ b/src/include/device_host_transport/nvshmem_common_ibgda.h
@@ -46,6 +46,8 @@
qp_man.tx_wq.cons_idx = NVSHMEMI_IBGDA_ULSCALAR_INVALID; \
qp_man.tx_wq.get_head = NVSHMEMI_IBGDA_ULSCALAR_INVALID; \
qp_man.tx_wq.get_tail = NVSHMEMI_IBGDA_ULSCALAR_INVALID; \
+ qp_man.rx_wq.resv_head = NVSHMEMI_IBGDA_ULSCALAR_INVALID; \
+ qp_man.rx_wq.cons_idx = NVSHMEMI_IBGDA_ULSCALAR_INVALID; \
qp_man.ibuf.head = NVSHMEMI_IBGDA_ULSCALAR_INVALID; \
qp_man.ibuf.tail = NVSHMEMI_IBGDA_ULSCALAR_INVALID; \
} while (0);
--
2.25.1