Initial commit

This commit is contained in:
dev
2025-02-27 21:53:53 +08:00
commit 815e55e4c0
1291 changed files with 185445 additions and 0 deletions

View File

@@ -0,0 +1,2 @@
// the rdma network module
module RDMANetwork = { Network, QueuePair, RDMASocket };

View File

@@ -0,0 +1,556 @@
type tBytes = seq[int];
fun InitBytes(size: int, value: int): tBytes {
var i: int;
var bytes: tBytes;
i = 0;
while (i < size) {
bytes += (i, value);
i = i + 1;
}
return bytes;
}
fun Append(a: tBytes, b: tBytes): tBytes {
var i: int;
i = 0;
while (i < sizeof(b)) {
a += (sizeof(a), b[i]);
i = i + 1;
}
return a;
}
enum tWROpCode {
WROpCode_INVALID = 100,
WROpCode_SEND = 101,
WROpCode_SEND_WITH_IMM = 102
}
enum tWCOpCode {
WCOpCode_INVALID = 200,
WCOpCode_SEND = 201,
WCOpCode_RECV = 202,
WCOpCode_RECV_WITH_IMM = 203 // no such opcode in ibv APIs, this is added to indicate a recv completion with wc_flags = IBV_WC_WITH_IMM
}
fun ConvertWRToWCOpCode(opcode: tWROpCode): tWCOpCode {
if (opcode == WROpCode_SEND) return WCOpCode_RECV;
if (opcode == WROpCode_SEND_WITH_IMM) return WCOpCode_RECV_WITH_IMM;
return WCOpCode_INVALID;
}
enum tStatus {
Status_OK,
Status_ERR,
Status_AGAIN
}
type tXmitPacket = (opcode: tWROpCode, payload: tBytes, length: int, imm: int);
type tGetPacketResp = (from: QueuePair, status: tStatus, packet: tXmitPacket);
event ePutPacket: tXmitPacket;
event eGetPacket: Network;
event eGetPacketResp: tGetPacketResp;
event eWaitConnected: machine;
event eWaitConnectedResp;
event eNextExchangeIter;
machine Network {
var qps: seq[QueuePair];
var user: machine;
start state Init {
entry (args: (sock: RDMASocket, peer: RDMASocket)) {
send args.sock, eConnect, this;
receive {
case eConnectResp: (qp: QueuePair) { qps += (0, qp); }
}
send args.peer, eConnect, this;
receive {
case eConnectResp: (qp: QueuePair) { qps += (1, qp); }
}
print format("network connected {0}", qps);
if (user != null)
send user, eWaitConnectedResp;
goto ExchangePackets;
}
on eWaitConnected do (from: machine) {
user = from;
}
}
state ExchangePackets {
entry {
var i: int;
var n: int;
i = 0;
while (i < sizeof(qps)) {
// exchange a nondeterministic number of packets between 1..4
n = choose(3) + 1;
while (n > 0) {
send qps[i], eGetPacket, this;
n = n - 1;
}
i = i + 1;
}
}
on eWaitConnected do (from: machine) {
send from, eWaitConnectedResp;
}
on eGetPacketResp do (resp: tGetPacketResp) {
var i: int;
if (resp.status == Status_OK) {
i = 0;
while (i < 2) {
if (qps[i] != resp.from)
break;
i = i + 1;
}
send qps[i], ePutPacket, resp.packet;
}
send this, eNextExchangeIter;
}
on eNextExchangeIter goto ExchangePackets;
}
}
type tWorkComplete = (wrIdx: int, opcode: tWCOpCode, payload: tBytes, length: int, imm: int, status: tStatus);
type tWorkRequest = (wrIdx: int, opcode: tWROpCode, payload: tBytes, length: int, imm: int);
event ePostRecv: tWorkRequest;
event ePostSend: tWorkRequest;
event ePollRecvCQ: RDMASocket;
event ePollSendCQ: RDMASocket;
machine QueuePair {
var maxNumSendWRs: int;
var maxNumRecvWRs: int;
var postedRecvWRs: seq[tWorkRequest];
var postedSendWRs: seq[tWorkRequest];
var sendCompQueue: seq[tWorkComplete];
var recvCompQueue: seq[tWorkComplete];
var outboundQueue: seq[tXmitPacket];
var inboundQueue: seq[tXmitPacket];
// users waiting on events
var network: Network;
var sockPollSendCQ: RDMASocket;
var sockPollRecvCQ: RDMASocket;
var pendingGetPkt: int;
var pendingPollSend: int;
var pendingPollRecv: int;
fun PushPacketToNetwork(net: Network) {
var wr: tWorkRequest;
var wc: tWorkComplete;
var packet: tXmitPacket;
wr = postedSendWRs[0];
postedSendWRs -= (0);
print format("{0} -sizeof postedSendWRs {1}", this, sizeof(postedSendWRs));
wc = (wrIdx = wr.wrIdx,
opcode = WCOpCode_SEND,
payload = wr.payload,
length = wr.length,
imm = wr.imm,
status = Status_OK);
sendCompQueue += (sizeof(sendCompQueue), wc);
print format("{0} +sizeof sendCompQueue {1}", this, sizeof(sendCompQueue));
if (pendingPollSend > 0) {
NotifySendCQ(sockPollSendCQ);
pendingPollSend = pendingPollSend - 1;
if (pendingPollSend == 0)
sockPollSendCQ = default(RDMASocket);
}
packet = (opcode = wr.opcode,
payload = wr.payload,
length = wr.length,
imm = wr.imm);
send net, eGetPacketResp, (from = this, status = Status_OK, packet = packet);
}
fun NotifyRecvCQ(sock: RDMASocket) {
var wc: tWorkComplete;
wc = recvCompQueue[0];
recvCompQueue -= (0);
print format("{0} -sizeof recvCompQueue {1}", this, sizeof(recvCompQueue));
send sock, ePollRecvCQReturn, wc;
}
fun NotifySendCQ(sock: RDMASocket) {
var wc: tWorkComplete;
wc = sendCompQueue[0];
sendCompQueue -= (0);
print format("{0} -sizeof sendCompQueue {1}", this, sizeof(sendCompQueue));
send sock, ePollSendCQReturn, wc;
}
start state Init {
entry (args: (maxNumSendWRs: int, maxNumRecvWRs: int)) {
print format("qp init start {0}", this);
maxNumSendWRs = args.maxNumSendWRs;
maxNumRecvWRs = args.maxNumRecvWRs;
print format("qp init done {0}", this);
goto WaitForEvents;
}
}
state WaitForEvents {
on ePostRecv do (wr: tWorkRequest) {
assert sizeof(postedRecvWRs) < maxNumRecvWRs;
postedRecvWRs += (sizeof(postedRecvWRs), wr);
print format("{0} +sizeof postedRecvWRs {1}", this, sizeof(postedRecvWRs));
}
on ePostSend do (wr: tWorkRequest) {
assert sizeof(postedSendWRs) < maxNumSendWRs;
postedSendWRs += (sizeof(postedSendWRs), wr);
print format("{0} +sizeof postedSendWRs {1}", this, sizeof(postedSendWRs));
if (pendingGetPkt > 0) {
PushPacketToNetwork(network);
pendingGetPkt = pendingGetPkt - 1;
if (pendingGetPkt == 0)
network = default(Network);
}
}
on ePutPacket do (packet: tXmitPacket) {
var wr: tWorkRequest;
var wc: tWorkComplete;
var i: int;
assert sizeof(postedRecvWRs) > 0, "error: receive not ready";
wr = postedRecvWRs[0];
postedRecvWRs -= (0);
print format("{0} -sizeof postedRecvWRs {1}", this, sizeof(postedRecvWRs));
assert packet.length <= wr.length;
wc = (wrIdx = wr.wrIdx,
opcode = ConvertWRToWCOpCode(packet.opcode),
payload = wr.payload,
length = packet.length,
imm = packet.imm,
status = Status_OK);
while (i < packet.length) {
wc.payload[i] = packet.payload[i];
i = i + 1;
}
recvCompQueue += (sizeof(recvCompQueue), wc);
print format("{0} +sizeof recvCompQueue {1}", this, sizeof(recvCompQueue));
if (pendingPollRecv > 0) {
NotifyRecvCQ(sockPollRecvCQ);
pendingPollRecv = pendingPollRecv - 1;
if (pendingPollRecv == 0)
sockPollRecvCQ = default(RDMASocket);
}
}
on eGetPacket do (net: Network) {
if (sizeof(postedSendWRs) == 0) {
// send net, eGetPacketResp, (from = this, status = Status_AGAIN, packet = default(tXmitPacket));
pendingGetPkt = pendingGetPkt + 1;
if (pendingGetPkt > 1)
assert network == net;
else
network = net;
} else {
PushPacketToNetwork(net);
}
}
on ePollRecvCQ do (sock: RDMASocket) {
if (sizeof(recvCompQueue) == 0) {
pendingPollRecv = pendingPollRecv + 1;
if (pendingPollRecv > 1)
assert sockPollRecvCQ == sock;
else
sockPollRecvCQ = sock;
} else {
NotifyRecvCQ(sock);
}
}
on ePollSendCQ do (sock: RDMASocket) {
if (sizeof(sendCompQueue) == 0) {
pendingPollSend = pendingPollSend + 1;
if (pendingPollSend > 1)
assert sockPollSendCQ == sock;
else
sockPollSendCQ = sock;
} else {
NotifySendCQ(sock);
}
}
}
}
type tTaggedBuffer = (bufIdx: int, payload: tBytes, length: int);
event ePollRecvCQReturn: tWorkComplete;
event ePollSendCQReturn: tWorkComplete;
event eRecvBytes: (from: machine, length: int);
event eSendBytes: tBytes;
event eRecvBytesResp: tBytes;
event eConnect: Network;
event eConnectResp: QueuePair;
event eNextPollCQIter;
machine RDMASocket {
var qp: QueuePair;
var sockId: int;
var bufSize: int;
var bufNum: int; // assume the numbers of local and remote send/recv buffers are the same
var flowCtrlBufNum: int;
var unusedSendBufs: seq[tTaggedBuffer];
var remotePostedBufNum: int;
var effectiveSendBufNum: int;
var numRecvBeforeAck: int;
var numRecvSinceLastAck: int;
var bytesToSend: tBytes;
var bytesRecved: tBytes;
// pending recv
var userWaited: machine;
var recvedData: tBytes;
var recvLength: int;
var pendingPollSendCQ: int;
var pendingPollRecvCQ: int;
start state Init {
entry (args: (sockId: int, bufSize: int, bufNum: int, numRecvBeforeAck: int)) {
print format("socket init start {0}", this);
flowCtrlBufNum = (args.bufNum + args.numRecvBeforeAck - 1) / args.numRecvBeforeAck;
qp = new QueuePair((
maxNumSendWRs = args.bufNum + flowCtrlBufNum,
maxNumRecvWRs = args.bufNum + flowCtrlBufNum));
sockId = args.sockId;
bufSize = args.bufSize;
bufNum = args.bufNum;
remotePostedBufNum = args.bufNum;
effectiveSendBufNum = args.bufNum;
numRecvBeforeAck = args.numRecvBeforeAck;
numRecvSinceLastAck = 0;
print format("socket init done {0} with qp {1}", this, qp);
goto BeforeConnect;
}
}
state BeforeConnect {
entry {
var i: int;
print format("post {0} recv buffers in {1}", bufNum, this);
i = 0;
while (i < bufNum + flowCtrlBufNum) {
send qp, ePostRecv, (wrIdx = sockId * bufNum * 2 + i, opcode = WROpCode_INVALID, payload = InitBytes(bufSize, 0), length = bufSize, imm = 0);
i = i + 1;
}
print format("create {0} send buffers", effectiveSendBufNum);
i = 0;
while (i < effectiveSendBufNum) {
unusedSendBufs += (i, (bufIdx = sockId * bufNum * 2 + i, payload = InitBytes(bufSize, 0), length = bufSize));
i = i + 1;
}
goto WaitConnect;
}
}
state WaitConnect {
on eConnect do (net: Network) {
print format("{0} connected to {1}", this, net);
send net, eConnectResp, qp;
goto PollCQEvents;
}
}
state PollCQEvents {
entry {
var i: int;
var sendBuf: tTaggedBuffer;
print format("{0} sizeof(bytesToSend) {1} && sizeof(unusedSendBufs) {2} && remotePostedBufNum {3}",
this, sizeof(bytesToSend), sizeof(unusedSendBufs), remotePostedBufNum);
if (sizeof(bytesToSend) > 0 && remotePostedBufNum == 0) {
print format("{0}: remote side not posted any recv buffer", this);
}
if (sizeof(bytesToSend) > 0 && sizeof(unusedSendBufs) == 0) {
print format("{0}: local side does not have send buffer", this);
}
while (sizeof(bytesToSend) > 0 && sizeof(unusedSendBufs) > 0 && remotePostedBufNum > 0) {
remotePostedBufNum = remotePostedBufNum - 1;
sendBuf = unusedSendBufs[0];
unusedSendBufs -= (0);
i = 0;
while (i < bufSize && sizeof(bytesToSend) > 0) {
sendBuf.payload[i] = bytesToSend[0];
bytesToSend -= (0);
i = i + 1;
}
send qp, ePostSend, (wrIdx = sendBuf.bufIdx, opcode = WROpCode_SEND, payload = sendBuf.payload, length = i, imm = 0);
}
if (pendingPollRecvCQ < bufNum + flowCtrlBufNum) {
i = 0;
while (i < bufNum + flowCtrlBufNum - pendingPollRecvCQ) {
send qp, ePollRecvCQ, this;
pendingPollRecvCQ = pendingPollRecvCQ + 1;
}
}
if (pendingPollSendCQ < bufNum) {
i = 0;
while (i < bufNum - pendingPollSendCQ) {
send qp, ePollSendCQ, this;
pendingPollSendCQ = pendingPollSendCQ + 1;
}
}
}
on ePollRecvCQReturn do (wc: tWorkComplete) {
var i: int;
if (wc.status == Status_OK) {
assert wc.opcode == WCOpCode_RECV_WITH_IMM || wc.opcode == WCOpCode_RECV;
if (wc.opcode == WCOpCode_RECV_WITH_IMM) {
remotePostedBufNum = remotePostedBufNum + wc.imm;
print format("{0} received flow control packet with imm {1}, remotePostedBufNum {2}", this, wc.imm, remotePostedBufNum);
} else if (wc.opcode == WCOpCode_RECV) {
i = 0;
while (i < wc.length) {
bytesRecved += (sizeof(bytesRecved), wc.payload[i]);
i = i + 1;
}
print format("recv cq returned, user {0} waited", userWaited);
if (userWaited != null) {
print format("recvLength {0}, sizeof(recvedData) {1}, sizeof(bytesRecved) {2}",
recvLength, sizeof(recvedData), sizeof(bytesRecved));
i = 0;
while (sizeof(recvedData) < recvLength && sizeof(bytesRecved) > 0) {
recvedData += (sizeof(recvedData), bytesRecved[0]);
bytesRecved -= (0);
i = i + 1;
}
print format("copy recv data, copy length {0}, recvLength {1}, recvedData {2}", i, recvLength, recvedData);
if (sizeof(recvedData) == recvLength) {
send userWaited, eRecvBytesResp, recvedData;
userWaited = default(machine);
recvedData = default(tBytes);
recvLength = 0;
}
}
}
send qp, ePostRecv, (wrIdx = wc.wrIdx, opcode = WROpCode_INVALID, payload = wc.payload, length = bufSize, imm = 0);
if (wc.opcode == WCOpCode_RECV) {
numRecvSinceLastAck = numRecvSinceLastAck + 1;
if (numRecvSinceLastAck == numRecvBeforeAck) {
send qp, ePostSend, (wrIdx = -1, opcode = WROpCode_SEND_WITH_IMM, payload = default(tBytes), length = 0, imm = numRecvSinceLastAck);
numRecvSinceLastAck = 0;
}
}
assert pendingPollRecvCQ > 0;
pendingPollRecvCQ = pendingPollRecvCQ - 1;
send this, eNextPollCQIter;
} else if (wc.status != Status_AGAIN) {
assert false, "Unexpected wc status";
}
}
on ePollSendCQReturn do (wc: tWorkComplete) {
var sendBuf: tTaggedBuffer;
if (wc.status == Status_OK) {
if (wc.opcode == WCOpCode_SEND) {
if (wc.wrIdx >= 0) {
sendBuf = (bufIdx = wc.wrIdx, payload = wc.payload, length = bufSize);
unusedSendBufs += (sizeof(unusedSendBufs), sendBuf);
}
} else {
assert false, "Unexpected wc opcode";
}
assert pendingPollSendCQ > 0;
pendingPollSendCQ = pendingPollSendCQ - 1;
send this, eNextPollCQIter;
} else if (wc.status != Status_AGAIN) {
assert false, "Unexpected wc status";
}
}
on eRecvBytes do (args: (from: machine, length: int)) {
var i: int;
print format("{0} requested to receive {1} bytes, sizeof(bytesRecved) {2}", args.from, args.length, sizeof(bytesRecved));
i = 0;
while (i < args.length && sizeof(bytesRecved) > 0) {
recvedData += (i, bytesRecved[0]);
bytesRecved -= (0);
i = i + 1;
}
if (sizeof(recvedData) == args.length) {
send args.from, eRecvBytesResp, recvedData;
recvedData = default(tBytes);
} else {
userWaited = args.from;
recvLength = args.length;
}
send this, eNextPollCQIter;
}
on eSendBytes do (bytes: tBytes) {
var i: int;
i = 0;
while (i < sizeof(bytes)) {
bytesToSend += (sizeof(bytesToSend), bytes[i] % 256);
i = i + 1;
}
send this, eNextPollCQIter;
}
on eNextPollCQIter goto PollCQEvents;
}
}