使用SHM来传输tlm payload测试
我花了几天打算自己写一个zero-copy的版本, 使用boost里现成的managed_shared_memory和interprocess_semphore, container/vector等库, 但压力测试一直有问题, 有的传100多笔数据就出错,有的500多笔数据出错, 都是boost低层的rbtree之类的内存布局出错,调不下去,所以放弃自己写,而寻找开源的实现。
- cpp-ipc
- boost.interprocess.message_queue
- Flow-IPC
- iceoryx
传输tlm_generic_payload, 数据域长度会在64,128,256..4096之间随机变化。
随机64-4096 byte数据, 同size的byte enable, 值固定是0xff,
Latency (microseconds):
Min: 7.01
P50: 10.22
P90: 71.08
P99: 142.43
Max: 1294.71
Average: 24.83
对于小数据, recv会从shm拷贝到程序buf, 大数据直接返回shm指针
随机32-4096 byte数据, 同size的byte enable, 值固定是0xff,
Latency (microseconds):
Min: 13.39
P50: 64.65
P90: 91.55
P99: 159.15
Max: 1268.10
Average: 72.43
需要起一个独立进程做route, 目前v2版本使用rust实现, C/C++接口刚提供。使用不方便
返回的 buff_t
- 大消息的情况 (size > large_msg_limit):
// 大消息处理
if (msg.storage_) {
ipc::storage_id_t buf_id = *reinterpret_cast<ipc::storage_id_t*>(&msg.data_);
void* buf = find_storage(buf_id, inf, msg_size); // 获取共享内存中的数据指针
if (buf != nullptr) {
// 直接返回共享内存的指针,并设置回收函数
return ipc::buff_t{buf, msg_size, [](void* p_info, std::size_t size) {
auto r_info = static_cast<recycle_t *>(p_info);
// ...
// 在回收函数中释放共享内存
}, r_info};
- 小消息的情况 (size <= large_msg_limit):
// 小消息处理
if (msg_size <= ipc::data_length) {
return make_cache(msg.data_, msg_size); // 创建新的缓存
// make_cache的实现
inline ipc::buff_t make_cache(void* data, std::size_t size) {
auto cache_mem = ipc::mem::alloc(size); // 分配新内存
if (cache_mem == nullptr) {
return {};
std::memcpy(cache_mem, data, size); // 拷贝数据
return ipc::buff_t { cache_mem, size, [](void* ptr, std::size_t) {
ipc::mem::free(ptr); // 释放分配的内存
} };
- 大消息 (> large_msg_limit):
- 返回的是共享内存空间的指针
- 不会发生额外的内存拷贝
- buffer_t 的析构函数负责回收共享内存
- 小消息 (<= large_msg_limit):
- 返回的是新分配的内存空间的指针
- 会将数据从共享内存拷贝到新分配的内存中
- buffer_t 的析构函数负责释放这块新分配的内存
void process_message(const char* channel_name) {
ipc::channel channel{channel_name};
// 接收消息
auto buffer = channel.recv();
if(buffer.empty()) return;
// 使用数据
auto data = static_cast<const char*>(buffer.data());
auto size = buffer.size();
// 不需要手动释放内存
// buffer_t 析构时会自动处理内存释放
- 不需要手动释放 buffer_t 中的内存,它会在析构时自动处理
- 大消息直接使用共享内存可以提高性能,避免不必要的拷贝
- 小消息虽然会有一次拷贝,但考虑到数据量小,对性能影响不大
boost mq
template<mqblock_types Block, class TimePoint>
inline bool message_queue_t<VoidPointer>::do_receive(
void *buffer, size_type buffer_size,
size_type &recvd_size, unsigned int &priority,
const TimePoint &abs_time)
// ...
//There is at least one message ready to pick, get the top one
ipcdetail::msg_hdr_t<VoidPointer> &top_msg = p_hdr->top_msg();
//Get data from the message
recvd_size = top_msg.len;
priority = top_msg.priority;
//Some cleanup to ease debugging
top_msg.len = 0;
top_msg.priority = 0;
//Copy data to receiver's bufers
std::memcpy(buffer, top_msg.data(), recvd_size); // 这里进行内存拷贝
//Free top message and put it in the free message list
// ...
- 当消息队列中有消息时,会获取队列顶部的消息(top_msg)
- 使用std::memcpy将消息内容从共享内存(top_msg.data())拷贝到用户提供的buffer中
- 拷贝的大小为消息的实际长度(recvd_size)
- 拷贝完成后会清空原消息的长度和优先级信息,并将该消息放回空闲消息列表中
- 用户需要确保提供的buffer足够大,能容纳消息内容
- 消息的内容会被复制一份,而不是直接使用共享内存中的数据
- 一旦消息被接收并拷贝走,共享内存中的原始消息就会被清空并重用
- 要自己分配和销毁内存,确保内存足够大
#include <iostream>
#include <chrono>
#include <thread>
#include <vector>
#include <algorithm>
#include <libipc/ipc.h>
#include <libipc/shm.h>
#include <cstring>
#include <iomanip>
#include <numeric>
#include <csignal>
#include <atomic>
#include <random>
#include <array>
const char* REQ_CHANNEL = "tlm_benchmark_req";
const char* RESP_CHANNEL = "tlm_benchmark_resp";
const int WARMUP_COUNT = 1000;
const int TEST_COUNT = 10000;
// TLM-like payload structure
struct TlmPayload {
uint64_t command; // Command type
uint64_t address; // Target address
uint32_t data_length; // Length of data
uint32_t byte_enable_length;// Length of byte_enable
uint64_t timestamp; // Timestamp for latency measurement
char data[0]; // Variable length data followed by byte_enable
// Test configuration
struct TestConfig {
size_t data_size;
size_t byte_enable_size;
size_t total_size; // Including header and data
size_t max_msg_size; // Maximum possible message size
// Performance statistics
struct PerfStats {
double min_latency_us;
double max_latency_us;
double avg_latency_us;
double p50_latency_us;
double p90_latency_us;
double p99_latency_us;
double throughput_mbps;
size_t message_size;
size_t sample_count;
// Global flag for graceful shutdown
std::atomic<bool> g_running{true};
// Add exit command definition
const uint64_t EXIT_COMMAND = 0xFFFFFFFFFFFFFFFF; // Special command for exit
// Available data sizes for random selection
const std::array<size_t, 7> DATA_SIZES = {64, 128, 256, 512, 1024, 2048, 4096};
void signal_handler(int) {
g_running = false;
void print_stats(const PerfStats& stats) {
std::cout << "\nPerformance Test Results:\n";
std::cout << std::string(80, '-') << '\n';
std::cout << std::fixed << std::setprecision(2);
std::cout << "Message Size: " << stats.message_size << " bytes\n";
std::cout << "Sample Count: " << stats.sample_count << "\n\n";
std::cout << "Latency (microseconds):\n";
std::cout << " Min: " << stats.min_latency_us << "\n";
std::cout << " P50: " << stats.p50_latency_us << "\n";
std::cout << " P90: " << stats.p90_latency_us << "\n";
std::cout << " P99: " << stats.p99_latency_us << "\n";
std::cout << " Max: " << stats.max_latency_us << "\n";
std::cout << " Average: " << stats.avg_latency_us << "\n\n";
std::cout << "Throughput: " << stats.throughput_mbps << " MB/s\n";
std::cout << std::string(80, '-') << '\n';
void run_producer(const TestConfig& config) {
signal(SIGINT, signal_handler);
// Create channels for sending requests and receiving responses
ipc::channel req_channel{REQ_CHANNEL, ipc::sender};
ipc::channel resp_channel{RESP_CHANNEL, ipc::receiver};
std::cout << "Waiting for consumer...\n";
// Prepare test data - use max_msg_size for buffer allocation
std::vector<char> send_buffer(config.max_msg_size);
auto* payload = reinterpret_cast<TlmPayload*>(send_buffer.data());
payload->command = 1;
payload->address = 0x1000;
payload->data_length = config.data_size;
payload->byte_enable_length = config.byte_enable_size;
// Initialize random number generators
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<unsigned char> data_dist(0, 255); // For random data
std::uniform_int_distribution<size_t> size_dist(0, DATA_SIZES.size() - 1); // For random size
// Warmup phase
std::cout << "Starting warmup...\n";
for (int i = 0; i < WARMUP_COUNT && g_running; i++) {
if (!req_channel.send(send_buffer.data(), config.total_size)) {
std::cerr << "Warmup send failed\n";
auto recv_data = resp_channel.recv();
if (recv_data.empty() || recv_data.size() != sizeof(int)) {
std::cerr << "Warmup receive failed\n";
if ((i + 1) % 100 == 0) {
std::cout << "Warmup progress: " << (i + 1) << "/" << WARMUP_COUNT << "\r" << std::flush;
std::cout << "\nWarmup completed\n";
// Performance test phase
std::vector<double> latencies;
std::cout << "Starting performance test...\n";
for (int i = 0; i < TEST_COUNT && g_running; i++) {
// Select random data size
size_t current_data_size = DATA_SIZES[size_dist(gen)];
// Update payload size information
payload->data_length = current_data_size;
payload->byte_enable_length = current_data_size;
// Fill with random data
char* data = payload->data;
for (size_t j = 0; j < current_data_size; ++j) {
data[j] = static_cast<char>(data_dist(gen));
for (size_t j = 0; j < current_data_size; ++j) {
data[current_data_size + j] = 0xff;
// Get current timestamp
payload->timestamp = std::chrono::duration_cast<std::chrono::nanoseconds>(
// Send and measure latency
auto start = std::chrono::high_resolution_clock::now();
if (!req_channel.send(send_buffer.data(), sizeof(TlmPayload) + 2 * current_data_size)) {
std::cerr << "Send failed at iteration " << i << "\n";
auto recv_data = resp_channel.recv();
if (recv_data.empty() || recv_data.size() != sizeof(int)) {
std::cerr << "Receive failed at iteration " << i << "\n";
auto end = std::chrono::high_resolution_clock::now();
double latency = std::chrono::duration<double, std::micro>(end - start).count();
if ((i + 1) % 1000 == 0) {
std::cout << "Progress: " << (i + 1) << "/" << TEST_COUNT << "\r" << std::flush;
std::cout << "\nTest completed\n";
// Calculate statistics
PerfStats stats{};
stats.message_size = config.total_size;
stats.sample_count = latencies.size();
if (!latencies.empty()) {
std::sort(latencies.begin(), latencies.end());
stats.min_latency_us = latencies.front();
stats.max_latency_us = latencies.back();
stats.p50_latency_us = latencies[latencies.size() * 50 / 100];
stats.p90_latency_us = latencies[latencies.size() * 90 / 100];
stats.p99_latency_us = latencies[latencies.size() * 99 / 100];
stats.avg_latency_us = std::accumulate(latencies.begin(), latencies.end(), 0.0) / latencies.size();
stats.throughput_mbps = (config.total_size * latencies.size()) / (stats.avg_latency_us);
// Send exit command to consumer
std::cout << "Sending exit command to consumer...\n";
payload->command = EXIT_COMMAND;
if (!req_channel.send(send_buffer.data(), config.total_size)) {
std::cerr << "Failed to send exit command\n";
// Clean up - disconnect will clean up the channels
void run_consumer(const TestConfig& config) {
signal(SIGINT, signal_handler);
// Create channels for receiving requests and sending responses
ipc::channel req_channel{REQ_CHANNEL, ipc::receiver};
ipc::channel resp_channel{RESP_CHANNEL, ipc::sender};
std::cout << "Consumer ready. Waiting for producer...\n";
// Wait for producer to connect
int response = 1; // Simple integer response
size_t messages_received = 0;
std::cout << "Connected to producer. Processing messages...\n";
// Process messages
while (g_running) {
// Receive request
auto recv_data = req_channel.recv();
if (recv_data.empty() || recv_data.size() < sizeof(TlmPayload)) {
if (g_running) { // Only report error if not shutting down
std::cerr << "Receive failed, size=" << recv_data.size() << "\n";
// Get payload and check for exit command
auto* payload = reinterpret_cast<const TlmPayload*>(recv_data.data());
if (payload->command == EXIT_COMMAND) {
std::cout << "\nReceived exit command\n";
// Send response immediately - just an integer
if (!resp_channel.send(&response, sizeof(response))) {
if (g_running) { // Only report error if not shutting down
std::cerr << "Send failed\n";
if (messages_received % 1000 == 0) {
std::cout << "Processed " << messages_received << " messages\r" << std::flush;
std::cout << "\nConsumer processed " << messages_received << " messages\n";
// Clean up - disconnect will clean up the channels
void print_usage(const char* program) {
std::cerr << "Usage: " << program << " [producer|consumer]\n"
<< "Example:\n"
<< " Terminal 1: " << program << " consumer\n"
<< " Terminal 2: " << program << " producer\n";
int main(int argc, char* argv[]) {
if (argc != 2) {
return 1;
std::string mode(argv[1]);
TestConfig config{
4096, // data_size
4096, // byte_enable_size
sizeof(TlmPayload) + 2 * 4096, // total_size (header + data + byte_enable)
sizeof(TlmPayload) + 2 * 4096 // max_msg_size (使用最大可能的数据大小)
std::cout << "Configuration:\n"
<< " Mode: " << mode << "\n"
<< " Data size: " << config.data_size << " bytes\n"
<< " Byte enable size: " << config.byte_enable_size << " bytes\n"
<< " Total message size: " << config.total_size << " bytes\n\n";
try {
if (mode == "producer") {
else if (mode == "consumer") {
else {
return 1;
catch (const std::exception& e) {
std::cerr << "Error: " << e.what() << std::endl;
return 1;
return 0;
#include <boost/interprocess/ipc/message_queue.hpp>
#include <iostream>
#include <thread>
#include <chrono>
#include <vector>
#include <algorithm>
#include <iomanip>
#include <numeric>
#include <csignal>
#include <atomic>
#include <random>
#include <array>
using namespace boost::interprocess;
const char* REQ_QUEUE = "tlm_perf_req";
const char* RESP_QUEUE = "tlm_perf_resp";
const int WARMUP_COUNT = 1000;
const int TEST_COUNT = 10000;
const int MAX_MESSAGES = 128; // Queue capacity
// Available data sizes for random selection
const std::array<size_t, 8> DATA_SIZES = {32, 64, 128, 256, 512, 1024, 2048, 4096};
// TLM-like payload structure with fixed size
struct TlmPayload {
uint64_t command; // Command type
uint64_t address; // Target address
uint32_t data_length; // Length of data
uint32_t byte_enable_length;// Length of byte_enable
uint64_t timestamp; // Timestamp for latency measurement
char data[0]; // Variable length data followed by byte_enable
// Performance statistics
struct PerfStats {
double min_latency_us;
double max_latency_us;
double avg_latency_us;
double p50_latency_us;
double p90_latency_us;
double p99_latency_us;
double throughput_mbps;
size_t message_size;
size_t sample_count;
// Global flag for graceful shutdown
std::atomic<bool> g_running{true};
void signal_handler(int) {
g_running = false;
void print_stats(const PerfStats& stats) {
std::cout << "\nPerformance Test Results:\n";
std::cout << std::string(80, '-') << '\n';
std::cout << std::fixed << std::setprecision(2);
std::cout << "Message Size: " << stats.message_size << " bytes\n";
std::cout << "Sample Count: " << stats.sample_count << "\n\n";
std::cout << "Latency (microseconds):\n";
std::cout << " Min: " << stats.min_latency_us << "\n";
std::cout << " P50: " << stats.p50_latency_us << "\n";
std::cout << " P90: " << stats.p90_latency_us << "\n";
std::cout << " P99: " << stats.p99_latency_us << "\n";
std::cout << " Max: " << stats.max_latency_us << "\n";
std::cout << " Average: " << stats.avg_latency_us << "\n\n";
std::cout << "Throughput: " << stats.throughput_mbps << " MB/s\n";
std::cout << std::string(80, '-') << '\n';
void cleanup_queues() {
void run_consumer(size_t msg_size) {
signal(SIGINT, signal_handler);
try {
// Create message queues
message_queue req_queue(create_only, REQ_QUEUE, MAX_MESSAGES, msg_size);
message_queue resp_queue(create_only, RESP_QUEUE, MAX_MESSAGES, sizeof(uint64_t));
std::cout << "Consumer ready. Processing messages...\n";
// Pre-allocate buffer
std::vector<char> recv_buffer(msg_size);
uint64_t response;
size_t messages_received = 0;
while (g_running) {
size_t recvd_size;
unsigned int priority;
// Receive request
req_queue.receive(recv_buffer.data(), msg_size, recvd_size, priority);
// Get timestamp and check for exit command
auto* payload = reinterpret_cast<const TlmPayload*>(recv_buffer.data());
if (payload->command == 0xFFFFFFFFFFFFFFFF) {
std::cout << "\nReceived exit command\n";
// Send response immediately
response = payload->timestamp;
resp_queue.send(&response, sizeof(response), 0);
if (messages_received % 1000 == 0) {
std::cout << "Processed " << messages_received << " messages\r" << std::flush;
std::cout << "\nConsumer processed " << messages_received << " messages\n";
catch (interprocess_exception& ex) {
std::cerr << "Consumer error: " << ex.what() << std::endl;
void run_producer(size_t data_size) {
signal(SIGINT, signal_handler);
try {
size_t msg_size = sizeof(TlmPayload) + 2 * data_size; // data + byte_enable
// Wait for consumer to create queues
std::cout << "Waiting for consumer...\n";
message_queue* req_queue = nullptr;
message_queue* resp_queue = nullptr;
while (g_running && (!req_queue || !resp_queue)) {
try {
if (!req_queue) req_queue = new message_queue(open_only, REQ_QUEUE);
if (!resp_queue) resp_queue = new message_queue(open_only, RESP_QUEUE);
catch (interprocess_exception&) {
std::cout << "." << std::flush;
if (!g_running) {
delete req_queue;
delete resp_queue;
std::cout << "\nConnected to consumer\n";
// Prepare test data
std::vector<char> send_buffer(msg_size);
auto* payload = reinterpret_cast<TlmPayload*>(send_buffer.data());
payload->command = 1;
payload->address = 0x1000;
payload->data_length = data_size;
payload->byte_enable_length = data_size;
// Initialize random number generators
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<unsigned char> data_dist(0, 255); // For random data
std::uniform_int_distribution<size_t> size_dist(0, DATA_SIZES.size() - 1); // For random size
uint64_t response;
// Warmup phase
std::cout << "Starting warmup...\n";
for (int i = 0; i < WARMUP_COUNT && g_running; i++) {
// Select random data size
size_t current_data_size = DATA_SIZES[size_dist(gen)];
// Update payload size information
payload->data_length = current_data_size;
payload->byte_enable_length = current_data_size;
// Fill with random data and fixed byte_enable
char* data = payload->data;
for (size_t j = 0; j < current_data_size; ++j) {
data[j] = static_cast<char>(data_dist(gen));
for (size_t j = 0; j < current_data_size; ++j) {
data[current_data_size + j] = 0xff; // Fixed byte_enable value
req_queue->send(send_buffer.data(), sizeof(TlmPayload) + 2 * current_data_size, 0);
size_t recvd_size;
unsigned int priority;
resp_queue->receive(&response, sizeof(response), recvd_size, priority);
if ((i + 1) % 100 == 0) {
std::cout << "Warmup progress: " << (i + 1) << "/" << WARMUP_COUNT << "\r" << std::flush;
if (!g_running) {
delete req_queue;
delete resp_queue;
std::cout << "\nWarmup completed\n";
// Performance test phase
std::vector<double> latencies;
std::cout << "Starting performance test...\n";
for (int i = 0; i < TEST_COUNT && g_running; i++) {
// Select random data size
size_t current_data_size = DATA_SIZES[size_dist(gen)];
// Update payload size information
payload->data_length = current_data_size;
payload->byte_enable_length = current_data_size;
// Fill with random data and fixed byte_enable
char* data = payload->data;
for (size_t j = 0; j < current_data_size; ++j) {
data[j] = static_cast<char>(data_dist(gen));
for (size_t j = 0; j < current_data_size; ++j) {
data[current_data_size + j] = 0xff; // Fixed byte_enable value
// Get timestamp before timing measurement
payload->timestamp = std::chrono::duration_cast<std::chrono::nanoseconds>(
auto start = std::chrono::high_resolution_clock::now();
req_queue->send(send_buffer.data(), sizeof(TlmPayload) + 2 * current_data_size, 0);
size_t recvd_size;
unsigned int priority;
resp_queue->receive(&response, sizeof(response), recvd_size, priority);
auto end = std::chrono::high_resolution_clock::now();
double latency = std::chrono::duration<double, std::micro>(end - start).count();
if ((i + 1) % 1000 == 0) {
std::cout << "Progress: " << (i + 1) << "/" << TEST_COUNT << "\r" << std::flush;
if (!g_running) {
delete req_queue;
delete resp_queue;
std::cout << "\nTest completed\n";
// Calculate statistics
PerfStats stats{};
stats.message_size = msg_size;
stats.sample_count = latencies.size();
if (!latencies.empty()) {
std::sort(latencies.begin(), latencies.end());
stats.min_latency_us = latencies.front();
stats.max_latency_us = latencies.back();
stats.p50_latency_us = latencies[latencies.size() * 50 / 100];
stats.p90_latency_us = latencies[latencies.size() * 90 / 100];
stats.p99_latency_us = latencies[latencies.size() * 99 / 100];
stats.avg_latency_us = std::accumulate(latencies.begin(), latencies.end(), 0.0) / latencies.size();
stats.throughput_mbps = (msg_size * latencies.size()) / (stats.avg_latency_us);
// Send exit command
std::cout << "Sending exit command to consumer...\n";
payload->command = 0xFFFFFFFFFFFFFFFF;
req_queue->send(send_buffer.data(), msg_size, 0);
delete req_queue;
delete resp_queue;
catch (interprocess_exception& ex) {
std::cerr << "Producer error: " << ex.what() << std::endl;
void print_usage(const char* program) {
std::cerr << "Usage: " << program << " [producer|consumer]\n"
<< "Example:\n"
<< " Terminal 1: " << program << " consumer\n"
<< " Terminal 2: " << program << " producer\n";
int main(int argc, char* argv[]) {
if (argc != 2) {
return 1;
std::string mode(argv[1]);
try {
if (mode == "producer") {
run_producer(4096); // Use max size for buffer allocation
else if (mode == "consumer") {
size_t max_msg_size = sizeof(TlmPayload) + 2 * 4096; // Use max possible message size
else {
return 1;
catch (std::exception& e) {
std::cerr << "Error: " << e.what() << std::endl;
return 1;
return 0;
