gazelle/src/common/dpdk_common.h-代码预览-gazelle:基于 DPDK 与 LwIP 的用户态协议栈项目 - AtomGit

LLemmy Huangmempool: fix copy_mbuf_private
b189e235创建于 2025年7月5日历史提交
/*
* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
* gazelle is licensed under the Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
*     http://license.coscl.org.cn/MulanPSL2
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
* PURPOSE.
* See the Mulan PSL v2 for more details.
*/

#ifndef __GAZELLE_DPDK_COMMON_H__
#define __GAZELLE_DPDK_COMMON_H__

#include <stdbool.h>
#include <rte_mbuf.h>
#include <rte_ring.h>
#include <lwip/pbuf.h>
#include <lwip/dpdk_version.h>

#include "gazelle_dfx_msg.h"
#include "gazelle_opt.h"

#define GAZELLE_KNI_NAME                     "kni"   // will be removed during dpdk update

#define GAZELLE_LATENCY_RD      0
#define GAZELLE_LATENCY_WR      1

/* Layout:
 * | rte_mbuf | mbuf_private | payload |
 * |   128    |              |         |
 **/
struct latency_timestamp {
        uint64_t stamp; // time stamp
        uint64_t check; // just for later vaild check
        uint16_t stamp_seg[GAZELLE_LATENCY_MAX]; // time stamp segment
        uint16_t type;  // latency type
};
struct mbuf_private {
    /* struct pbuf_custom must at first. do not copy in copy_mbuf_private() !!! */
    struct pbuf_custom pc;
    /* the stack to which buf belongs. do not copy in copy_mbuf_private() !!! */
    int stack_id;

    struct latency_timestamp lt;
};

static __rte_always_inline struct mbuf_private *mbuf_to_private(const struct rte_mbuf *m)
{
    return (struct mbuf_private *)RTE_PTR_ADD(m, sizeof(struct rte_mbuf));
}
static __rte_always_inline struct pbuf *mbuf_to_pbuf(const struct rte_mbuf *m)
{
    return &mbuf_to_private(m)->pc.pbuf;
}
static __rte_always_inline struct mbuf_private *pbuf_to_private(const struct pbuf *p)
{
    return mbuf_to_private(pbuf_to_mbuf(p));
}

static __rte_always_inline void copy_mbuf_private(struct mbuf_private *dst, const struct mbuf_private *src)
{
    rte_memcpy(&dst->lt, &src->lt, sizeof(struct latency_timestamp));
}

/* NOTE!!! magic code, even the order.
*  I wrote it carefully, and check the assembly. for example, there is 24 ins in A72,
*  and if there is no cache miss, it only take less than 20 cycle(store pipe is the bottleneck).
*/
static __rte_always_inline void copy_mbuf(struct rte_mbuf *dst, const struct rte_mbuf *src)
{
    /* In the direction of tx, data is copied from lstack to ltran. It is necessary to judge whether
       the length of data transmitted from lstack has been tampered with to prevent overflow
    */
    uint16_t data_len = src->data_len;
    if (data_len > RTE_MBUF_DEFAULT_BUF_SIZE)
        return;

    dst->ol_flags = src->ol_flags;
    dst->tx_offload = src->tx_offload;
    // there is buf_len in rx_descriptor_fields1, copy it is dangerous acturely. 16 : mbuf desc size
    rte_memcpy((uint8_t *)dst->rx_descriptor_fields1, (const uint8_t *)src->rx_descriptor_fields1, 16);

    uint8_t *dst_data = rte_pktmbuf_mtod(dst, void*);
    uint8_t *src_data = rte_pktmbuf_mtod(src, void*);
    rte_memcpy(dst_data, src_data, data_len);

    copy_mbuf_private(mbuf_to_private(dst), mbuf_to_private(src));
}

static __rte_always_inline void time_stamp_into_mbuf(uint32_t rx_count, struct rte_mbuf *buf[], uint64_t time_stamp)
{
    struct latency_timestamp *lt;
    for (uint32_t i = 0; i < rx_count; i++) {
        lt = &mbuf_to_private(buf[i])->lt;
        lt->stamp = time_stamp;
        lt->check = ~(time_stamp);
        lt->type = GAZELLE_LATENCY_RD;
    }
}

static __rte_always_inline void time_stamp_into_pbuf(uint32_t tx_count, struct pbuf *buf[], uint64_t time_stamp)
{
    struct latency_timestamp *lt;
    for (uint32_t i = 0; i < tx_count; i++) {
        lt = &pbuf_to_private(buf[i])->lt;
        lt->stamp = time_stamp;
        lt->check = ~(time_stamp);
        lt->type = GAZELLE_LATENCY_WR;
    }
}

bool get_kni_started(void);
struct rte_kni* get_gazelle_kni(void);
int32_t dpdk_kni_init(uint16_t port, struct rte_mempool *pool);
int32_t kni_process_tx(struct rte_mbuf **pkts_burst, uint32_t count);
void kni_process_rx(uint16_t port);
void dpdk_kni_release(void);

struct rte_eth_conf;
struct rte_eth_dev_info;
void eth_params_checksum(struct rte_eth_conf *conf, struct rte_eth_dev_info *dev_info);

/*
    gazelle custom rte ring interface
    one thread enqueue and dequeue, other thread read object use and object still in queue.
    so malloc and free in same thread. only surpport single-consumers or the single-consumer.

    cons.tail            prod.tail                prod.head                 cons.head
    gazelle_ring_sp_enqueue: cons.head-->> cons.tal,  enqueue object
    gazelle_ring_sc_dequeue: cons.tal -->> prod.tail, dequeue object
    gazelle_ring_read:       prod.head-->> cons.head, read object, prod.head = prod.tail + N
    gazelle_ring_read_over:  prod.tail  =  prod.head, update prod.tail
 */
static __rte_always_inline uint32_t gazelle_ring_sp_enqueue(struct rte_ring *r, void *const *obj_table, uint32_t n)
{
    uint32_t head = __atomic_load_n(&r->cons.head, __ATOMIC_ACQUIRE);
    uint32_t tail = r->cons.tail;

    uint32_t free_entries = r->capacity + tail - head;
    if (unlikely(free_entries == 0))
        return 0;
    if (n > free_entries)
        n = free_entries;

    __rte_ring_enqueue_elems(r, head, obj_table, sizeof(void *), n);

    __atomic_store_n(&r->cons.head, head + n, __ATOMIC_RELEASE);

    return n;
}

static __rte_always_inline uint32_t gazelle_ring_sc_dequeue(struct rte_ring *r, void **obj_table, uint32_t n)
{
    uint32_t prod = __atomic_load_n(&r->prod.tail, __ATOMIC_ACQUIRE);
    uint32_t cons = r->cons.tail;

    uint32_t entries = prod - cons;
    if (unlikely(entries == 0))
        return 0;
    if (n > entries)
        n = entries;

    __rte_ring_dequeue_elems(r, cons, obj_table, sizeof(void *), n);

    __atomic_store_n(&r->cons.tail, cons + n, __ATOMIC_RELEASE);

    return n;
}

static __rte_always_inline uint32_t gazelle_ring_read(struct rte_ring *r, void **obj_table, uint32_t n)
{
    uint32_t cons = __atomic_load_n(&r->cons.head, __ATOMIC_ACQUIRE);
    uint32_t prod = r->prod.head;

    const uint32_t entries = cons - prod;
    if (n > entries) {
        n = entries;
    }
    if (unlikely(n == 0)) {
        return 0;
    }

    __rte_ring_dequeue_elems(r, prod, obj_table, sizeof(void *), n);

    r->prod.head = prod + n;

    return n;
}

static __rte_always_inline void gazelle_ring_read_over(struct rte_ring *r)
{
    __atomic_store_n(&r->prod.tail, r->prod.head, __ATOMIC_RELEASE);
}

static __rte_always_inline uint32_t gazelle_ring_readover_count(struct rte_ring *r)
{
    rte_smp_rmb();
    return r->prod.tail - r->cons.tail;
}
static __rte_always_inline uint32_t gazelle_ring_readable_count(const struct rte_ring *r)
{
    rte_smp_rmb();
    return r->cons.head - r->prod.head;
}

static __rte_always_inline uint32_t gazelle_ring_count(const struct rte_ring *r)
{
    rte_smp_rmb();
    return r->cons.head - r->cons.tail;
}
static __rte_always_inline uint32_t gazelle_ring_free_count(const struct rte_ring *r)
{
    return r->capacity - gazelle_ring_count(r);
}
#endif