Redis源码剖析（1）：SkipList

1 前言

SkipList由William Pugh在他的论文《Skip Lists: A Probabilistic Alternative to Balanced Trees》中提出，是一种可以用来代替平衡树的数据结构。它采取了概率平衡而不是严格强制的平衡，因此插入和删除的算法比平衡树的等效算法简单得多，速度也快得多。

1.1 空间开销

SkipList的底层是单项链表，区别在于每个链表节点有多层指向下个节点的指针，而链表的随机层数算法由常数p决定，因此，SkipList的额外空间开销与p相关。

根据随即层数算法，得出每个节点的平均层数为1/(1-p)，则SkipList的总体空间开销即为n/(1-p)。

1.2 时间复杂度

SkipList的查找时间复杂度为O(log(n))。

2 Redis中的实现

在Redis中，SkipList主要作为Sorted Set的底层数据结构，它的实现几乎是William Pugh在论文中描述的原始算法的C语言版的翻译。除了对以下三个方面进行了修改:

该实现允许重复的分数（score）。
同时通过键（即score）和对应的元素进行比较
有一个向前指针，所以第一层是双向链表，以此允许从尾到头的遍历。

2.1 数据结构

/* 链表节点 */
typedef struct zskiplistNode {
    sds ele;
    double score;
    struct zskiplistNode *backward; // 第一层的backward指针
    struct zskiplistLevel {
        struct zskiplistNode *forward; // 每一层的forward指针
        unsigned long span; // 到下个节点的跨度
    } level[];
} zskiplistNode;

/* 链表 */
typedef struct zskiplist {
    struct zskiplistNode *header; // 哑头节点，初始化MAX_LEVEL层
    struct zskiplistNode *tail; // 尾节点，初始化为null
    unsigned long length;
    int level;
} zskiplist;

2.2 SkipList核心API

2.2.1 初始化

/* 创建SkipList */
zskiplist *zslCreate(void) {
    int j;
    zskiplist *zsl;

    zsl = zmalloc(sizeof(*zsl));
    zsl->level = 1; // 初始化最高层数为1
    zsl->length = 0; // 初始化长度为0
    
    /* 头节点为哑节点，初始化MAX_LEVEL层 */
    zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
    for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
        zsl->header->level[j].forward = NULL;
        zsl->header->level[j].span = 0;
    }
    zsl->header->backward = NULL;
    zsl->tail = NULL; // 尾节点初始化为null
    return zsl;
}

2.2.2 添加

/* 随即层数算法 */
int zslRandomLevel(void) {
    int level = 1;
    while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
        level += 1;
    return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
}

/* 插入节点（假设插入元素不存在） */
zskiplistNode *zslInsert(zskiplist *zsl, double score, sds ele) {
    zskiplistNode *update[ZSKIPLIST_MAXLEVEL]; // 记录所插入节点的每一层的前驱节点
    zskiplistNode *x;
    
    unsigned int rank[ZSKIPLIST_MAXLEVEL]; // 记录所插入节点的每一层的前驱节点的跨度
    int i, level;

    serverAssert(!isnan(score));
    x = zsl->header;
    /* 每一层循环相当于走向down指针 */
    for (i = zsl->level-1; i >= 0; i--) {
        /* store rank that is crossed to reach the insert position */
        rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
        
        /*
        走向forward指针的条件：
        1. forward节点不为空
        2. forward节点的值小于插入值，或者等于但是元素小于插入元素
        */
        while (x->level[i].forward &&
                (x->level[i].forward->score < score ||
                    (x->level[i].forward->score == score &&
                    sdscmp(x->level[i].forward->ele,ele) < 0)))
        {
            rank[i] += x->level[i].span;
            x = x->level[i].forward;
        }
        
        update[i] = x; // while循环结束时，x所在节点必为当前层所插入节点的前驱节点
    }
    /* we assume the element is not already inside, since we allow duplicated
     * scores, reinserting the same element should never happen since the
     * caller of zslInsert() should test in the hash table if the element is
     * already inside or not. */
     
    level = zslRandomLevel(); // 随机生成层数
    
    /* 如果随机层数大于当前最高层数，适当更新 */
    if (level > zsl->level) {
        for (i = zsl->level; i < level; i++) {
            rank[i] = 0;
            update[i] = zsl->header;
            update[i]->level[i].span = zsl->length;
        }
        zsl->level = level;
    }
    
    x = zslCreateNode(level,score,ele); // 创建节点
    for (i = 0; i < level; i++) {
        /* 插入节点 */
        x->level[i].forward = update[i]->level[i].forward;
        update[i]->level[i].forward = x;

        /* update span covered by update[i] as x is inserted here */
        x->level[i].span = update[i]->level[i].span - (rank[0] - rank[i]);
        update[i]->level[i].span = (rank[0] - rank[i]) + 1;
    }

    /* increment span for untouched levels */
    for (i = level; i < zsl->level; i++) {
        update[i]->level[i].span++;
    }

    /* 更新第一层的backward指针 */
    x->backward = (update[0] == zsl->header) ? NULL : update[0];
    if (x->level[0].forward)
        x->level[0].forward->backward = x;
    else
        zsl->tail = x;
        
    zsl->length++; // 更新链表长度
    return x;
}

2.2.3 删除

/* 删除节点
 * Internal function used by zslDelete, zslDeleteRangeByScore and
 * zslDeleteRangeByRank. */
void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
    int i;
    /*
    每一层逐步删除节点的逻辑：
    1. 如果前驱节点forward指针指向被删除节点，则更新forward指针和跨度
    2. 否则则更新跨度。但为何会存在forward指针不指向被删除节点的情况呢？
    */
    for (i = 0; i < zsl->level; i++) {
        if (update[i]->level[i].forward == x) {
            update[i]->level[i].span += x->level[i].span - 1;
            update[i]->level[i].forward = x->level[i].forward;
        } else {
            update[i]->level[i].span -= 1;
        }
    }
    
    /* 适当更新第一层的backward指针 */
    if (x->level[0].forward) {
        x->level[0].forward->backward = x->backward;
    } else {
        zsl->tail = x->backward;
    }
    
    /* 如果最高层为空了就缩减层数 */
    while(zsl->level > 1 && zsl->header->level[zsl->level-1].forward == NULL)
        zsl->level--;
    zsl->length--; // 更新链表长度
}

/* 删除元素
 * Delete an element with matching score/element from the skiplist.
 * The function returns 1 if the node was found and deleted, otherwise
 * 0 is returned.
 *
 * If 'node' is NULL the deleted node is freed by zslFreeNode(), otherwise
 * it is not freed (but just unlinked) and *node is set to the node pointer,
 * so that it is possible for the caller to reuse the node (including the
 * referenced SDS string at node->ele). */
int zslDelete(zskiplist *zsl, double score, sds ele, zskiplistNode **node) {
    zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
    int i;

    x = zsl->header;
    for (i = zsl->level-1; i >= 0; i--) {
        while (x->level[i].forward &&
                (x->level[i].forward->score < score ||
                    (x->level[i].forward->score == score &&
                        sdscmp(x->level[i].forward->ele,ele) < 0)))
        {
            x = x->level[i].forward;
        }
        update[i] = x;
    }
    /* We may have multiple elements with the same score, what we need
     * is to find the element with both the right score and object. */
    x = x->level[0].forward;
    if (x && score == x->score && sdscmp(x->ele,ele) == 0) {
        zslDeleteNode(zsl, x, update);
        if (!node)
            zslFreeNode(x);
        else
            *node = x;
        return 1;
    }
    return 0; /* not found */
}

2.2.4 范围判断

/* Returns if there is a part of the zset is in range. */
int zslIsInRange(zskiplist *zsl, zrangespec *range) {
    zskiplistNode *x;

    /* Test for ranges that will always be empty. */
    if (range->min > range->max ||
            (range->min == range->max && (range->minex || range->maxex)))
        return 0;
    x = zsl->tail;
    if (x == NULL || !zslValueGteMin(x->score,range)) // 尾元素是否小于最小值
        return 0;
    x = zsl->header->level[0].forward;
    if (x == NULL || !zslValueLteMax(x->score,range)) // 首元素是否大于最大值
        return 0;
    return 1;
}

/* Find the first node that is contained in the specified range.
 * Returns NULL when no element is contained in the range. */
zskiplistNode *zslFirstInRange(zskiplist *zsl, zrangespec *range) {
    zskiplistNode *x;
    int i;

    /* If everything is out of range, return early. */
    if (!zslIsInRange(zsl,range)) return NULL;

    x = zsl->header;
    /* 当小于最小值的时候前进 */
    for (i = zsl->level-1; i >= 0; i--) {
        /* Go forward while *OUT* of range. */
        while (x->level[i].forward &&
            !zslValueGteMin(x->level[i].forward->score,range))
                x = x->level[i].forward;
    }

    /* This is an inner range, so the next node cannot be NULL. */
    x = x->level[0].forward;
    serverAssert(x != NULL);

    /* Check if score <= max. */
    if (!zslValueLteMax(x->score,range)) return NULL;
    return x;
}

/* Find the last node that is contained in the specified range.
 * Returns NULL when no element is contained in the range. */
zskiplistNode *zslLastInRange(zskiplist *zsl, zrangespec *range) {
    zskiplistNode *x;
    int i;

    /* If everything is out of range, return early. */
    if (!zslIsInRange(zsl,range)) return NULL;

    x = zsl->header;
    /* 当小于最大值的时候前进 */
    for (i = zsl->level-1; i >= 0; i--) {
        /* Go forward while *IN* range. */
        while (x->level[i].forward &&
            zslValueLteMax(x->level[i].forward->score,range))
                x = x->level[i].forward;
    }

    /* This is an inner range, so this node cannot be NULL. */
    serverAssert(x != NULL);

    /* Check if score >= min. */
    if (!zslValueGteMin(x->score,range)) return NULL;
    return x;
}

2.2.5 范围删除

/* Delete all the elements with score between min and max from the skiplist.
 * Min and max are inclusive, so a score >= min || score <= max is deleted.
 * Note that this function takes the reference to the hash table view of the
 * sorted set, in order to remove the elements from the hash table too. */
unsigned long zslDeleteRangeByScore(zskiplist *zsl, zrangespec *range, dict *dict) {
    zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
    unsigned long removed = 0;
    int i;
    
    /* 当小于最小值的时候，继续向前，同时记录每层的前驱节点，用于删除 */
    x = zsl->header;
    for (i = zsl->level-1; i >= 0; i--) {
        while (x->level[i].forward && (range->minex ?
            x->level[i].forward->score <= range->min :
            x->level[i].forward->score < range->min))
                x = x->level[i].forward;
        update[i] = x;
    }

    /* Current node is the last with score < or <= min. */
    x = x->level[0].forward;

    /* Delete nodes while in range. */
    while (x &&
           (range->maxex ? x->score < range->max : x->score <= range->max))
    {
        /* 删除当前节点，释放内存，更新删除节点数，走向下一个节点 */
        zskiplistNode *next = x->level[0].forward;
        zslDeleteNode(zsl,x,update);
        dictDelete(dict,x->ele);
        zslFreeNode(x); /* Here is where x->ele is actually released. */
        removed++;
        x = next;
    }
    return removed;
}

/* Delete all the elements with rank between start and end from the skiplist.
 * Start and end are inclusive. Note that start and end need to be 1-based */
unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
    zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
    unsigned long traversed = 0, removed = 0;
    int i;

    x = zsl->header;
    /* 当小于start的时候继续前进，记录前驱节点用于删除 */
    for (i = zsl->level-1; i >= 0; i--) {
        while (x->level[i].forward && (traversed + x->level[i].span) < start) {
            traversed += x->level[i].span;
            x = x->level[i].forward;
        }
        update[i] = x;
    }

    traversed++;
    x = x->level[0].forward;
    /* 当小于end的时候边前进边删除，因为span是到下一个节点的跨度，所以删除对span的更新不影响后续节点 */
    while (x && traversed <= end) {
        zskiplistNode *next = x->level[0].forward;
        zslDeleteNode(zsl,x,update);
        dictDelete(dict,x->ele);
        zslFreeNode(x);
        removed++;
        traversed++;
        x = next;
    }
    return removed;
}

2.2.6 排名

/* Find the rank for an element by both score and key.
 * Returns 0 when the element cannot be found, rank otherwise.
 * Note that the rank is 1-based due to the span of zsl->header to the
 * first element. */
unsigned long zslGetRank(zskiplist *zsl, double score, sds ele) {
    zskiplistNode *x;
    unsigned long rank = 0;
    int i;

    x = zsl->header;
    /* 每层遍历的时候累加rank */
    for (i = zsl->level-1; i >= 0; i--) {
        while (x->level[i].forward &&
            (x->level[i].forward->score < score ||
                (x->level[i].forward->score == score &&
                sdscmp(x->level[i].forward->ele,ele) <= 0))) {
            rank += x->level[i].span;
            x = x->level[i].forward;
        }

        /* x might be equal to zsl->header, so test if obj is non-NULL */
        if (x->ele && sdscmp(x->ele,ele) == 0) {
            return rank;
        }
    }
    return 0;
}

/* Finds an element by its rank. The rank argument needs to be 1-based. */
zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
    zskiplistNode *x;
    unsigned long traversed = 0;
    int i;

    x = zsl->header;
    /* 每层遍历的时候累加已经遍历的节点数traversed */
    for (i = zsl->level-1; i >= 0; i--) {
        while (x->level[i].forward && (traversed + x->level[i].span) <= rank)
        {
            traversed += x->level[i].span;
            x = x->level[i].forward;
        }
        if (traversed == rank) {
            return x;
        }
    }
    return NULL;
}

3 参考文献

[1] William Pugh, Skip Lists: A Probabilistic Alternative to Balanced Trees
https://15721.courses.cs.cmu....

[2] Redis-6.0.6 source codes
https://github.com/redis/redi...

Redis源码剖析（1）：SkipList

1 前言

1.1 空间开销

1.2 时间复杂度

2 Redis中的实现

2.1 数据结构

2.2 SkipList核心API

2.2.1 初始化

2.2.2 添加

2.2.3 删除

2.2.4 范围判断

2.2.5 范围删除

2.2.6 排名

3 参考文献

Hezijian Xiao

引用和评论

Redis 持久化原理分析和使用建议

分布式数据库解析

嘎嘎好用！推荐三款开源的 Redis 桌面客户端！

自制审批流框架记录

Redis 又双叒叕改开源协议了，微软提前推出高性能替代方案 Garnet

使用 redis 做分布式锁时，遇到客户端挂掉，如何正确的释放锁？

Redis与MySQL数据一致性问题解决方案