Ruby 2.x 源代码学习：线程

前言

Ruby 使用 pthread 线程库来实现线程概念

本文涉及到的源代码：

vm.c
thread.c
thread_pthread.c（linux/unix 平台）

线程类 Thread

参考 Ruby 2.x 源代码学习：bootstrap 这篇文章，Ruby 解释器在 bootstrap 阶段会调用一系列 Init_XXX 函数，和线程相关的两个函数是位于 vm.c 文件中的 Init_VM 和位于 thread.c 文件中的 Init_Thread

Init_VM 与 rb_cThread

参考 Ruby 2.x 源代码学习：对象模型，每个对象在 Ruby 虚拟机内部对应一个 RObject 结构体，每个对象都有一个类结构 RClass，每个类也是一个对象，下面这句代码在虚拟机中创建了 Thread 类在虚拟机内部对应的 RClass rb_cThread，它以 rb_cObject 类为父类，名字为 Thread

// vm.c

void Init_VM(void) {
    ...
    rb_cThread = rb_define_class("Thread", rb_cObject);
    ...
}

提示：Ruby 的很多内置类都以 rb_c 为前缀

Init_Thread

介绍完 rb_cThread，现在可以来看看 Init_Thread 函数了

// thread.c

void Init_Thread(void) {
    ...
    VALUE cThGroup;
    rb_thread_t *th = GET_THREAD();

    ...
    rb_define_singleton_method(rb_cThread, "new", thread_s_new, -1);
    rb_define_singleton_method(rb_cThread, "start", thread_start, -2);
    ...
    rb_define_method(rb_cThread, "initialize", thread_initialize, -2);
    rb_define_method(rb_cThread, "join", thread_join_m, -1);
    ...

    /* init thread core */
    /* main thread setting */
    /* acquire global vm lock */

    rb_thread_create_timer_thread();

    /* suppress warnings on cygwin, mingw and mswin.*/
    (void)native_mutex_trylock;

    Init_thread_sync();
}

通过调用 rb_define_xxx api 向 rb_cThread 添加 Thread 类支持的方法，这些方法我们在后面选一些分析
底部 / init thread core / 注释的代码块涉及到线程调度，下文会详细分析

启动线程

Ruby 启动线程有很多方式，我们先来看看使用 Thread::new 方法的启动流程，根据上文的分析，Thread::new 方法对应的 C 函数为 thread_s_new

// thread.c

static VALUE thread_s_new(int argc, VALUE *argv, VALUE klass)
{
    rb_thread_t *th;
    // 创建 thread 对象
    VALUE thread = rb_thread_alloc(klass);

    // 如果主线程被杀掉，直接抛出异常
    if (GET_VM()->main_thread->status == THREAD_KILLED)
        rb_raise(rb_eThreadError, "can't alloc thread");

    // 调用 thread 类的 initialize 方法（也是一个 C 语言实现的 native 函数）
    rb_obj_call_init(thread, argc, argv);
    GetThreadPtr(thread, th);
    // 如果子类没有调用父类的 initialize 方法，直接抛出异常
    if (!threadptr_initialized(th)) {
           rb_raise(rb_eThreadError, "uninitialized thread - check `%"PRIsVALUE"#initialize'", klass);
    }
    return thread;
}

Thread 类的 initialize 方法定义在 thread.c 中

// thread.c

/* :nodoc: */
static VALUE thread_initialize(VALUE thread, VALUE args)
{
    rb_thread_t *th;
    // 必须传递 block 参数，否则抛出异常!!!
    if (!rb_block_given_p()) {
        rb_raise(rb_eThreadError, "must be called with a block");
    }
    GetThreadPtr(thread, th);
    if (th->first_args) {
        VALUE proc = th->first_proc, loc;
        if (!proc || !RTEST(loc = rb_proc_location(proc))) {
            rb_raise(rb_eThreadError, "already initialized thread");
        }
        rb_raise(rb_eThreadError,
         "already initialized thread - %"PRIsVALUE":%"PRIsVALUE,
                 RARRAY_AREF(loc, 0), RARRAY_AREF(loc, 1));
    }
    return thread_create_core(thread, args, 0);
}

我们再来看 thread_create_core 函数，这个函数中会创建 pthread 线程并启动执行

// thread.c

static VALUE thread_create_core(VALUE thval, VALUE args, VALUE (*fn)(ANYARGS)) {
    ...
    // 这三句话和线程要执行的 block 相关，先有个大致印象
    th->first_func = fn;
    th->first_proc = fn ? Qfalse : rb_block_proc();
    th->first_args = args; /* GC: shouldn't put before above line */
    ...

    ...
    // 创建 native pthread
    err = native_thread_create(th);
    if (err) {
        // 抛出异常
        ...
    }
    // 将线程 th 添加到 vm 的线程列表里头，供线程调度使用
    rb_vm_living_threads_insert(th->vm, th);
    return thval;
}

根据不同的平台，native_thread_create 有不同的实现，对于 linux 操作系统，具体实现在 thread_pthread.c 文件中，经过一些辗转之后又会回到 thread.c 中的 thread_start_func_2 函数：

static int
thread_start_func_2(rb_thread_t *th, VALUE *stack_start, VALUE *register_stack_start)
{
    ...
    // global virtual lock 全局虚拟机锁
    gvl_acquire(th->vm, th);
    {
        rb_thread_set_current(th);

        TH_PUSH_TAG(th);
        if ((state = EXEC_TAG()) == 0) {
            ...
            th->value = rb_vm_invoke_proc(th, proc, (int)RARRAY_LEN(args), RARRAY_CONST_PTR(args), VM_BLOCK_HANDLER_NONE);
            ...
        } else {
            th->value = (*th->first_func)((void *)args);
        }
    }
    else {
        // 异常处理
    }
    ...
    return 0;
}

终于看到传说中的 GVL 了！从网上看 Ruby 多线程一直有争议，将 Ruby 多线程戏称为 "伪多线程"，由于 GVL 的存在，Ruby 多线程并不能正真在多核上并行执行，对 CPU 密集型的应用来说这可能是硬伤～，但多线程对 IO 密集型还是有改进的，一个线程在做耗时的 IO 操作时可以通过 gvl_release 让出 GVL 供其它线程使用！
ruby 自带的 C 语言扩展已经考虑到这种情况，但是对于历史遗留的数量庞大的第三方扩展可能就悲剧了

停止线程

线程调度

GVL

Ruby 线程调度和 GVL 密切相关，所以我们先来看看 GVL 相关的数据结构和方法

rb_global_vm_lock_t

thread_pthread.h 文件定义了 rb_global_vm_lock_t 结构体

// thread.c

typedef struct rb_global_vm_lock_struct {
    /* fast path */
    unsigned long acquired;
    rb_nativethread_lock_t lock;

    /* slow path */
    volatile unsigned long waiting;
    rb_nativethread_cond_t cond;

    /* yield */
    rb_nativethread_cond_t switch_cond;
    rb_nativethread_cond_t switch_wait_cond;
    int need_yield;
    int wait_yield;
} rb_global_vm_lock_t;

acquire

当前是否有线程获取到全局锁，初始值为 0，每次获取 GVL 时设置为 1，释放时设置为 0

lock

全局锁，用于保护 acquire 等字段的读写以及和各种 condition（条件）配合使用

waiting

当前处于 lock 状态的线程个数，waiting > 0 表明当前有线程正在等待 GVL

cond

switch_cons

条件变量，用于告知释放全局锁的线程已经有线程获取到全局锁

switch_wait_cond

条件变量，由于存在竞态条件，两个线程有可能同时 yield，这时只有一个线程可以执行 yeild，其它线程必须等待，当获取 yield 执行权限的线程执行完操作之后使用该变量通知其它线程

gvl_init

解释器在初始化 Thread 类时会同时初始化 GVL，并获取 GVL。这也容易理解，因为解释器启动时只有一个（主）线程，该线程肯定要获取 GVL

// thread.c

void Init_Thread(void) {
    ...
    /* main thread setting */
    {
        gvl_init(th->vm);
        gvl_acquire(th->vm, th);
        ...
    }
}

gvl_init 具体实现和平台相关，linux/unix 平台下在 thread_pthread.c 文件里可以找到相关代码（下同）

// thread_pthread.c

static void
gvl_init(rb_vm_t *vm)
{
    native_mutex_initialize(&vm->gvl.lock);
    native_cond_initialize(&vm->gvl.cond, RB_CONDATTR_CLOCK_MONOTONIC);
    native_cond_initialize(&vm->gvl.switch_cond, RB_CONDATTR_CLOCK_MONOTONIC);
    native_cond_initialize(&vm->gvl.switch_wait_cond, RB_CONDATTR_CLOCK_MONOTONIC);
    vm->gvl.acquired = 0;
    vm->gvl.waiting = 0;
    vm->gvl.need_yield = 0;
    vm->gvl.wait_yield = 0;
}

gvl_acquire

gvl_acquire 用于获取 GVL，在调用 gvl_acquire_common 进行实际获取动作之前需要先锁定 gvl.lock

static void
gvl_acquire(rb_vm_t *vm, rb_thread_t *th)
{
    native_mutex_lock(&vm->gvl.lock);
    gvl_acquire_common(vm);
    native_mutex_unlock(&vm->gvl.lock);
}

gvl_acquire_common

获取 GVL 的核心代码

static void gvl_acquire_common(rb_vm_t *vm)
{
    // 如果 GVL 的状态是 acquired，才需要等待其它线程释放 GVL，否则直接设置 GVL 为 acquired
    if (vm->gvl.acquired) {
        // 上文提到过，waiting 是当前等待 GVL 的线程个数
        vm->gvl.waiting++;
        if (vm->gvl.waiting == 1) {
            // 定时线程相关，先略过
            rb_thread_wakeup_timer_thread_low();
        }
        // 经典的 while wait 循环，等待 GVL 被释放，gvl_release 函数会 signal gvl.cond
        while (vm->gvl.acquired) {
            native_cond_wait(&vm->gvl.cond, &vm->gvl.lock);
        }
        // 当前等待 GVL 的线程个数 --
        vm->gvl.waiting--;
        // 上文提到过，释放 GVL 的线程会等待 新线程获取 GVL，这里发送一个通知信号告诉原来持有 GVL 的线程
        if (vm->gvl.need_yield) {
            vm->gvl.need_yield = 0;
            native_cond_signal(&vm->gvl.switch_cond);
        }
    }

    vm->gvl.acquired = 1;
}

gvl_release

gvl_release 相对简单一些：

// thread_pthread.c

static void gvl_release_common(rb_vm_t *vm)
{
    vm->gvl.acquired = 0;
    if (vm->gvl.waiting > 0)
        // 唤醒 gvl_acquire 线程
        native_cond_signal(&vm->gvl.cond);
}

static void gvl_release(rb_vm_t *vm)
{
    native_mutex_lock(&vm->gvl.lock);
    gvl_release_common(vm);
    native_mutex_unlock(&vm->gvl.lock);
}

gvl_destroy

gvl_atfork

gvl_yield

gvl_yield 函数用于释放 GVL并重新 acquire，有点类似操作系统里面进程在内核态重新请求调度器进行进程调度
如果去掉 yield 核心代码，gvl_yield 和 gvl_release 几乎没啥区别，只是在最后重新尝试获取 GVL

static void gvl_yield(rb_vm_t *vm, rb_thread_t *th)
{   
    // 和其它 gvl_ 系函数一样，代码必须包裹在 native_mutex_lock 和 native_mutex_unlock 内
    native_mutex_lock(&vm->gvl.lock);
    gvl_release_common(vm);

    // yield 核心代码

    acquire:
    // 重新尝试获取 GVL
    gvl_acquire_common(vm);
    native_mutex_unlock(&vm->gvl.lock);
}

我们来看看 yield 核心代码：

// thread_pthread.c gvl_yield

    /* An another thread is processing GVL yield. */
    // 如果其它线程也正在 yield，则进入等待
    if (UNLIKELY(vm->gvl.wait_yield)) {
        while (vm->gvl.wait_yield)
            native_cond_wait(&vm->gvl.switch_wait_cond, &vm->gvl.lock);
        goto acquire;
    }
    // 如果当前有线程阻塞在 gvl_acquire 需要等待线程获取 GVL
    if (vm->gvl.waiting > 0) {
        /* Wait until another thread task take GVL. */
        vm->gvl.need_yield = 1;
        vm->gvl.wait_yield = 1;
        while (vm->gvl.need_yield)
            native_cond_wait(&vm->gvl.switch_cond, &vm->gvl.lock);
        vm->gvl.wait_yield = 0;
    } else {
        native_mutex_unlock(&vm->gvl.lock);
        sched_yield();
        native_mutex_lock(&vm->gvl.lock);
    }

    native_cond_broadcast(&vm->gvl.switch_wait_cond);

Thread 线程调度相关方法

有了上文关于 GVL 的基础知识，我们来看一些 Thread 线程调度的方法，它们基本上就是对 gvl_xxx 函数简单封装

Thread::pass

doc

Give the thread scheduler a hint to pass execution to another thread. A running thread may or may not switch, it depends on OS and processor.

Init_Thread 函数中定义了 Thread::pass 的入口，顺着入口最终找到 rb_thread_schedule_limits 函数

// thread.c

void Init_Thread(void) {
    ...
    rb_define_singleton_method(rb_cThread, "pass", thread_s_pass, 0);
    ...
}

static VALUE thread_s_pass(VALUE klass) {
    rb_thread_shedule();
    return Qnil;
}

static void rb_thread_schedule(void) {
    rb_thread_t *cur_th = GET_THREAD();
    rb_thread_schedule_limits(0);
    RUBY_VM_CHECK_INTS(cur_th);
}

rb_thread_schedule_limits 调用 gvl_yield 释放 GVL 请求调度

static void rb_thread_schedule_limits(unsigned long limits_us)
{
    thread_debug("rb_thread_schedule\n");
    // 如果只有一个线程，显然啥也不需要干
    if (!rb_thread_alone()) {
        rb_thread_t *th = GET_THREAD();

        if (th->running_time_us >= limits_us) {
            thread_debug("rb_thread_schedule/switch start\n");
            RB_GC_SAVE_MACHINE_CONTEXT(th);
            // 调用 gvl_yield
            gvl_yield(th->vm, th);
            rb_thread_set_current(th);
            thread_debug("rb_thread_schedule/switch done\n");
        }
    }
}

Thread::stop

doc

Stops execution of the current thread, putting it into a 'sleep' state, and schedules execution of another thread.

doc 提供了一个有意思的例子：

 1 a = Thread.new { print "a"; Thread.stop; print "c" }
 2 sleep 0.1 while a.status!='sleep'
 3 print "b"
 4 a.run
 5 a.join
 
#=> "abc"

最终输出结果为 abc：

语句 1 new 了一个 Thread 对象，传入一个 block
语句 2 使主线程 sleep 一段时间，这时线程 a 获得执行机会，输出 'a', 此后线程 a 被 stop，释放 GVL
语句 3 主线程从 sleep 中返回输出 'b'
语句 4 重新激活线程 a
语句 5 主线程等待线程 a 执行完毕

我们可能会有一些疑问：

Thread.stop 是如何使线程进入 'sleep' 状态的？
全局函数 sleep 是如何实现的？
Thread::run 是如何重新激活线程 a？
Thread::join 是如何实现的？

rb_thread_stop

Thread::stop 对应的 C 函数为 rb_thread_stop，它首先检查当前线程是否是虚拟机内唯一的线程，如果是则禁止 stop，接着调用 rb_thread_sleep_deadly

VALUE rb_thread_stop(void) {
    // 如果 线程 是 Ruby 虚拟机内唯一线程，不允许 stop !!!
    if (rb_thread_alone()) {
        rb_raise(rb_eThreadError,
             "stopping only thread\n\tnote: use sleep to stop forever");
    }
    rb_thread_sleep_deadly();
    return Qnil;
}


void rb_thread_sleep_deadly(void)
{
    thread_debug("rb_thread_sleep_deadly\n");
    sleep_forever(GET_THREAD(), 1, 1);
}

sleep_forever 函数的定义如下，从函数命名来看该函数实现"永久休眠"

static void sleep_forever(rb_thread_t *th, int deadlockable, int spurious_check)
{
    enum rb_thread_status prev_status = th->status;
    enum rb_thread_status status = deadlockable ? THREAD_STOPPED_FOREVER : THREAD_STOPPED;

    th->status = status;
    RUBY_VM_CHECK_INTS_BLOCKING(th);
    // 只有当线程状态发生变化才退出循环 ！！！
    while (th->status == status) {
        if (deadlockable) {
            th->vm->sleeper++;
            rb_check_deadlock(th->vm);
        }
        // 平台相关 sleep 实现，传入的超时时间为 0，sleep forever
        native_sleep(th, 0);

        if (deadlockable) {
            th->vm->sleeper--;
        }
        RUBY_VM_CHECK_INTS_BLOCKING(th);
        if (!spurious_check)
            break;
    }
    th->status = prev_status;
}

我们来看看 thread_pthread.c 中 native_sleep 是如何实现的：

// thread_pthread.c

static void
native_sleep(rb_thread_t *th, struct timeval *timeout_tv)
{
    struct timespec timeout;
    rb_nativethread_lock_t *lock = &th->interrupt_lock;
    rb_nativethread_cond_t *cond = &th->native_thread_data.sleep_cond;

    // 计算 sleep 超时时间
    ...

    GVL_UNLOCK_BEGIN();
    {
        // 核心代码 
    }
    GVL_UNLOCK_END();

    thread_debug("native_sleep done\n");
}