golang调度学习-调度流程 (二) newproc

前序

上文讲到rt0_go的runtime·schedinit(SB)

TEXT runtime·rt0_go<ABIInternal>(SB),NOSPLIT,$0
    // 略， 查看 golang调度学习-调度流程 (一)
    
    // create a new goroutine to start program
    MOVQ    $runtime·mainPC(SB), AX                        // entry, 就是 $runtime·main
    PUSHQ    AX                                          // newproc 的第二个参数
    PUSHQ    $0                                            // arg size的第一个参数
    CALL    runtime·newproc(SB)                         // 调用 runtime·newproc($0, $runtime·mainPC(SB))
    POPQ    AX
    POPQ    AX
    
    // start this M
    CALL    runtime·mstart(SB)

    CALL    runtime·abort(SB)    // mstart should never return
    RET

    // Prevent dead-code elimination of debugCallV1, which is
    // intended to be called by debuggers.
    MOVQ    $runtime·debugCallV1<ABIInternal>(SB), AX
    RET

goroute

新建golang文件main.go

package main

func testGoFun() {
    go testGoFun()
}

执行

go tool compile -N -l -S ./main.go > ./main.s

查看main.s文件

"".testGoFun STEXT size=71 args=0x0 locals=0x18 funcid=0x0
    0x0000 00000 (./test_go.go:7)    TEXT    "".testGoFun(SB), ABIInternal, $24-0
    0x0000 00000 (./test_go.go:7)    MOVQ    (TLS), CX
    0x0009 00009 (./test_go.go:7)    CMPQ    SP, 16(CX)
    0x000d 00013 (./test_go.go:7)    PCDATA    $0, $-2
    0x000d 00013 (./test_go.go:7)    JLS    64
    0x000f 00015 (./test_go.go:7)    PCDATA    $0, $-1
    0x000f 00015 (./test_go.go:7)    SUBQ    $24, SP
    0x0013 00019 (./test_go.go:7)    MOVQ    BP, 16(SP)
    0x0018 00024 (./test_go.go:7)    LEAQ    16(SP), BP
    0x001d 00029 (./test_go.go:7)    FUNCDATA    $0, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
    0x001d 00029 (./test_go.go:7)    FUNCDATA    $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
    0x001d 00029 (./test_go.go:8)    MOVL    $0, (SP)
    0x0024 00036 (./test_go.go:8)    LEAQ    "".testGoFun·f(SB), AX
    0x002b 00043 (./test_go.go:8)    MOVQ    AX, 8(SP)
    0x0030 00048 (./test_go.go:8)    PCDATA    $1, $0
    0x0030 00048 (./test_go.go:8)    CALL    runtime.newproc(SB)
    0x0035 00053 (./test_go.go:9)    MOVQ    16(SP), BP
    0x003a 00058 (./test_go.go:9)    ADDQ    $24, SP
    0x003e 00062 (./test_go.go:9)    RET
    0x003f 00063 (./test_go.go:9)    NOP
    0x003f 00063 (./test_go.go:7)    PCDATA    $1, $-1
    0x003f 00063 (./test_go.go:7)    PCDATA    $0, $-2
    0x003f 00063 (./test_go.go:7)    NOP
    0x0040 00064 (./test_go.go:7)    CALL    runtime.morestack_noctxt(SB)
    0x0045 00069 (./test_go.go:7)    PCDATA    $0, $-1
    0x0045 00069 (./test_go.go:7)    JMP    0
    0x0000 65 48 8b 0c 25 00 00 00 00 48 3b 61 10 76 31 48  eH..%....H;a.v1H
    0x0010 83 ec 18 48 89 6c 24 10 48 8d 6c 24 10 c7 04 24  ...H.l$.H.l$...$
    0x0020 00 00 00 00 48 8d 05 00 00 00 00 48 89 44 24 08  ....H......H.D$.
    0x0030 e8 00 00 00 00 48 8b 6c 24 10 48 83 c4 18 c3 90  .....H.l$.H.....
    0x0040 e8 00 00 00 00 eb b9                             .......
    rel 5+4 t=17 TLS+0
    rel 39+4 t=16 "".testGoFun·f+0
    rel 49+4 t=8 runtime.newproc+0
    rel 65+4 t=8 runtime.morestack_noctxt+0

可以看到也是调用CALL runtime.newproc(SB)来调用新协程

newproc

创建一个新的g运行带siz字节参数的fn，并且把它放到g.m.p的待运行队列
在编写程序中，使用 go func() {}来创建一个goroutine(g)，这条语句会被编译器翻译成函数 newproc()。

// Create a new g running fn with siz bytes of arguments.
// Put it on the queue of g's waiting to run.
// The compiler turns a go statement into a call to this.
//
// The stack layout of this call is unusual: it assumes that the
// arguments to pass to fn are on the stack sequentially immediately
// after &fn. Hence, they are logically part of newproc's argument
// frame, even though they don't appear in its signature (and can't
// because their types differ between call sites).
//
// This must be nosplit because this stack layout means there are
// untyped arguments in newproc's argument frame. Stack copies won't
// be able to adjust them and stack splits won't be able to copy them.
//
//go:nosplit
func newproc(siz int32, fn *funcval) {
   argp := add(unsafe.Pointer(&fn), sys.PtrSize)                        // 下面例子a=1的位置
   gp := getg()
   pc := getcallerpc()                                                  // 下面例子funCaller的PC
   systemstack(func() {                                                 // 在g0的堆栈上执行
      newg := newproc1(fn, argp, siz, gp, pc)                           // 新建g,下面分析源码
      _p_ := getg().m.p.ptr()
      // 把newg放到_p_的runnext
      // runqput第三个参数如果是True就把g放到runnext,runnext原有的放到runq。 否则g放到runq
      // 如果runq满了就放到sched.runq（要加锁）
      // 参考 https://blog.csdn.net/diaosssss/article/details/93066804
      runqput(_p_, newg, true)
      if mainStarted {                                                  // rt0_go调用的时候是False
         wakep()                                                        // 详见 golang调度学习-调度流程 (三)
      }
   })
}

假设是调用

go funCaller() {
    go funcA(a=1, b=2)
}

newproc的调用栈, caller SP表示funCaller的SP

位置	值
(24)caller SP	b=2
(16)caller SP	a=1
(8)caller SP	*funcA
(0)caller SP	siz=24, sizeof(1)+sizeof(2)+sizeof(*funcA)
(-8)caller SP	caller PC

newproc1

// Create a new g in state _Grunnable, starting at fn, with narg bytes
// of arguments starting at argp. callerpc is the address of the go
// statement that created this. The caller is responsible for adding
// the new g to the scheduler.
//
// This must run on the system stack because it's the continuation of
// newproc, which cannot split the stack.
//
//go:systemstack
func newproc1(fn *funcval, argp unsafe.Pointer, narg int32, callergp *g, callerpc uintptr) *g {
    _g_ := getg()

    if fn == nil {
        _g_.m.throwing = -1 // do not dump full stacks
        throw("go of nil func value")
    }
    acquirem()         // disable preemption because it can be holding p in a local var， loc++
    siz := narg
    siz = (siz + 7) &^ 7

    // We could allocate a larger initial stack if necessary.
    // Not worth it: this is almost always an error.
    // 4*sizeof(uintreg): extra space added below
    // sizeof(uintreg): caller's LR (arm) or return address (x86, in gostartcall).
    // 参数大小不能大约初始栈大小
    if siz >= _StackMin-4*sys.RegSize-sys.RegSize {
        throw("newproc: function arguments too large for new goroutine")
    }

    _p_ := _g_.m.p.ptr()
    newg := gfget(_p_)                              // 从缓存中读取g, 详见下文
    if newg == nil {                        
        newg = malg(_StackMin)                      // 缓存中没有g, 新建g。分配栈为 2k 大小的G对象
        casgstatus(newg, _Gidle, _Gdead)            //将g的状态改为_Gdead 
        // 添加到allg数组，防止gc扫描清除掉
        allgadd(newg) // publishes with a g->status of Gdead so GC scanner doesn't look at uninitialized stack.
    }
    if newg.stack.hi == 0 {
        throw("newproc1: newg missing stack")
    }

    if readgstatus(newg) != _Gdead {
        throw("newproc1: new g is not Gdead")
    }

    totalSize := 4*sys.RegSize + uintptr(siz) + sys.MinFrameSize // extra space in case of reads slightly beyond frame
    totalSize += -totalSize & (sys.SpAlign - 1)                  // align to spAlign
    // 新协程的栈顶计算，将栈顶减去参数占用的空间
    sp := newg.stack.hi - totalSize
    spArg := sp
    if usesLR {
        // caller's LR
        *(*uintptr)(unsafe.Pointer(sp)) = 0
        prepGoExitFrame(sp)
        spArg += sys.MinFrameSize
    }
    if narg > 0 {           // 如果有参数
        // copy参数到栈上
        memmove(unsafe.Pointer(spArg), argp, uintptr(narg))
        // This is a stack-to-stack copy. If write barriers
        // are enabled and the source stack is grey (the
        // destination is always black), then perform a
        // barrier copy. We do this *after* the memmove
        // because the destination stack may have garbage on
        // it.
        if writeBarrier.needed && !_g_.m.curg.gcscandone {
            f := findfunc(fn.fn)
            stkmap := (*stackmap)(funcdata(f, _FUNCDATA_ArgsPointerMaps))
            //一些gc相关的工作省略
            if stkmap.nbit > 0 {
                // We're in the prologue, so it's always stack map index 0.
                bv := stackmapdata(stkmap, 0)
                bulkBarrierBitmap(spArg, spArg, uintptr(bv.n)*sys.PtrSize, 0, bv.bytedata)
            }
        }
    }
    // 初始化G的gobuf，保存sp，pc，traceback信息任务函数等, 
    memclrNoHeapPointers(unsafe.Pointer(&newg.sched), unsafe.Sizeof(newg.sched))
    newg.sched.sp = sp
    newg.stktopsp = sp                              // 栈顶sp，计算traceback
    // 保存goexit的地址到sched.pc，后面会调节 goexit 作为任务函数返回后执行的地址，所以goroutine结束后会调用goexit
    newg.sched.pc = funcPC(goexit) + sys.PCQuantum  // +PCQuantum so that previous instruction is in same function
    // sched.g保存当前新的G
    newg.sched.g = guintptr(unsafe.Pointer(newg))
    // 将当前的pc压入栈，保存g的任务函数为pc
    gostartcallfn(&newg.sched, fn)
    newg.gopc = callerpc                                        // 调用者pc, 计算traceback
    newg.ancestors = saveAncestors(callergp)                    // 祖先g, 计算traceback
    newg.startpc = fn.fn
    if _g_.m.curg != nil {
        newg.labels = _g_.m.curg.labels                         // profiler labels
    }
    // 堆栈转储和死锁检测器中是否必须省略g。
    if isSystemGoroutine(newg, false) {
        atomic.Xadd(&sched.ngsys, +1)
    }
    casgstatus(newg, _Gdead, _Grunnable)                        // 切换状态
    
    // 从[_p_.goidcache,_p_.goidcacheend) 获取goid。 不够用就从sched.goidgen里面批量进货16个
    if _p_.goidcache == _p_.goidcacheend {
        // Sched.goidgen is the last allocated id,
        // this batch must be [sched.goidgen+1, sched.goidgen+GoidCacheBatch].
        // At startup sched.goidgen=0, so main goroutine receives goid=1.
        _p_.goidcache = atomic.Xadd64(&sched.goidgen, _GoidCacheBatch)
        _p_.goidcache -= _GoidCacheBatch - 1
        _p_.goidcacheend = _p_.goidcache + _GoidCacheBatch
    }
    newg.goid = int64(_p_.goidcache)
    _p_.goidcache++
    
    if raceenabled {
        newg.racectx = racegostart(callerpc)
    }
    if trace.enabled {
        traceGoCreate(newg, newg.startpc)
    }
    releasem(_g_.m)
    return newg
}

newproc1的主要工作：

acquirem (m.lock++)
从缓存获取newg,缓存没有就新建一个
如果newg没有栈，就新建一个2k的栈
构建newg的栈，先把调用参数拷贝到栈上，接着push goexit到栈上（假装是goexit调用了fn）
初始化newg的sched，traceback, goid等信息
newg状态从_Gdead转变为_Grunnable
releasem (m.lock--)

gfget

从缓存中获取g

// Get from gfree list.
// If local list is empty, grab a batch from global list.
func gfget(_p_ *p) *g {
retry:
    // 如果_p_.gFree为空，sched.gFree.stack或者sched.gFree.noStack不为空，偷最多32个过来
    if _p_.gFree.empty() && (!sched.gFree.stack.empty() || !sched.gFree.noStack.empty()) {
        lock(&sched.gFree.lock)
        // Move a batch of free Gs to the P.
        for _p_.gFree.n < 32 {
            // Prefer Gs with stacks.
            gp := sched.gFree.stack.pop()
            if gp == nil {
                gp = sched.gFree.noStack.pop()
                if gp == nil {
                    break
                }
            }
            sched.gFree.n--
            _p_.gFree.push(gp)
            _p_.gFree.n++
        }
        unlock(&sched.gFree.lock)
        goto retry
    }
    
    gp := _p_.gFree.pop()
    if gp == nil {
        return nil
    }
    _p_.gFree.n--
    if gp.stack.lo == 0 {
        // Stack was deallocated in gfput. Allocate a new one.
        systemstack(func() {
            gp.stack = stackalloc(_FixedStack)
        })
        gp.stackguard0 = gp.stack.lo + _StackGuard
    } else {
        if raceenabled {
            racemalloc(unsafe.Pointer(gp.stack.lo), gp.stack.hi-gp.stack.lo)
        }
        if msanenabled {
            msanmalloc(unsafe.Pointer(gp.stack.lo), gp.stack.hi-gp.stack.lo)
        }
    }
    return gp
}

malg()

malg()函数创建一个新的g，包括为该g申请栈空间（支持程序分配栈的系统）。系统中的每个g都是由该函数创建而来的

// Allocate a new g, with a stack big enough for stacksize bytes.
func malg(stacksize int32) *g {
    newg := new(g)
    if stacksize >= 0 {
        stacksize = round2(_StackSystem + stacksize)
        systemstack(func() {
            newg.stack = stackalloc(uint32(stacksize))
        })
        newg.stackguard0 = newg.stack.lo + _StackGuard
        newg.stackguard1 = ^uintptr(0)
        // Clear the bottom word of the stack. We record g
        // there on gsignal stack during VDSO on ARM and ARM64.
        *(*uintptr)(unsafe.Pointer(newg.stack.lo)) = 0
    }
    return newg
}

gfput

// Put on gfree list.
// If local list is too long, transfer a batch to the global list.
func gfput(_p_ *p, gp *g) {
    if readgstatus(gp) != _Gdead {
        throw("gfput: bad status (not Gdead)")
    }

    stksize := gp.stack.hi - gp.stack.lo

    if stksize != _FixedStack {
        // non-standard stack size - free it.
        stackfree(gp.stack)
        gp.stack.lo = 0
        gp.stack.hi = 0
        gp.stackguard0 = 0
    }

    _p_.gFree.push(gp)
    _p_.gFree.n++
    if _p_.gFree.n >= 64 {
        lock(&sched.gFree.lock)
        for _p_.gFree.n >= 32 {
            _p_.gFree.n--
            gp = _p_.gFree.pop()
            if gp.stack.lo == 0 {
                sched.gFree.noStack.push(gp)
            } else {
                sched.gFree.stack.push(gp)
            }
            sched.gFree.n++
        }
        unlock(&sched.gFree.lock)
    }
}

流程如下图:

引用文章

[1] Go语言内幕（6）：启动和内存分配初始化 https://studygolang.com/artic...

golang调度学习-调度流程 (二) newproc

前序

goroute

newproc

newproc1

gfget

malg()

gfput

引用文章

xxx小M

引用和评论

Go Modules

腾讯 tRPC-Go 教学——（5）filter、context 和日志组件

大模型时代，后端程序员如何避免被AI卷死？

Go slice切片使用教程，一次通关！

腾讯 tRPC-Go 教学——（1）搭建服务

一文弄懂用Go实现MCP服务

gozero限流、熔断、降级如何实现？面试的时候怎么回答？