Add Goroutine stack inspector to admin/monitor (#19207)

Continues on from #19202.

Following the addition of pprof labels we can now more easily understand the relationship between a goroutine and the requests that spawn them. 

This PR takes advantage of the labels and adds a few others, then provides a mechanism for the monitoring page to query the pprof goroutine profile.

The binary profile that results from this profile is immediately piped in to the google library for parsing this and then stack traces are formed for the goroutines.

If the goroutine is within a context or has been created from a goroutine within a process context it will acquire the process description labels for that process. 

The goroutines are mapped with there associate pids and any that do not have an associated pid are placed in a group at the bottom as unbound.

In this way we should be able to more easily examine goroutines that have been stuck.

A manager command `gitea manager processes` is also provided that can export the processes (with or without stacktraces) to the command line.

Signed-off-by: Andrew Thornton <art27@cantab.net>
This commit is contained in:
zeripath 2022-03-31 18:01:43 +01:00 committed by GitHub
parent 9c349a4277
commit c88547ce71
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
48 changed files with 1479 additions and 595 deletions

View file

@ -6,13 +6,8 @@
package process
import (
"bytes"
"context"
"fmt"
"io"
"os/exec"
"runtime/pprof"
"sort"
"strconv"
"sync"
"time"
@ -30,6 +25,18 @@ var (
DefaultContext = context.Background()
)
// DescriptionPProfLabel is a label set on goroutines that have a process attached
const DescriptionPProfLabel = "process-description"
// PIDPProfLabel is a label set on goroutines that have a process attached
const PIDPProfLabel = "pid"
// PPIDPProfLabel is a label set on goroutines that have a process attached
const PPIDPProfLabel = "ppid"
// ProcessTypePProfLabel is a label set on goroutines that have a process attached
const ProcessTypePProfLabel = "process-type"
// IDType is a pid type
type IDType string
@ -44,15 +51,15 @@ type Manager struct {
next int64
lastTime int64
processes map[IDType]*Process
processMap map[IDType]*process
}
// GetManager returns a Manager and initializes one as singleton if there's none yet
func GetManager() *Manager {
managerInit.Do(func() {
manager = &Manager{
processes: make(map[IDType]*Process),
next: 1,
processMap: make(map[IDType]*process),
next: 1,
}
})
return manager
@ -69,12 +76,25 @@ func GetManager() *Manager {
func (pm *Manager) AddContext(parent context.Context, description string) (ctx context.Context, cancel context.CancelFunc, finished FinishedFunc) {
ctx, cancel = context.WithCancel(parent)
ctx, pid, finished := pm.Add(ctx, description, cancel)
ctx, _, finished = pm.Add(ctx, description, cancel, NormalProcessType, true)
return &Context{
Context: ctx,
pid: pid,
}, cancel, finished
return ctx, cancel, finished
}
// AddTypedContext creates a new context and adds it as a process. Once the process is finished, finished must be called
// to remove the process from the process table. It should not be called until the process is finished but must always be called.
//
// cancel should be used to cancel the returned context, however it will not remove the process from the process table.
// finished will cancel the returned context and remove it from the process table.
//
// Most processes will not need to use the cancel function but there will be cases whereby you want to cancel the process but not immediately remove it from the
// process table.
func (pm *Manager) AddTypedContext(parent context.Context, description, processType string, currentlyRunning bool) (ctx context.Context, cancel context.CancelFunc, finished FinishedFunc) {
ctx, cancel = context.WithCancel(parent)
ctx, _, finished = pm.Add(ctx, description, cancel, processType, currentlyRunning)
return ctx, cancel, finished
}
// AddContextTimeout creates a new context and add it as a process. Once the process is finished, finished must be called
@ -90,52 +110,61 @@ func (pm *Manager) AddContextTimeout(parent context.Context, timeout time.Durati
// it's meaningless to use timeout <= 0, and it must be a bug! so we must panic here to tell developers to make the timeout correct
panic("the timeout must be greater than zero, otherwise the context will be cancelled immediately")
}
ctx, cancel = context.WithTimeout(parent, timeout)
ctx, pid, finshed := pm.Add(ctx, description, cancel)
ctx, _, finshed = pm.Add(ctx, description, cancel, NormalProcessType, true)
return &Context{
Context: ctx,
pid: pid,
}, cancel, finshed
return ctx, cancel, finshed
}
// Add create a new process
func (pm *Manager) Add(ctx context.Context, description string, cancel context.CancelFunc) (context.Context, IDType, FinishedFunc) {
func (pm *Manager) Add(ctx context.Context, description string, cancel context.CancelFunc, processType string, currentlyRunning bool) (context.Context, IDType, FinishedFunc) {
parentPID := GetParentPID(ctx)
pm.mutex.Lock()
start, pid := pm.nextPID()
parent := pm.processes[parentPID]
parent := pm.processMap[parentPID]
if parent == nil {
parentPID = ""
}
process := &Process{
process := &process{
PID: pid,
ParentPID: parentPID,
Description: description,
Start: start,
Cancel: cancel,
Type: processType,
}
finished := func() {
cancel()
pm.remove(process)
pprof.SetGoroutineLabels(ctx)
var finished FinishedFunc
if currentlyRunning {
finished = func() {
cancel()
pm.remove(process)
pprof.SetGoroutineLabels(ctx)
}
} else {
finished = func() {
cancel()
pm.remove(process)
}
}
if parent != nil {
parent.AddChild(process)
}
pm.processes[pid] = process
pm.processMap[pid] = process
pm.mutex.Unlock()
pprofCtx := pprof.WithLabels(ctx, pprof.Labels("process-description", description, "ppid", string(parentPID), "pid", string(pid)))
pprof.SetGoroutineLabels(pprofCtx)
pprofCtx := pprof.WithLabels(ctx, pprof.Labels(DescriptionPProfLabel, description, PPIDPProfLabel, string(parentPID), PIDPProfLabel, string(pid), ProcessTypePProfLabel, processType))
if currentlyRunning {
pprof.SetGoroutineLabels(pprofCtx)
}
return pprofCtx, pid, finished
return &Context{
Context: pprofCtx,
pid: pid,
}, pid, finished
}
// nextPID will return the next available PID. pm.mutex should already be locked.
@ -160,142 +189,24 @@ func (pm *Manager) nextPID() (start time.Time, pid IDType) {
// Remove a process from the ProcessManager.
func (pm *Manager) Remove(pid IDType) {
pm.mutex.Lock()
delete(pm.processes, pid)
delete(pm.processMap, pid)
pm.mutex.Unlock()
}
func (pm *Manager) remove(process *Process) {
func (pm *Manager) remove(process *process) {
pm.mutex.Lock()
if p := pm.processes[process.PID]; p == process {
delete(pm.processes, process.PID)
defer pm.mutex.Unlock()
if p := pm.processMap[process.PID]; p == process {
delete(pm.processMap, process.PID)
}
parent := pm.processes[process.ParentPID]
pm.mutex.Unlock()
if parent == nil {
return
}
parent.RemoveChild(process)
}
// Cancel a process in the ProcessManager.
func (pm *Manager) Cancel(pid IDType) {
pm.mutex.Lock()
process, ok := pm.processes[pid]
process, ok := pm.processMap[pid]
pm.mutex.Unlock()
if ok {
if ok && process.Type != SystemProcessType {
process.Cancel()
}
}
// Processes gets the processes in a thread safe manner
func (pm *Manager) Processes(onlyRoots bool) []*Process {
pm.mutex.Lock()
processes := make([]*Process, 0, len(pm.processes))
if onlyRoots {
for _, process := range pm.processes {
if _, has := pm.processes[process.ParentPID]; !has {
processes = append(processes, process)
}
}
} else {
for _, process := range pm.processes {
processes = append(processes, process)
}
}
pm.mutex.Unlock()
sort.Slice(processes, func(i, j int) bool {
left, right := processes[i], processes[j]
return left.Start.Before(right.Start)
})
return processes
}
// Exec a command and use the default timeout.
func (pm *Manager) Exec(desc, cmdName string, args ...string) (string, string, error) {
return pm.ExecDir(DefaultContext, -1, "", desc, cmdName, args...)
}
// ExecTimeout a command and use a specific timeout duration.
func (pm *Manager) ExecTimeout(timeout time.Duration, desc, cmdName string, args ...string) (string, string, error) {
return pm.ExecDir(DefaultContext, timeout, "", desc, cmdName, args...)
}
// ExecDir a command and use the default timeout.
func (pm *Manager) ExecDir(ctx context.Context, timeout time.Duration, dir, desc, cmdName string, args ...string) (string, string, error) {
return pm.ExecDirEnv(ctx, timeout, dir, desc, nil, cmdName, args...)
}
// ExecDirEnv runs a command in given path and environment variables, and waits for its completion
// up to the given timeout (or DefaultTimeout if -1 is given).
// Returns its complete stdout and stderr
// outputs and an error, if any (including timeout)
func (pm *Manager) ExecDirEnv(ctx context.Context, timeout time.Duration, dir, desc string, env []string, cmdName string, args ...string) (string, string, error) {
return pm.ExecDirEnvStdIn(ctx, timeout, dir, desc, env, nil, cmdName, args...)
}
// ExecDirEnvStdIn runs a command in given path and environment variables with provided stdIN, and waits for its completion
// up to the given timeout (or DefaultTimeout if -1 is given).
// Returns its complete stdout and stderr
// outputs and an error, if any (including timeout)
func (pm *Manager) ExecDirEnvStdIn(ctx context.Context, timeout time.Duration, dir, desc string, env []string, stdIn io.Reader, cmdName string, args ...string) (string, string, error) {
if timeout <= 0 {
timeout = 60 * time.Second
}
stdOut := new(bytes.Buffer)
stdErr := new(bytes.Buffer)
ctx, _, finished := pm.AddContextTimeout(ctx, timeout, desc)
defer finished()
cmd := exec.CommandContext(ctx, cmdName, args...)
cmd.Dir = dir
cmd.Env = env
cmd.Stdout = stdOut
cmd.Stderr = stdErr
if stdIn != nil {
cmd.Stdin = stdIn
}
if err := cmd.Start(); err != nil {
return "", "", err
}
err := cmd.Wait()
if err != nil {
err = &Error{
PID: GetPID(ctx),
Description: desc,
Err: err,
CtxErr: ctx.Err(),
Stdout: stdOut.String(),
Stderr: stdErr.String(),
}
}
return stdOut.String(), stdErr.String(), err
}
// Error is a wrapped error describing the error results of Process Execution
type Error struct {
PID IDType
Description string
Err error
CtxErr error
Stdout string
Stderr string
}
func (err *Error) Error() string {
return fmt.Sprintf("exec(%s:%s) failed: %v(%v) stdout: %s stderr: %s", err.PID, err.Description, err.Err, err.CtxErr, err.Stdout, err.Stderr)
}
// Unwrap implements the unwrappable implicit interface for go1.13 Unwrap()
func (err *Error) Unwrap() error {
return err.Err
}