Add Goroutine stack inspector to admin/monitor (#19207)
Continues on from #19202. Following the addition of pprof labels we can now more easily understand the relationship between a goroutine and the requests that spawn them. This PR takes advantage of the labels and adds a few others, then provides a mechanism for the monitoring page to query the pprof goroutine profile. The binary profile that results from this profile is immediately piped in to the google library for parsing this and then stack traces are formed for the goroutines. If the goroutine is within a context or has been created from a goroutine within a process context it will acquire the process description labels for that process. The goroutines are mapped with there associate pids and any that do not have an associated pid are placed in a group at the bottom as unbound. In this way we should be able to more easily examine goroutines that have been stuck. A manager command `gitea manager processes` is also provided that can export the processes (with or without stacktraces) to the command line. Signed-off-by: Andrew Thornton <art27@cantab.net>
This commit is contained in:
parent
9c349a4277
commit
c88547ce71
48 changed files with 1479 additions and 595 deletions
|
@ -6,13 +6,8 @@
|
|||
package process
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"os/exec"
|
||||
"runtime/pprof"
|
||||
"sort"
|
||||
"strconv"
|
||||
"sync"
|
||||
"time"
|
||||
|
@ -30,6 +25,18 @@ var (
|
|||
DefaultContext = context.Background()
|
||||
)
|
||||
|
||||
// DescriptionPProfLabel is a label set on goroutines that have a process attached
|
||||
const DescriptionPProfLabel = "process-description"
|
||||
|
||||
// PIDPProfLabel is a label set on goroutines that have a process attached
|
||||
const PIDPProfLabel = "pid"
|
||||
|
||||
// PPIDPProfLabel is a label set on goroutines that have a process attached
|
||||
const PPIDPProfLabel = "ppid"
|
||||
|
||||
// ProcessTypePProfLabel is a label set on goroutines that have a process attached
|
||||
const ProcessTypePProfLabel = "process-type"
|
||||
|
||||
// IDType is a pid type
|
||||
type IDType string
|
||||
|
||||
|
@ -44,15 +51,15 @@ type Manager struct {
|
|||
next int64
|
||||
lastTime int64
|
||||
|
||||
processes map[IDType]*Process
|
||||
processMap map[IDType]*process
|
||||
}
|
||||
|
||||
// GetManager returns a Manager and initializes one as singleton if there's none yet
|
||||
func GetManager() *Manager {
|
||||
managerInit.Do(func() {
|
||||
manager = &Manager{
|
||||
processes: make(map[IDType]*Process),
|
||||
next: 1,
|
||||
processMap: make(map[IDType]*process),
|
||||
next: 1,
|
||||
}
|
||||
})
|
||||
return manager
|
||||
|
@ -69,12 +76,25 @@ func GetManager() *Manager {
|
|||
func (pm *Manager) AddContext(parent context.Context, description string) (ctx context.Context, cancel context.CancelFunc, finished FinishedFunc) {
|
||||
ctx, cancel = context.WithCancel(parent)
|
||||
|
||||
ctx, pid, finished := pm.Add(ctx, description, cancel)
|
||||
ctx, _, finished = pm.Add(ctx, description, cancel, NormalProcessType, true)
|
||||
|
||||
return &Context{
|
||||
Context: ctx,
|
||||
pid: pid,
|
||||
}, cancel, finished
|
||||
return ctx, cancel, finished
|
||||
}
|
||||
|
||||
// AddTypedContext creates a new context and adds it as a process. Once the process is finished, finished must be called
|
||||
// to remove the process from the process table. It should not be called until the process is finished but must always be called.
|
||||
//
|
||||
// cancel should be used to cancel the returned context, however it will not remove the process from the process table.
|
||||
// finished will cancel the returned context and remove it from the process table.
|
||||
//
|
||||
// Most processes will not need to use the cancel function but there will be cases whereby you want to cancel the process but not immediately remove it from the
|
||||
// process table.
|
||||
func (pm *Manager) AddTypedContext(parent context.Context, description, processType string, currentlyRunning bool) (ctx context.Context, cancel context.CancelFunc, finished FinishedFunc) {
|
||||
ctx, cancel = context.WithCancel(parent)
|
||||
|
||||
ctx, _, finished = pm.Add(ctx, description, cancel, processType, currentlyRunning)
|
||||
|
||||
return ctx, cancel, finished
|
||||
}
|
||||
|
||||
// AddContextTimeout creates a new context and add it as a process. Once the process is finished, finished must be called
|
||||
|
@ -90,52 +110,61 @@ func (pm *Manager) AddContextTimeout(parent context.Context, timeout time.Durati
|
|||
// it's meaningless to use timeout <= 0, and it must be a bug! so we must panic here to tell developers to make the timeout correct
|
||||
panic("the timeout must be greater than zero, otherwise the context will be cancelled immediately")
|
||||
}
|
||||
|
||||
ctx, cancel = context.WithTimeout(parent, timeout)
|
||||
|
||||
ctx, pid, finshed := pm.Add(ctx, description, cancel)
|
||||
ctx, _, finshed = pm.Add(ctx, description, cancel, NormalProcessType, true)
|
||||
|
||||
return &Context{
|
||||
Context: ctx,
|
||||
pid: pid,
|
||||
}, cancel, finshed
|
||||
return ctx, cancel, finshed
|
||||
}
|
||||
|
||||
// Add create a new process
|
||||
func (pm *Manager) Add(ctx context.Context, description string, cancel context.CancelFunc) (context.Context, IDType, FinishedFunc) {
|
||||
func (pm *Manager) Add(ctx context.Context, description string, cancel context.CancelFunc, processType string, currentlyRunning bool) (context.Context, IDType, FinishedFunc) {
|
||||
parentPID := GetParentPID(ctx)
|
||||
|
||||
pm.mutex.Lock()
|
||||
start, pid := pm.nextPID()
|
||||
|
||||
parent := pm.processes[parentPID]
|
||||
parent := pm.processMap[parentPID]
|
||||
if parent == nil {
|
||||
parentPID = ""
|
||||
}
|
||||
|
||||
process := &Process{
|
||||
process := &process{
|
||||
PID: pid,
|
||||
ParentPID: parentPID,
|
||||
Description: description,
|
||||
Start: start,
|
||||
Cancel: cancel,
|
||||
Type: processType,
|
||||
}
|
||||
|
||||
finished := func() {
|
||||
cancel()
|
||||
pm.remove(process)
|
||||
pprof.SetGoroutineLabels(ctx)
|
||||
var finished FinishedFunc
|
||||
if currentlyRunning {
|
||||
finished = func() {
|
||||
cancel()
|
||||
pm.remove(process)
|
||||
pprof.SetGoroutineLabels(ctx)
|
||||
}
|
||||
} else {
|
||||
finished = func() {
|
||||
cancel()
|
||||
pm.remove(process)
|
||||
}
|
||||
}
|
||||
|
||||
if parent != nil {
|
||||
parent.AddChild(process)
|
||||
}
|
||||
pm.processes[pid] = process
|
||||
pm.processMap[pid] = process
|
||||
pm.mutex.Unlock()
|
||||
|
||||
pprofCtx := pprof.WithLabels(ctx, pprof.Labels("process-description", description, "ppid", string(parentPID), "pid", string(pid)))
|
||||
pprof.SetGoroutineLabels(pprofCtx)
|
||||
pprofCtx := pprof.WithLabels(ctx, pprof.Labels(DescriptionPProfLabel, description, PPIDPProfLabel, string(parentPID), PIDPProfLabel, string(pid), ProcessTypePProfLabel, processType))
|
||||
if currentlyRunning {
|
||||
pprof.SetGoroutineLabels(pprofCtx)
|
||||
}
|
||||
|
||||
return pprofCtx, pid, finished
|
||||
return &Context{
|
||||
Context: pprofCtx,
|
||||
pid: pid,
|
||||
}, pid, finished
|
||||
}
|
||||
|
||||
// nextPID will return the next available PID. pm.mutex should already be locked.
|
||||
|
@ -160,142 +189,24 @@ func (pm *Manager) nextPID() (start time.Time, pid IDType) {
|
|||
// Remove a process from the ProcessManager.
|
||||
func (pm *Manager) Remove(pid IDType) {
|
||||
pm.mutex.Lock()
|
||||
delete(pm.processes, pid)
|
||||
delete(pm.processMap, pid)
|
||||
pm.mutex.Unlock()
|
||||
}
|
||||
|
||||
func (pm *Manager) remove(process *Process) {
|
||||
func (pm *Manager) remove(process *process) {
|
||||
pm.mutex.Lock()
|
||||
if p := pm.processes[process.PID]; p == process {
|
||||
delete(pm.processes, process.PID)
|
||||
defer pm.mutex.Unlock()
|
||||
if p := pm.processMap[process.PID]; p == process {
|
||||
delete(pm.processMap, process.PID)
|
||||
}
|
||||
parent := pm.processes[process.ParentPID]
|
||||
pm.mutex.Unlock()
|
||||
|
||||
if parent == nil {
|
||||
return
|
||||
}
|
||||
|
||||
parent.RemoveChild(process)
|
||||
}
|
||||
|
||||
// Cancel a process in the ProcessManager.
|
||||
func (pm *Manager) Cancel(pid IDType) {
|
||||
pm.mutex.Lock()
|
||||
process, ok := pm.processes[pid]
|
||||
process, ok := pm.processMap[pid]
|
||||
pm.mutex.Unlock()
|
||||
if ok {
|
||||
if ok && process.Type != SystemProcessType {
|
||||
process.Cancel()
|
||||
}
|
||||
}
|
||||
|
||||
// Processes gets the processes in a thread safe manner
|
||||
func (pm *Manager) Processes(onlyRoots bool) []*Process {
|
||||
pm.mutex.Lock()
|
||||
processes := make([]*Process, 0, len(pm.processes))
|
||||
if onlyRoots {
|
||||
for _, process := range pm.processes {
|
||||
if _, has := pm.processes[process.ParentPID]; !has {
|
||||
processes = append(processes, process)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for _, process := range pm.processes {
|
||||
processes = append(processes, process)
|
||||
}
|
||||
}
|
||||
pm.mutex.Unlock()
|
||||
|
||||
sort.Slice(processes, func(i, j int) bool {
|
||||
left, right := processes[i], processes[j]
|
||||
|
||||
return left.Start.Before(right.Start)
|
||||
})
|
||||
|
||||
return processes
|
||||
}
|
||||
|
||||
// Exec a command and use the default timeout.
|
||||
func (pm *Manager) Exec(desc, cmdName string, args ...string) (string, string, error) {
|
||||
return pm.ExecDir(DefaultContext, -1, "", desc, cmdName, args...)
|
||||
}
|
||||
|
||||
// ExecTimeout a command and use a specific timeout duration.
|
||||
func (pm *Manager) ExecTimeout(timeout time.Duration, desc, cmdName string, args ...string) (string, string, error) {
|
||||
return pm.ExecDir(DefaultContext, timeout, "", desc, cmdName, args...)
|
||||
}
|
||||
|
||||
// ExecDir a command and use the default timeout.
|
||||
func (pm *Manager) ExecDir(ctx context.Context, timeout time.Duration, dir, desc, cmdName string, args ...string) (string, string, error) {
|
||||
return pm.ExecDirEnv(ctx, timeout, dir, desc, nil, cmdName, args...)
|
||||
}
|
||||
|
||||
// ExecDirEnv runs a command in given path and environment variables, and waits for its completion
|
||||
// up to the given timeout (or DefaultTimeout if -1 is given).
|
||||
// Returns its complete stdout and stderr
|
||||
// outputs and an error, if any (including timeout)
|
||||
func (pm *Manager) ExecDirEnv(ctx context.Context, timeout time.Duration, dir, desc string, env []string, cmdName string, args ...string) (string, string, error) {
|
||||
return pm.ExecDirEnvStdIn(ctx, timeout, dir, desc, env, nil, cmdName, args...)
|
||||
}
|
||||
|
||||
// ExecDirEnvStdIn runs a command in given path and environment variables with provided stdIN, and waits for its completion
|
||||
// up to the given timeout (or DefaultTimeout if -1 is given).
|
||||
// Returns its complete stdout and stderr
|
||||
// outputs and an error, if any (including timeout)
|
||||
func (pm *Manager) ExecDirEnvStdIn(ctx context.Context, timeout time.Duration, dir, desc string, env []string, stdIn io.Reader, cmdName string, args ...string) (string, string, error) {
|
||||
if timeout <= 0 {
|
||||
timeout = 60 * time.Second
|
||||
}
|
||||
|
||||
stdOut := new(bytes.Buffer)
|
||||
stdErr := new(bytes.Buffer)
|
||||
|
||||
ctx, _, finished := pm.AddContextTimeout(ctx, timeout, desc)
|
||||
defer finished()
|
||||
|
||||
cmd := exec.CommandContext(ctx, cmdName, args...)
|
||||
cmd.Dir = dir
|
||||
cmd.Env = env
|
||||
cmd.Stdout = stdOut
|
||||
cmd.Stderr = stdErr
|
||||
if stdIn != nil {
|
||||
cmd.Stdin = stdIn
|
||||
}
|
||||
|
||||
if err := cmd.Start(); err != nil {
|
||||
return "", "", err
|
||||
}
|
||||
|
||||
err := cmd.Wait()
|
||||
if err != nil {
|
||||
err = &Error{
|
||||
PID: GetPID(ctx),
|
||||
Description: desc,
|
||||
Err: err,
|
||||
CtxErr: ctx.Err(),
|
||||
Stdout: stdOut.String(),
|
||||
Stderr: stdErr.String(),
|
||||
}
|
||||
}
|
||||
|
||||
return stdOut.String(), stdErr.String(), err
|
||||
}
|
||||
|
||||
// Error is a wrapped error describing the error results of Process Execution
|
||||
type Error struct {
|
||||
PID IDType
|
||||
Description string
|
||||
Err error
|
||||
CtxErr error
|
||||
Stdout string
|
||||
Stderr string
|
||||
}
|
||||
|
||||
func (err *Error) Error() string {
|
||||
return fmt.Sprintf("exec(%s:%s) failed: %v(%v) stdout: %s stderr: %s", err.PID, err.Description, err.Err, err.CtxErr, err.Stdout, err.Stderr)
|
||||
}
|
||||
|
||||
// Unwrap implements the unwrappable implicit interface for go1.13 Unwrap()
|
||||
func (err *Error) Unwrap() error {
|
||||
return err.Err
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue