Commit 444e70bc authored by pfandzelter's avatar pfandzelter
Browse files

Merge branch 'tp/profiling' into 'main'

add profiling support

See merge request !138
parents d7460e92 ff8d092d
Pipeline #15823 passed with stages
in 14 minutes and 41 seconds
......@@ -10,6 +10,10 @@ test.db/
# etcd local database
.default.etcd/
# profiling files
*.pprof
*.pdf
### tubCloud ###
*_conflic*
......
......@@ -5,6 +5,7 @@
<excludeFolder url="file://$MODULE_DIR$/coverage" />
<excludeFolder url="file://$MODULE_DIR$/pkg/badgerdb/test.db" />
<excludeFolder url="file://$MODULE_DIR$/.default.etcd" />
<excludeFolder url="file://$MODULE_DIR$/pkg/fred/.default.etcd" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
......
......@@ -15,8 +15,8 @@ COPY go.sum .
RUN go mod download
COPY cmd/frednode cmd/frednode
COPY pkg pkg
COPY cmd cmd
COPY proto proto
RUN CGO_ENABLED=0 go install ./cmd/frednode/
......
......@@ -563,3 +563,22 @@ It starts the same deployment as in the 3 node test and runs a number of queries
It uses the Docker API to destroy and start the corresponding containers.
The code can be found in `./tests/FailingNodeTest` but can be started with `make failtest` in `./tests/3NodeTest/` after a deployment has been created with `make fred`.
#### Profiling
FReD supports CPU and memory profiling for the main `frednode` binary.
Use the `--cpuprofile` and `--memprofile´ flags in addition to your other flags to enable profiling.
Please register or sign in to reply
Keep in mind that this may have an impact on performance in some cases.
```sh
# start fred with profiling
$ ./frednode --cpuprofile fredcpu.pprof --memprof fredmem.pprof [ALL_YOUR_OTHER_FLAGS...]
# run tests, benchmarks, your application, etc.
# then quit fred with SIGINT or SIGKILL and your files will be written
# open pprof files and convert to pdf (note that you need graphviz installed)
# you also need to provide the path to your frednode binary
$ go tool pprof --pdf ./frednode fredcpu.pprof > cpu.pdf
$ go tool pprof --pdf ./frednode fredmem.pprof > mem.pdf
```
\ No newline at end of file
......@@ -15,8 +15,8 @@ COPY go.sum .
RUN go mod download
COPY cmd/alexandra cmd/alexandra
COPY pkg pkg
COPY cmd cmd
COPY proto proto
RUN CGO_ENABLED=0 go install ./cmd/alexandra/
......
......@@ -4,22 +4,24 @@ import (
"flag"
"os"
"os/signal"
"runtime"
"runtime/pprof"
"syscall"
"git.tu-berlin.de/mcc-fred/fred/pkg/api"
"git.tu-berlin.de/mcc-fred/fred/pkg/dynamo"
"git.tu-berlin.de/mcc-fred/fred/pkg/etcdnase"
"git.tu-berlin.de/mcc-fred/fred/pkg/nasecache"
"git.tu-berlin.de/mcc-fred/fred/pkg/peering"
"git.tu-berlin.de/mcc-fred/fred/pkg/storageclient"
"github.com/caarlos0/env/v6"
"github.com/go-errors/errors"
"github.com/mmcloughlin/geohash"
"github.com/rs/zerolog"
"github.com/rs/zerolog/log"
"git.tu-berlin.de/mcc-fred/fred/pkg/api"
"git.tu-berlin.de/mcc-fred/fred/pkg/badgerdb"
"git.tu-berlin.de/mcc-fred/fred/pkg/dynamo"
"git.tu-berlin.de/mcc-fred/fred/pkg/etcdnase"
"git.tu-berlin.de/mcc-fred/fred/pkg/fred"
"git.tu-berlin.de/mcc-fred/fred/pkg/nasecache"
"git.tu-berlin.de/mcc-fred/fred/pkg/peering"
"git.tu-berlin.de/mcc-fred/fred/pkg/storageclient"
)
type fredConfig struct {
......@@ -73,6 +75,10 @@ type fredConfig struct {
Cert string `env:"TRIGGER_CERT"`
Key string `env:"TRIGGER_KEY"`
}
Profiling struct {
CPUProfPath string `env:"PROFILING_CPU_PATH"`
MemProfPath string `env:"PROFILING_MEM_PATH"`
}
}
func parseArgs() (fc fredConfig) {
......@@ -126,6 +132,10 @@ func parseArgs() (fc fredConfig) {
// trigger node tls configuration
flag.StringVar(&(fc.Trigger.Cert), "trigger-cert", "", "Certificate for trigger node connection. (Env: TRIGGER_CERT)")
flag.StringVar(&(fc.Trigger.Key), "trigger-key", "", "Key file for trigger node connection. (Env: TRIGGER_KEY)")
flag.StringVar(&(fc.Profiling.CPUProfPath), "cpuprofile", "", "Enable CPU profiling and specify path for pprof output")
flag.StringVar(&(fc.Profiling.MemProfPath), "memprofile", "", "Enable memory profiling and specify path for pprof output")
flag.Parse()
// override with ENV variables
......@@ -185,6 +195,36 @@ func main() {
log.Fatal().Msg("Log ExtHandler has to be either dev or prod")
}
// https://github.com/influxdata/influxdb/blob/master/cmd/influxd/internal/profile/profile.go
var prof struct {
cpu *os.File
mem *os.File
}
if fc.Profiling.CPUProfPath != "" {
prof.cpu, err = os.Create(fc.Profiling.CPUProfPath)
if err != nil {
log.Fatal().Msgf("cpuprofile: %v", err)
}
err = pprof.StartCPUProfile(prof.cpu)
if err != nil {
log.Fatal().Msgf("cpuprofile: %v", err)
}
}
if fc.Profiling.CPUProfPath != "" {
prof.mem, err = os.Create(fc.Profiling.MemProfPath)
if err != nil {
log.Fatal().Msgf("memprofile: %v", err)
}
runtime.MemProfileRate = 4096
}
// Uncomment to print json config
// log.Debug().Msgf("Configuration: %is", (func() string {
// is, _ := json.MarshalIndent(fc, "", " ")
......@@ -271,42 +311,30 @@ func main() {
isProxied := fc.Server.Proxy != "" && fc.Server.Host != fc.Server.Proxy
es := api.NewServer(fc.Server.Host, f.E, fc.Server.Cert, fc.Server.Key, fc.Server.CA, isProxied, fc.Server.Proxy)
// TODO this code should live somewhere where it is called every n seconds, but for testing purposes the easiest way
// TODO to simulate an internet shutdown is via killing a node, so testing once at startup should be enough
missedItems := n.RequestNodeStatus(n.GetNodeID())
if missedItems != nil {
log.Warn().Msg("NodeStatus: This node was offline has missed some updates, getting them from other nodes")
for _, item := range missedItems {
nodeID, addr := n.GetNodeWithBiggerExpiry(item.Keygroup)
if addr == "" {
log.Error().Msgf("NodeStatus: Was not able to find node that can provide item %s, skipping it...", item.Keygroup)
continue
}
log.Info().Msgf("Getting item of KG %s ID %s from Node %s @ %s", string(item.Keygroup), item.ID, string(nodeID), addr)
item, err := c.SendGetItem(addr, item.Keygroup, item.ID)
if err != nil {
log.Err(err).Msg("Was not able to get Items from node")
}
expiry, _ := n.GetExpiry(item.Keygroup)
err = store.Update(string(item.Keygroup), item.ID, item.Val, expiry)
if err != nil {
log.Error().Msgf("Could not update missed item %s", item.ID)
}
}
} else {
log.Debug().Msg("NodeStatus: No updates were missed by this node.")
}
quit := make(chan os.Signal, 1)
signal.Notify(quit,
syscall.SIGINT,
syscall.SIGTERM,
syscall.SIGQUIT)
func() {
<-quit
c.Destroy()
log.Err(is.Close()).Msg("error closing peering server")
log.Err(es.Close()).Msg("error closing api server")
log.Err(store.Close()).Msg("error closing database")
}()
os.Interrupt,
syscall.SIGTERM)
<-quit
log.Info().Msg("FReD Node Closing Now!")
c.Destroy()
log.Err(is.Close()).Msg("closing peering server")
log.Err(es.Close()).Msg("closing api server")
log.Err(store.Close()).Msg("closing database")
if prof.cpu != nil {
pprof.StopCPUProfile()
err = prof.cpu.Close()
log.Err(err).Msg("stopping cpu profile")
prof.cpu = nil
}
if prof.mem != nil {
err = pprof.Lookup("heap").WriteTo(prof.mem, 0)
log.Err(err).Msg("stopping mem profile")
err = prof.mem.Close()
log.Err(err).Msg("stopping mem profile")
prof.mem = nil
}
}
......@@ -98,8 +98,11 @@ func startServer(cert string, key string, ca string, host string, wsHost string)
log.Debug().Msgf("Interconnection Server is listening on %s", host)
go func() {
err := s.Server.Serve(lis)
defer s.GracefulStop()
log.Fatal().Err(s.Server.Serve(lis)).Msg("Interconnection Server")
if err != nil {
log.Fatal().Msgf("Interconnection Server exited: %s", err.Error())
}
}()
// start a http server that let's us see what the trigger node has received
......
......@@ -18,8 +18,8 @@ COPY go.sum .
RUN go mod download
COPY cmd/frednode cmd/frednode
COPY pkg pkg
COPY cmd cmd
COPY proto proto
RUN CGO_ENABLED=0 go install -gcflags="all=-N -l" ./cmd/frednode/
......
......@@ -158,7 +158,12 @@ func NewServer(host string, handler fred.ExtHandler, cert string, key string, ca
log.Debug().Msgf("Externalconnection Server is listening on %s", host)
go func() {
log.Fatal().Err(s.Server.Serve(lis)).Msg("Externalconnection Server")
err := s.Server.Serve(lis)
// if Serve returns without an error, we probably intentionally closed it
if err != nil {
log.Fatal().Msgf("Externalconnection Server exited: %s", err.Error())
}
}()
return s
......
......@@ -86,6 +86,32 @@ func New(config *Config) (f Fred) {
a := newAuthService(config.NaSe)
// TODO this code should live somewhere where it is called every n seconds, but for testing purposes the easiest way
// TODO to simulate an internet shutdown is via killing a node, so testing once at startup should be enough
missedItems := config.NaSe.RequestNodeStatus(config.NaSe.GetNodeID())
if missedItems != nil {
log.Warn().Msg("NodeStatus: This node was offline has missed some updates, getting them from other nodes")
for _, item := range missedItems {
nodeID, addr := config.NaSe.GetNodeWithBiggerExpiry(item.Keygroup)
if addr == "" {
log.Error().Msgf("NodeStatus: Was not able to find node that can provide item %s, skipping it...", item.Keygroup)
continue
}
log.Info().Msgf("Getting item of KG %s ID %s from Node %s @ %s", string(item.Keygroup), item.ID, string(nodeID), addr)
item, err := config.Client.SendGetItem(addr, item.Keygroup, item.ID)
if err != nil {
log.Err(err).Msg("Was not able to get Items from node")
}
expiry, _ := config.NaSe.GetExpiry(item.Keygroup)
err = s.update(item, expiry)
if err != nil {
log.Error().Msgf("Could not update missed item %s", item.ID)
}
}
} else {
log.Debug().Msg("NodeStatus: No updates were missed by this node.")
}
return Fred{
E: newExthandler(s, r, t, a, config.NaSe),
I: newInthandler(s, r, t, config.NaSe),
......
......@@ -34,7 +34,7 @@ func TestMain(m *testing.M) {
},
)
zerolog.SetGlobalLevel(zerolog.DebugLevel)
zerolog.SetGlobalLevel(zerolog.ErrorLevel)
fInfo, err := os.Stat(etcdDir)
......@@ -63,6 +63,8 @@ func TestMain(m *testing.M) {
ClientCertAuth: true,
}
cfg.LogLevel = "error"
e, err := embed.StartEtcd(cfg)
if err != nil {
......@@ -232,3 +234,131 @@ func TestMisformedKeygroupInput(t *testing.T) {
testMisformedKeygroupInput(t, "user", "misf%?ormed", "id", "value")
testMisformedKeygroupInput(t, "use|r", "misf|ormed", "id|", "val|ue")
}
func BenchmarkPut(b *testing.B) {
user := "user"
kg := "benchmarkPut"
id := "benchmarkItem"
value := "benchmarkVal"
err := f.E.HandleCreateKeygroup(user, fred.Keygroup{
Name: fred.KeygroupName(kg),
Mutable: true,
Expiry: 0,
})
if err != nil {
log.Err(err).Msg(err.(*errors.Error).ErrorStack())
b.Error(err)
}
for i := 0; i < b.N; i++ {
err = f.E.HandleUpdate(user, fred.Item{
Keygroup: fred.KeygroupName(kg),
ID: id,
Val: value,
})
if err != nil {
log.Err(err).Msg(err.(*errors.Error).ErrorStack())
b.Error(err)
}
}
err = f.E.HandleDeleteKeygroup(user, fred.Keygroup{
Name: fred.KeygroupName(kg),
})
if err != nil {
log.Err(err).Msg(err.(*errors.Error).ErrorStack())
b.Error(err)
}
}
func BenchmarkGet(b *testing.B) {
user := "user"
kg := "benchmarkGet"
id := "benchmarkItem"
value := "benchmarkVal"
err := f.E.HandleCreateKeygroup(user, fred.Keygroup{
Name: fred.KeygroupName(kg),
Mutable: true,
Expiry: 0,
})
if err != nil {
log.Err(err).Msg(err.(*errors.Error).ErrorStack())
b.Error(err)
}
err = f.E.HandleUpdate(user, fred.Item{
Keygroup: fred.KeygroupName(kg),
ID: id,
Val: value,
})
if err != nil {
log.Err(err).Msg(err.(*errors.Error).ErrorStack())
b.Error(err)
}
for i := 0; i < b.N; i++ {
_, err := f.E.HandleRead(user, fred.Item{
Keygroup: fred.KeygroupName(kg),
ID: id,
})
if err != nil {
log.Err(err).Msg(err.(*errors.Error).ErrorStack())
b.Error(err)
}
}
err = f.E.HandleDeleteKeygroup(user, fred.Keygroup{
Name: fred.KeygroupName(kg),
})
if err != nil {
log.Err(err).Msg(err.(*errors.Error).ErrorStack())
b.Error(err)
}
}
func BenchmarkAppend(b *testing.B) {
user := "user"
kg := "benchmarkAppend"
value := "benchmarkVal"
err := f.E.HandleCreateKeygroup(user, fred.Keygroup{
Name: fred.KeygroupName(kg),
Mutable: false,
Expiry: 0,
})
if err != nil {
log.Err(err).Msg(err.(*errors.Error).ErrorStack())
b.Error(err)
}
for i := 0; i < b.N; i++ {
_, err = f.E.HandleAppend(user, fred.Item{
Keygroup: fred.KeygroupName(kg),
Val: value,
})
if err != nil {
log.Err(err).Msg(err.(*errors.Error).ErrorStack())
b.Error(err)
}
}
err = f.E.HandleDeleteKeygroup(user, fred.Keygroup{
Name: fred.KeygroupName(kg),
})
if err != nil {
log.Err(err).Msg(err.(*errors.Error).ErrorStack())
b.Error(err)
}
}
......@@ -33,7 +33,12 @@ func NewServer(host string, handler fred.IntHandler) *Server {
log.Debug().Msgf("Interconnection Server is listening on %s", host)
go func() {
log.Fatal().Err(s.Server.Serve(lis)).Msg("Interconnection Server")
err := s.Server.Serve(lis)
// if Serve returns without an error, we probably intentionally closed it
if err != nil {
log.Fatal().Msgf("Interconnection Server exited: %s", err.Error())
}
}()
return s
......
......@@ -16,8 +16,8 @@ COPY go.sum .
RUN go mod download
COPY cmd/storageserver cmd/storageserver
COPY pkg pkg
COPY cmd cmd
COPY proto proto
# Static build required so that we can safely copy the binary over.
......
......@@ -63,7 +63,7 @@ func concurrentUpdates(nodes []*grpcclient.Node, concurrent int, updates int, ru
// let's check if everything worked
for i := range expected {
for i := 0; i < concurrent; i++ {
for key, val := range expected[i] {
v := nodes[0].GetItem(keygroup, key, false)
......@@ -147,7 +147,7 @@ func concurrentUpdatesImmutable(nodes []*grpcclient.Node, concurrent int, update
v := nodes[0].GetItem(keygroup, key, false)
found := 0
for i := range expected {
for i := 0; i < concurrent; i++ {
val, ok := expected[i][key]
if !ok {
......@@ -197,12 +197,12 @@ func (t *ConcurrencySuite) RunTests() {
// Test 5: create immutable keygroup, have 100 goroutines append data
// expected behavior: all updates arrive
run++
concurrentUpdatesImmutable([]*grpcclient.Node{t.c.nodeA}, 100, 100, run)
concurrentUpdatesImmutable([]*grpcclient.Node{t.c.nodeA}, 100, 50, run)
// Test 6: create immutable keygroup on two nodes, have one goroutine each append data
// expected behavior: all updates arrive
run++
concurrentUpdatesImmutable([]*grpcclient.Node{t.c.nodeA, t.c.nodeB}, 2, 500, run)
concurrentUpdatesImmutable([]*grpcclient.Node{t.c.nodeA, t.c.nodeB}, 2, 100, run)
}
......
......@@ -20,13 +20,13 @@ clean: ## clean up all resources
3nodetest: clean ## start all containers with docker compose and run the test dockerized
@docker network create fredwork --gateway 172.26.0.1 --subnet 172.26.0.0/16 || true
@docker-compose -f etcd.yml -f nodeA.yml -f nodeB.yml -f nodeC.yml -f 3NodeTester.yml -f trigger.yml build
@docker-compose -f etcd.yml -f nodeA.yml -f nodeB.yml -f nodeC.yml -f 3NodeTester.yml -f trigger.yml up --force-recreate --abort-on-container-exit --renew-anon-volumes --remove-orphans
@docker-compose -f etcd.yml -f nodeA.yml -f nodeB.yml -f nodeC.yml -f 3NodeTester.yml -f trigger.yml up --force-recreate --abort-on-container-exit --exit-code-from tester --renew-anon-volumes --remove-orphans
3nodetest-debug-nodeB: export TEST_RANGE = -
3nodetest-debug-nodeB: ## same as "test" but with nodeB in debugger mode
@docker network create fredwork --gateway 172.26.0.1 --subnet 172.26.0.0/16 || true
@docker-compose -f etcd.yml -f nodeA.yml -f nodeB-debug.yml -f nodeC.yml -f 3NodeTester.yml -f trigger.yml build
@docker-compose -f etcd.yml -f nodeA.yml -f nodeB-debug.yml -f nodeC.yml -f 3NodeTester.yml -f trigger.yml up --force-recreate --abort-on-container-exit --renew-anon-volumes --remove-orphans
@docker-compose -f etcd.yml -f nodeA.yml -f nodeB-debug.yml -f nodeC.yml -f 3NodeTester.yml -f trigger.yml up --force-recreate --abort-on-container-exit --exit-code-from tester --renew-anon-volumes --remove-orphans
fred: clean ## Start all containers except the tester so that other clients can access them
@docker network create fredwork --gateway 172.26.0.1 --subnet 172.26.0.0/16 || true
......@@ -35,7 +35,7 @@ fred: clean ## Start all containers except the tester so that other clients can
failtest: ## Start the failtest
@docker-compose -f failtest.yml build
@docker-compose -f failtest.yml up --force-recreate --abort-on-container-exit --renew-anon-volumes
@docker-compose -f failtest.yml up --force-recreate --abort-on-container-exit --renew-anon-volumes --exit-code-from tester
alexandra: clean ## start fred with alexandra
@docker network create fredwork --gateway 172.26.0.1 --subnet 172.26.0.0/16 || true
......
......@@ -38,6 +38,8 @@ services:
--key /cert/nodeA.key \
--ca-file /cert/ca.crt \
--adaptor remote \
--cpuprofile /profiles/cpuprofile.pprof \
--memprofile /profiles/memprofile.pprof \
--nase-host https://172.26.6.1:2379 \
--nase-cert /cert/nodeA.crt \
--nase-key /cert/nodeA.key \
......@@ -50,10 +52,12 @@ services:
--trigger-key /cert/nodeA.key"
environment:
- LOG_LEVEL
stop_signal: SIGINT
volumes:
- ../../nase/tls/nodeA.crt:/cert/nodeA.crt
- ../../nase/tls/nodeA.key:/cert/nodeA.key
- ../../nase/tls/ca.crt:/cert/ca.crt
- .:/profiles
networks:
fredwork:
ipv4_address: 172.26.1.101
......
......@@ -16,8 +16,8 @@ COPY go.sum .
RUN go mod download
COPY cmd/simpletrigger cmd/simpletrigger
COPY pkg pkg
COPY cmd cmd
COPY proto proto
# Static build required so that we can safely copy the binary over.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment