Commit ff8d092d authored by pfandzelter's avatar pfandzelter
Browse files

add profiling support

add profiling support to 3nodetest
parent d7460e92
Pipeline #15821 passed with stages
in 17 minutes and 20 seconds
......@@ -10,6 +10,10 @@ test.db/
# etcd local database
.default.etcd/
# profiling files
*.pprof
*.pdf
### tubCloud ###
*_conflic*
......
......@@ -5,6 +5,7 @@
<excludeFolder url="file://$MODULE_DIR$/coverage" />
<excludeFolder url="file://$MODULE_DIR$/pkg/badgerdb/test.db" />
<excludeFolder url="file://$MODULE_DIR$/.default.etcd" />
<excludeFolder url="file://$MODULE_DIR$/pkg/fred/.default.etcd" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
......
......@@ -15,8 +15,8 @@ COPY go.sum .
RUN go mod download
COPY cmd/frednode cmd/frednode
COPY pkg pkg
COPY cmd cmd
COPY proto proto
RUN CGO_ENABLED=0 go install ./cmd/frednode/
......
......@@ -563,3 +563,22 @@ It starts the same deployment as in the 3 node test and runs a number of queries
It uses the Docker API to destroy and start the corresponding containers.
The code can be found in `./tests/FailingNodeTest` but can be started with `make failtest` in `./tests/3NodeTest/` after a deployment has been created with `make fred`.
#### Profiling
FReD supports CPU and memory profiling for the main `frednode` binary.
Use the `--cpuprofile` and `--memprofile´ flags in addition to your other flags to enable profiling.
Keep in mind that this may have an impact on performance in some cases.
```sh
# start fred with profiling
$ ./frednode --cpuprofile fredcpu.pprof --memprof fredmem.pprof [ALL_YOUR_OTHER_FLAGS...]
# run tests, benchmarks, your application, etc.
# then quit fred with SIGINT or SIGKILL and your files will be written
# open pprof files and convert to pdf (note that you need graphviz installed)
# you also need to provide the path to your frednode binary
$ go tool pprof --pdf ./frednode fredcpu.pprof > cpu.pdf
$ go tool pprof --pdf ./frednode fredmem.pprof > mem.pdf
```
\ No newline at end of file
......@@ -15,8 +15,8 @@ COPY go.sum .
RUN go mod download
COPY cmd/alexandra cmd/alexandra
COPY pkg pkg
COPY cmd cmd
COPY proto proto
RUN CGO_ENABLED=0 go install ./cmd/alexandra/
......
......@@ -4,22 +4,24 @@ import (
"flag"
"os"
"os/signal"
"runtime"
"runtime/pprof"
"syscall"
"git.tu-berlin.de/mcc-fred/fred/pkg/api"
"git.tu-berlin.de/mcc-fred/fred/pkg/dynamo"
"git.tu-berlin.de/mcc-fred/fred/pkg/etcdnase"
"git.tu-berlin.de/mcc-fred/fred/pkg/nasecache"
"git.tu-berlin.de/mcc-fred/fred/pkg/peering"
"git.tu-berlin.de/mcc-fred/fred/pkg/storageclient"
"github.com/caarlos0/env/v6"
"github.com/go-errors/errors"
"github.com/mmcloughlin/geohash"
"github.com/rs/zerolog"
"github.com/rs/zerolog/log"
"git.tu-berlin.de/mcc-fred/fred/pkg/api"
"git.tu-berlin.de/mcc-fred/fred/pkg/badgerdb"
"git.tu-berlin.de/mcc-fred/fred/pkg/dynamo"
"git.tu-berlin.de/mcc-fred/fred/pkg/etcdnase"
"git.tu-berlin.de/mcc-fred/fred/pkg/fred"
"git.tu-berlin.de/mcc-fred/fred/pkg/nasecache"
"git.tu-berlin.de/mcc-fred/fred/pkg/peering"
"git.tu-berlin.de/mcc-fred/fred/pkg/storageclient"
)
type fredConfig struct {
......@@ -73,6 +75,10 @@ type fredConfig struct {
Cert string `env:"TRIGGER_CERT"`
Key string `env:"TRIGGER_KEY"`
}
Profiling struct {
CPUProfPath string `env:"PROFILING_CPU_PATH"`
MemProfPath string `env:"PROFILING_MEM_PATH"`
}
}
func parseArgs() (fc fredConfig) {
......@@ -126,6 +132,10 @@ func parseArgs() (fc fredConfig) {
// trigger node tls configuration
flag.StringVar(&(fc.Trigger.Cert), "trigger-cert", "", "Certificate for trigger node connection. (Env: TRIGGER_CERT)")
flag.StringVar(&(fc.Trigger.Key), "trigger-key", "", "Key file for trigger node connection. (Env: TRIGGER_KEY)")
flag.StringVar(&(fc.Profiling.CPUProfPath), "cpuprofile", "", "Enable CPU profiling and specify path for pprof output")
flag.StringVar(&(fc.Profiling.MemProfPath), "memprofile", "", "Enable memory profiling and specify path for pprof output")
flag.Parse()
// override with ENV variables
......@@ -185,6 +195,36 @@ func main() {
log.Fatal().Msg("Log ExtHandler has to be either dev or prod")
}
// https://github.com/influxdata/influxdb/blob/master/cmd/influxd/internal/profile/profile.go
var prof struct {
cpu *os.File
mem *os.File
}
if fc.Profiling.CPUProfPath != "" {
prof.cpu, err = os.Create(fc.Profiling.CPUProfPath)
if err != nil {
log.Fatal().Msgf("cpuprofile: %v", err)
}
err = pprof.StartCPUProfile(prof.cpu)
if err != nil {
log.Fatal().Msgf("cpuprofile: %v", err)
}
}
if fc.Profiling.CPUProfPath != "" {
prof.mem, err = os.Create(fc.Profiling.MemProfPath)
if err != nil {
log.Fatal().Msgf("memprofile: %v", err)
}
runtime.MemProfileRate = 4096
}
// Uncomment to print json config
// log.Debug().Msgf("Configuration: %is", (func() string {
// is, _ := json.MarshalIndent(fc, "", " ")
......@@ -271,42 +311,30 @@ func main() {
isProxied := fc.Server.Proxy != "" && fc.Server.Host != fc.Server.Proxy
es := api.NewServer(fc.Server.Host, f.E, fc.Server.Cert, fc.Server.Key, fc.Server.CA, isProxied, fc.Server.Proxy)
// TODO this code should live somewhere where it is called every n seconds, but for testing purposes the easiest way
// TODO to simulate an internet shutdown is via killing a node, so testing once at startup should be enough
missedItems := n.RequestNodeStatus(n.GetNodeID())
if missedItems != nil {
log.Warn().Msg("NodeStatus: This node was offline has missed some updates, getting them from other nodes")
for _, item := range missedItems {
nodeID, addr := n.GetNodeWithBiggerExpiry(item.Keygroup)
if addr == "" {
log.Error().Msgf("NodeStatus: Was not able to find node that can provide item %s, skipping it...", item.Keygroup)
continue
}
log.Info().Msgf("Getting item of KG %s ID %s from Node %s @ %s", string(item.Keygroup), item.ID, string(nodeID), addr)
item, err := c.SendGetItem(addr, item.Keygroup, item.ID)
if err != nil {
log.Err(err).Msg("Was not able to get Items from node")
}
expiry, _ := n.GetExpiry(item.Keygroup)
err = store.Update(string(item.Keygroup), item.ID, item.Val, expiry)
if err != nil {
log.Error().Msgf("Could not update missed item %s", item.ID)
}
}
} else {
log.Debug().Msg("NodeStatus: No updates were missed by this node.")
}
quit := make(chan os.Signal, 1)
signal.Notify(quit,
syscall.SIGINT,
syscall.SIGTERM,
syscall.SIGQUIT)
func() {
<-quit
c.Destroy()
log.Err(is.Close()).Msg("error closing peering server")
log.Err(es.Close()).Msg("error closing api server")
log.Err(store.Close()).Msg("error closing database")
}()
os.Interrupt,
syscall.SIGTERM)
<-quit
log.Info().Msg("FReD Node Closing Now!")
c.Destroy()
log.Err(is.Close()).Msg("closing peering server")
log.Err(es.Close()).Msg("closing api server")
log.Err(store.Close()).Msg("closing database")
if prof.cpu != nil {
pprof.StopCPUProfile()
err = prof.cpu.Close()
log.Err(err).Msg("stopping cpu profile")
prof.cpu = nil
}
if prof.mem != nil {
err = pprof.Lookup("heap").WriteTo(prof.mem, 0)
log.Err(err).Msg("stopping mem profile")
err = prof.mem.Close()
log.Err(err).Msg("stopping mem profile")
prof.mem = nil
}
}
......@@ -98,8 +98,11 @@ func startServer(cert string, key string, ca string, host string, wsHost string)
log.Debug().Msgf("Interconnection Server is listening on %s", host)
go func() {
err := s.Server.Serve(lis)
defer s.GracefulStop()
log.Fatal().Err(s.Server.Serve(lis)).Msg("Interconnection Server")
if err != nil {
log.Fatal().Msgf("Interconnection Server exited: %s", err.Error())
}
}()
// start a http server that let's us see what the trigger node has received
......
......@@ -18,8 +18,8 @@ COPY go.sum .
RUN go mod download
COPY cmd/frednode cmd/frednode
COPY pkg pkg
COPY cmd cmd
COPY proto proto
RUN CGO_ENABLED=0 go install -gcflags="all=-N -l" ./cmd/frednode/
......
......@@ -158,7 +158,12 @@ func NewServer(host string, handler fred.ExtHandler, cert string, key string, ca
log.Debug().Msgf("Externalconnection Server is listening on %s", host)
go func() {
log.Fatal().Err(s.Server.Serve(lis)).Msg("Externalconnection Server")
err := s.Server.Serve(lis)
// if Serve returns without an error, we probably intentionally closed it
if err != nil {
log.Fatal().Msgf("Externalconnection Server exited: %s", err.Error())
}
}()
return s
......
......@@ -86,6 +86,32 @@ func New(config *Config) (f Fred) {
a := newAuthService(config.NaSe)
// TODO this code should live somewhere where it is called every n seconds, but for testing purposes the easiest way
// TODO to simulate an internet shutdown is via killing a node, so testing once at startup should be enough
missedItems := config.NaSe.RequestNodeStatus(config.NaSe.GetNodeID())
if missedItems != nil {
log.Warn().Msg("NodeStatus: This node was offline has missed some updates, getting them from other nodes")
for _, item := range missedItems {
nodeID, addr := config.NaSe.GetNodeWithBiggerExpiry(item.Keygroup)
if addr == "" {
log.Error().Msgf("NodeStatus: Was not able to find node that can provide item %s, skipping it...", item.Keygroup)
continue
}
log.Info().Msgf("Getting item of KG %s ID %s from Node %s @ %s", string(item.Keygroup), item.ID, string(nodeID), addr)
item, err := config.Client.SendGetItem(addr, item.Keygroup, item.ID)
if err != nil {
log.Err(err).Msg("Was not able to get Items from node")
}
expiry, _ := config.NaSe.GetExpiry(item.Keygroup)
err = s.update(item, expiry)
if err != nil {
log.Error().Msgf("Could not update missed item %s", item.ID)
}
}
} else {
log.Debug().Msg("NodeStatus: No updates were missed by this node.")
}
return Fred{
E: newExthandler(s, r, t, a, config.NaSe),
I: newInthandler(s, r, t, config.NaSe),
......
......@@ -34,7 +34,7 @@ func TestMain(m *testing.M) {
},
)
zerolog.SetGlobalLevel(zerolog.DebugLevel)
zerolog.SetGlobalLevel(zerolog.ErrorLevel)
fInfo, err := os.Stat(etcdDir)
......@@ -63,6 +63,8 @@ func TestMain(m *testing.M) {
ClientCertAuth: true,
}
cfg.LogLevel = "error"
e, err := embed.StartEtcd(cfg)
if err != nil {
......@@ -232,3 +234,131 @@ func TestMisformedKeygroupInput(t *testing.T) {
testMisformedKeygroupInput(t, "user", "misf%?ormed", "id", "value")
testMisformedKeygroupInput(t, "use|r", "misf|ormed", "id|", "val|ue")
}
func BenchmarkPut(b *testing.B) {
user := "user"
kg := "benchmarkPut"
id := "benchmarkItem"
value := "benchmarkVal"
err := f.E.HandleCreateKeygroup(user, fred.Keygroup{
Name: fred.KeygroupName(kg),
Mutable: true,
Expiry: 0,
})
if err != nil {
log.Err(err).Msg(err.(*errors.Error).ErrorStack())
b.Error(err)
}
for i := 0; i < b.N; i++ {
err = f.E.HandleUpdate(user, fred.Item{
Keygroup: fred.KeygroupName(kg),
ID: id,
Val: value,
})
if err != nil {
log.Err(err).Msg(err.(*errors.Error).ErrorStack())
b.Error(err)
}
}
err = f.E.HandleDeleteKeygroup(user, fred.Keygroup{
Name: fred.KeygroupName(kg),
})
if err != nil {
log.Err(err).Msg(err.(*errors.Error).ErrorStack())
b.Error(err)
}
}
func BenchmarkGet(b *testing.B) {
user := "user"
kg := "benchmarkGet"
id := "benchmarkItem"
value := "benchmarkVal"
err := f.E.HandleCreateKeygroup(user, fred.Keygroup{
Name: fred.KeygroupName(kg),
Mutable: true,
Expiry: 0,
})
if err != nil {
log.Err(err).Msg(err.(*errors.Error).ErrorStack())
b.Error(err)
}
err = f.E.HandleUpdate(user, fred.Item{
Keygroup: fred.KeygroupName(kg),
ID: id,
Val: value,
})
if err != nil {
log.Err(err).Msg(err.(*errors.Error).ErrorStack())
b.Error(err)
}
for i := 0; i < b.N; i++ {
_, err := f.E.HandleRead(user, fred.Item{
Keygroup: fred.KeygroupName(kg),
ID: id,
})
if err != nil {
log.Err(err).Msg(err.(*errors.Error).ErrorStack())
b.Error(err)
}
}
err = f.E.HandleDeleteKeygroup(user, fred.Keygroup{
Name: fred.KeygroupName(kg),
})
if err != nil {
log.Err(err).Msg(err.(*errors.Error).ErrorStack())
b.Error(err)
}
}
func BenchmarkAppend(b *testing.B) {
user := "user"
kg := "benchmarkAppend"
value := "benchmarkVal"
err := f.E.HandleCreateKeygroup(user, fred.Keygroup{
Name: fred.KeygroupName(kg),
Mutable: false,
Expiry: 0,
})
if err != nil {
log.Err(err).Msg(err.(*errors.Error).ErrorStack())
b.Error(err)
}
for i := 0; i < b.N; i++ {
_, err = f.E.HandleAppend(user, fred.Item{
Keygroup: fred.KeygroupName(kg),
Val: value,
})
if err != nil {
log.Err(err).Msg(err.(*errors.Error).ErrorStack())
b.Error(err)
}
}
err = f.E.HandleDeleteKeygroup(user, fred.Keygroup{
Name: fred.KeygroupName(kg),
})
if err != nil {
log.Err(err).Msg(err.(*errors.Error).ErrorStack())
b.Error(err)
}
}
......@@ -33,7 +33,12 @@ func NewServer(host string, handler fred.IntHandler) *Server {
log.Debug().Msgf("Interconnection Server is listening on %s", host)
go func() {
log.Fatal().Err(s.Server.Serve(lis)).Msg("Interconnection Server")
err := s.Server.Serve(lis)
// if Serve returns without an error, we probably intentionally closed it
if err != nil {
log.Fatal().Msgf("Interconnection Server exited: %s", err.Error())
}
}()
return s
......
......@@ -16,8 +16,8 @@ COPY go.sum .
RUN go mod download
COPY cmd/storageserver cmd/storageserver
COPY pkg pkg
COPY cmd cmd
COPY proto proto
# Static build required so that we can safely copy the binary over.
......
......@@ -63,7 +63,7 @@ func concurrentUpdates(nodes []*grpcclient.Node, concurrent int, updates int, ru
// let's check if everything worked
for i := range expected {
for i := 0; i < concurrent; i++ {
for key, val := range expected[i] {
v := nodes[0].GetItem(keygroup, key, false)
......@@ -147,7 +147,7 @@ func concurrentUpdatesImmutable(nodes []*grpcclient.Node, concurrent int, update
v := nodes[0].GetItem(keygroup, key, false)
found := 0
for i := range expected {
for i := 0; i < concurrent; i++ {
val, ok := expected[i][key]
if !ok {
......@@ -197,12 +197,12 @@ func (t *ConcurrencySuite) RunTests() {
// Test 5: create immutable keygroup, have 100 goroutines append data
// expected behavior: all updates arrive
run++
concurrentUpdatesImmutable([]*grpcclient.Node{t.c.nodeA}, 100, 100, run)
concurrentUpdatesImmutable([]*grpcclient.Node{t.c.nodeA}, 100, 50, run)
// Test 6: create immutable keygroup on two nodes, have one goroutine each append data
// expected behavior: all updates arrive
run++
concurrentUpdatesImmutable([]*grpcclient.Node{t.c.nodeA, t.c.nodeB}, 2, 500, run)
concurrentUpdatesImmutable([]*grpcclient.Node{t.c.nodeA, t.c.nodeB}, 2, 100, run)
}
......
......@@ -20,13 +20,13 @@ clean: ## clean up all resources
3nodetest: clean ## start all containers with docker compose and run the test dockerized
@docker network create fredwork --gateway 172.26.0.1 --subnet 172.26.0.0/16 || true
@docker-compose -f etcd.yml -f nodeA.yml -f nodeB.yml -f nodeC.yml -f 3NodeTester.yml -f trigger.yml build
@docker-compose -f etcd.yml -f nodeA.yml -f nodeB.yml -f nodeC.yml -f 3NodeTester.yml -f trigger.yml up --force-recreate --abort-on-container-exit --renew-anon-volumes --remove-orphans
@docker-compose -f etcd.yml -f nodeA.yml -f nodeB.yml -f nodeC.yml -f 3NodeTester.yml -f trigger.yml up --force-recreate --abort-on-container-exit --exit-code-from tester --renew-anon-volumes --remove-orphans
3nodetest-debug-nodeB: export TEST_RANGE = -
3nodetest-debug-nodeB: ## same as "test" but with nodeB in debugger mode
@docker network create fredwork --gateway 172.26.0.1 --subnet 172.26.0.0/16 || true
@docker-compose -f etcd.yml -f nodeA.yml -f nodeB-debug.yml -f nodeC.yml -f 3NodeTester.yml -f trigger.yml build
@docker-compose -f etcd.yml -f nodeA.yml -f nodeB-debug.yml -f nodeC.yml -f 3NodeTester.yml -f trigger.yml up --force-recreate --abort-on-container-exit --renew-anon-volumes --remove-orphans
@docker-compose -f etcd.yml -f nodeA.yml -f nodeB-debug.yml -f nodeC.yml -f 3NodeTester.yml -f trigger.yml up --force-recreate --abort-on-container-exit --exit-code-from tester --renew-anon-volumes --remove-orphans
fred: clean ## Start all containers except the tester so that other clients can access them
@docker network create fredwork --gateway 172.26.0.1 --subnet 172.26.0.0/16 || true
......@@ -35,7 +35,7 @@ fred: clean ## Start all containers except the tester so that other clients can
failtest: ## Start the failtest
@docker-compose -f failtest.yml build
@docker-compose -f failtest.yml up --force-recreate --abort-on-container-exit --renew-anon-volumes
@docker-compose -f failtest.yml up --force-recreate --abort-on-container-exit --renew-anon-volumes --exit-code-from tester
alexandra: clean ## start fred with alexandra
@docker network create fredwork --gateway 172.26.0.1 --subnet 172.26.0.0/16 || true
......
......@@ -38,6 +38,8 @@ services:
--key /cert/nodeA.key \
--ca-file /cert/ca.crt \
--adaptor remote \
--cpuprofile /profiles/cpuprofile.pprof \
--memprofile /profiles/memprofile.pprof \
--nase-host https://172.26.6.1:2379 \
--nase-cert /cert/nodeA.crt \
--nase-key /cert/nodeA.key \
......@@ -50,10 +52,12 @@ services:
--trigger-key /cert/nodeA.key"
environment:
- LOG_LEVEL
stop_signal: SIGINT
volumes:
- ../../nase/tls/nodeA.crt:/cert/nodeA.crt
- ../../nase/tls/nodeA.key:/cert/nodeA.key
- ../../nase/tls/ca.crt:/cert/ca.crt
- .:/profiles
networks:
fredwork:
ipv4_address: 172.26.1.101
......
......@@ -16,8 +16,8 @@ COPY go.sum .
RUN go mod download
COPY cmd/simpletrigger cmd/simpletrigger
COPY pkg pkg
COPY cmd cmd
COPY proto proto
# Static build required so that we can safely copy the binary over.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment