From baed60e19ba5e58271cbe30aecb09404716e4a99 Mon Sep 17 00:00:00 2001 From: Quentin Machu Date: Sat, 23 Jan 2016 22:02:34 -0500 Subject: [PATCH] prometheus: add initial Prometheus support --- database/pgsql/feature.go | 39 +- database/pgsql/keyvalue.go | 5 + database/pgsql/layer.go | 26 + database/pgsql/lock.go | 8 + database/pgsql/namespace.go | 5 + database/pgsql/notification.go | 11 + database/pgsql/pgsql.go | 25 +- database/pgsql/vulnerability.go | 24 +- grafana.json | 851 ++++++++++++++++++++++++++++++++ notifier/notifier.go | 17 +- updater/updater.go | 52 +- utils/prometheus.go | 13 + 12 files changed, 1048 insertions(+), 28 deletions(-) create mode 100644 grafana.json create mode 100644 utils/prometheus.go diff --git a/database/pgsql/feature.go b/database/pgsql/feature.go index 84c2cc10..c87d1ce2 100644 --- a/database/pgsql/feature.go +++ b/database/pgsql/feature.go @@ -16,6 +16,7 @@ package pgsql import ( "database/sql" + "time" "github.com/coreos/clair/database" cerrors "github.com/coreos/clair/utils/errors" @@ -27,15 +28,19 @@ func (pgSQL *pgSQL) insertFeature(feature database.Feature) (int, error) { return 0, cerrors.NewBadRequestError("could not find/insert invalid Feature") } + // Do cache lookup. if pgSQL.cache != nil { - promCacheQueriesTotal.WithLabelValues("feature").Inc() + promCacheQueriesTotal.WithLabelValues("feature").Inc() id, found := pgSQL.cache.Get("feature:" + feature.Namespace.Name + ":" + feature.Name) if found { - promCacheHitsTotal.WithLabelValues("feature").Inc() + promCacheHitsTotal.WithLabelValues("feature").Inc() return id.(int), nil } } + // We do `defer observeQueryTime` here because we don't want to observe cached features. + defer observeQueryTime("insertFeature", "all", time.Now()) + // Find or create Namespace. namespaceID, err := pgSQL.insertNamespace(feature.Namespace) if err != nil { @@ -61,21 +66,29 @@ func (pgSQL *pgSQL) insertFeatureVersion(featureVersion database.FeatureVersion) return 0, cerrors.NewBadRequestError("could not find/insert invalid FeatureVersion") } + // Do cache lookup. if pgSQL.cache != nil { - promCacheQueriesTotal.WithLabelValues("featureversion").Inc() + promCacheQueriesTotal.WithLabelValues("featureversion").Inc() id, found := pgSQL.cache.Get("featureversion:" + featureVersion.Feature.Namespace.Name + ":" + featureVersion.Feature.Name + ":" + featureVersion.Version.String()) if found { - promCacheHitsTotal.WithLabelValues("featureversion").Inc() + promCacheHitsTotal.WithLabelValues("featureversion").Inc() return id.(int), nil } } + // We do `defer observeQueryTime` here because we don't want to observe cached featureversions. + defer observeQueryTime("insertFeatureVersion", "all", time.Now()) + // Find or create Feature first. + t := time.Now() featureID, err := pgSQL.insertFeature(featureVersion.Feature) + observeQueryTime("insertFeatureVersion", "insertFeature", t) + if err != nil { return 0, err } + featureVersion.Feature.ID = featureID // Begin transaction. @@ -85,10 +98,14 @@ func (pgSQL *pgSQL) insertFeatureVersion(featureVersion database.FeatureVersion) return 0, handleError("insertFeatureVersion.Begin()", err) } - // Set transaction as SERIALIZABLE. - // This is how we ensure that the data in Vulnerability_Affects_FeatureVersion is always - // consistent. + // Lock Vulnerability_Affects_FeatureVersion exclusively. + // We want to prevent InsertVulnerability to modify it. + promConcurrentLockVAFV.Inc() + defer promConcurrentLockVAFV.Dec() + t = time.Now() _, err = tx.Exec(getQuery("l_vulnerability_affects_featureversion")) + observeQueryTime("insertFeatureVersion", "lock", t) + if err != nil { tx.Rollback() return 0, handleError("insertFeatureVersion.l_vulnerability_affects_featureversion", err) @@ -96,12 +113,17 @@ func (pgSQL *pgSQL) insertFeatureVersion(featureVersion database.FeatureVersion) // Find or create FeatureVersion. var newOrExisting string + + t = time.Now() err = tx.QueryRow(getQuery("soi_featureversion"), featureID, &featureVersion.Version). Scan(&newOrExisting, &featureVersion.ID) + observeQueryTime("insertFeatureVersion", "soi_featureversion", t) + if err != nil { tx.Rollback() return 0, handleError("soi_featureversion", err) } + if newOrExisting == "exi" { // That featureVersion already exists, return its id. tx.Commit() @@ -110,7 +132,10 @@ func (pgSQL *pgSQL) insertFeatureVersion(featureVersion database.FeatureVersion) // Link the new FeatureVersion with every vulnerabilities that affect it, by inserting in // Vulnerability_Affects_FeatureVersion. + t = time.Now() err = linkFeatureVersionToVulnerabilities(tx, featureVersion) + observeQueryTime("insertFeatureVersion", "linkFeatureVersionToVulnerabilities", t) + if err != nil { tx.Rollback() return 0, err diff --git a/database/pgsql/keyvalue.go b/database/pgsql/keyvalue.go index ce16e894..68995ed1 100644 --- a/database/pgsql/keyvalue.go +++ b/database/pgsql/keyvalue.go @@ -16,6 +16,7 @@ package pgsql import ( "database/sql" + "time" cerrors "github.com/coreos/clair/utils/errors" ) @@ -27,6 +28,8 @@ func (pgSQL *pgSQL) InsertKeyValue(key, value string) (err error) { return cerrors.NewBadRequestError("could not insert a flag which has an empty name or value") } + defer observeQueryTime("InsertKeyValue", "all", time.Now()) + // Upsert. // // Note: UPSERT works only on >= PostgreSQL 9.5 which is not yet supported by AWS RDS. @@ -64,6 +67,8 @@ func (pgSQL *pgSQL) InsertKeyValue(key, value string) (err error) { // GetValue reads a single key / value tuple and returns an empty string if the key doesn't exist. func (pgSQL *pgSQL) GetKeyValue(key string) (string, error) { + defer observeQueryTime("GetKeyValue", "all", time.Now()) + var value string err := pgSQL.QueryRow(getQuery("s_keyvalue"), key).Scan(&value) diff --git a/database/pgsql/layer.go b/database/pgsql/layer.go index 3f0e28d7..a0d46773 100644 --- a/database/pgsql/layer.go +++ b/database/pgsql/layer.go @@ -16,6 +16,7 @@ package pgsql import ( "database/sql" + "time" "github.com/coreos/clair/database" "github.com/coreos/clair/utils" @@ -24,6 +25,14 @@ import ( ) func (pgSQL *pgSQL) FindLayer(name string, withFeatures, withVulnerabilities bool) (database.Layer, error) { + subquery := "all" + if withFeatures { + subquery += "/features" + } else if withVulnerabilities { + subquery += "/features+vulnerabilities" + } + defer observeQueryTime("FindLayer", subquery, time.Now()) + // Find the layer var layer database.Layer var parentID zero.Int @@ -31,9 +40,12 @@ func (pgSQL *pgSQL) FindLayer(name string, withFeatures, withVulnerabilities boo var namespaceID zero.Int var namespaceName sql.NullString + t := time.Now() err := pgSQL.QueryRow(getQuery("s_layer"), name). Scan(&layer.ID, &layer.Name, &layer.EngineVersion, &parentID, &parentName, &namespaceID, &namespaceName) + observeQueryTime("FindLayer", "s_layer", t) + if err != nil { return layer, handleError("s_layer", err) } @@ -53,15 +65,22 @@ func (pgSQL *pgSQL) FindLayer(name string, withFeatures, withVulnerabilities boo // Find its features if withFeatures || withVulnerabilities { + t = time.Now() featureVersions, err := pgSQL.getLayerFeatureVersions(layer.ID) + observeQueryTime("FindLayer", "getLayerFeatureVersions", t) + if err != nil { return layer, err } + layer.Features = featureVersions if withVulnerabilities { // Load the vulnerabilities that affect the FeatureVersions. + t = time.Now() err := pgSQL.loadAffectedBy(layer.Features) + observeQueryTime("FindLayer", "loadAffectedBy", t) + if err != nil { return layer, err } @@ -183,6 +202,8 @@ func (pgSQL *pgSQL) loadAffectedBy(featureVersions []database.FeatureVersion) er // been modified. // TODO(Quentin-M): This behavior should be implemented at the Feature detectors level. func (pgSQL *pgSQL) InsertLayer(layer database.Layer) error { + tf := time.Now() + // Verify parameters if layer.Name == "" { log.Warning("could not insert a layer which has an empty Name") @@ -202,6 +223,9 @@ func (pgSQL *pgSQL) InsertLayer(layer database.Layer) error { layer.ID = existingLayer.ID } + // We do `defer observeQueryTime` here because we don't want to observe existing layers. + defer observeQueryTime("InsertLayer", "all", tf) + // Get parent ID. var parentID zero.Int if layer.Parent != nil { @@ -346,6 +370,8 @@ func createNV(features []database.FeatureVersion) (map[string]*database.FeatureV } func (pgSQL *pgSQL) DeleteLayer(name string) error { + defer observeQueryTime("DeleteLayer", "all", time.Now()) + result, err := pgSQL.Exec(getQuery("r_layer"), name) if err != nil { return handleError("r_layer", err) diff --git a/database/pgsql/lock.go b/database/pgsql/lock.go index c9cb2508..82d1c75f 100644 --- a/database/pgsql/lock.go +++ b/database/pgsql/lock.go @@ -30,6 +30,8 @@ func (pgSQL *pgSQL) Lock(name string, owner string, duration time.Duration, rene return false, time.Time{} } + defer observeQueryTime("Lock", "all", time.Now()) + // Prune locks. pgSQL.pruneLocks() @@ -68,6 +70,8 @@ func (pgSQL *pgSQL) Unlock(name, owner string) { return } + defer observeQueryTime("Unlock", "all", time.Now()) + pgSQL.Exec(getQuery("r_lock"), name, owner) } @@ -79,6 +83,8 @@ func (pgSQL *pgSQL) FindLock(name string) (string, time.Time, error) { return "", time.Time{}, cerrors.NewBadRequestError("could not find an invalid lock") } + defer observeQueryTime("FindLock", "all", time.Now()) + var owner string var until time.Time err := pgSQL.QueryRow(getQuery("f_lock"), name).Scan(&owner, &until) @@ -91,6 +97,8 @@ func (pgSQL *pgSQL) FindLock(name string) (string, time.Time, error) { // pruneLocks removes every expired locks from the database func (pgSQL *pgSQL) pruneLocks() { + defer observeQueryTime("pruneLocks", "all", time.Now()) + if _, err := pgSQL.Exec(getQuery("r_lock_expired")); err != nil { handleError("r_lock_expired", err) } diff --git a/database/pgsql/namespace.go b/database/pgsql/namespace.go index 5d7b406e..48538fca 100644 --- a/database/pgsql/namespace.go +++ b/database/pgsql/namespace.go @@ -15,6 +15,8 @@ package pgsql import ( + "time" + "github.com/coreos/clair/database" cerrors "github.com/coreos/clair/utils/errors" ) @@ -32,6 +34,9 @@ func (pgSQL *pgSQL) insertNamespace(namespace database.Namespace) (int, error) { } } + // We do `defer observeQueryTime` here because we don't want to observe cached namespaces. + defer observeQueryTime("insertNamespace", "all", time.Now()) + var id int err := pgSQL.QueryRow(getQuery("soi_namespace"), namespace.Name).Scan(&id) if err != nil { diff --git a/database/pgsql/notification.go b/database/pgsql/notification.go index ed98c17a..6e631708 100644 --- a/database/pgsql/notification.go +++ b/database/pgsql/notification.go @@ -13,6 +13,8 @@ import ( // do it in tx so we won't insert/update a vuln without notification and vice-versa. // name and created doesn't matter. func (pgSQL *pgSQL) insertNotification(tx *sql.Tx, notification database.VulnerabilityNotification) error { + defer observeQueryTime("insertNotification", "all", time.Now()) + // Marshal old and new Vulnerabilities. oldVulnerability, err := json.Marshal(notification.OldVulnerability) if err != nil { @@ -38,6 +40,8 @@ func (pgSQL *pgSQL) insertNotification(tx *sql.Tx, notification database.Vulnera // Get one available notification name (!locked && !deleted && (!notified || notified_but_timed-out)). // Does not fill new/old vuln. func (pgSQL *pgSQL) GetAvailableNotification(renotifyInterval time.Duration) (database.VulnerabilityNotification, error) { + defer observeQueryTime("GetAvailableNotification", "all", time.Now()) + before := time.Now().Add(-renotifyInterval) var notification database.VulnerabilityNotification @@ -51,6 +55,8 @@ func (pgSQL *pgSQL) GetAvailableNotification(renotifyInterval time.Duration) (da } func (pgSQL *pgSQL) GetNotification(name string, limit, page int) (database.VulnerabilityNotification, error) { + defer observeQueryTime("GetNotification", "all", time.Now()) + // Get Notification. var notification database.VulnerabilityNotification var oldVulnerability []byte @@ -74,11 +80,14 @@ func (pgSQL *pgSQL) GetNotification(name string, limit, page int) (database.Vuln } // TODO(Quentin-M): Fill LayersIntroducingVulnerability. + // And time it. return notification, nil } func (pgSQL *pgSQL) SetNotificationNotified(name string) error { + defer observeQueryTime("SetNotificationNotified", "all", time.Now()) + if _, err := pgSQL.Exec(getQuery("u_notification_notified"), name); err != nil { return handleError("u_notification_notified", err) } @@ -86,6 +95,8 @@ func (pgSQL *pgSQL) SetNotificationNotified(name string) error { } func (pgSQL *pgSQL) DeleteNotification(name string) error { + defer observeQueryTime("DeleteNotification", "all", time.Now()) + result, err := pgSQL.Exec(getQuery("r_notification"), name) if err != nil { return handleError("r_notification", err) diff --git a/database/pgsql/pgsql.go b/database/pgsql/pgsql.go index 22bef59a..77ad428d 100644 --- a/database/pgsql/pgsql.go +++ b/database/pgsql/pgsql.go @@ -22,10 +22,12 @@ import ( "path" "runtime" "strings" + "time" "bitbucket.org/liamstask/goose/lib/goose" "github.com/coreos/clair/config" "github.com/coreos/clair/database" + "github.com/coreos/clair/utils" cerrors "github.com/coreos/clair/utils/errors" "github.com/coreos/pkg/capnslog" "github.com/hashicorp/golang-lru" @@ -39,23 +41,36 @@ var ( promErrorsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "clair_pgsql_errors_total", - Help: "Number of errors that PostgreSQL requests generates.", + Help: "Number of errors that PostgreSQL requests generated.", }, []string{"request"}) promCacheHitsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "clair_pgsql_cache_hits_total", - Help: "Number of cache hits that the PostgreSQL backend does.", + Help: "Number of cache hits that the PostgreSQL backend did.", }, []string{"object"}) promCacheQueriesTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "clair_pgsql_cache_queries_total", - Help: "Number of cache queries that the PostgreSQL backend does.", + Help: "Number of cache queries that the PostgreSQL backend did.", }, []string{"object"}) + + promQueryDurationMilliseconds = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "clair_pgsql_query_duration_milliseconds", + Help: "Time it takes to execute the database query.", + }, []string{"query", "subquery"}) + + promConcurrentLockVAFV = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "clair_pgsql_concurrent_lock_vafv_total", + Help: "Number of transactions trying to hold the exclusive Vulnerability_Affects_FeatureVersion lock.", + }) ) func init() { + prometheus.MustRegister(promErrorsTotal) prometheus.MustRegister(promCacheHitsTotal) prometheus.MustRegister(promCacheQueriesTotal) + prometheus.MustRegister(promQueryDurationMilliseconds) + prometheus.MustRegister(promConcurrentLockVAFV) } type pgSQL struct { @@ -240,3 +255,7 @@ func isErrUniqueViolation(err error) bool { pqErr, ok := err.(*pq.Error) return ok && pqErr.Code == "23505" } + +func observeQueryTime(query, subquery string, start time.Time) { + utils.PrometheusObserveTimeMilliseconds(promQueryDurationMilliseconds.WithLabelValues(query, subquery), start) +} diff --git a/database/pgsql/vulnerability.go b/database/pgsql/vulnerability.go index ee3b54d0..b8a71f06 100644 --- a/database/pgsql/vulnerability.go +++ b/database/pgsql/vulnerability.go @@ -17,6 +17,7 @@ package pgsql import ( "database/sql" "fmt" + "time" "github.com/coreos/clair/database" "github.com/coreos/clair/utils" @@ -26,6 +27,8 @@ import ( ) func (pgSQL *pgSQL) FindVulnerability(namespaceName, name string) (database.Vulnerability, error) { + defer observeQueryTime("FindVulnerability", "all", time.Now()) + vulnerability := database.Vulnerability{ Name: name, Namespace: database.Namespace{ @@ -92,6 +95,8 @@ func (pgSQL *pgSQL) InsertVulnerabilities(vulnerabilities []database.Vulnerabili } func (pgSQL *pgSQL) insertVulnerability(vulnerability database.Vulnerability) error { + tf := time.Now() + // Verify parameters if vulnerability.Name == "" || len(vulnerability.FixedIn) == 0 || vulnerability.Namespace.Name == "" || !vulnerability.Severity.IsValid() { @@ -142,6 +147,10 @@ func (pgSQL *pgSQL) insertVulnerability(vulnerability database.Vulnerability) er } } + // We do `defer observeQueryTime` here because we don't want to observe existing & up-to-date + // vulnerabilities. + defer observeQueryTime("insertVulnerability", "all", tf) + // Insert or find the new Features. // We already have the Feature IDs in updatedFixedInFeatureVersions because diffFixedIn fills them // in using the existing vulnerability's FixedIn FeatureVersions. Note that even if FixedIn @@ -166,10 +175,14 @@ func (pgSQL *pgSQL) insertVulnerability(vulnerability database.Vulnerability) er return handleError("insertVulnerability.Begin()", err) } - // Set transaction as SERIALIZABLE. - // This is how we ensure that the data in Vulnerability_Affects_FeatureVersion is always - // consistent. + // Lock Vulnerability_Affects_FeatureVersion exclusively. + // We want to prevent InsertFeatureVersion to modify it. + promConcurrentLockVAFV.Inc() + defer promConcurrentLockVAFV.Dec() + t := time.Now() _, err = tx.Exec(getQuery("l_vulnerability_affects_featureversion")) + observeQueryTime("insertVulnerability", "lock", t) + if err != nil { tx.Rollback() return handleError("insertVulnerability.l_vulnerability_affects_featureversion", err) @@ -206,7 +219,10 @@ func (pgSQL *pgSQL) insertVulnerability(vulnerability database.Vulnerability) er } // Update Vulnerability_FixedIn_Feature and Vulnerability_Affects_FeatureVersion now. + t = time.Now() err = pgSQL.updateVulnerabilityFeatureVersions(tx, &vulnerability, &existingVulnerability, newFixedInFeatureVersions, updatedFixedInFeatureVersions) + observeQueryTime("insertVulnerability", "updateVulnerabilityFeatureVersions", t) + if err != nil { tx.Rollback() return err @@ -371,6 +387,8 @@ func linkVulnerabilityToFeatureVersions(tx *sql.Tx, fixedInID, vulnerabilityID, } func (pgSQL *pgSQL) DeleteVulnerability(namespaceName, name string) error { + defer observeQueryTime("DeleteVulnerability", "all", time.Now()) + result, err := pgSQL.Exec(getQuery("r_vulnerability"), namespaceName, name) if err != nil { return handleError("r_vulnerability", err) diff --git a/grafana.json b/grafana.json new file mode 100644 index 00000000..d20889cd --- /dev/null +++ b/grafana.json @@ -0,0 +1,851 @@ +{ + "id": 1, + "title": "Clair", + "originalTitle": "Clair", + "tags": [], + "style": "dark", + "timezone": "browser", + "editable": true, + "hideControls": false, + "sharedCrosshair": false, + "rows": [ + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": null, + "editable": true, + "error": false, + "fill": 1, + "grid": { + "leftLogBase": 1, + "leftMax": null, + "leftMin": 0, + "rightLogBase": 1, + "rightMax": null, + "rightMin": null, + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 1, + "isNew": true, + "leftYAxisLabel": "", + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(clair_updater_duration_seconds)", + "hide": false, + "interval": "1m", + "intervalFactor": 2, + "legendFormat": "Updater - duration", + "metric": "", + "refId": "A", + "step": 120 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Updater - Duration", + "tooltip": { + "shared": true, + "value_type": "cumulative" + }, + "transparent": false, + "type": "graph", + "x-axis": true, + "y-axis": true, + "y_formats": [ + "s", + "short" + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": null, + "editable": true, + "error": false, + "format": "none", + "id": 3, + "interval": null, + "isNew": true, + "links": [], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "clair_updater_notes_total", + "intervalFactor": 2, + "legendFormat": "", + "metric": "clair_updater_notes_total", + "refId": "A", + "step": 60 + } + ], + "thresholds": "", + "title": "Updater - Number of Notes", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "aliasColors": {}, + "bars": false, + "datasource": null, + "editable": true, + "error": false, + "fill": 1, + "grid": { + "leftLogBase": 1, + "leftMax": null, + "leftMin": 0, + "rightLogBase": 1, + "rightMax": null, + "rightMin": null, + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 2, + "isNew": true, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(clair_updater_errors_total[$rate]))", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Updater - Error rate", + "metric": "clair_updater_errors_total", + "refId": "A", + "step": 2 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Updater - Error rate", + "tooltip": { + "shared": true, + "value_type": "cumulative" + }, + "type": "graph", + "x-axis": true, + "y-axis": true, + "y_formats": [ + "short", + "short" + ] + } + ], + "showTitle": true, + "title": "Updater" + }, + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": null, + "editable": true, + "error": false, + "fill": 1, + "grid": { + "leftLogBase": 1, + "leftMax": null, + "leftMin": 0, + "rightLogBase": 1, + "rightMax": null, + "rightMin": null, + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 4, + "isNew": true, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 8, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(clair_notifier_latency_seconds_sum[$rate]))/sum(rate(clair_notifier_latency_seconds_count[$rate]))", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Notifier - Latency", + "refId": "A", + "step": 2 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Notifier - Latency", + "tooltip": { + "shared": true, + "value_type": "cumulative" + }, + "type": "graph", + "x-axis": true, + "y-axis": true, + "y_formats": [ + "ms", + "short" + ] + }, + { + "aliasColors": {}, + "bars": false, + "datasource": null, + "editable": true, + "error": false, + "fill": 1, + "grid": { + "leftLogBase": 1, + "leftMax": null, + "leftMin": 0, + "rightLogBase": 1, + "rightMax": null, + "rightMin": null, + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 5, + "isNew": true, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(clair_notifier_backend_errors_total[$rate]))", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Notifier - Error rate", + "metric": "", + "refId": "A", + "step": 2 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Notifier - Error rate", + "tooltip": { + "shared": true, + "value_type": "cumulative" + }, + "type": "graph", + "x-axis": true, + "y-axis": true, + "y_formats": [ + "short", + "short" + ] + } + ], + "showTitle": true, + "title": "Notifier" + }, + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": null, + "editable": true, + "error": false, + "fill": 1, + "grid": { + "leftLogBase": 1, + "leftMax": 100, + "leftMin": 0, + "rightLogBase": 1, + "rightMax": null, + "rightMin": null, + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 7, + "isNew": true, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 8, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(clair_pgsql_cache_hits_total[$rate])) by (object) / sum(rate(clair_pgsql_cache_queries_total[$rate])) by (object) * 100", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{object}}", + "metric": "clair_pgsql_cache_queries_total", + "refId": "A", + "step": 2 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "PostgreSQL - Cache Hit Rate", + "tooltip": { + "shared": true, + "value_type": "cumulative" + }, + "type": "graph", + "x-axis": true, + "y-axis": true, + "y_formats": [ + "percent", + "short" + ] + }, + { + "aliasColors": {}, + "bars": false, + "datasource": null, + "editable": true, + "error": false, + "fill": 1, + "grid": { + "leftLogBase": 1, + "leftMax": null, + "leftMin": 0, + "rightLogBase": 1, + "rightMax": null, + "rightMin": null, + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 6, + "isNew": true, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(clair_pgsql_errors_total[$rate]))", + "interval": "", + "intervalFactor": 2, + "legendFormat": "PostgreSQL - Error rate", + "metric": "clair_updater_errors_total", + "refId": "A", + "step": 2 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "PostgreSQL - Error rate", + "tooltip": { + "shared": true, + "value_type": "cumulative" + }, + "type": "graph", + "x-axis": true, + "y-axis": true, + "y_formats": [ + "short", + "short" + ] + } + ], + "showTitle": true, + "title": "PostgreSQL" + }, + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": null, + "editable": true, + "error": false, + "fill": 1, + "grid": { + "leftLogBase": 1, + "leftMax": null, + "leftMin": 0, + "rightLogBase": 1, + "rightMax": null, + "rightMin": null, + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 8, + "isNew": true, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(clair_pgsql_query_duration_milliseconds_sum[$rate])) by (query, subquery) / sum(rate(clair_pgsql_query_duration_milliseconds_count[$rate])) by (query, subquery)", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{query}} - {{subquery}}", + "metric": "", + "refId": "A", + "step": 2 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "PostgreSQL - Query Duration", + "tooltip": { + "shared": true, + "value_type": "cumulative" + }, + "type": "graph", + "x-axis": true, + "y-axis": true, + "y_formats": [ + "ms", + "short" + ] + } + ], + "title": "PostgreSQL" + }, + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": null, + "editable": true, + "error": false, + "fill": 1, + "grid": { + "leftLogBase": 1, + "leftMax": null, + "leftMin": 0, + "rightLogBase": 1, + "rightMax": null, + "rightMin": 0, + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 9, + "isNew": true, + "leftYAxisLabel": "", + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "rightYAxisLabel": "", + "seriesOverrides": [ + { + "alias": "Concurrent transactions", + "yaxis": 2 + } + ], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(clair_pgsql_query_duration_milliseconds_sum{query=\"insertFeatureVersion\", subquery=\"lock\"}[$rate])) / sum(rate(clair_pgsql_query_duration_milliseconds_count{query=\"insertFeatureVersion\", subquery=\"lock\"}[$rate]))", + "intervalFactor": 2, + "legendFormat": "insertFeatureVersion wait duration", + "refId": "A", + "step": 2 + }, + { + "expr": "sum(rate(clair_pgsql_query_duration_milliseconds_sum{query=\"insertVulnerability\", subquery=\"lock\"}[$rate])) / sum(rate(clair_pgsql_query_duration_milliseconds_count{query=\"insertVulnerability\", subquery=\"lock\"}[$rate]))", + "intervalFactor": 2, + "legendFormat": "insertVulnerability wait duration", + "refId": "B", + "step": 2 + }, + { + "expr": "sum(clair_pgsql_concurrent_lock_vafv_total)", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Concurrent transactions", + "refId": "C", + "step": 2 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Lock VAFV", + "tooltip": { + "shared": true, + "value_type": "cumulative" + }, + "type": "graph", + "x-axis": true, + "y-axis": true, + "y_formats": [ + "ms", + "short" + ] + } + ], + "title": "PostgreSQL" + } + ], + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": { + "now": true, + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "templating": { + "list": [ + { + "allFormat": "glob", + "auto": true, + "auto_count": 5, + "current": { + "tags": [], + "text": "15s", + "value": "15s" + }, + "datasource": null, + "hideLabel": false, + "includeAll": false, + "label": "Rate", + "multi": false, + "multiFormat": "glob", + "name": "rate", + "options": [ + { + "selected": false, + "text": "auto", + "value": "$__auto_interval" + }, + { + "selected": false, + "text": "5s", + "value": "5s" + }, + { + "selected": true, + "text": "15s", + "value": "15s" + }, + { + "selected": false, + "text": "30s", + "value": "30s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "3h", + "value": "3h" + }, + { + "selected": false, + "text": "6h", + "value": "6h" + }, + { + "selected": false, + "text": "12h", + "value": "12h" + }, + { + "selected": false, + "text": "1d", + "value": "1d" + }, + { + "selected": false, + "text": "7d", + "value": "7d" + }, + { + "selected": false, + "text": "30d", + "value": "30d" + } + ], + "query": "5s, 15s, 30s, 1m,5m,10m,30m,1h,3h,6h,12h,1d,7d,30d", + "refresh": false, + "type": "interval" + }, + { + "allFormat": "regex wildcard", + "current": { + "tags": [], + "text": "All", + "value": ".*" + }, + "datasource": null, + "includeAll": true, + "label": "Instance", + "multi": true, + "multiFormat": "glob", + "name": "instance", + "options": [ + { + "selected": true, + "text": "All", + "value": ".*" + }, + { + "selected": false, + "text": "192.168.99.1:6061", + "value": "192\\.168\\.99\\.1\\:6061" + } + ], + "query": "label_values(instance)", + "refresh": false, + "regex": "", + "type": "query", + "useTags": false + } + ] + }, + "annotations": { + "list": [] + }, + "refresh": "5s", + "schemaVersion": 8, + "version": 17, + "links": [] +} diff --git a/notifier/notifier.go b/notifier/notifier.go index 0590198d..081c76e2 100644 --- a/notifier/notifier.go +++ b/notifier/notifier.go @@ -42,10 +42,15 @@ var ( notifiers = make(map[string]Notifier) - promNotifierLatencySeconds = prometheus.NewGauge(prometheus.GaugeOpts{ - Name: "clair_notifier_latency_seconds", + promNotifierLatencyMilliseconds = prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: "clair_notifier_latency_milliseconds", Help: "Time it takes to send a notification after it's been created.", }) + + promNotifierBackendErrorsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "clair_notifier_backend_errors_total", + Help: "Number of errors that notifier backends generated.", + }, []string{"backend"}) ) // Notifier represents anything that can transmit notifications. @@ -57,6 +62,11 @@ type Notifier interface { Send(notification database.VulnerabilityNotification) error } +func init() { + prometheus.MustRegister(promNotifierLatencyMilliseconds) + prometheus.MustRegister(promNotifierBackendErrorsTotal) +} + // RegisterNotifier makes a Fetcher available by the provided name. // If Register is called twice with the same name or if driver is nil, // it panics. @@ -114,7 +124,7 @@ func Run(config *config.NotifierConfig, datastore database.Datastore, stopper *u go func() { success, interrupted := handleTask(*notification, stopper, config.Attempts) if success { - promNotifierLatencySeconds.Set(float64(time.Now().Sub(notification.Created))) + utils.PrometheusObserveTimeMilliseconds(promNotifierLatencyMilliseconds, notification.Created) datastore.SetNotificationNotified(notification.Name) } if interrupted { @@ -188,6 +198,7 @@ func handleTask(notification database.VulnerabilityNotification, st *utils.Stopp // Send using the current notifier. if err := notifier.Send(notification); err != nil { // Send failed; increase attempts/backoff and retry. + promNotifierBackendErrorsTotal.WithLabelValues(notifierName).Inc() log.Errorf("could not send notification '%s' to notifier '%s': %s", notification.Name, notifierName, err) backOff = timeutil.ExpBackoff(backOff, maxBackOff) attempts++ diff --git a/updater/updater.go b/updater/updater.go index f1d5d2ef..06c24405 100644 --- a/updater/updater.go +++ b/updater/updater.go @@ -17,7 +17,6 @@ package updater import ( - "encoding/json" "math/rand" "strconv" "time" @@ -27,6 +26,7 @@ import ( "github.com/coreos/clair/utils" "github.com/coreos/pkg/capnslog" "github.com/pborman/uuid" + "github.com/prometheus/client_golang/prometheus" ) const ( @@ -38,7 +38,30 @@ const ( refreshLockDuration = time.Minute * 8 ) -var log = capnslog.NewPackageLogger("github.com/coreos/clair", "updater") +var ( + log = capnslog.NewPackageLogger("github.com/coreos/clair", "updater") + + promUpdaterErrorsTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "clair_updater_errors_total", + Help: "Numbers of errors that the updater generated.", + }) + + promUpdaterDurationSeconds = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "clair_updater_duration_seconds", + Help: "Time it takes to update the vulnerability database.", + }) + + promUpdaterNotesTotal = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "clair_updater_notes_total", + Help: "Number of notes that the vulnerability fetchers generated.", + }) +) + +func init() { + prometheus.MustRegister(promUpdaterErrorsTotal) + prometheus.MustRegister(promUpdaterDurationSeconds) + prometheus.MustRegister(promUpdaterNotesTotal) +} // Run updates the vulnerability database at regular intervals. func Run(config *config.UpdaterConfig, datastore database.Datastore, st *utils.Stopper) { @@ -126,6 +149,8 @@ func Run(config *config.UpdaterConfig, datastore database.Datastore, st *utils.S // Update fetches all the vulnerabilities from the registered fetchers, upserts // them into the database and then sends notifications. func Update(datastore database.Datastore) { + defer setUpdaterDuration(time.Now()) + log.Info("updating vulnerabilities") // Fetch updates. @@ -137,26 +162,35 @@ func Update(datastore database.Datastore) { log.Tracef("beginning insertion of %d vulnerabilities for update", len(vulnerabilities)) err := datastore.InsertVulnerabilities(vulnerabilities) if err != nil { + promUpdaterErrorsTotal.Inc() log.Errorf("an error occured when inserting vulnerabilities for update: %s", err) return } vulnerabilities = nil - // Update flags and notes. + // Update flags. for flagName, flagValue := range flags { datastore.InsertKeyValue(flagName, flagValue) } - bnotes, _ := json.Marshal(notes) - datastore.InsertKeyValue(notesFlagName, string(bnotes)) + // Log notes. + for _, note := range notes { + log.Warningf("fetcher note: %s", note) + } + promUpdaterNotesTotal.Set(float64(len(notes))) // Update last successful update if every fetchers worked properly. if status { datastore.InsertKeyValue(flagName, strconv.FormatInt(time.Now().UTC().Unix(), 10)) } + log.Info("update finished") } +func setUpdaterDuration(start time.Time) { + promUpdaterDurationSeconds.Set(time.Since(start).Seconds()) +} + // fetch get data from the registered fetchers, in parallel. func fetch(datastore database.Datastore) (bool, []database.Vulnerability, map[string]string, []string) { var vulnerabilities []database.Vulnerability @@ -170,6 +204,7 @@ func fetch(datastore database.Datastore) (bool, []database.Vulnerability, map[st go func(name string, fetcher Fetcher) { response, err := fetcher.FetchUpdate(datastore) if err != nil { + promUpdaterErrorsTotal.Inc() log.Errorf("an error occured when fetching update '%s': %s.", name, err) status = false responseC <- nil @@ -204,10 +239,3 @@ func getLastUpdate(datastore database.Datastore) time.Time { } return time.Time{} } - -func getNotes(datastore database.Datastore) (notes []string) { - if jsonNotes, err := datastore.GetKeyValue(notesFlagName); err == nil && jsonNotes != "" { - json.Unmarshal([]byte(jsonNotes), notes) - } - return -} diff --git a/utils/prometheus.go b/utils/prometheus.go new file mode 100644 index 00000000..decd1a9e --- /dev/null +++ b/utils/prometheus.go @@ -0,0 +1,13 @@ +package utils + +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" +) + +// PrometheusObserveTimeMilliseconds observes the elapsed time since start, in milliseconds, +// on the specified Prometheus Histogram. +func PrometheusObserveTimeMilliseconds(h prometheus.Histogram, start time.Time) { + h.Observe(float64(time.Since(start).Nanoseconds()) / float64(time.Millisecond)) +}