prometheus: add initial Prometheus support

This commit is contained in:
Quentin Machu 2016-01-23 22:02:34 -05:00 committed by Jimmy Zelinskie
parent ad0531acc7
commit baed60e19b
12 changed files with 1048 additions and 28 deletions

View File

@ -16,6 +16,7 @@ package pgsql
import (
"database/sql"
"time"
"github.com/coreos/clair/database"
cerrors "github.com/coreos/clair/utils/errors"
@ -27,15 +28,19 @@ func (pgSQL *pgSQL) insertFeature(feature database.Feature) (int, error) {
return 0, cerrors.NewBadRequestError("could not find/insert invalid Feature")
}
// Do cache lookup.
if pgSQL.cache != nil {
promCacheQueriesTotal.WithLabelValues("feature").Inc()
promCacheQueriesTotal.WithLabelValues("feature").Inc()
id, found := pgSQL.cache.Get("feature:" + feature.Namespace.Name + ":" + feature.Name)
if found {
promCacheHitsTotal.WithLabelValues("feature").Inc()
promCacheHitsTotal.WithLabelValues("feature").Inc()
return id.(int), nil
}
}
// We do `defer observeQueryTime` here because we don't want to observe cached features.
defer observeQueryTime("insertFeature", "all", time.Now())
// Find or create Namespace.
namespaceID, err := pgSQL.insertNamespace(feature.Namespace)
if err != nil {
@ -61,21 +66,29 @@ func (pgSQL *pgSQL) insertFeatureVersion(featureVersion database.FeatureVersion)
return 0, cerrors.NewBadRequestError("could not find/insert invalid FeatureVersion")
}
// Do cache lookup.
if pgSQL.cache != nil {
promCacheQueriesTotal.WithLabelValues("featureversion").Inc()
promCacheQueriesTotal.WithLabelValues("featureversion").Inc()
id, found := pgSQL.cache.Get("featureversion:" + featureVersion.Feature.Namespace.Name + ":" +
featureVersion.Feature.Name + ":" + featureVersion.Version.String())
if found {
promCacheHitsTotal.WithLabelValues("featureversion").Inc()
promCacheHitsTotal.WithLabelValues("featureversion").Inc()
return id.(int), nil
}
}
// We do `defer observeQueryTime` here because we don't want to observe cached featureversions.
defer observeQueryTime("insertFeatureVersion", "all", time.Now())
// Find or create Feature first.
t := time.Now()
featureID, err := pgSQL.insertFeature(featureVersion.Feature)
observeQueryTime("insertFeatureVersion", "insertFeature", t)
if err != nil {
return 0, err
}
featureVersion.Feature.ID = featureID
// Begin transaction.
@ -85,10 +98,14 @@ func (pgSQL *pgSQL) insertFeatureVersion(featureVersion database.FeatureVersion)
return 0, handleError("insertFeatureVersion.Begin()", err)
}
// Set transaction as SERIALIZABLE.
// This is how we ensure that the data in Vulnerability_Affects_FeatureVersion is always
// consistent.
// Lock Vulnerability_Affects_FeatureVersion exclusively.
// We want to prevent InsertVulnerability to modify it.
promConcurrentLockVAFV.Inc()
defer promConcurrentLockVAFV.Dec()
t = time.Now()
_, err = tx.Exec(getQuery("l_vulnerability_affects_featureversion"))
observeQueryTime("insertFeatureVersion", "lock", t)
if err != nil {
tx.Rollback()
return 0, handleError("insertFeatureVersion.l_vulnerability_affects_featureversion", err)
@ -96,12 +113,17 @@ func (pgSQL *pgSQL) insertFeatureVersion(featureVersion database.FeatureVersion)
// Find or create FeatureVersion.
var newOrExisting string
t = time.Now()
err = tx.QueryRow(getQuery("soi_featureversion"), featureID, &featureVersion.Version).
Scan(&newOrExisting, &featureVersion.ID)
observeQueryTime("insertFeatureVersion", "soi_featureversion", t)
if err != nil {
tx.Rollback()
return 0, handleError("soi_featureversion", err)
}
if newOrExisting == "exi" {
// That featureVersion already exists, return its id.
tx.Commit()
@ -110,7 +132,10 @@ func (pgSQL *pgSQL) insertFeatureVersion(featureVersion database.FeatureVersion)
// Link the new FeatureVersion with every vulnerabilities that affect it, by inserting in
// Vulnerability_Affects_FeatureVersion.
t = time.Now()
err = linkFeatureVersionToVulnerabilities(tx, featureVersion)
observeQueryTime("insertFeatureVersion", "linkFeatureVersionToVulnerabilities", t)
if err != nil {
tx.Rollback()
return 0, err

View File

@ -16,6 +16,7 @@ package pgsql
import (
"database/sql"
"time"
cerrors "github.com/coreos/clair/utils/errors"
)
@ -27,6 +28,8 @@ func (pgSQL *pgSQL) InsertKeyValue(key, value string) (err error) {
return cerrors.NewBadRequestError("could not insert a flag which has an empty name or value")
}
defer observeQueryTime("InsertKeyValue", "all", time.Now())
// Upsert.
//
// Note: UPSERT works only on >= PostgreSQL 9.5 which is not yet supported by AWS RDS.
@ -64,6 +67,8 @@ func (pgSQL *pgSQL) InsertKeyValue(key, value string) (err error) {
// GetValue reads a single key / value tuple and returns an empty string if the key doesn't exist.
func (pgSQL *pgSQL) GetKeyValue(key string) (string, error) {
defer observeQueryTime("GetKeyValue", "all", time.Now())
var value string
err := pgSQL.QueryRow(getQuery("s_keyvalue"), key).Scan(&value)

View File

@ -16,6 +16,7 @@ package pgsql
import (
"database/sql"
"time"
"github.com/coreos/clair/database"
"github.com/coreos/clair/utils"
@ -24,6 +25,14 @@ import (
)
func (pgSQL *pgSQL) FindLayer(name string, withFeatures, withVulnerabilities bool) (database.Layer, error) {
subquery := "all"
if withFeatures {
subquery += "/features"
} else if withVulnerabilities {
subquery += "/features+vulnerabilities"
}
defer observeQueryTime("FindLayer", subquery, time.Now())
// Find the layer
var layer database.Layer
var parentID zero.Int
@ -31,9 +40,12 @@ func (pgSQL *pgSQL) FindLayer(name string, withFeatures, withVulnerabilities boo
var namespaceID zero.Int
var namespaceName sql.NullString
t := time.Now()
err := pgSQL.QueryRow(getQuery("s_layer"), name).
Scan(&layer.ID, &layer.Name, &layer.EngineVersion, &parentID, &parentName, &namespaceID,
&namespaceName)
observeQueryTime("FindLayer", "s_layer", t)
if err != nil {
return layer, handleError("s_layer", err)
}
@ -53,15 +65,22 @@ func (pgSQL *pgSQL) FindLayer(name string, withFeatures, withVulnerabilities boo
// Find its features
if withFeatures || withVulnerabilities {
t = time.Now()
featureVersions, err := pgSQL.getLayerFeatureVersions(layer.ID)
observeQueryTime("FindLayer", "getLayerFeatureVersions", t)
if err != nil {
return layer, err
}
layer.Features = featureVersions
if withVulnerabilities {
// Load the vulnerabilities that affect the FeatureVersions.
t = time.Now()
err := pgSQL.loadAffectedBy(layer.Features)
observeQueryTime("FindLayer", "loadAffectedBy", t)
if err != nil {
return layer, err
}
@ -183,6 +202,8 @@ func (pgSQL *pgSQL) loadAffectedBy(featureVersions []database.FeatureVersion) er
// been modified.
// TODO(Quentin-M): This behavior should be implemented at the Feature detectors level.
func (pgSQL *pgSQL) InsertLayer(layer database.Layer) error {
tf := time.Now()
// Verify parameters
if layer.Name == "" {
log.Warning("could not insert a layer which has an empty Name")
@ -202,6 +223,9 @@ func (pgSQL *pgSQL) InsertLayer(layer database.Layer) error {
layer.ID = existingLayer.ID
}
// We do `defer observeQueryTime` here because we don't want to observe existing layers.
defer observeQueryTime("InsertLayer", "all", tf)
// Get parent ID.
var parentID zero.Int
if layer.Parent != nil {
@ -346,6 +370,8 @@ func createNV(features []database.FeatureVersion) (map[string]*database.FeatureV
}
func (pgSQL *pgSQL) DeleteLayer(name string) error {
defer observeQueryTime("DeleteLayer", "all", time.Now())
result, err := pgSQL.Exec(getQuery("r_layer"), name)
if err != nil {
return handleError("r_layer", err)

View File

@ -30,6 +30,8 @@ func (pgSQL *pgSQL) Lock(name string, owner string, duration time.Duration, rene
return false, time.Time{}
}
defer observeQueryTime("Lock", "all", time.Now())
// Prune locks.
pgSQL.pruneLocks()
@ -68,6 +70,8 @@ func (pgSQL *pgSQL) Unlock(name, owner string) {
return
}
defer observeQueryTime("Unlock", "all", time.Now())
pgSQL.Exec(getQuery("r_lock"), name, owner)
}
@ -79,6 +83,8 @@ func (pgSQL *pgSQL) FindLock(name string) (string, time.Time, error) {
return "", time.Time{}, cerrors.NewBadRequestError("could not find an invalid lock")
}
defer observeQueryTime("FindLock", "all", time.Now())
var owner string
var until time.Time
err := pgSQL.QueryRow(getQuery("f_lock"), name).Scan(&owner, &until)
@ -91,6 +97,8 @@ func (pgSQL *pgSQL) FindLock(name string) (string, time.Time, error) {
// pruneLocks removes every expired locks from the database
func (pgSQL *pgSQL) pruneLocks() {
defer observeQueryTime("pruneLocks", "all", time.Now())
if _, err := pgSQL.Exec(getQuery("r_lock_expired")); err != nil {
handleError("r_lock_expired", err)
}

View File

@ -15,6 +15,8 @@
package pgsql
import (
"time"
"github.com/coreos/clair/database"
cerrors "github.com/coreos/clair/utils/errors"
)
@ -32,6 +34,9 @@ func (pgSQL *pgSQL) insertNamespace(namespace database.Namespace) (int, error) {
}
}
// We do `defer observeQueryTime` here because we don't want to observe cached namespaces.
defer observeQueryTime("insertNamespace", "all", time.Now())
var id int
err := pgSQL.QueryRow(getQuery("soi_namespace"), namespace.Name).Scan(&id)
if err != nil {

View File

@ -13,6 +13,8 @@ import (
// do it in tx so we won't insert/update a vuln without notification and vice-versa.
// name and created doesn't matter.
func (pgSQL *pgSQL) insertNotification(tx *sql.Tx, notification database.VulnerabilityNotification) error {
defer observeQueryTime("insertNotification", "all", time.Now())
// Marshal old and new Vulnerabilities.
oldVulnerability, err := json.Marshal(notification.OldVulnerability)
if err != nil {
@ -38,6 +40,8 @@ func (pgSQL *pgSQL) insertNotification(tx *sql.Tx, notification database.Vulnera
// Get one available notification name (!locked && !deleted && (!notified || notified_but_timed-out)).
// Does not fill new/old vuln.
func (pgSQL *pgSQL) GetAvailableNotification(renotifyInterval time.Duration) (database.VulnerabilityNotification, error) {
defer observeQueryTime("GetAvailableNotification", "all", time.Now())
before := time.Now().Add(-renotifyInterval)
var notification database.VulnerabilityNotification
@ -51,6 +55,8 @@ func (pgSQL *pgSQL) GetAvailableNotification(renotifyInterval time.Duration) (da
}
func (pgSQL *pgSQL) GetNotification(name string, limit, page int) (database.VulnerabilityNotification, error) {
defer observeQueryTime("GetNotification", "all", time.Now())
// Get Notification.
var notification database.VulnerabilityNotification
var oldVulnerability []byte
@ -74,11 +80,14 @@ func (pgSQL *pgSQL) GetNotification(name string, limit, page int) (database.Vuln
}
// TODO(Quentin-M): Fill LayersIntroducingVulnerability.
// And time it.
return notification, nil
}
func (pgSQL *pgSQL) SetNotificationNotified(name string) error {
defer observeQueryTime("SetNotificationNotified", "all", time.Now())
if _, err := pgSQL.Exec(getQuery("u_notification_notified"), name); err != nil {
return handleError("u_notification_notified", err)
}
@ -86,6 +95,8 @@ func (pgSQL *pgSQL) SetNotificationNotified(name string) error {
}
func (pgSQL *pgSQL) DeleteNotification(name string) error {
defer observeQueryTime("DeleteNotification", "all", time.Now())
result, err := pgSQL.Exec(getQuery("r_notification"), name)
if err != nil {
return handleError("r_notification", err)

View File

@ -22,10 +22,12 @@ import (
"path"
"runtime"
"strings"
"time"
"bitbucket.org/liamstask/goose/lib/goose"
"github.com/coreos/clair/config"
"github.com/coreos/clair/database"
"github.com/coreos/clair/utils"
cerrors "github.com/coreos/clair/utils/errors"
"github.com/coreos/pkg/capnslog"
"github.com/hashicorp/golang-lru"
@ -39,23 +41,36 @@ var (
promErrorsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "clair_pgsql_errors_total",
Help: "Number of errors that PostgreSQL requests generates.",
Help: "Number of errors that PostgreSQL requests generated.",
}, []string{"request"})
promCacheHitsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "clair_pgsql_cache_hits_total",
Help: "Number of cache hits that the PostgreSQL backend does.",
Help: "Number of cache hits that the PostgreSQL backend did.",
}, []string{"object"})
promCacheQueriesTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "clair_pgsql_cache_queries_total",
Help: "Number of cache queries that the PostgreSQL backend does.",
Help: "Number of cache queries that the PostgreSQL backend did.",
}, []string{"object"})
promQueryDurationMilliseconds = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: "clair_pgsql_query_duration_milliseconds",
Help: "Time it takes to execute the database query.",
}, []string{"query", "subquery"})
promConcurrentLockVAFV = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "clair_pgsql_concurrent_lock_vafv_total",
Help: "Number of transactions trying to hold the exclusive Vulnerability_Affects_FeatureVersion lock.",
})
)
func init() {
prometheus.MustRegister(promErrorsTotal)
prometheus.MustRegister(promCacheHitsTotal)
prometheus.MustRegister(promCacheQueriesTotal)
prometheus.MustRegister(promQueryDurationMilliseconds)
prometheus.MustRegister(promConcurrentLockVAFV)
}
type pgSQL struct {
@ -240,3 +255,7 @@ func isErrUniqueViolation(err error) bool {
pqErr, ok := err.(*pq.Error)
return ok && pqErr.Code == "23505"
}
func observeQueryTime(query, subquery string, start time.Time) {
utils.PrometheusObserveTimeMilliseconds(promQueryDurationMilliseconds.WithLabelValues(query, subquery), start)
}

View File

@ -17,6 +17,7 @@ package pgsql
import (
"database/sql"
"fmt"
"time"
"github.com/coreos/clair/database"
"github.com/coreos/clair/utils"
@ -26,6 +27,8 @@ import (
)
func (pgSQL *pgSQL) FindVulnerability(namespaceName, name string) (database.Vulnerability, error) {
defer observeQueryTime("FindVulnerability", "all", time.Now())
vulnerability := database.Vulnerability{
Name: name,
Namespace: database.Namespace{
@ -92,6 +95,8 @@ func (pgSQL *pgSQL) InsertVulnerabilities(vulnerabilities []database.Vulnerabili
}
func (pgSQL *pgSQL) insertVulnerability(vulnerability database.Vulnerability) error {
tf := time.Now()
// Verify parameters
if vulnerability.Name == "" || len(vulnerability.FixedIn) == 0 ||
vulnerability.Namespace.Name == "" || !vulnerability.Severity.IsValid() {
@ -142,6 +147,10 @@ func (pgSQL *pgSQL) insertVulnerability(vulnerability database.Vulnerability) er
}
}
// We do `defer observeQueryTime` here because we don't want to observe existing & up-to-date
// vulnerabilities.
defer observeQueryTime("insertVulnerability", "all", tf)
// Insert or find the new Features.
// We already have the Feature IDs in updatedFixedInFeatureVersions because diffFixedIn fills them
// in using the existing vulnerability's FixedIn FeatureVersions. Note that even if FixedIn
@ -166,10 +175,14 @@ func (pgSQL *pgSQL) insertVulnerability(vulnerability database.Vulnerability) er
return handleError("insertVulnerability.Begin()", err)
}
// Set transaction as SERIALIZABLE.
// This is how we ensure that the data in Vulnerability_Affects_FeatureVersion is always
// consistent.
// Lock Vulnerability_Affects_FeatureVersion exclusively.
// We want to prevent InsertFeatureVersion to modify it.
promConcurrentLockVAFV.Inc()
defer promConcurrentLockVAFV.Dec()
t := time.Now()
_, err = tx.Exec(getQuery("l_vulnerability_affects_featureversion"))
observeQueryTime("insertVulnerability", "lock", t)
if err != nil {
tx.Rollback()
return handleError("insertVulnerability.l_vulnerability_affects_featureversion", err)
@ -206,7 +219,10 @@ func (pgSQL *pgSQL) insertVulnerability(vulnerability database.Vulnerability) er
}
// Update Vulnerability_FixedIn_Feature and Vulnerability_Affects_FeatureVersion now.
t = time.Now()
err = pgSQL.updateVulnerabilityFeatureVersions(tx, &vulnerability, &existingVulnerability, newFixedInFeatureVersions, updatedFixedInFeatureVersions)
observeQueryTime("insertVulnerability", "updateVulnerabilityFeatureVersions", t)
if err != nil {
tx.Rollback()
return err
@ -371,6 +387,8 @@ func linkVulnerabilityToFeatureVersions(tx *sql.Tx, fixedInID, vulnerabilityID,
}
func (pgSQL *pgSQL) DeleteVulnerability(namespaceName, name string) error {
defer observeQueryTime("DeleteVulnerability", "all", time.Now())
result, err := pgSQL.Exec(getQuery("r_vulnerability"), namespaceName, name)
if err != nil {
return handleError("r_vulnerability", err)

851
grafana.json Normal file
View File

@ -0,0 +1,851 @@
{
"id": 1,
"title": "Clair",
"originalTitle": "Clair",
"tags": [],
"style": "dark",
"timezone": "browser",
"editable": true,
"hideControls": false,
"sharedCrosshair": false,
"rows": [
{
"collapse": false,
"editable": true,
"height": "250px",
"panels": [
{
"aliasColors": {},
"bars": false,
"datasource": null,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"leftLogBase": 1,
"leftMax": null,
"leftMin": 0,
"rightLogBase": 1,
"rightMax": null,
"rightMin": null,
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 1,
"isNew": true,
"leftYAxisLabel": "",
"legend": {
"alignAsTable": false,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"show": false,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(clair_updater_duration_seconds)",
"hide": false,
"interval": "1m",
"intervalFactor": 2,
"legendFormat": "Updater - duration",
"metric": "",
"refId": "A",
"step": 120
}
],
"timeFrom": null,
"timeShift": null,
"title": "Updater - Duration",
"tooltip": {
"shared": true,
"value_type": "cumulative"
},
"transparent": false,
"type": "graph",
"x-axis": true,
"y-axis": true,
"y_formats": [
"s",
"short"
]
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"datasource": null,
"editable": true,
"error": false,
"format": "none",
"id": 3,
"interval": null,
"isNew": true,
"links": [],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"span": 2,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"targets": [
{
"expr": "clair_updater_notes_total",
"intervalFactor": 2,
"legendFormat": "",
"metric": "clair_updater_notes_total",
"refId": "A",
"step": 60
}
],
"thresholds": "",
"title": "Updater - Number of Notes",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "avg"
},
{
"aliasColors": {},
"bars": false,
"datasource": null,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"leftLogBase": 1,
"leftMax": null,
"leftMin": 0,
"rightLogBase": 1,
"rightMax": null,
"rightMin": null,
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 2,
"isNew": true,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": false,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 4,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(clair_updater_errors_total[$rate]))",
"interval": "",
"intervalFactor": 2,
"legendFormat": "Updater - Error rate",
"metric": "clair_updater_errors_total",
"refId": "A",
"step": 2
}
],
"timeFrom": null,
"timeShift": null,
"title": "Updater - Error rate",
"tooltip": {
"shared": true,
"value_type": "cumulative"
},
"type": "graph",
"x-axis": true,
"y-axis": true,
"y_formats": [
"short",
"short"
]
}
],
"showTitle": true,
"title": "Updater"
},
{
"collapse": false,
"editable": true,
"height": "250px",
"panels": [
{
"aliasColors": {},
"bars": false,
"datasource": null,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"leftLogBase": 1,
"leftMax": null,
"leftMin": 0,
"rightLogBase": 1,
"rightMax": null,
"rightMin": null,
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 4,
"isNew": true,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": false,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 8,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(clair_notifier_latency_seconds_sum[$rate]))/sum(rate(clair_notifier_latency_seconds_count[$rate]))",
"interval": "",
"intervalFactor": 2,
"legendFormat": "Notifier - Latency",
"refId": "A",
"step": 2
}
],
"timeFrom": null,
"timeShift": null,
"title": "Notifier - Latency",
"tooltip": {
"shared": true,
"value_type": "cumulative"
},
"type": "graph",
"x-axis": true,
"y-axis": true,
"y_formats": [
"ms",
"short"
]
},
{
"aliasColors": {},
"bars": false,
"datasource": null,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"leftLogBase": 1,
"leftMax": null,
"leftMin": 0,
"rightLogBase": 1,
"rightMax": null,
"rightMin": null,
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 5,
"isNew": true,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": false,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 4,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(clair_notifier_backend_errors_total[$rate]))",
"interval": "",
"intervalFactor": 2,
"legendFormat": "Notifier - Error rate",
"metric": "",
"refId": "A",
"step": 2
}
],
"timeFrom": null,
"timeShift": null,
"title": "Notifier - Error rate",
"tooltip": {
"shared": true,
"value_type": "cumulative"
},
"type": "graph",
"x-axis": true,
"y-axis": true,
"y_formats": [
"short",
"short"
]
}
],
"showTitle": true,
"title": "Notifier"
},
{
"collapse": false,
"editable": true,
"height": "250px",
"panels": [
{
"aliasColors": {},
"bars": false,
"datasource": null,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"leftLogBase": 1,
"leftMax": 100,
"leftMin": 0,
"rightLogBase": 1,
"rightMax": null,
"rightMin": null,
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 7,
"isNew": true,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 8,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(clair_pgsql_cache_hits_total[$rate])) by (object) / sum(rate(clair_pgsql_cache_queries_total[$rate])) by (object) * 100",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{object}}",
"metric": "clair_pgsql_cache_queries_total",
"refId": "A",
"step": 2
}
],
"timeFrom": null,
"timeShift": null,
"title": "PostgreSQL - Cache Hit Rate",
"tooltip": {
"shared": true,
"value_type": "cumulative"
},
"type": "graph",
"x-axis": true,
"y-axis": true,
"y_formats": [
"percent",
"short"
]
},
{
"aliasColors": {},
"bars": false,
"datasource": null,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"leftLogBase": 1,
"leftMax": null,
"leftMin": 0,
"rightLogBase": 1,
"rightMax": null,
"rightMin": null,
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 6,
"isNew": true,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": false,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 4,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(clair_pgsql_errors_total[$rate]))",
"interval": "",
"intervalFactor": 2,
"legendFormat": "PostgreSQL - Error rate",
"metric": "clair_updater_errors_total",
"refId": "A",
"step": 2
}
],
"timeFrom": null,
"timeShift": null,
"title": "PostgreSQL - Error rate",
"tooltip": {
"shared": true,
"value_type": "cumulative"
},
"type": "graph",
"x-axis": true,
"y-axis": true,
"y_formats": [
"short",
"short"
]
}
],
"showTitle": true,
"title": "PostgreSQL"
},
{
"collapse": false,
"editable": true,
"height": "250px",
"panels": [
{
"aliasColors": {},
"bars": false,
"datasource": null,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"leftLogBase": 1,
"leftMax": null,
"leftMin": 0,
"rightLogBase": 1,
"rightMax": null,
"rightMin": null,
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 8,
"isNew": true,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 12,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(clair_pgsql_query_duration_milliseconds_sum[$rate])) by (query, subquery) / sum(rate(clair_pgsql_query_duration_milliseconds_count[$rate])) by (query, subquery)",
"hide": false,
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{query}} - {{subquery}}",
"metric": "",
"refId": "A",
"step": 2
}
],
"timeFrom": null,
"timeShift": null,
"title": "PostgreSQL - Query Duration",
"tooltip": {
"shared": true,
"value_type": "cumulative"
},
"type": "graph",
"x-axis": true,
"y-axis": true,
"y_formats": [
"ms",
"short"
]
}
],
"title": "PostgreSQL"
},
{
"collapse": false,
"editable": true,
"height": "250px",
"panels": [
{
"aliasColors": {},
"bars": false,
"datasource": null,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"leftLogBase": 1,
"leftMax": null,
"leftMin": 0,
"rightLogBase": 1,
"rightMax": null,
"rightMin": 0,
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 9,
"isNew": true,
"leftYAxisLabel": "",
"legend": {
"alignAsTable": false,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"rightYAxisLabel": "",
"seriesOverrides": [
{
"alias": "Concurrent transactions",
"yaxis": 2
}
],
"span": 12,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(clair_pgsql_query_duration_milliseconds_sum{query=\"insertFeatureVersion\", subquery=\"lock\"}[$rate])) / sum(rate(clair_pgsql_query_duration_milliseconds_count{query=\"insertFeatureVersion\", subquery=\"lock\"}[$rate]))",
"intervalFactor": 2,
"legendFormat": "insertFeatureVersion wait duration",
"refId": "A",
"step": 2
},
{
"expr": "sum(rate(clair_pgsql_query_duration_milliseconds_sum{query=\"insertVulnerability\", subquery=\"lock\"}[$rate])) / sum(rate(clair_pgsql_query_duration_milliseconds_count{query=\"insertVulnerability\", subquery=\"lock\"}[$rate]))",
"intervalFactor": 2,
"legendFormat": "insertVulnerability wait duration",
"refId": "B",
"step": 2
},
{
"expr": "sum(clair_pgsql_concurrent_lock_vafv_total)",
"hide": false,
"interval": "",
"intervalFactor": 2,
"legendFormat": "Concurrent transactions",
"refId": "C",
"step": 2
}
],
"timeFrom": null,
"timeShift": null,
"title": "Lock VAFV",
"tooltip": {
"shared": true,
"value_type": "cumulative"
},
"type": "graph",
"x-axis": true,
"y-axis": true,
"y_formats": [
"ms",
"short"
]
}
],
"title": "PostgreSQL"
}
],
"time": {
"from": "now-30m",
"to": "now"
},
"timepicker": {
"now": true,
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"templating": {
"list": [
{
"allFormat": "glob",
"auto": true,
"auto_count": 5,
"current": {
"tags": [],
"text": "15s",
"value": "15s"
},
"datasource": null,
"hideLabel": false,
"includeAll": false,
"label": "Rate",
"multi": false,
"multiFormat": "glob",
"name": "rate",
"options": [
{
"selected": false,
"text": "auto",
"value": "$__auto_interval"
},
{
"selected": false,
"text": "5s",
"value": "5s"
},
{
"selected": true,
"text": "15s",
"value": "15s"
},
{
"selected": false,
"text": "30s",
"value": "30s"
},
{
"selected": false,
"text": "1m",
"value": "1m"
},
{
"selected": false,
"text": "5m",
"value": "5m"
},
{
"selected": false,
"text": "10m",
"value": "10m"
},
{
"selected": false,
"text": "30m",
"value": "30m"
},
{
"selected": false,
"text": "1h",
"value": "1h"
},
{
"selected": false,
"text": "3h",
"value": "3h"
},
{
"selected": false,
"text": "6h",
"value": "6h"
},
{
"selected": false,
"text": "12h",
"value": "12h"
},
{
"selected": false,
"text": "1d",
"value": "1d"
},
{
"selected": false,
"text": "7d",
"value": "7d"
},
{
"selected": false,
"text": "30d",
"value": "30d"
}
],
"query": "5s, 15s, 30s, 1m,5m,10m,30m,1h,3h,6h,12h,1d,7d,30d",
"refresh": false,
"type": "interval"
},
{
"allFormat": "regex wildcard",
"current": {
"tags": [],
"text": "All",
"value": ".*"
},
"datasource": null,
"includeAll": true,
"label": "Instance",
"multi": true,
"multiFormat": "glob",
"name": "instance",
"options": [
{
"selected": true,
"text": "All",
"value": ".*"
},
{
"selected": false,
"text": "192.168.99.1:6061",
"value": "192\\.168\\.99\\.1\\:6061"
}
],
"query": "label_values(instance)",
"refresh": false,
"regex": "",
"type": "query",
"useTags": false
}
]
},
"annotations": {
"list": []
},
"refresh": "5s",
"schemaVersion": 8,
"version": 17,
"links": []
}

View File

@ -42,10 +42,15 @@ var (
notifiers = make(map[string]Notifier)
promNotifierLatencySeconds = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "clair_notifier_latency_seconds",
promNotifierLatencyMilliseconds = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: "clair_notifier_latency_milliseconds",
Help: "Time it takes to send a notification after it's been created.",
})
promNotifierBackendErrorsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "clair_notifier_backend_errors_total",
Help: "Number of errors that notifier backends generated.",
}, []string{"backend"})
)
// Notifier represents anything that can transmit notifications.
@ -57,6 +62,11 @@ type Notifier interface {
Send(notification database.VulnerabilityNotification) error
}
func init() {
prometheus.MustRegister(promNotifierLatencyMilliseconds)
prometheus.MustRegister(promNotifierBackendErrorsTotal)
}
// RegisterNotifier makes a Fetcher available by the provided name.
// If Register is called twice with the same name or if driver is nil,
// it panics.
@ -114,7 +124,7 @@ func Run(config *config.NotifierConfig, datastore database.Datastore, stopper *u
go func() {
success, interrupted := handleTask(*notification, stopper, config.Attempts)
if success {
promNotifierLatencySeconds.Set(float64(time.Now().Sub(notification.Created)))
utils.PrometheusObserveTimeMilliseconds(promNotifierLatencyMilliseconds, notification.Created)
datastore.SetNotificationNotified(notification.Name)
}
if interrupted {
@ -188,6 +198,7 @@ func handleTask(notification database.VulnerabilityNotification, st *utils.Stopp
// Send using the current notifier.
if err := notifier.Send(notification); err != nil {
// Send failed; increase attempts/backoff and retry.
promNotifierBackendErrorsTotal.WithLabelValues(notifierName).Inc()
log.Errorf("could not send notification '%s' to notifier '%s': %s", notification.Name, notifierName, err)
backOff = timeutil.ExpBackoff(backOff, maxBackOff)
attempts++

View File

@ -17,7 +17,6 @@
package updater
import (
"encoding/json"
"math/rand"
"strconv"
"time"
@ -27,6 +26,7 @@ import (
"github.com/coreos/clair/utils"
"github.com/coreos/pkg/capnslog"
"github.com/pborman/uuid"
"github.com/prometheus/client_golang/prometheus"
)
const (
@ -38,7 +38,30 @@ const (
refreshLockDuration = time.Minute * 8
)
var log = capnslog.NewPackageLogger("github.com/coreos/clair", "updater")
var (
log = capnslog.NewPackageLogger("github.com/coreos/clair", "updater")
promUpdaterErrorsTotal = prometheus.NewCounter(prometheus.CounterOpts{
Name: "clair_updater_errors_total",
Help: "Numbers of errors that the updater generated.",
})
promUpdaterDurationSeconds = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "clair_updater_duration_seconds",
Help: "Time it takes to update the vulnerability database.",
})
promUpdaterNotesTotal = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "clair_updater_notes_total",
Help: "Number of notes that the vulnerability fetchers generated.",
})
)
func init() {
prometheus.MustRegister(promUpdaterErrorsTotal)
prometheus.MustRegister(promUpdaterDurationSeconds)
prometheus.MustRegister(promUpdaterNotesTotal)
}
// Run updates the vulnerability database at regular intervals.
func Run(config *config.UpdaterConfig, datastore database.Datastore, st *utils.Stopper) {
@ -126,6 +149,8 @@ func Run(config *config.UpdaterConfig, datastore database.Datastore, st *utils.S
// Update fetches all the vulnerabilities from the registered fetchers, upserts
// them into the database and then sends notifications.
func Update(datastore database.Datastore) {
defer setUpdaterDuration(time.Now())
log.Info("updating vulnerabilities")
// Fetch updates.
@ -137,26 +162,35 @@ func Update(datastore database.Datastore) {
log.Tracef("beginning insertion of %d vulnerabilities for update", len(vulnerabilities))
err := datastore.InsertVulnerabilities(vulnerabilities)
if err != nil {
promUpdaterErrorsTotal.Inc()
log.Errorf("an error occured when inserting vulnerabilities for update: %s", err)
return
}
vulnerabilities = nil
// Update flags and notes.
// Update flags.
for flagName, flagValue := range flags {
datastore.InsertKeyValue(flagName, flagValue)
}
bnotes, _ := json.Marshal(notes)
datastore.InsertKeyValue(notesFlagName, string(bnotes))
// Log notes.
for _, note := range notes {
log.Warningf("fetcher note: %s", note)
}
promUpdaterNotesTotal.Set(float64(len(notes)))
// Update last successful update if every fetchers worked properly.
if status {
datastore.InsertKeyValue(flagName, strconv.FormatInt(time.Now().UTC().Unix(), 10))
}
log.Info("update finished")
}
func setUpdaterDuration(start time.Time) {
promUpdaterDurationSeconds.Set(time.Since(start).Seconds())
}
// fetch get data from the registered fetchers, in parallel.
func fetch(datastore database.Datastore) (bool, []database.Vulnerability, map[string]string, []string) {
var vulnerabilities []database.Vulnerability
@ -170,6 +204,7 @@ func fetch(datastore database.Datastore) (bool, []database.Vulnerability, map[st
go func(name string, fetcher Fetcher) {
response, err := fetcher.FetchUpdate(datastore)
if err != nil {
promUpdaterErrorsTotal.Inc()
log.Errorf("an error occured when fetching update '%s': %s.", name, err)
status = false
responseC <- nil
@ -204,10 +239,3 @@ func getLastUpdate(datastore database.Datastore) time.Time {
}
return time.Time{}
}
func getNotes(datastore database.Datastore) (notes []string) {
if jsonNotes, err := datastore.GetKeyValue(notesFlagName); err == nil && jsonNotes != "" {
json.Unmarshal([]byte(jsonNotes), notes)
}
return
}

13
utils/prometheus.go Normal file
View File

@ -0,0 +1,13 @@
package utils
import (
"time"
"github.com/prometheus/client_golang/prometheus"
)
// PrometheusObserveTimeMilliseconds observes the elapsed time since start, in milliseconds,
// on the specified Prometheus Histogram.
func PrometheusObserveTimeMilliseconds(h prometheus.Histogram, start time.Time) {
h.Observe(float64(time.Since(start).Nanoseconds()) / float64(time.Millisecond))
}