Refactor metrics provider interface for time series. (#8989)

This commit is contained in:
Paul Banks 2020-10-20 16:41:16 +01:00 committed by GitHub
parent da4ec9ff27
commit 4f1c13b38a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 366 additions and 155 deletions

View File

@ -16,7 +16,7 @@ export default class TopologyMetrics extends Component {
constructor(owner, args) {
super(owner, args);
this.hasMetricsProvider = !!this.cfg.get().metrics_provider
this.hasMetricsProvider = !!this.cfg.get().metrics_provider;
}
// =methods

View File

@ -4,6 +4,10 @@
{{on-window 'resize' (action 'redraw')}}
{{#if data.labels}}
<a class="sparkline-key-link" {{action (mut shouldShowKey) true}}>Key</a>
{{/if}}
<div class="sparkline-wrapper">
<div class="tooltip">
<div class="sparkline-time">Timestamp</div>
@ -12,3 +16,36 @@
<svg class="sparkline"></svg>
</div>
{{#if shouldShowKey}}
<ModalDialog
class="sparkline-key"
@onclose={{action (mut shouldShowKey) false}}
as |modal|>
<BlockSlot @name="header">
<h3>Metrics Key</h3>
</BlockSlot>
<BlockSlot @name="body">
<div class="sparkline-key-content">
<p>This key describes the metrics corresponding to the graph tooltip labels in more detail.</p>
<dl>
{{#each-in data.labels as |label desc| }}
<dt>{{label}}</dt>
<dd>{{{desc}}}</dd>
{{/each-in}}
</dl>
{{#unless data.labels}}
<span class="no-data">No metrics loaded.</span>
{{/unless}}
</div>
</BlockSlot>
<BlockSlot @name="actions">
<button
type="button"
class="type-cancel"
onclick={{action modal.close}}
>
Close
</button>
</BlockSlot>
</ModalDialog>
{{/if}}

View File

@ -27,14 +27,15 @@ export default Component.extend({
this.drawGraphs();
},
change: function(evt) {
this.data = evt.data;
this.set('data', evt.data.series);
this.element.querySelector('.sparkline-loader').style.display = 'none';
this.drawGraphs();
this.rerender();
},
},
drawGraphs: function() {
if (!this.data.series) {
if (!this.data) {
return;
}
@ -50,13 +51,13 @@ export default Component.extend({
// To be safe, filter any series that actually have no data points. This can
// happen thanks to our current provider contract allowing empty arrays for
// series data if there is no value.
//
// TODO(banks): switch series provider data to be a single array with series
// values as properties as we need below to enforce sensible alignment of
// timestamps and explicit summing expectations.
let series = ((this.data || {}).series || []).filter(s => s.data.length > 0);
let maybeData = this.data || {};
let series = maybeData.data || [];
let labels = maybeData.labels || {};
let unitSuffix = maybeData.unitSuffix || '';
let keys = Object.keys(labels).filter(l => l != 'Total');
if (series.length == 0) {
if (series.length == 0 || keys.length == 0) {
// Put the graph in an error state that might get fixed if metrics show up
// on next poll.
let loader = this.element.querySelector('.sparkline-loader');
@ -65,32 +66,26 @@ export default Component.extend({
return;
}
// Fill the timestamps for x axis.
let data = series[0].data.map(d => {
return { time: d[0] };
});
let keys = [];
// Initialize zeros
let summed = this.data.series[0].data.map(d => 0);
for (var i = 0; i < series.length; i++) {
let s = series[i];
// Attach the value as a new field to the data grid.
s.data.map((d, idx) => {
data[idx][s.label] = d[1];
summed[idx] += d[1];
});
keys.push(s.label);
}
let st = stack()
.keys(keys)
.order(stackOrderReverse);
let stackData = st(data);
let stackData = st(series);
// Sum all of the values for each point to get max range. Technically
// stackData contains this but I didn't find reliable documentation on
// whether we can rely on the highest stacked area to always be first/last
// in array etc. so this is simpler.
let summed = series.map(d => {
let sum = 0;
keys.forEach(l => {
sum = sum + d[l];
});
return sum;
});
let x = scaleTime()
.domain(extent(data, d => d.time))
.domain(extent(series, d => d.time))
.range([0, w]);
let y = scaleLinear()
@ -126,6 +121,7 @@ export default Component.extend({
let tooltip = select(this.element.querySelector('.tooltip'));
tooltip.selectAll('.sparkline-tt-legend').remove();
tooltip.selectAll('.sparkline-tt-sum').remove();
for (var k of keys) {
let legend = tooltip.append('div').attr('class', 'sparkline-tt-legend');
@ -137,13 +133,24 @@ export default Component.extend({
legend
.append('span')
.text(k + ': ')
.text(k)
.append('span')
.attr('class', 'sparkline-tt-legend-value');
}
let tipVals = tooltip.selectAll('.sparkline-tt-legend-value');
// Add a label for the summed value
if (keys.length > 1) {
tooltip
.append('div')
.attr('class', 'sparkline-tt-sum')
.append('span')
.text('Total')
.append('span')
.attr('class', 'sparkline-tt-sum-value');
}
let self = this;
svg
.on('mouseover', function(e) {
@ -152,10 +159,30 @@ export default Component.extend({
// We update here since we might redraw the graph with user's cursor
// stationary over it. If that happens mouseover fires but not
// mousemove but the tooltip and cursor are wrong (based on old data).
self.updateTooltip(e, data, stackData, keys, x, tooltip, tipVals, cursor);
self.updateTooltip(
e,
series,
stackData,
summed,
unitSuffix,
x,
tooltip,
tipVals,
cursor
);
})
.on('mousemove', function(e, d, i) {
self.updateTooltip(e, data, stackData, keys, x, tooltip, tipVals, cursor);
.on('mousemove', function(e) {
self.updateTooltip(
e,
series,
stackData,
summed,
unitSuffix,
x,
tooltip,
tipVals,
cursor
);
})
.on('mouseout', function(e) {
tooltip.style('visibility', 'hidden');
@ -168,7 +195,17 @@ export default Component.extend({
this.svg.on('mouseover mousemove mouseout', null);
}
},
updateTooltip: function(e, data, stackData, keys, x, tooltip, tipVals, cursor) {
updateTooltip: function(
e,
series,
stackData,
summed,
unitSuffix,
x,
tooltip,
tipVals,
cursor
) {
let [mouseX] = pointer(e);
cursor.attr('x', mouseX);
@ -176,7 +213,7 @@ export default Component.extend({
var bisectTime = bisector(function(d) {
return d.time;
}).left;
let tipIdx = bisectTime(data, mouseTime);
let tipIdx = bisectTime(series, mouseTime);
tooltip
// 22 px is the correction to align the arrow on the tool tip with
@ -185,23 +222,15 @@ export default Component.extend({
.select('.sparkline-time')
.text(niceTimeWithSeconds(mouseTime));
// Get the summed value - that's the one of the top most stack.
tooltip.select('.sparkline-tt-sum-value').text(`${shortNumStr(summed[tipIdx])}${unitSuffix}`);
tipVals.nodes().forEach((n, i) => {
let val = stackData[i][tipIdx][1] - stackData[i][tipIdx][0];
select(n).text(this.formatTooltip(keys[i], val));
select(n).text(`${shortNumStr(val)}${unitSuffix}`);
});
cursor.attr('x', mouseX);
},
formatTooltip: function(label, val) {
switch (label) {
case 'Data rate received':
// fallthrough
case 'Data rate transmitted':
return dataRateStr(val);
default:
return shortNumStr(val);
}
},
}
});
// Duplicated in vendor/metrics-providers/prometheus.js since we want that to

View File

@ -17,7 +17,7 @@
position: absolute;
z-index: 100;
bottom: 78px;
width: 250px;
width: 217px;
}
.sparkline-tt-legend-color {
@ -37,3 +37,35 @@
}
}
// Key modal
.sparkline-key {
.sparkline-key-content {
width: 500px;
min-height: 100px;
dl {
padding: 10px 0 0 0;
}
dt {
width: 125px;
float: left;
}
dd {
margin: 0 0 12px 135px;
}
}
}
.sparkline-key-link {
visibility: hidden;
float: right;
// TODO: this is a massive hack but we want it to be actually outside of the
// bounding box of this component. We could move it into the parent component
// but it's pretty tied up to the state - should only show if we have metrics
// loaded etc. I expect there is a cleaner way to refactor this though.
margin-top: -35px;
margin-right: 12px;
}
#metrics-container:hover .sparkline-key-link {
visibility: visible;
}

View File

@ -1,31 +1,41 @@
#metrics-container div .sparkline-wrapper {
#metrics-container .sparkline-wrapper {
svg path {
stroke-width: 0;
}
.tooltip {
padding: 5px 10px 10px 10px;
padding: 0 0 10px;
font-size: 0.875em;
line-height: 1.5em;
font-weight: normal;
border: 1px solid #BAC1CC;
border: 1px solid $gray-300;
background: #fff;
border-radius: 2px;
box-sizing: border-box;
box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.05), 0px 4px 4px rgba(0, 0, 0, 0.1);
.sparkline-time {
padding: 0;
padding: 8px 10px;
font-weight: bold;
font-size: 14px;
color: #000;
margin-bottom: 5px;
border-bottom: 1px solid $gray-200;
margin-bottom: 4px;
text-align: center;
}
.sparkline-tt-legend {
.sparkline-tt-legend,
.sparkline-tt-sum {
border: 0;
padding: 3px 10px 0 10px;
}
.sparkline-tt-sum {
border-top: 1px solid $gray-200;
margin-top: 4px;
padding: 8px 10px 0 10px;
}
.sparkline-tt-legend-color {
width: 12px;
height: 12px;
@ -33,6 +43,11 @@
margin: 0 5px 0 0;
padding: 0;
}
.sparkline-tt-legend-value,
.sparkline-tt-sum-value {
float: right;
}
}
div.tooltip:before{
@ -43,7 +58,7 @@
height: 12px;
left: 15px;
bottom: -7px;
border: 1px solid #BAC1CC;
border: 1px solid $gray-300;
border-top: 0;
border-left: 0;
background: #fff;
@ -51,3 +66,37 @@
}
}
// Key modal
.sparkline-key {
h3::before {
@extend %with-info-circle-fill-mask, %as-pseudo;
margin: 2px 3px 0 0;
font-size: 14px;
}
h3 {
color: $gray-900;
font-size: 16px;
}
.sparkline-key-content {
dt {
font-weight: 600;
}
dd {
color: $gray-500;
}
}
}
.sparkline-key-link {
color: $gray-500;
}
.sparkline-key-link:hover {
color: $blue-500;
}
#metrics-container:hover .sparkline-key-link::before {
@extend %with-info-circle-fill-mask, %as-pseudo;
margin: 1px 3px 0 0;
font-size: 12px;
}

View File

@ -11,6 +11,7 @@ const meta = {
export default RepositoryService.extend({
cfg: service('ui-config'),
error: null,
init: function() {
this._super(...arguments);
@ -21,10 +22,21 @@ export default RepositoryService.extend({
opts.metrics_proxy_enabled = uiCfg.metrics_proxy_enabled;
// Inject the base app URL
const provider = uiCfg.metrics_provider || 'prometheus';
this.provider = window.consul.getMetricsProvider(provider, opts);
try {
this.provider = window.consul.getMetricsProvider(provider, opts);
} catch(e) {
this.error = new Error(`metrics provider not initialized: ${e}`);
// Show the user the error once for debugging their provider outside UI
// Dev.
console.error(this.error);
}
},
findServiceSummary: function(protocol, slug, dc, nspace, configuration = {}) {
if (this.error) {
return Promise.reject(this.error);
}
const promises = [
// TODO: support namespaces in providers
this.provider.serviceRecentSummarySeries(slug, protocol, {}),
@ -33,13 +45,16 @@ export default RepositoryService.extend({
return Promise.all(promises).then(function(results) {
return {
meta: meta,
series: results[0].series,
series: results[0],
stats: results[1].stats,
};
});
},
findUpstreamSummary: function(slug, dc, nspace, configuration = {}) {
if (this.error) {
return Promise.reject(this.error);
}
return this.provider.upstreamRecentSummaryStats(slug, {}).then(function(result) {
result.meta = meta;
return result;
@ -47,9 +62,12 @@ export default RepositoryService.extend({
},
findDownstreamSummary: function(slug, dc, nspace, configuration = {}) {
if (this.error) {
return Promise.reject(this.error);
}
return this.provider.downstreamRecentSummaryStats(slug, {}).then(function(result) {
result.meta = meta;
return result;
});
},
}
});

View File

@ -1,54 +1,81 @@
/*eslint no-console: "off"*/
(function () {
var emptySeries = { unitSuffix: "", labels: {}, data: [] }
var prometheusProvider = {
options: {},
/**
* init is called when the provide is first loaded.
* init is called when the provider is first loaded.
*
* options.providerOptions contains any operator configured parameters
* specified in the Consul agent config that is serving the UI.
*
* options.proxy.baseURL contains the base URL if the agent has a metrics
* proxy configured. If it doesn't options.proxy will be null. The provider
* should throw an Exception (TODO: specific type?) if it requires a metrics
* proxy and one is not configured.
* Consul will provider a boolean options.metrics_proxy_enabled to indicate
* whether the agent has a metrics proxy configured.
*
* The provider should throw an Exception if the options are not valid for
* example because it requires a metrics proxy and one is not configured.
*/
init: function(options) {
this.options = options;
if (!this.options.metrics_proxy_enabled) {
throw new Error("prometheus metrics provider currently requires the ui_config.metrics_proxy to be configured in the Consul agent.");
}
},
/**
* serviceRecentSummarySeries should return time series for a recent time
* period summarizing the usage of the named service.
*
* If these metrics aren't available then empty series may be returned.
* If these metrics aren't available then an empty series array may be
* returned.
*
* The period may (later) be specified in options.startTime and
* options.endTime.
*
* The service's protocol must be given as one of Consul's supported
* protocols e.g. "tcp", "http", "http2", "grpc". If it is empty or the
* provider doesn't recognize it it should treat it as "tcp" and provide
* just basic connection stats.
* provider doesn't recognize the protocol, it should treat it as "tcp" and
* provide basic connection stats.
*
* The expected return value is a promise which resolves to an object that
* should look like the following:
*
* {
* series: [
* // The unitSuffix is shown after the value in tooltips. Values will be
* // rounded and shortened. Larger values will already have a suffix
* // like "10k". The suffix provided here is concatenated directly
* // allowing for suffixes like "mbps/kbps" by using a suffix of "bps".
* // If the unit doesn't make sense in this format, include a
* // leading space for example " rps" would show as "1.2k rps".
* unitSuffix: " rps",
*
* // The set of labels to graph. The key should exactly correspond to a
* // property of every data point in the array below except for the
* // special case "Total" which is used to show the sum of all the
* // stacked graph values. The key is displayed in the tooltop so it
* // should be human-friendly but as concise as possible. The value is a
* // longer description that is displayed in the graph's key on request
* // to explain exactly what the metrics mean.
* labels: {
* "Total": "Total inbound requests per second.",
* "Successes": "Successful responses (with an HTTP response code not in the 5xx range) per second.",
* "Errors": "Error responses (with an HTTP response code in the 5xx range) per second.",
* },
*
* data: [
* {
* label: "Requests per second",
* data: [...]
* time: 1600944516286, // milliseconds since Unix epoch
* "Successes": 1234.5,
* "Errors": 2.3,
* },
* ...
* ]
* }
*
* Each time series' data array is simple an array of tuples with the first
* being a Date object and the second a floating point value:
*
* [[Date(1600944516286), 1234.9], [Date(1600944526286), 1234.9], ...]
* Every data point object should have a value for every series label
* (except for "Total") otherwise it will be assumed to be "0".
*/
serviceRecentSummarySeries: function(serviceName, protocol, options) {
// Fetch time-series
@ -62,36 +89,11 @@
options.end = now;
if (this.hasL7Metrics(protocol)) {
series.push(this.fetchRequestRateSeries(serviceName, options))
labels.push("Requests per second")
series.push(this.fetchErrorRateSeries(serviceName, options))
labels.push("Errors per second")
} else {
// Fallback to just L4 metrics.
series.push(this.fetchServiceRxSeries(serviceName, options))
labels.push("Data rate received")
series.push(this.fetchServiceTxSeries(serviceName, options))
labels.push("Data rate transmitted")
return this.fetchRequestRateSeries(serviceName, options);
}
var all = Promise.allSettled(series).
then(function(results){
var data = { series: [] }
for (var i = 0; i < series.length; i++) {
if (results[i].value) {
data.series.push({
label: labels[i],
data: results[i].value
});
} else if (results[i].reason) {
console.log("ERROR processing series", labels[i], results[i].reason)
}
}
return data
})
// Fetch the metrics async, and return a promise to the result.
return all
// Fallback to just L4 metrics.
return this.fetchDataRateSeries(serviceName, options);
},
/**
@ -174,8 +176,8 @@
},
/**
* downstreamRecentSummaryStats should return four summary statistics for each
* downstream service over a recent time period.
* downstreamRecentSummaryStats should return four summary statistics for
* each downstream service over a recent time period.
*
* If these metrics aren't available then an empty array may be returned.
*
@ -188,9 +190,10 @@
* stats: {
* // Each downstream will appear as an entry keyed by the downstream
* // service name. The value is an array of stats with the same
* // format as serviceRecentSummaryStats response.stats. Note that
* // different downstreams might show different stats depending on
* // their protocol.
* // format as serviceRecentSummaryStats response.stats. Different
* // downstreams may display different stats if required although the
* // protocol should be the same for all as it is the target
* // service's protocol that matters here.
* "downstream_name": [
* {label: "SR", desc: "...", value: "99%"},
* ...
@ -276,59 +279,102 @@
return all
},
reformatSeries: function(response) {
// Handle empty results from prometheus.
if (!response || !response.data || !response.data.result
|| response.data.result.length < 1) {
return [];
}
// Reformat the prometheus data to be the format we want which is
// essentially the same but with Date objects instead of unix timestamps.
return response.data.result[0].values.map(function(val){
return [new Date(val[0]*1000), parseFloat(val[1])]
})
reformatSeries: function(unitSuffix, labelMap) {
return function(response) {
// Handle empty result sets gracefully.
if (!response.data || !response.data.result || response.data.result.length == 0
|| !response.data.result[0].values
|| response.data.result[0].values.length == 0) {
return emptySeries;
}
// Reformat the prometheus data to be the format we want with stacked
// values as object properties.
// Populate time values first based on first result since Prometheus will
// always return all the same points for all series in the query.
let series = response.data.result[0].values.map(function(d, i) {
return {
time: Math.round(d[0] * 1000),
};
});
// Then for each series returned populate the labels and values in the
// points.
response.data.result.map(function(d) {
d.values.map(function(p, i) {
series[i][d.metric.label] = parseFloat(p[1]);
});
});
return {
unitSuffix: unitSuffix,
labels: labelMap,
data: series
};
};
},
fetchRequestRateSeries: function(serviceName, options){
var q = `sum(irate(envoy_listener_http_downstream_rq_xx{local_cluster="${serviceName}",envoy_http_conn_manager_prefix="public_listener_http"}[10m]))`
return this.fetchSeries(q, options).then(this.reformatSeries, function(xhr){
// Failure. log to console and return an blank result for now.
console.log("ERROR: failed to fetch requestRate", xhr.responseText)
return []
// We need the sum of all non-500 error rates as one value and the 500
// error rate as a separate series so that they stack to show the full
// request rate. Some creative label replacement makes this possible in
// one query.
var q = `sum by (label) (`+
// The outer label_replace catches 5xx error and relabels them as
// err=yes
`label_replace(`+
// The inner label_replace relabels all !5xx rates as err=no so they
// will get summed together.
`label_replace(`+
// Get rate of requests to the service
`irate(envoy_listener_http_downstream_rq_xx{local_cluster="${serviceName}",envoy_http_conn_manager_prefix="public_listener_http"}[10m])`+
// ... inner replacement matches all code classes except "5" and
// applies err=no
`, "label", "Successes", "envoy_response_code_class", "[^5]")`+
// ... outer replacement matches code=5 and applies err=yes
`, "label", "Errors", "envoy_response_code_class", "5")`+
`)`
var labelMap = {
Total: 'Total inbound requests per second',
Successes: 'Successful responses (with an HTTP response code not in the 5xx range) per second.',
Errors: 'Error responses (with an HTTP response code in the 5xx range) per second.',
};
return this.fetchSeries(q, options)
.then(this.reformatSeries(" rps", labelMap), function(xhr){
// Failure. log to console and return a blank result for now.
console.log('ERROR: failed to fetch requestRate', xhr.responseText)
return emptySeries;
})
},
fetchErrorRateSeries: function(serviceName, options){
// 100 * to get a result in percent
var q = `sum(`+
`irate(envoy_listener_http_downstream_rq_xx{`+
`local_cluster="${serviceName}",`+
`envoy_http_conn_manager_prefix="public_listener_http",`+
`envoy_response_code_class="5"}[10m]`+
`)`+
`)`;
return this.fetchSeries(q, options).then(this.reformatSeries, function(xhr){
// Failure. log to console and return an blank result for now.
console.log("ERROR: failed to fetch errorRate", xhr.responseText)
return []
})
},
fetchServiceRxSeries: function(serviceName, options){
var q = `8 * sum(irate(envoy_tcp_downstream_cx_rx_bytes_total{local_cluster="${serviceName}", envoy_tcp_prefix="public_listener_tcp"}[10m]))`
return this.fetchSeries(q, options).then(this.reformatSeries, function(xhr){
// Failure. log to console and return an blank result for now.
console.log("ERROR: failed to fetch rx data rate", xhr.responseText)
return []
})
},
fetchServiceTxSeries: function(serviceName, options){
var q = `8 * sum(irate(envoy_tcp_downstream_cx_tx_bytes_total{local_cluster="${serviceName}", envoy_tcp_prefix="public_listener_tcp"}[10m]))`
return this.fetchSeries(q, options).then(this.reformatSeries, function(xhr){
// Failure. log to console and return an blank result for now.
console.log("ERROR: failed to fetch tx data rate", xhr.responseText)
return []
fetchDataRateSeries: function(serviceName, options){
// 8 * converts from bytes/second to bits/second
var q = `8 * sum by (label) (`+
// Label replace generates a unique label per rx/tx metric to stop them
// being summed together.
`label_replace(`+
// Get the tx rate
`irate(envoy_tcp_downstream_cx_tx_bytes_total{local_cluster="${serviceName}",envoy_tcp_prefix="public_listener_tcp"}[10m])`+
// Match all and apply the tx label
`, "label", "Outbound", "__name__", ".*"`+
// Union those vectors with the RX ones
`) or label_replace(`+
// Get the rx rate
`irate(envoy_tcp_downstream_cx_rx_bytes_total{local_cluster="${serviceName}",envoy_tcp_prefix="public_listener_tcp"}[10m])`+
// Match all and apply the rx label
`, "label", "Inbound", "__name__", ".*"`+
`)`+
`)`
var labelMap = {
Total: 'Total bandwidth',
Inbound: 'Inbound data rate (data recieved) from the network in bits per second.',
Outbound: 'Outbound data rate (data transmitted) from the network in bits per second.',
};
return this.fetchSeries(q, options)
.then(this.reformatSeries("bps", labelMap), function(xhr){
// Failure. log to console and return a blank result for now.
console.log('ERROR: failed to fetch requestRate', xhr.responseText)
return emptySeries;
})
},