chore: instrument github oauth2 limits (#11532)

* chore: instrument github oauth2 limits

Rate limit information for github oauth2 providers instrumented in prometheus
This commit is contained in:
Steven Masley 2024-01-10 09:29:33 -06:00 committed by GitHub
parent 50b78e3325
commit 3f9da674c6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 421 additions and 10 deletions

View File

@ -1802,7 +1802,7 @@ func configureGithubOAuth2(instrument *promoauth.Factory, accessURL *url.URL, cl
}
return &coderd.GithubOAuth2Config{
OAuth2Config: instrument.New("github-login", &oauth2.Config{
OAuth2Config: instrument.NewGithub("github-login", &oauth2.Config{
ClientID: clientID,
ClientSecret: clientSecret,
Endpoint: endpoint,

View File

@ -85,6 +85,8 @@ type FakeIDP struct {
// to test something like PKI auth vs a client_secret.
hookAuthenticateClient func(t testing.TB, req *http.Request) (url.Values, error)
serve bool
// optional middlewares
middlewares chi.Middlewares
}
func StatusError(code int, err error) error {
@ -115,6 +117,12 @@ func WithAuthorizedRedirectURL(hook func(redirectURL string) error) func(*FakeID
}
}
func WithMiddlewares(mws ...func(http.Handler) http.Handler) func(*FakeIDP) {
return func(f *FakeIDP) {
f.middlewares = append(f.middlewares, mws...)
}
}
// WithRefresh is called when a refresh token is used. The email is
// the email of the user that is being refreshed assuming the claims are correct.
func WithRefresh(hook func(email string) error) func(*FakeIDP) {
@ -570,6 +578,7 @@ func (f *FakeIDP) httpHandler(t testing.TB) http.Handler {
t.Helper()
mux := chi.NewMux()
mux.Use(f.middlewares...)
// This endpoint is required to initialize the OIDC provider.
// It is used to get the OIDC configuration.
mux.Get("/.well-known/openid-configuration", func(rw http.ResponseWriter, r *http.Request) {

View File

@ -464,8 +464,13 @@ func ConvertConfig(instrument *promoauth.Factory, entries []codersdk.ExternalAut
oauthConfig = &exchangeWithClientSecret{oc}
}
instrumented := instrument.New(entry.ID, oauthConfig)
if strings.EqualFold(entry.Type, string(codersdk.EnhancedExternalAuthProviderGitHub)) {
instrumented = instrument.NewGithub(entry.ID, oauthConfig)
}
cfg := &Config{
InstrumentedOAuth2Config: instrument.New(entry.ID, oauthConfig),
InstrumentedOAuth2Config: instrumented,
ID: entry.ID,
Regex: regex,
Type: entry.Type,

100
coderd/promoauth/github.go Normal file
View File

@ -0,0 +1,100 @@
package promoauth
import (
"fmt"
"net/http"
"strconv"
"time"
)
type rateLimits struct {
Limit int
Remaining int
Used int
Reset time.Time
Resource string
}
// githubRateLimits checks the returned response headers and
func githubRateLimits(resp *http.Response, err error) (rateLimits, bool) {
if err != nil || resp == nil {
return rateLimits{}, false
}
p := headerParser{header: resp.Header}
// See
// https://docs.github.com/en/rest/using-the-rest-api/rate-limits-for-the-rest-api?apiVersion=2022-11-28#checking-the-status-of-your-rate-limit
limits := rateLimits{
Limit: p.int("x-ratelimit-limit"),
Remaining: p.int("x-ratelimit-remaining"),
Used: p.int("x-ratelimit-used"),
Resource: p.string("x-ratelimit-resource"),
}
if limits.Limit == 0 &&
limits.Remaining == 0 &&
limits.Used == 0 {
// For some requests, github has no rate limit. In which case,
// it returns all 0s. We can just omit these.
return limits, false
}
// Reset is when the rate limit "used" will be reset to 0.
// If it's unix 0, then we do not know when it will reset.
// Change it to a zero time as that is easier to handle in golang.
unix := p.int("x-ratelimit-reset")
resetAt := time.Unix(int64(unix), 0)
if unix == 0 {
resetAt = time.Time{}
}
limits.Reset = resetAt
// Unauthorized requests have their own rate limit, so we should
// track them separately.
if resp.StatusCode == http.StatusUnauthorized {
limits.Resource += "-unauthorized"
}
// A 401 or 429 means too many requests. This might mess up the
// "resource" string because we could hit the unauthorized limit,
// and we do not want that to override the authorized one.
// However, in testing, it seems a 401 is always a 401, even if
// the limit is hit.
if len(p.errors) > 0 {
// If we are missing any headers, then do not try and guess
// what the rate limits are.
return limits, false
}
return limits, true
}
type headerParser struct {
errors map[string]error
header http.Header
}
func (p *headerParser) string(key string) string {
if p.errors == nil {
p.errors = make(map[string]error)
}
v := p.header.Get(key)
if v == "" {
p.errors[key] = fmt.Errorf("missing header %q", key)
}
return v
}
func (p *headerParser) int(key string) int {
v := p.string(key)
if v == "" {
return -1
}
i, err := strconv.Atoi(v)
if err != nil {
p.errors[key] = err
}
return i
}

View File

@ -4,6 +4,7 @@ import (
"context"
"fmt"
"net/http"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
@ -46,11 +47,25 @@ var _ OAuth2Config = (*Config)(nil)
// Primarily to avoid any prometheus errors registering duplicate metrics.
type Factory struct {
metrics *metrics
// optional replace now func
Now func() time.Time
}
// metrics is the reusable metrics for all oauth2 providers.
type metrics struct {
externalRequestCount *prometheus.CounterVec
// if the oauth supports it, rate limit metrics.
// rateLimit is the defined limit per interval
rateLimit *prometheus.GaugeVec
rateLimitRemaining *prometheus.GaugeVec
rateLimitUsed *prometheus.GaugeVec
// rateLimitReset is unix time of the next interval (when the rate limit resets).
rateLimitReset *prometheus.GaugeVec
// rateLimitResetIn is the time in seconds until the rate limit resets.
// This is included because it is sometimes more helpful to know the limit
// will reset in 600seconds, rather than at 1704000000 unix time.
rateLimitResetIn *prometheus.GaugeVec
}
func NewFactory(registry prometheus.Registerer) *Factory {
@ -68,6 +83,53 @@ func NewFactory(registry prometheus.Registerer) *Factory {
"source",
"status_code",
}),
rateLimit: factory.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "oauth2",
Name: "external_requests_rate_limit_total",
Help: "The total number of allowed requests per interval.",
}, []string{
"name",
// Resource allows different rate limits for the same oauth2 provider.
// Some IDPs have different buckets for different rate limits.
"resource",
}),
rateLimitRemaining: factory.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "oauth2",
Name: "external_requests_rate_limit_remaining",
Help: "The remaining number of allowed requests in this interval.",
}, []string{
"name",
"resource",
}),
rateLimitUsed: factory.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "oauth2",
Name: "external_requests_rate_limit_used",
Help: "The number of requests made in this interval.",
}, []string{
"name",
"resource",
}),
rateLimitReset: factory.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "oauth2",
Name: "external_requests_rate_limit_next_reset_unix",
Help: "Unix timestamp for when the next interval starts",
}, []string{
"name",
"resource",
}),
rateLimitResetIn: factory.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "oauth2",
Name: "external_requests_rate_limit_reset_in_seconds",
Help: "Seconds until the next interval",
}, []string{
"name",
"resource",
}),
},
}
}
@ -80,6 +142,44 @@ func (f *Factory) New(name string, under OAuth2Config) *Config {
}
}
// NewGithub returns a new instrumented oauth2 config for github. It tracks
// rate limits as well as just the external request counts.
//
//nolint:bodyclose
func (f *Factory) NewGithub(name string, under OAuth2Config) *Config {
cfg := f.New(name, under)
cfg.interceptors = append(cfg.interceptors, func(resp *http.Response, err error) {
limits, ok := githubRateLimits(resp, err)
if !ok {
return
}
labels := prometheus.Labels{
"name": cfg.name,
"resource": limits.Resource,
}
// Default to -1 for "do not know"
resetIn := float64(-1)
if !limits.Reset.IsZero() {
now := time.Now()
if f.Now != nil {
now = f.Now()
}
resetIn = limits.Reset.Sub(now).Seconds()
if resetIn < 0 {
// If it just reset, just make it 0.
resetIn = 0
}
}
f.metrics.rateLimit.With(labels).Set(float64(limits.Limit))
f.metrics.rateLimitRemaining.With(labels).Set(float64(limits.Remaining))
f.metrics.rateLimitUsed.With(labels).Set(float64(limits.Used))
f.metrics.rateLimitReset.With(labels).Set(float64(limits.Reset.Unix()))
f.metrics.rateLimitResetIn.With(labels).Set(resetIn)
})
return cfg
}
type Config struct {
// Name is a human friendly name to identify the oauth2 provider. This should be
// deterministic from restart to restart, as it is going to be used as a label in
@ -87,6 +187,8 @@ type Config struct {
name string
underlying OAuth2Config
metrics *metrics
// interceptors are called after every request made by the oauth2 client.
interceptors []func(resp *http.Response, err error)
}
func (c *Config) Do(ctx context.Context, source Oauth2Source, req *http.Request) (*http.Response, error) {
@ -169,5 +271,10 @@ func (i *instrumentedTripper) RoundTrip(r *http.Request) (*http.Response, error)
"source": string(i.source),
"status_code": fmt.Sprintf("%d", statusCode),
}).Inc()
// Handle any extra interceptors.
for _, interceptor := range i.c.interceptors {
interceptor(resp, err)
}
return resp, err
}

View File

@ -1,14 +1,24 @@
package promoauth_test
import (
"context"
"fmt"
"io"
"net/http"
"net/http/httptest"
"net/url"
"strings"
"testing"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
ptestutil "github.com/prometheus/client_golang/prometheus/testutil"
io_prometheus_client "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"golang.org/x/exp/maps"
"golang.org/x/oauth2"
"github.com/coder/coder/v2/coderd/coderdtest/oidctest"
"github.com/coder/coder/v2/coderd/externalauth"
@ -22,12 +32,25 @@ func TestInstrument(t *testing.T) {
ctx := testutil.Context(t, testutil.WaitShort)
idp := oidctest.NewFakeIDP(t, oidctest.WithServing())
reg := prometheus.NewRegistry()
count := func() int {
return ptestutil.CollectAndCount(reg, "coderd_oauth2_external_requests_total")
t.Cleanup(func() {
if t.Failed() {
t.Log(registryDump(reg))
}
})
const id = "test"
labels := prometheus.Labels{
"name": id,
"status_code": "200",
}
const metricname = "coderd_oauth2_external_requests_total"
count := func(source string) int {
labels["source"] = source
return counterValue(t, reg, "coderd_oauth2_external_requests_total", labels)
}
factory := promoauth.NewFactory(reg)
const id = "test"
cfg := externalauth.Config{
InstrumentedOAuth2Config: factory.New(id, idp.OIDCConfig(t, []string{})),
ID: "test",
@ -35,13 +58,13 @@ func TestInstrument(t *testing.T) {
}
// 0 Requests before we start
require.Equal(t, count(), 0)
require.Nil(t, metricValue(t, reg, metricname, labels), "no metrics at start")
// Exchange should trigger a request
code := idp.CreateAuthCode(t, "foo")
token, err := cfg.Exchange(ctx, code)
require.NoError(t, err)
require.Equal(t, count(), 1)
require.Equal(t, count("Exchange"), 1)
// Force a refresh
token.Expiry = time.Now().Add(time.Hour * -1)
@ -49,17 +72,18 @@ func TestInstrument(t *testing.T) {
refreshed, err := src.Token()
require.NoError(t, err)
require.NotEqual(t, token.AccessToken, refreshed.AccessToken, "token refreshed")
require.Equal(t, count(), 2)
require.Equal(t, count("TokenSource"), 1)
// Try a validate
valid, _, err := cfg.ValidateToken(ctx, refreshed.AccessToken)
require.NoError(t, err)
require.True(t, valid)
require.Equal(t, count(), 3)
require.Equal(t, count("ValidateToken"), 1)
// Verify the default client was not broken. This check is added because we
// extend the http.DefaultTransport. If a `.Clone()` is not done, this can be
// mis-used. It is cheap to run this quick check.
snapshot := registryDump(reg)
req, err := http.NewRequestWithContext(ctx, http.MethodGet,
must[*url.URL](t)(idp.IssuerURL().Parse("/.well-known/openid-configuration")).String(), nil)
require.NoError(t, err)
@ -68,7 +92,137 @@ func TestInstrument(t *testing.T) {
require.NoError(t, err)
_ = resp.Body.Close()
require.Equal(t, count(), 3)
require.NoError(t, compare(reg, snapshot), "no metric changes")
}
func TestGithubRateLimits(t *testing.T) {
t.Parallel()
now := time.Now()
cases := []struct {
Name string
NoHeaders bool
Omit []string
ExpectNoMetrics bool
Limit int
Remaining int
Used int
Reset time.Time
at time.Time
}{
{
Name: "NoHeaders",
NoHeaders: true,
ExpectNoMetrics: true,
},
{
Name: "ZeroHeaders",
ExpectNoMetrics: true,
},
{
Name: "OverLimit",
Limit: 100,
Remaining: 0,
Used: 500,
Reset: now.Add(time.Hour),
at: now,
},
{
Name: "UnderLimit",
Limit: 100,
Remaining: 0,
Used: 500,
Reset: now.Add(time.Hour),
at: now,
},
{
Name: "Partial",
Omit: []string{"x-ratelimit-remaining"},
ExpectNoMetrics: true,
Limit: 100,
Remaining: 0,
Used: 500,
Reset: now.Add(time.Hour),
at: now,
},
}
for _, c := range cases {
c := c
t.Run(c.Name, func(t *testing.T) {
t.Parallel()
reg := prometheus.NewRegistry()
idp := oidctest.NewFakeIDP(t, oidctest.WithMiddlewares(
func(next http.Handler) http.Handler {
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
if !c.NoHeaders {
rw.Header().Set("x-ratelimit-limit", fmt.Sprintf("%d", c.Limit))
rw.Header().Set("x-ratelimit-remaining", fmt.Sprintf("%d", c.Remaining))
rw.Header().Set("x-ratelimit-used", fmt.Sprintf("%d", c.Used))
rw.Header().Set("x-ratelimit-resource", "core")
rw.Header().Set("x-ratelimit-reset", fmt.Sprintf("%d", c.Reset.Unix()))
for _, omit := range c.Omit {
rw.Header().Del(omit)
}
}
next.ServeHTTP(rw, r)
})
}))
factory := promoauth.NewFactory(reg)
if !c.at.IsZero() {
factory.Now = func() time.Time {
return c.at
}
}
cfg := factory.NewGithub("test", idp.OIDCConfig(t, []string{}))
// Do a single oauth2 call
ctx := testutil.Context(t, testutil.WaitShort)
ctx = context.WithValue(ctx, oauth2.HTTPClient, idp.HTTPClient(nil))
_, err := cfg.Exchange(ctx, idp.CreateAuthCode(t, "foo"))
require.NoError(t, err)
// Verify
labels := prometheus.Labels{
"name": "test",
"resource": "core",
}
pass := true
if !c.ExpectNoMetrics {
pass = pass && assert.Equal(t, gaugeValue(t, reg, "coderd_oauth2_external_requests_rate_limit_total", labels), c.Limit, "limit")
pass = pass && assert.Equal(t, gaugeValue(t, reg, "coderd_oauth2_external_requests_rate_limit_remaining", labels), c.Remaining, "remaining")
pass = pass && assert.Equal(t, gaugeValue(t, reg, "coderd_oauth2_external_requests_rate_limit_used", labels), c.Used, "used")
if !c.at.IsZero() {
until := c.Reset.Sub(c.at)
// Float accuracy is not great, so we allow a delta of 2
pass = pass && assert.InDelta(t, gaugeValue(t, reg, "coderd_oauth2_external_requests_rate_limit_reset_in_seconds", labels), int(until.Seconds()), 2, "reset in")
}
} else {
pass = pass && assert.Nil(t, metricValue(t, reg, "coderd_oauth2_external_requests_rate_limit_total", labels), "not exists")
}
// Helpful debugging
if !pass {
t.Log(registryDump(reg))
}
})
}
}
func registryDump(reg *prometheus.Registry) string {
h := promhttp.HandlerFor(reg, promhttp.HandlerOpts{})
rec := httptest.NewRecorder()
req, _ := http.NewRequestWithContext(context.Background(), http.MethodGet, "/", nil)
h.ServeHTTP(rec, req)
resp := rec.Result()
data, _ := io.ReadAll(resp.Body)
_ = resp.Body.Close()
return string(data)
}
func must[V any](t *testing.T) func(v V, err error) V {
@ -78,3 +232,39 @@ func must[V any](t *testing.T) func(v V, err error) V {
return v
}
}
func gaugeValue(t testing.TB, reg prometheus.Gatherer, metricName string, labels prometheus.Labels) int {
labeled := metricValue(t, reg, metricName, labels)
require.NotNilf(t, labeled, "metric %q with labels %v not found", metricName, labels)
return int(labeled.GetGauge().GetValue())
}
func counterValue(t testing.TB, reg prometheus.Gatherer, metricName string, labels prometheus.Labels) int {
labeled := metricValue(t, reg, metricName, labels)
require.NotNilf(t, labeled, "metric %q with labels %v not found", metricName, labels)
return int(labeled.GetCounter().GetValue())
}
func compare(reg prometheus.Gatherer, compare string) error {
return ptestutil.GatherAndCompare(reg, strings.NewReader(compare))
}
func metricValue(t testing.TB, reg prometheus.Gatherer, metricName string, labels prometheus.Labels) *io_prometheus_client.Metric {
metrics, err := reg.Gather()
require.NoError(t, err)
for _, m := range metrics {
if m.GetName() == metricName {
for _, labeled := range m.GetMetric() {
mLables := make(prometheus.Labels)
for _, v := range labeled.GetLabel() {
mLables[v.GetName()] = v.GetValue()
}
if maps.Equal(mLables, labels) {
return labeled
}
}
}
}
return nil
}