Prometheus指标收集触发机制
源码地址:https://github.com/prometheus/prometheus
程序启动
从程序启动代码开始分析, 部分main代码如下:
reloaders := []reloader{
{
// The Scrape and notifier managers need to reload before the Discovery manager as
// they need to read the most updated config when receiving the new targets list.
name: "scrape",
reloader: scrapeManager.ApplyConfig,
}, {
name: "scrape_sd",
reloader: func(cfg *config.Config) error {
c := make(map[string]discovery.Configs)
for _, v := range cfg.ScrapeConfigs {
c[v.JobName] = v.ServiceDiscoveryConfigs
}
return discoveryManagerScrape.ApplyConfig(c)
},
}
}
详细代码请参看官方源码,从源码可以看到:
使用了oklog框架来组织协程,discoveryManagerScrape.ApplyConfig(c)就是在其中一个协程中处理中执行的,具体来看这个方法:
func (m *Manager) ApplyConfig(cfg map[string]Configs) error {
for _, prov := range m.providers {
prov.subs = prov.newSubs
prov.newSubs = map[string]struct{}{}
prov.mu.Unlock()
if !prov.IsStarted() {
m.startProvider(m.ctx, prov)
}
}
}
func (m *Manager) startProvider(ctx context.Context, p *Provider) {
level.Debug(m.logger).Log("msg", "Starting provider", "provider", p.name, "subs", fmt.Sprintf("%v", p.subs))
ctx, cancel := context.WithCancel(ctx)
updates := make(chan []*targetgroup.Group)
p.cancel = cancel
go p.d.Run(ctx, updates)
go m.updater(ctx, p, updates)
}
部分代码如上:
这里会通过一个协程启动每个provider:这里的provider是指prometheus对接的各个平台,获取指标数据
然后再起一个协程发送指标数据更新channel标识,供其他select此通道标识的地方解析处理指标数据
获取数据
这里以k8s为例,查看其run方法:
// Run implements the discoverer interface.
func (d *Discovery) Run(ctx context.Context, ch chan<- []*targetgroup.Group) {
d.Lock()
namespaces := d.getNamespaces()
switch d.role {
case RoleEndpoint:
for _, namespace := range namespaces {
e := d.client.CoreV1().Endpoints(namespace)
elw := &cache.ListWatch{
ListFunc: func(options metav1.ListOptions) (runtime.Object, error) {
options.FieldSelector = d.selectors.endpoints.field
options.LabelSelector = d.selectors.endpoints.label
return e.List(ctx, options)
},
WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) {
options.FieldSelector = d.selectors.endpoints.field
options.LabelSelector = d.selectors.endpoints.label
return e.Watch(ctx, options)
},
}
s := d.client.CoreV1().Services(namespace)
slw := &cache.ListWatch{
ListFunc: func(options metav1.ListOptions) (runtime.Object, error) {
options.FieldSelector = d.selectors.service.field
options.LabelSelector = d.selectors.service.label
return s.List(ctx, options)
},
WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) {
options.FieldSelector = d.selectors.service.field
options.LabelSelector = d.selectors.service.label
return s.Watch(ctx, options)
},
}
p := d.client.CoreV1().Pods(namespace)
plw := &cache.ListWatch{
ListFunc: func(options metav1.ListOptions) (runtime.Object, error) {
options.FieldSelector = d.selectors.pod.field
options.LabelSelector = d.selectors.pod.label
return p.List(ctx, options)
},
WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) {
options.FieldSelector = d.selectors.pod.field
options.LabelSelector = d.selectors.pod.label
return p.Watch(ctx, options)
},
}
var nodeInf cache.SharedInformer
if d.attachMetadata.Node {
nodeInf = d.newNodeInformer(ctx)
go nodeInf.Run(ctx.Done())
}
eps := NewEndpoints(
log.With(d.logger, "role", "endpoint"),
d.newEndpointsByNodeInformer(elw),
cache.NewSharedInformer(slw, &apiv1.Service{}, resyncDisabled),
cache.NewSharedInformer(plw, &apiv1.Pod{}, resyncDisabled),
nodeInf,
)
d.discoverers = append(d.discoverers, eps)
go eps.endpointsInf.Run(ctx.Done())
go eps.serviceInf.Run(ctx.Done())
go eps.podInf.Run(ctx.Done())
}
default:
level.Error(d.logger).Log("msg", "unknown Kubernetes discovery kind", "role", d.role)
}
var wg sync.WaitGroup
for _, dd := range d.discoverers {
wg.Add(1)
go func(d discovery.Discoverer) {
defer wg.Done()
d.Run(ctx, ch)
}(dd)
}
d.Unlock()
wg.Wait()
<-ctx.Done()
}
该方法逻辑大致如下:
针对k8s的各种资源类型:service、endpoint等,初始化client-go同步所需数据:设置资源CRUD对应回调方法,比如该资源add的时候,将改资源实例增加到定义的队列queue中
对每种资源开启一个协程:通过client-go开启该资源类型数据同步
执行每种资源的发现方法run
下面以endpoint资源为例,查看其run方法:
// Run implements the Discoverer interface.
func (e *Endpoints) Run(ctx context.Context, ch chan<- []*targetgroup.Group) {
defer e.queue.ShutDown()
cacheSyncs := []cache.InformerSynced{e.endpointsInf.HasSynced, e.serviceInf.HasSynced, e.podInf.HasSynced}
if e.withNodeMetadata {
cacheSyncs = append(cacheSyncs, e.nodeInf.HasSynced)
}
if !cache.WaitForCacheSync(ctx.Done(), cacheSyncs...) {
if !errors.Is(ctx.Err(), context.Canceled) {
level.Error(e.logger).Log("msg", "endpoints informer unable to sync cache")
}
return
}
go func() {
for e.process(ctx, ch) {
}
}()
// Block until the target provider is explicitly canceled.
<-ctx.Done()
}
上面的代码逻辑大致如下:
检查endpoint资源是否同步完成
启动协程处理获取到的数据process
下面来看下process代码:
func (e *Endpoints) process(ctx context.Context, ch chan<- []*targetgroup.Group) bool {
keyObj, quit := e.queue.Get()
if quit {
return false
}
defer e.queue.Done(keyObj)
key := keyObj.(string)
namespace, name, err := cache.SplitMetaNamespaceKey(key)
if err != nil {
level.Error(e.logger).Log("msg", "splitting key failed", "key", key)
return true
}
o, exists, err := e.endpointsStore.GetByKey(key)
if err != nil {
level.Error(e.logger).Log("msg", "getting object from store failed", "key", key)
return true
}
if !exists {
send(ctx, ch, &targetgroup.Group{Source: endpointsSourceFromNamespaceAndName(namespace, name)})
return true
}
eps, err := convertToEndpoints(o)
if err != nil {
level.Error(e.logger).Log("msg", "converting to Endpoints object failed", "err", err)
return true
}
send(ctx, ch, e.buildEndpoints(eps))
return true
}
func send(ctx context.Context, ch chan<- []*targetgroup.Group, tg *targetgroup.Group) {
if tg == nil {
return
}
select {
case <-ctx.Done():
case ch <- []*targetgroup.Group{tg}:
}
}
上面的代码逻辑大致如下:
从queue中取出获取到的endpoint
解析endpoint信息,获取到namespace和name
查询缓存中是否存在该endpoint分组缓存,如果不存在,则新建
发送endpoint分组指标信息到分组channel中
发送数据接收标识
回到启动provider的地方
func (m *Manager) startProvider(ctx context.Context, p *Provider) {
level.Debug(m.logger).Log("msg", "Starting provider", "provider", p.name, "subs", fmt.Sprintf("%v", p.subs))
ctx, cancel := context.WithCancel(ctx)
updates := make(chan []*targetgroup.Group)
p.cancel = cancel
go p.d.Run(ctx, updates)
go m.updater(ctx, p, updates)
}
上面讲了数据获取的逻辑,也就是p.d.Run方法,这里开始讲updater方法,查看此方法:
func (m *Manager) updater(ctx context.Context, p *Provider, updates chan []*targetgroup.Group) {
// Ensure targets from this provider are cleaned up.
defer m.cleaner(p)
for {
select {
case <-ctx.Done():
return
case tgs, ok := <-updates:
receivedUpdates.WithLabelValues(m.name).Inc()
if !ok {
level.Debug(m.logger).Log("msg", "Discoverer channel closed", "provider", p.name)
// Wait for provider cancellation to ensure targets are cleaned up when expected.
<-ctx.Done()
return
}
p.mu.RLock()
for s := range p.subs {
m.updateGroup(poolKey{setName: s, provider: p.name}, tgs)
}
p.mu.RUnlock()
select {
case m.triggerSend <- struct{}{}:
default:
}
}
}
}
上面的代码逻辑大致如下:
轮训数据channel数据:这个channel数据由另外一个协程产生
将数据缓存下来
发送一个数据接收到的标识,也就是triggerSend
这里注意上面的代码里,其实未经判断 tgs 是不是真正的发生了变化,直接就去和 triggerSend 管道通信了,这里会有性能问题
数据发送
// Run starts the background processing.
func (m *Manager) Run() error {
go m.sender()
for range m.ctx.Done() {
m.cancelDiscoverers()
return m.ctx.Err()
}
return nil
}
func (m *Manager) sender() {
ticker := time.NewTicker(m.updatert)
defer ticker.Stop()
for {
select {
case <-m.ctx.Done():
return
case <-ticker.C: // Some discoverers send updates too often, so we throttle these with the ticker.
select {
case <-m.triggerSend:
sentUpdates.WithLabelValues(m.name).Inc()
select {
case m.syncCh <- m.allGroups():
default:
delayedUpdates.WithLabelValues(m.name).Inc()
level.Debug(m.logger).Log("msg", "Discovery receiver's channel was full so will retry the next cycle")
select {
case m.triggerSend <- struct{}{}:
default:
}
}
default:
}
}
}
}
在上面更新完了 targetgroup 之后,会传给 triggerSend 管道,进而触发 sender 里面和 syncCh 管道通信,sender是一个后台协程,大致逻辑如下:
调用 m.allGroups 把 map 里存的 targets 数据进行整理,调整了一下 map 数据的映射方式
将数据发给 syncCh 管道
数据接收
那么这个syncCh管道是谁在读取呢,这里就需要回到去文章最开始的程序启动那里了,那里定义了这样一个处理方法:
{
// Scrape manager.
g.Add(
func() error {
// When the scrape manager receives a new targets list
// it needs to read a valid config for each job.
// It depends on the config being in sync with the discovery manager so
// we wait until the config is fully loaded.
<-reloadReady.C
err := scrapeManager.Run(discoveryManagerScrape.SyncCh())
level.Info(logger).Log("msg", "Scrape manager stopped")
return err
},
func(err error) {
// Scrape manager needs to be stopped before closing the local TSDB
// so that it doesn't try to write samples to a closed storage.
// We should also wait for rule manager to be fully stopped to ensure
// we don't trigger any false positive alerts for rules using absent().
level.Info(logger).Log("msg", "Stopping scrape manager...")
scrapeManager.Stop()
},
)
}
可以看到,是在这里定义的方法处理syncCh通道数据的,查看该run方法 :
// Run receives and saves target set updates and triggers the scraping loops reloading.
// Reloading happens in the background so that it doesn't block receiving targets updates.
func (m *Manager) Run(tsets <-chan map[string][]*targetgroup.Group) error {
go m.reloader()
for {
select {
case ts := <-tsets:
m.updateTsets(ts)
select {
case m.triggerReload <- struct{}{}:
default:
}
case <-m.graceShut:
return nil
}
}
}
上面逻辑大致如下:
启动一个协程来处理数据
从管道里接收到相关的信息,保存之后,往 triggerReload 里面写个空数据去触发 reload
数据处理
这个triggerReload是谁在接收呢,就是上面单独启动的那个协程在处理,查看该reloader方法:
func (m *Manager) reloader() {
reloadIntervalDuration := m.opts.DiscoveryReloadInterval
if reloadIntervalDuration < model.Duration(5*time.Second) {
reloadIntervalDuration = model.Duration(5 * time.Second)
}
ticker := time.NewTicker(time.Duration(reloadIntervalDuration))
defer ticker.Stop()
for {
select {
case <-m.graceShut:
return
case <-ticker.C:
select {
case <-m.triggerReload:
m.reload()
case <-m.graceShut:
return
}
}
}
}
这里就进入到了数据处理的逻辑了,这个单独起一篇文章进行讲解。
性能问题
下面来看下上面收到的reload会被频繁触发导致性能的问题 :
kube-system 命名空间里面的一些 endpoint 资源,会一直处于变化的状态。使用 kubectl get endpoints 命令也可以确认这一点。
[~]$ date +%F_%R:%S; \
k get endpoints -n kube-system kube-controller-manager -o yaml \
|grep resourceVersion
2022-01-22_15:34:04 resourceVersion: "254417351"
[~]$ date +%F_%R:%S; \
k get endpoints -n kube-system kube-controller-manager -o yaml \
|grep resourceVersion
2022-01-22_15:34:06 resourceVersion: "254417361"
而这点,基于 k8s 本身的机制,是设计如此的,虽然上述资源的核心内容没变,但是资源版本会一直刷新。
但也正是因为这点,在 k8s 集群中部署 prometheus 的时候,如果有 watch kube-system 资源的时候,其 reload 频度就会受其影响,因为 endpoint 一直变,导致不断触发 reload。
上文提到的 discovery/manager.go 里的 updater 函数,里面未判断是否真的发生实质性 targetgroup 信息变更,就继续触发 reload 链路去了。因此,尽管看起来 kube-system 的一些 endpoint 仅仅是版本号变了而已,却依旧真的会触发 reload 链路。
浙公网安备 33010602011771号