Files
Keybard-Vagabond-Demo/manifests/infrastructure/openobserve-collector/agent-collector.yaml
Michael DiLeo 7327d77dcd redaction (#1)
Add the redacted source file for demo purposes

Reviewed-on: https://source.michaeldileo.org/michael_dileo/Keybard-Vagabond-Demo/pulls/1
Co-authored-by: Michael DiLeo <michael_dileo@proton.me>
Co-committed-by: Michael DiLeo <michael_dileo@proton.me>
2025-12-24 13:40:47 +00:00

530 lines
18 KiB
YAML

apiVersion: opentelemetry.io/v1beta1
kind: OpenTelemetryCollector
metadata:
name: openobserve-collector-agent
namespace: openobserve-collector
spec:
managementState: managed
image: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:0.127.0
config:
exporters:
otlphttp/openobserve:
endpoint: http://openobserve-openobserve-standalone.openobserve.svc.cluster.local:5080
headers:
Authorization: ${OPENOBSERVE_AUTH}
logs_endpoint: http://openobserve-openobserve-standalone.openobserve.svc.cluster.local:5080/api/default/v1/logs
metrics_endpoint: http://openobserve-openobserve-standalone.openobserve.svc.cluster.local:5080/api/default/v1/metrics
traces_endpoint: http://openobserve-openobserve-standalone.openobserve.svc.cluster.local:5080/api/default/v1/traces
# HTTP client configuration to match OpenObserve HTTP/1.1
compression: gzip
max_idle_conns: 50
max_idle_conns_per_host: 5
idle_conn_timeout: 120s
read_buffer_size: 8192
write_buffer_size: 8192
otlphttp/openobserve_k8s_events:
endpoint: http://openobserve-openobserve-standalone.openobserve.svc.cluster.local:5080
headers:
Authorization: ${OPENOBSERVE_AUTH}
stream-name: k8s_events
logs_endpoint: http://openobserve-openobserve-standalone.openobserve.svc.cluster.local:5080/api/default/v1/logs
# HTTP client configuration to match OpenObserve HTTP/1.1
compression: gzip
max_idle_conns: 50
max_idle_conns_per_host: 5
idle_conn_timeout: 120s
read_buffer_size: 8192
write_buffer_size: 8192
extensions:
zpages: {}
processors:
batch:
send_batch_size: 5000
timeout: 30s
send_batch_max_size: 6000
metadata_keys:
- k8s.namespace.name
- k8s.pod.name
k8sattributes:
auth_type: serviceAccount
extract:
labels:
- from: pod
key: app.kubernetes.io/name
tag_name: service.name
- from: pod
key: app.kubernetes.io/component
tag_name: k8s.app.component
metadata:
- k8s.pod.name
- k8s.namespace.name
- k8s.node.name
filter:
node_from_env_var: K8S_NODE_NAME
passthrough: false
pod_association:
- sources:
- from: resource_attribute
name: k8s.pod.uid
- sources:
- from: resource_attribute
name: k8s.pod.name
- from: resource_attribute
name: k8s.namespace.name
- from: resource_attribute
name: k8s.node.name
- sources:
- from: resource_attribute
name: k8s.pod.ip
- sources:
- from: resource_attribute
name: k8s.pod.name
- from: resource_attribute
name: k8s.namespace.name
- sources:
- from: connection
attributes:
actions:
- key: k8s_node_name
from_attribute: k8s.node.name
action: upsert
groupbyattrs/final:
keys:
- k8s_node_name
- direction
metricstransform:
transforms:
- include: system.network.io
match_type: strict
action: update
new_name: system_network_io
- include: system.cpu.time
match_type: strict
action: update
new_name: k8s_node_cpu_time
- include: system.cpu.utilization
match_type: strict
action: update
new_name: k8s_node_cpu_utilization
- include: k8s.node.cpu.utilization
match_type: strict
action: update
new_name: k8s_node_cpu_utilization
- include: system.memory.usage
match_type: strict
action: update
new_name: system_memory_usage
- include: system.memory.utilization
match_type: strict
action: update
new_name: k8s_node_memory_utilization
- include: system.filesystem.utilization
match_type: strict
action: update
new_name: k8s_node_filesystem_utilization
- include: container_fs_reads_total
match_type: strict
action: update
new_name: container_fs_reads_total
- include: container_fs_writes_total
match_type: strict
action: update
new_name: container_fs_writes_total
- include: k8s.pod.cpu_request_utilization
match_type: strict
action: update
new_name: k8s_pod_cpu_request_utilization
- include: k8s.pod.cpu_limit_utilization
match_type: strict
action: update
new_name: k8s_pod_cpu_limit_utilization
- include: k8s.pod.memory_request_utilization
match_type: strict
action: update
new_name: k8s_pod_memory_request_utilization
- include: k8s.pod.memory_limit_utilization
match_type: strict
action: update
new_name: k8s_pod_memory_limit_utilization
- include: k8s.container.cpu_request_utilization
match_type: strict
action: update
new_name: k8s_container_cpu_request_utilization
- include: k8s.container.cpu_limit_utilization
match_type: strict
action: update
new_name: k8s_container_cpu_limit_utilization
- include: k8s.container.memory_request_utilization
match_type: strict
action: update
new_name: k8s_container_memory_request_utilization
- include: k8s.container.memory_limit_utilization
match_type: strict
action: update
new_name: k8s_container_memory_limit_utilization
resourcedetection:
detectors:
- system
- env
- k8snode
override: true
system:
hostname_sources:
- os
- dns
# Filter out high-cardinality, low-value metrics
filter/drop_noisy_metrics:
metrics:
exclude:
match_type: regexp
metric_names:
- ".*_bucket$" # Drop histogram buckets for non-critical metrics
- "go_.*" # Drop Go runtime metrics
- "promhttp_.*" # Drop Prometheus HTTP metrics
- "process_.*" # Drop process metrics
- "container_spec_.*" # Drop container spec metrics
- "container_tasks_state" # Drop task state metrics
# Add intelligent trace sampling to reduce from 100% to ~15-20%
tail_sampling:
decision_wait: 10s
num_traces: 50000
expected_new_traces_per_sec: 10
policies:
# Always sample error traces (100%)
- name: errors
type: status_code
status_code:
status_codes: [ERROR]
# Always sample slow traces >1s (100%)
- name: slow-traces
type: latency
latency:
threshold_ms: 1000
# Always sample traces from critical namespaces (100%)
- name: critical-namespaces
type: string_attribute
string_attribute:
key: k8s.namespace.name
values: [kube-system, openobserve, cert-manager, ingress-nginx, longhorn-system]
# Sample 5% of normal traces (reduced from 10% for resource optimization)
- name: probabilistic
type: probabilistic
probabilistic:
sampling_percentage: 5
receivers:
filelog/std:
exclude:
- /var/log/pods/default_daemonset-collector*_*/opentelemetry-collector/*.log
include:
- /var/log/pods/*/*/*.log
include_file_name: false
include_file_path: true
operators:
- id: get-format
routes:
- expr: body matches "^\\{"
output: parser-docker
- expr: body matches "^[^ Z]+ "
output: parser-crio
- expr: body matches "^[^ Z]+Z"
output: parser-containerd
type: router
- id: parser-crio
output: extract_metadata_from_filepath
regex: ^(?P<time>[^ Z]+) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) ?(?P<log>.*)$
timestamp:
layout: 2006-01-02T15:04:05.999999999Z07:00
layout_type: gotime
parse_from: attributes.time
type: regex_parser
- id: parser-containerd
output: extract_metadata_from_filepath
regex: ^(?P<time>[^ ^Z]+Z) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) ?(?P<log>.*)$
timestamp:
layout: "%Y-%m-%dT%H:%M:%S.%LZ"
parse_from: attributes.time
type: regex_parser
- id: parser-docker
output: extract_metadata_from_filepath
timestamp:
layout: "%Y-%m-%dT%H:%M:%S.%LZ"
parse_from: attributes.time
type: json_parser
- cache:
size: 128
id: extract_metadata_from_filepath
parse_from: attributes["log.file.path"]
regex: ^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[a-f0-9\-]{36})\/(?P<container_name>[^\._]+)\/(?P<restart_count>\d+)\.log$
type: regex_parser
- from: attributes.log
to: body
type: move
- from: attributes.stream
to: attributes["log.iostream"]
type: move
- from: attributes.container_name
to: resource["k8s.container.name"]
type: move
- from: attributes.namespace
to: resource["k8s.namespace.name"]
type: move
- from: attributes.pod_name
to: resource["k8s.pod.name"]
type: move
- from: attributes.restart_count
to: resource["k8s.container.restart_count"]
type: move
- from: attributes.uid
to: resource["k8s.pod.uid"]
type: move
start_at: end
hostmetrics:
collection_interval: 60s
root_path: /hostfs
scrapers:
cpu: {}
disk: {}
memory: {}
filesystem:
exclude_fs_types:
fs_types:
- autofs
- binfmt_misc
- bpf
- cgroup2
- configfs
- debugfs
- devpts
- devtmpfs
- fusectl
- hugetlbfs
- iso9660
- mqueue
- nsfs
- overlay
- proc
- procfs
- pstore
- rpc_pipefs
- securityfs
- selinuxfs
- squashfs
- sysfs
- tracefs
match_type: strict
exclude_mount_points:
match_type: regexp
mount_points:
- /dev/.*
- /proc/.*
- /sys/.*
- /run/k3s/containerd/.*
- /var/lib/docker/.*
- /var/lib/kubelet/.*
- /snap/.*
load: {}
network: {}
kubeletstats:
auth_type: serviceAccount
collection_interval: 60s
endpoint: https://${env:K8S_NODE_IP}:10250
extra_metadata_labels:
- container.id
- k8s.volume.type
insecure_skip_verify: true
metric_groups:
- node
- pod
- container
- volume
metrics:
k8s.pod.cpu_limit_utilization:
enabled: true
k8s.pod.cpu_request_utilization:
enabled: true
k8s.pod.memory_limit_utilization:
enabled: true
k8s.pod.memory_request_utilization:
enabled: true
k8s.container.cpu_limit_utilization:
enabled: true
k8s.container.cpu_request_utilization:
enabled: true
k8s.container.memory_limit_utilization:
enabled: true
k8s.container.memory_request_utilization:
enabled: true
otlp:
protocols:
grpc: {}
http: {}
prometheus:
config:
scrape_configs:
- job_name: otel-collector
scrape_interval: 30s
static_configs:
- targets:
- 0.0.0.0:8888
- job_name: postgresql-cnpg
scrape_interval: 60s
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- postgresql-system
relabel_configs:
# Only scrape pods with the cnpg.io/cluster label
- source_labels: [__meta_kubernetes_pod_label_cnpg_io_cluster]
action: keep
regex: postgres-shared
# Use the metrics port (9187)
- source_labels: [__meta_kubernetes_pod_container_port_name]
action: keep
regex: metrics
# Set the metrics path
- target_label: __metrics_path__
replacement: /metrics
# Add useful labels
- source_labels: [__meta_kubernetes_pod_name]
target_label: instance
- source_labels: [__meta_kubernetes_pod_label_cnpg_io_cluster]
target_label: cnpg_cluster
- source_labels: [__meta_kubernetes_namespace]
target_label: kubernetes_namespace
# Celery and Redis metrics - direct scraping
- job_name: redis-exporter
scrape_interval: 30s
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- redis-system
relabel_configs:
- source_labels: [__meta_kubernetes_service_name]
action: keep
regex: redis-exporter
- source_labels: [__meta_kubernetes_endpoint_port_name]
action: keep
regex: metrics
- source_labels: [__meta_kubernetes_namespace]
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
target_label: kubernetes_service_name
- job_name: celery-metrics-exporter
scrape_interval: 60s
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- celery-monitoring
relabel_configs:
- source_labels: [__meta_kubernetes_service_name]
action: keep
regex: celery-metrics-exporter
- source_labels: [__meta_kubernetes_endpoint_port_name]
action: keep
regex: metrics
- source_labels: [__meta_kubernetes_namespace]
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
target_label: kubernetes_service_name
# Longhorn metrics still handled by target allocator via ServiceMonitor
service:
telemetry:
metrics:
address: 0.0.0.0:8888
pipelines:
logs:
exporters:
- otlphttp/openobserve
processors:
- batch
- k8sattributes
receivers:
- filelog/std
metrics:
exporters:
- otlphttp/openobserve
processors:
- batch
- k8sattributes
- attributes
- filter/drop_noisy_metrics
- metricstransform
receivers:
- kubeletstats
- hostmetrics
- prometheus
traces:
exporters:
- otlphttp/openobserve
processors:
- batch
- k8sattributes
- tail_sampling
receivers:
- otlp
env:
- name: K8S_NODE_IP
valueFrom:
fieldRef:
fieldPath: status.hostIP
- name: K8S_NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: OPENOBSERVE_AUTH
valueFrom:
secretKeyRef:
name: openobserve-collector-credentials
key: authorization
ingress:
route: {}
mode: daemonset
observability:
metrics:
enableMetrics: true
podDisruptionBudget:
maxUnavailable: 1
replicas: 1
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 300m
memory: 512Mi
securityContext:
runAsUser: 0
runAsGroup: 0
serviceAccount: openobserve-collector
hostNetwork: true
upgradeStrategy: automatic
volumeMounts:
- mountPath: /hostfs
name: hostfs
readOnly: true
- mountPath: /var/log/pods
name: varlogpods
readOnly: true
- mountPath: /hostfs/proc
name: proc
readOnly: true
- mountPath: /hostfs/sys
name: sys
readOnly: true
volumes:
- hostPath:
path: /
name: hostfs
- hostPath:
path: /var/log/pods
name: varlogpods
- hostPath:
path: /proc
name: proc
- hostPath:
path: /sys
name: sys