530 lines
18 KiB
YAML
530 lines
18 KiB
YAML
apiVersion: opentelemetry.io/v1beta1
|
|
kind: OpenTelemetryCollector
|
|
metadata:
|
|
name: openobserve-collector-agent
|
|
namespace: openobserve-collector
|
|
spec:
|
|
managementState: managed
|
|
image: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:0.127.0
|
|
config:
|
|
exporters:
|
|
otlphttp/openobserve:
|
|
endpoint: http://openobserve-openobserve-standalone.openobserve.svc.cluster.local:5080
|
|
headers:
|
|
Authorization: ${OPENOBSERVE_AUTH}
|
|
logs_endpoint: http://openobserve-openobserve-standalone.openobserve.svc.cluster.local:5080/api/default/v1/logs
|
|
metrics_endpoint: http://openobserve-openobserve-standalone.openobserve.svc.cluster.local:5080/api/default/v1/metrics
|
|
traces_endpoint: http://openobserve-openobserve-standalone.openobserve.svc.cluster.local:5080/api/default/v1/traces
|
|
# HTTP client configuration to match OpenObserve HTTP/1.1
|
|
compression: gzip
|
|
max_idle_conns: 50
|
|
max_idle_conns_per_host: 5
|
|
idle_conn_timeout: 120s
|
|
read_buffer_size: 8192
|
|
write_buffer_size: 8192
|
|
otlphttp/openobserve_k8s_events:
|
|
endpoint: http://openobserve-openobserve-standalone.openobserve.svc.cluster.local:5080
|
|
headers:
|
|
Authorization: ${OPENOBSERVE_AUTH}
|
|
stream-name: k8s_events
|
|
logs_endpoint: http://openobserve-openobserve-standalone.openobserve.svc.cluster.local:5080/api/default/v1/logs
|
|
# HTTP client configuration to match OpenObserve HTTP/1.1
|
|
compression: gzip
|
|
max_idle_conns: 50
|
|
max_idle_conns_per_host: 5
|
|
idle_conn_timeout: 120s
|
|
read_buffer_size: 8192
|
|
write_buffer_size: 8192
|
|
extensions:
|
|
zpages: {}
|
|
processors:
|
|
batch:
|
|
send_batch_size: 5000
|
|
timeout: 30s
|
|
send_batch_max_size: 6000
|
|
metadata_keys:
|
|
- k8s.namespace.name
|
|
- k8s.pod.name
|
|
k8sattributes:
|
|
auth_type: serviceAccount
|
|
extract:
|
|
labels:
|
|
- from: pod
|
|
key: app.kubernetes.io/name
|
|
tag_name: service.name
|
|
- from: pod
|
|
key: app.kubernetes.io/component
|
|
tag_name: k8s.app.component
|
|
metadata:
|
|
- k8s.pod.name
|
|
- k8s.namespace.name
|
|
- k8s.node.name
|
|
filter:
|
|
node_from_env_var: K8S_NODE_NAME
|
|
passthrough: false
|
|
pod_association:
|
|
- sources:
|
|
- from: resource_attribute
|
|
name: k8s.pod.uid
|
|
- sources:
|
|
- from: resource_attribute
|
|
name: k8s.pod.name
|
|
- from: resource_attribute
|
|
name: k8s.namespace.name
|
|
- from: resource_attribute
|
|
name: k8s.node.name
|
|
- sources:
|
|
- from: resource_attribute
|
|
name: k8s.pod.ip
|
|
- sources:
|
|
- from: resource_attribute
|
|
name: k8s.pod.name
|
|
- from: resource_attribute
|
|
name: k8s.namespace.name
|
|
- sources:
|
|
- from: connection
|
|
|
|
attributes:
|
|
actions:
|
|
- key: k8s_node_name
|
|
from_attribute: k8s.node.name
|
|
action: upsert
|
|
groupbyattrs/final:
|
|
keys:
|
|
- k8s_node_name
|
|
- direction
|
|
metricstransform:
|
|
transforms:
|
|
- include: system.network.io
|
|
match_type: strict
|
|
action: update
|
|
new_name: system_network_io
|
|
- include: system.cpu.time
|
|
match_type: strict
|
|
action: update
|
|
new_name: k8s_node_cpu_time
|
|
- include: system.cpu.utilization
|
|
match_type: strict
|
|
action: update
|
|
new_name: k8s_node_cpu_utilization
|
|
- include: k8s.node.cpu.utilization
|
|
match_type: strict
|
|
action: update
|
|
new_name: k8s_node_cpu_utilization
|
|
- include: system.memory.usage
|
|
match_type: strict
|
|
action: update
|
|
new_name: system_memory_usage
|
|
- include: system.memory.utilization
|
|
match_type: strict
|
|
action: update
|
|
new_name: k8s_node_memory_utilization
|
|
- include: system.filesystem.utilization
|
|
match_type: strict
|
|
action: update
|
|
new_name: k8s_node_filesystem_utilization
|
|
- include: container_fs_reads_total
|
|
match_type: strict
|
|
action: update
|
|
new_name: container_fs_reads_total
|
|
- include: container_fs_writes_total
|
|
match_type: strict
|
|
action: update
|
|
new_name: container_fs_writes_total
|
|
- include: k8s.pod.cpu_request_utilization
|
|
match_type: strict
|
|
action: update
|
|
new_name: k8s_pod_cpu_request_utilization
|
|
- include: k8s.pod.cpu_limit_utilization
|
|
match_type: strict
|
|
action: update
|
|
new_name: k8s_pod_cpu_limit_utilization
|
|
- include: k8s.pod.memory_request_utilization
|
|
match_type: strict
|
|
action: update
|
|
new_name: k8s_pod_memory_request_utilization
|
|
- include: k8s.pod.memory_limit_utilization
|
|
match_type: strict
|
|
action: update
|
|
new_name: k8s_pod_memory_limit_utilization
|
|
- include: k8s.container.cpu_request_utilization
|
|
match_type: strict
|
|
action: update
|
|
new_name: k8s_container_cpu_request_utilization
|
|
- include: k8s.container.cpu_limit_utilization
|
|
match_type: strict
|
|
action: update
|
|
new_name: k8s_container_cpu_limit_utilization
|
|
- include: k8s.container.memory_request_utilization
|
|
match_type: strict
|
|
action: update
|
|
new_name: k8s_container_memory_request_utilization
|
|
- include: k8s.container.memory_limit_utilization
|
|
match_type: strict
|
|
action: update
|
|
new_name: k8s_container_memory_limit_utilization
|
|
resourcedetection:
|
|
detectors:
|
|
- system
|
|
- env
|
|
- k8snode
|
|
override: true
|
|
system:
|
|
hostname_sources:
|
|
- os
|
|
- dns
|
|
# Filter out high-cardinality, low-value metrics
|
|
filter/drop_noisy_metrics:
|
|
metrics:
|
|
exclude:
|
|
match_type: regexp
|
|
metric_names:
|
|
- ".*_bucket$" # Drop histogram buckets for non-critical metrics
|
|
- "go_.*" # Drop Go runtime metrics
|
|
- "promhttp_.*" # Drop Prometheus HTTP metrics
|
|
- "process_.*" # Drop process metrics
|
|
- "container_spec_.*" # Drop container spec metrics
|
|
- "container_tasks_state" # Drop task state metrics
|
|
# Add intelligent trace sampling to reduce from 100% to ~15-20%
|
|
tail_sampling:
|
|
decision_wait: 10s
|
|
num_traces: 50000
|
|
expected_new_traces_per_sec: 10
|
|
policies:
|
|
# Always sample error traces (100%)
|
|
- name: errors
|
|
type: status_code
|
|
status_code:
|
|
status_codes: [ERROR]
|
|
# Always sample slow traces >1s (100%)
|
|
- name: slow-traces
|
|
type: latency
|
|
latency:
|
|
threshold_ms: 1000
|
|
# Always sample traces from critical namespaces (100%)
|
|
- name: critical-namespaces
|
|
type: string_attribute
|
|
string_attribute:
|
|
key: k8s.namespace.name
|
|
values: [kube-system, openobserve, cert-manager, ingress-nginx, longhorn-system]
|
|
# Sample 5% of normal traces (reduced from 10% for resource optimization)
|
|
- name: probabilistic
|
|
type: probabilistic
|
|
probabilistic:
|
|
sampling_percentage: 5
|
|
receivers:
|
|
filelog/std:
|
|
exclude:
|
|
- /var/log/pods/default_daemonset-collector*_*/opentelemetry-collector/*.log
|
|
include:
|
|
- /var/log/pods/*/*/*.log
|
|
include_file_name: false
|
|
include_file_path: true
|
|
operators:
|
|
- id: get-format
|
|
routes:
|
|
- expr: body matches "^\\{"
|
|
output: parser-docker
|
|
- expr: body matches "^[^ Z]+ "
|
|
output: parser-crio
|
|
- expr: body matches "^[^ Z]+Z"
|
|
output: parser-containerd
|
|
type: router
|
|
- id: parser-crio
|
|
output: extract_metadata_from_filepath
|
|
regex: ^(?P<time>[^ Z]+) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) ?(?P<log>.*)$
|
|
timestamp:
|
|
layout: 2006-01-02T15:04:05.999999999Z07:00
|
|
layout_type: gotime
|
|
parse_from: attributes.time
|
|
type: regex_parser
|
|
- id: parser-containerd
|
|
output: extract_metadata_from_filepath
|
|
regex: ^(?P<time>[^ ^Z]+Z) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) ?(?P<log>.*)$
|
|
timestamp:
|
|
layout: "%Y-%m-%dT%H:%M:%S.%LZ"
|
|
parse_from: attributes.time
|
|
type: regex_parser
|
|
- id: parser-docker
|
|
output: extract_metadata_from_filepath
|
|
timestamp:
|
|
layout: "%Y-%m-%dT%H:%M:%S.%LZ"
|
|
parse_from: attributes.time
|
|
type: json_parser
|
|
- cache:
|
|
size: 128
|
|
id: extract_metadata_from_filepath
|
|
parse_from: attributes["log.file.path"]
|
|
regex: ^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[a-f0-9\-]{36})\/(?P<container_name>[^\._]+)\/(?P<restart_count>\d+)\.log$
|
|
type: regex_parser
|
|
- from: attributes.log
|
|
to: body
|
|
type: move
|
|
- from: attributes.stream
|
|
to: attributes["log.iostream"]
|
|
type: move
|
|
- from: attributes.container_name
|
|
to: resource["k8s.container.name"]
|
|
type: move
|
|
- from: attributes.namespace
|
|
to: resource["k8s.namespace.name"]
|
|
type: move
|
|
- from: attributes.pod_name
|
|
to: resource["k8s.pod.name"]
|
|
type: move
|
|
- from: attributes.restart_count
|
|
to: resource["k8s.container.restart_count"]
|
|
type: move
|
|
- from: attributes.uid
|
|
to: resource["k8s.pod.uid"]
|
|
type: move
|
|
start_at: end
|
|
hostmetrics:
|
|
collection_interval: 60s
|
|
root_path: /hostfs
|
|
scrapers:
|
|
cpu: {}
|
|
disk: {}
|
|
memory: {}
|
|
filesystem:
|
|
exclude_fs_types:
|
|
fs_types:
|
|
- autofs
|
|
- binfmt_misc
|
|
- bpf
|
|
- cgroup2
|
|
- configfs
|
|
- debugfs
|
|
- devpts
|
|
- devtmpfs
|
|
- fusectl
|
|
- hugetlbfs
|
|
- iso9660
|
|
- mqueue
|
|
- nsfs
|
|
- overlay
|
|
- proc
|
|
- procfs
|
|
- pstore
|
|
- rpc_pipefs
|
|
- securityfs
|
|
- selinuxfs
|
|
- squashfs
|
|
- sysfs
|
|
- tracefs
|
|
match_type: strict
|
|
exclude_mount_points:
|
|
match_type: regexp
|
|
mount_points:
|
|
- /dev/.*
|
|
- /proc/.*
|
|
- /sys/.*
|
|
- /run/k3s/containerd/.*
|
|
- /var/lib/docker/.*
|
|
- /var/lib/kubelet/.*
|
|
- /snap/.*
|
|
load: {}
|
|
network: {}
|
|
kubeletstats:
|
|
auth_type: serviceAccount
|
|
collection_interval: 60s
|
|
endpoint: https://${env:K8S_NODE_IP}:10250
|
|
extra_metadata_labels:
|
|
- container.id
|
|
- k8s.volume.type
|
|
insecure_skip_verify: true
|
|
metric_groups:
|
|
- node
|
|
- pod
|
|
- container
|
|
- volume
|
|
metrics:
|
|
k8s.pod.cpu_limit_utilization:
|
|
enabled: true
|
|
k8s.pod.cpu_request_utilization:
|
|
enabled: true
|
|
k8s.pod.memory_limit_utilization:
|
|
enabled: true
|
|
k8s.pod.memory_request_utilization:
|
|
enabled: true
|
|
k8s.container.cpu_limit_utilization:
|
|
enabled: true
|
|
k8s.container.cpu_request_utilization:
|
|
enabled: true
|
|
k8s.container.memory_limit_utilization:
|
|
enabled: true
|
|
k8s.container.memory_request_utilization:
|
|
enabled: true
|
|
otlp:
|
|
protocols:
|
|
grpc: {}
|
|
http: {}
|
|
prometheus:
|
|
config:
|
|
scrape_configs:
|
|
- job_name: otel-collector
|
|
scrape_interval: 30s
|
|
static_configs:
|
|
- targets:
|
|
- 0.0.0.0:8888
|
|
- job_name: postgresql-cnpg
|
|
scrape_interval: 60s
|
|
kubernetes_sd_configs:
|
|
- role: pod
|
|
namespaces:
|
|
names:
|
|
- postgresql-system
|
|
relabel_configs:
|
|
# Only scrape pods with the cnpg.io/cluster label
|
|
- source_labels: [__meta_kubernetes_pod_label_cnpg_io_cluster]
|
|
action: keep
|
|
regex: postgres-shared
|
|
# Use the metrics port (9187)
|
|
- source_labels: [__meta_kubernetes_pod_container_port_name]
|
|
action: keep
|
|
regex: metrics
|
|
# Set the metrics path
|
|
- target_label: __metrics_path__
|
|
replacement: /metrics
|
|
# Add useful labels
|
|
- source_labels: [__meta_kubernetes_pod_name]
|
|
target_label: instance
|
|
- source_labels: [__meta_kubernetes_pod_label_cnpg_io_cluster]
|
|
target_label: cnpg_cluster
|
|
- source_labels: [__meta_kubernetes_namespace]
|
|
target_label: kubernetes_namespace
|
|
# Celery and Redis metrics - direct scraping
|
|
- job_name: redis-exporter
|
|
scrape_interval: 30s
|
|
kubernetes_sd_configs:
|
|
- role: endpoints
|
|
namespaces:
|
|
names:
|
|
- redis-system
|
|
relabel_configs:
|
|
- source_labels: [__meta_kubernetes_service_name]
|
|
action: keep
|
|
regex: redis-exporter
|
|
- source_labels: [__meta_kubernetes_endpoint_port_name]
|
|
action: keep
|
|
regex: metrics
|
|
- source_labels: [__meta_kubernetes_namespace]
|
|
target_label: kubernetes_namespace
|
|
- source_labels: [__meta_kubernetes_service_name]
|
|
target_label: kubernetes_service_name
|
|
- job_name: celery-metrics-exporter
|
|
scrape_interval: 60s
|
|
kubernetes_sd_configs:
|
|
- role: endpoints
|
|
namespaces:
|
|
names:
|
|
- celery-monitoring
|
|
relabel_configs:
|
|
- source_labels: [__meta_kubernetes_service_name]
|
|
action: keep
|
|
regex: celery-metrics-exporter
|
|
- source_labels: [__meta_kubernetes_endpoint_port_name]
|
|
action: keep
|
|
regex: metrics
|
|
- source_labels: [__meta_kubernetes_namespace]
|
|
target_label: kubernetes_namespace
|
|
- source_labels: [__meta_kubernetes_service_name]
|
|
target_label: kubernetes_service_name
|
|
# Longhorn metrics still handled by target allocator via ServiceMonitor
|
|
service:
|
|
telemetry:
|
|
metrics:
|
|
address: 0.0.0.0:8888
|
|
pipelines:
|
|
logs:
|
|
exporters:
|
|
- otlphttp/openobserve
|
|
processors:
|
|
- batch
|
|
- k8sattributes
|
|
receivers:
|
|
- filelog/std
|
|
metrics:
|
|
exporters:
|
|
- otlphttp/openobserve
|
|
processors:
|
|
- batch
|
|
- k8sattributes
|
|
- attributes
|
|
- filter/drop_noisy_metrics
|
|
- metricstransform
|
|
receivers:
|
|
- kubeletstats
|
|
- hostmetrics
|
|
- prometheus
|
|
traces:
|
|
exporters:
|
|
- otlphttp/openobserve
|
|
processors:
|
|
- batch
|
|
- k8sattributes
|
|
- tail_sampling
|
|
receivers:
|
|
- otlp
|
|
env:
|
|
- name: K8S_NODE_IP
|
|
valueFrom:
|
|
fieldRef:
|
|
fieldPath: status.hostIP
|
|
- name: K8S_NODE_NAME
|
|
valueFrom:
|
|
fieldRef:
|
|
fieldPath: spec.nodeName
|
|
- name: OPENOBSERVE_AUTH
|
|
valueFrom:
|
|
secretKeyRef:
|
|
name: openobserve-collector-credentials
|
|
key: authorization
|
|
ingress:
|
|
route: {}
|
|
mode: daemonset
|
|
observability:
|
|
metrics:
|
|
enableMetrics: true
|
|
podDisruptionBudget:
|
|
maxUnavailable: 1
|
|
replicas: 1
|
|
resources:
|
|
requests:
|
|
cpu: 100m
|
|
memory: 256Mi
|
|
limits:
|
|
cpu: 300m
|
|
memory: 512Mi
|
|
securityContext:
|
|
runAsUser: 0
|
|
runAsGroup: 0
|
|
serviceAccount: openobserve-collector
|
|
hostNetwork: true
|
|
upgradeStrategy: automatic
|
|
volumeMounts:
|
|
- mountPath: /hostfs
|
|
name: hostfs
|
|
readOnly: true
|
|
- mountPath: /var/log/pods
|
|
name: varlogpods
|
|
readOnly: true
|
|
- mountPath: /hostfs/proc
|
|
name: proc
|
|
readOnly: true
|
|
- mountPath: /hostfs/sys
|
|
name: sys
|
|
readOnly: true
|
|
volumes:
|
|
- hostPath:
|
|
path: /
|
|
name: hostfs
|
|
- hostPath:
|
|
path: /var/log/pods
|
|
name: varlogpods
|
|
- hostPath:
|
|
path: /proc
|
|
name: proc
|
|
- hostPath:
|
|
path: /sys
|
|
name: sys |