Keybard-Vagabond-Demo/manifests/infrastructure/openobserve-collector/agent-collector.yaml

apiVersion: opentelemetry.io/v1beta1
kind: OpenTelemetryCollector
metadata:
  name: openobserve-collector-agent
  namespace: openobserve-collector
spec:
  managementState: managed
  image: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:0.127.0
  config:
    exporters:
      otlphttp/openobserve:
        endpoint: http://openobserve-openobserve-standalone.openobserve.svc.cluster.local:5080
        headers:
          Authorization: ${OPENOBSERVE_AUTH}
        logs_endpoint: http://openobserve-openobserve-standalone.openobserve.svc.cluster.local:5080/api/default/v1/logs
        metrics_endpoint: http://openobserve-openobserve-standalone.openobserve.svc.cluster.local:5080/api/default/v1/metrics
        traces_endpoint: http://openobserve-openobserve-standalone.openobserve.svc.cluster.local:5080/api/default/v1/traces
        # HTTP client configuration to match OpenObserve HTTP/1.1
        compression: gzip
        max_idle_conns: 50
        max_idle_conns_per_host: 5
        idle_conn_timeout: 120s
        read_buffer_size: 8192
        write_buffer_size: 8192
      otlphttp/openobserve_k8s_events:
        endpoint: http://openobserve-openobserve-standalone.openobserve.svc.cluster.local:5080
        headers:
          Authorization: ${OPENOBSERVE_AUTH}
          stream-name: k8s_events
        logs_endpoint: http://openobserve-openobserve-standalone.openobserve.svc.cluster.local:5080/api/default/v1/logs
        # HTTP client configuration to match OpenObserve HTTP/1.1
        compression: gzip
        max_idle_conns: 50
        max_idle_conns_per_host: 5
        idle_conn_timeout: 120s
        read_buffer_size: 8192
        write_buffer_size: 8192
    extensions:
      zpages: {}
    processors:
      batch:
        send_batch_size: 5000
        timeout: 30s
        send_batch_max_size: 6000
        metadata_keys:
          - k8s.namespace.name
          - k8s.pod.name
      k8sattributes:
        auth_type: serviceAccount
        extract:
          labels:
            - from: pod
              key: app.kubernetes.io/name
              tag_name: service.name
            - from: pod
              key: app.kubernetes.io/component
              tag_name: k8s.app.component
          metadata:
            - k8s.pod.name
            - k8s.namespace.name
            - k8s.node.name
        filter:
          node_from_env_var: K8S_NODE_NAME
        passthrough: false
        pod_association:
          - sources:
              - from: resource_attribute
                name: k8s.pod.uid
          - sources:
              - from: resource_attribute
                name: k8s.pod.name
              - from: resource_attribute
                name: k8s.namespace.name
              - from: resource_attribute
                name: k8s.node.name
          - sources:
              - from: resource_attribute
                name: k8s.pod.ip
          - sources:
              - from: resource_attribute
                name: k8s.pod.name
              - from: resource_attribute
                name: k8s.namespace.name
          - sources:
              - from: connection

      attributes:
        actions:
          - key: k8s_node_name
            from_attribute: k8s.node.name
            action: upsert
      groupbyattrs/final:
        keys:
          - k8s_node_name
          - direction
      metricstransform:
        transforms:
          - include: system.network.io
            match_type: strict
            action: update
            new_name: system_network_io
          - include: system.cpu.time
            match_type: strict
            action: update
            new_name: k8s_node_cpu_time
          - include: system.cpu.utilization
            match_type: strict
            action: update
            new_name: k8s_node_cpu_utilization
          - include: k8s.node.cpu.utilization
            match_type: strict
            action: update
            new_name: k8s_node_cpu_utilization
          - include: system.memory.usage
            match_type: strict
            action: update
            new_name: system_memory_usage
          - include: system.memory.utilization
            match_type: strict
            action: update
            new_name: k8s_node_memory_utilization
          - include: system.filesystem.utilization
            match_type: strict
            action: update
            new_name: k8s_node_filesystem_utilization
          - include: container_fs_reads_total
            match_type: strict
            action: update
            new_name: container_fs_reads_total
          - include: container_fs_writes_total
            match_type: strict
            action: update
            new_name: container_fs_writes_total
          - include: k8s.pod.cpu_request_utilization
            match_type: strict
            action: update
            new_name: k8s_pod_cpu_request_utilization
          - include: k8s.pod.cpu_limit_utilization
            match_type: strict
            action: update
            new_name: k8s_pod_cpu_limit_utilization
          - include: k8s.pod.memory_request_utilization
            match_type: strict
            action: update
            new_name: k8s_pod_memory_request_utilization
          - include: k8s.pod.memory_limit_utilization
            match_type: strict
            action: update
            new_name: k8s_pod_memory_limit_utilization
          - include: k8s.container.cpu_request_utilization
            match_type: strict
            action: update
            new_name: k8s_container_cpu_request_utilization
          - include: k8s.container.cpu_limit_utilization
            match_type: strict
            action: update
            new_name: k8s_container_cpu_limit_utilization
          - include: k8s.container.memory_request_utilization
            match_type: strict
            action: update
            new_name: k8s_container_memory_request_utilization
          - include: k8s.container.memory_limit_utilization
            match_type: strict
            action: update
            new_name: k8s_container_memory_limit_utilization
      resourcedetection:
        detectors:
          - system
          - env
          - k8snode
        override: true
        system:
          hostname_sources:
            - os
            - dns
      # Filter out high-cardinality, low-value metrics
      filter/drop_noisy_metrics:
        metrics:
          exclude:
            match_type: regexp
            metric_names:
              - ".*_bucket$"  # Drop histogram buckets for non-critical metrics
              - "go_.*"       # Drop Go runtime metrics
              - "promhttp_.*" # Drop Prometheus HTTP metrics
              - "process_.*"  # Drop process metrics
              - "container_spec_.*" # Drop container spec metrics
              - "container_tasks_state" # Drop task state metrics
      # Add intelligent trace sampling to reduce from 100% to ~15-20%
      tail_sampling:
        decision_wait: 10s
        num_traces: 50000
        expected_new_traces_per_sec: 10
        policies:
          # Always sample error traces (100%)
          - name: errors
            type: status_code
            status_code:
              status_codes: [ERROR]
          # Always sample slow traces >1s (100%)
          - name: slow-traces
            type: latency
            latency:
              threshold_ms: 1000
          # Always sample traces from critical namespaces (100%)
          - name: critical-namespaces
            type: string_attribute
            string_attribute:
              key: k8s.namespace.name
              values: [kube-system, openobserve, cert-manager, ingress-nginx, longhorn-system]
          # Sample 5% of normal traces (reduced from 10% for resource optimization)
          - name: probabilistic
            type: probabilistic
            probabilistic:
              sampling_percentage: 5
    receivers:
      filelog/std:
        exclude:
          - /var/log/pods/default_daemonset-collector*_*/opentelemetry-collector/*.log
        include:
          - /var/log/pods/*/*/*.log
        include_file_name: false
        include_file_path: true
        operators:
          - id: get-format
            routes:
              - expr: body matches "^\\{"
                output: parser-docker
              - expr: body matches "^[^ Z]+ "
                output: parser-crio
              - expr: body matches "^[^ Z]+Z"
                output: parser-containerd
            type: router
          - id: parser-crio
            output: extract_metadata_from_filepath
            regex: ^(?P<time>[^ Z]+) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) ?(?P<log>.*)$
            timestamp:
              layout: 2006-01-02T15:04:05.999999999Z07:00
              layout_type: gotime
              parse_from: attributes.time
            type: regex_parser
          - id: parser-containerd
            output: extract_metadata_from_filepath
            regex: ^(?P<time>[^ ^Z]+Z) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) ?(?P<log>.*)$
            timestamp:
              layout: "%Y-%m-%dT%H:%M:%S.%LZ"
              parse_from: attributes.time
            type: regex_parser
          - id: parser-docker
            output: extract_metadata_from_filepath
            timestamp:
              layout: "%Y-%m-%dT%H:%M:%S.%LZ"
              parse_from: attributes.time
            type: json_parser
          - cache:
              size: 128
            id: extract_metadata_from_filepath
            parse_from: attributes["log.file.path"]
            regex: ^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[a-f0-9\-]{36})\/(?P<container_name>[^\._]+)\/(?P<restart_count>\d+)\.log$
            type: regex_parser
          - from: attributes.log
            to: body
            type: move
          - from: attributes.stream
            to: attributes["log.iostream"]
            type: move
          - from: attributes.container_name
            to: resource["k8s.container.name"]
            type: move
          - from: attributes.namespace
            to: resource["k8s.namespace.name"]
            type: move
          - from: attributes.pod_name
            to: resource["k8s.pod.name"]
            type: move
          - from: attributes.restart_count
            to: resource["k8s.container.restart_count"]
            type: move
          - from: attributes.uid
            to: resource["k8s.pod.uid"]
            type: move
        start_at: end
      hostmetrics:
        collection_interval: 60s
        root_path: /hostfs
        scrapers:
          cpu: {}
          disk: {}
          memory: {}
          filesystem:
            exclude_fs_types:
              fs_types:
                - autofs
                - binfmt_misc
                - bpf
                - cgroup2
                - configfs
                - debugfs
                - devpts
                - devtmpfs
                - fusectl
                - hugetlbfs
                - iso9660
                - mqueue
                - nsfs
                - overlay
                - proc
                - procfs
                - pstore
                - rpc_pipefs
                - securityfs
                - selinuxfs
                - squashfs
                - sysfs
                - tracefs
              match_type: strict
            exclude_mount_points:
              match_type: regexp
              mount_points:
                - /dev/.*
                - /proc/.*
                - /sys/.*
                - /run/k3s/containerd/.*
                - /var/lib/docker/.*
                - /var/lib/kubelet/.*
                - /snap/.*
          load: {}
          network: {}
      kubeletstats:
        auth_type: serviceAccount
        collection_interval: 60s
        endpoint: https://${env:K8S_NODE_IP}:10250
        extra_metadata_labels:
          - container.id
          - k8s.volume.type
        insecure_skip_verify: true
        metric_groups:
          - node
          - pod
          - container
          - volume
        metrics:
          k8s.pod.cpu_limit_utilization:
            enabled: true
          k8s.pod.cpu_request_utilization:
            enabled: true
          k8s.pod.memory_limit_utilization:
            enabled: true
          k8s.pod.memory_request_utilization:
            enabled: true
          k8s.container.cpu_limit_utilization:
            enabled: true
          k8s.container.cpu_request_utilization:
            enabled: true
          k8s.container.memory_limit_utilization:
            enabled: true
          k8s.container.memory_request_utilization:
            enabled: true
      otlp:
        protocols:
          grpc: {}
          http: {}
      prometheus:
        config:
          scrape_configs:
            - job_name: otel-collector
              scrape_interval: 30s
              static_configs:
                - targets:
                    - 0.0.0.0:8888
            - job_name: postgresql-cnpg
              scrape_interval: 60s
              kubernetes_sd_configs:
                - role: pod
                  namespaces:
                    names:
                      - postgresql-system
              relabel_configs:
                # Only scrape pods with the cnpg.io/cluster label
                - source_labels: [__meta_kubernetes_pod_label_cnpg_io_cluster]
                  action: keep
                  regex: postgres-shared
                # Use the metrics port (9187)
                - source_labels: [__meta_kubernetes_pod_container_port_name]
                  action: keep
                  regex: metrics
                # Set the metrics path
                - target_label: __metrics_path__
                  replacement: /metrics
                # Add useful labels
                - source_labels: [__meta_kubernetes_pod_name]
                  target_label: instance
                - source_labels: [__meta_kubernetes_pod_label_cnpg_io_cluster]
                  target_label: cnpg_cluster
                - source_labels: [__meta_kubernetes_namespace]
                  target_label: kubernetes_namespace
            # Celery and Redis metrics - direct scraping
            - job_name: redis-exporter
              scrape_interval: 30s
              kubernetes_sd_configs:
                - role: endpoints
                  namespaces:
                    names:
                      - redis-system
              relabel_configs:
                - source_labels: [__meta_kubernetes_service_name]
                  action: keep
                  regex: redis-exporter
                - source_labels: [__meta_kubernetes_endpoint_port_name]
                  action: keep
                  regex: metrics
                - source_labels: [__meta_kubernetes_namespace]
                  target_label: kubernetes_namespace
                - source_labels: [__meta_kubernetes_service_name]
                  target_label: kubernetes_service_name
            - job_name: celery-metrics-exporter
              scrape_interval: 60s
              kubernetes_sd_configs:
                - role: endpoints
                  namespaces:
                    names:
                      - celery-monitoring
              relabel_configs:
                - source_labels: [__meta_kubernetes_service_name]
                  action: keep
                  regex: celery-metrics-exporter
                - source_labels: [__meta_kubernetes_endpoint_port_name]
                  action: keep
                  regex: metrics
                - source_labels: [__meta_kubernetes_namespace]
                  target_label: kubernetes_namespace
                - source_labels: [__meta_kubernetes_service_name]
                  target_label: kubernetes_service_name
            # Longhorn metrics still handled by target allocator via ServiceMonitor
    service:
      telemetry:
        metrics:
          address: 0.0.0.0:8888
      pipelines:
        logs:
          exporters:
            - otlphttp/openobserve
          processors:
            - batch
            - k8sattributes
          receivers:
            - filelog/std
        metrics:
          exporters:
            - otlphttp/openobserve
          processors:
            - batch
            - k8sattributes
            - attributes
            - filter/drop_noisy_metrics
            - metricstransform
          receivers:
            - kubeletstats
            - hostmetrics
            - prometheus
        traces:
          exporters:
            - otlphttp/openobserve
          processors:
            - batch
            - k8sattributes
            - tail_sampling
          receivers:
            - otlp
  env:
    - name: K8S_NODE_IP
      valueFrom:
        fieldRef:
          fieldPath: status.hostIP
    - name: K8S_NODE_NAME
      valueFrom:
        fieldRef:
          fieldPath: spec.nodeName
    - name: OPENOBSERVE_AUTH
      valueFrom:
        secretKeyRef:
          name: openobserve-collector-credentials
          key: authorization
  ingress:
    route: {}
  mode: daemonset
  observability:
    metrics:
      enableMetrics: true
  podDisruptionBudget:
    maxUnavailable: 1
  replicas: 1
  resources:
    requests:
      cpu: 100m
      memory: 256Mi
    limits:
      cpu: 300m
      memory: 512Mi
  securityContext:
    runAsUser: 0
    runAsGroup: 0
  serviceAccount: openobserve-collector
  hostNetwork: true
  upgradeStrategy: automatic
  volumeMounts:
    - mountPath: /hostfs
      name: hostfs
      readOnly: true
    - mountPath: /var/log/pods
      name: varlogpods
      readOnly: true
    - mountPath: /hostfs/proc
      name: proc
      readOnly: true
    - mountPath: /hostfs/sys
      name: sys
      readOnly: true
  volumes:
    - hostPath:
        path: /
      name: hostfs
    - hostPath:
        path: /var/log/pods
      name: varlogpods
    - hostPath:
        path: /proc
      name: proc
    - hostPath:
        path: /sys
      name: sys