manifests/infrastructure/postgresql/cluster-shared.yaml

---
apiVersion: postgresql.cnpg.io/v1
kind: Cluster
metadata:
  name: postgres-shared
  namespace: postgresql-system
  labels:
    app: postgresql-shared
    backup.longhorn.io/enable: "true" 
spec:
  instances: 3
  
  # Use CloudNativePG-compatible PostGIS image
  # imageName: ghcr.io/cloudnative-pg/postgresql:16.6  # Standard image
  imageName: registry.keyboardvagabond.com/library/cnpg-postgis:16.6-3.4-v2

    # Bootstrap with initial database and user
  bootstrap:
    initdb:
      database: shared_db
      owner: shared_user
      encoding: UTF8
      localeCollate: en_US.UTF-8
      localeCType: en_US.UTF-8

      # Install PostGIS extensions in template database (available to all databases)
      postInitTemplateSQL:
        - CREATE EXTENSION IF NOT EXISTS postgis;
        - CREATE EXTENSION IF NOT EXISTS postgis_topology;
        - CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;
        - CREATE EXTENSION IF NOT EXISTS postgis_tiger_geocoder;


  # PostgreSQL configuration for conservative scaling (4GB memory limit)
  postgresql:
    parameters:
      # Performance optimizations for 4GB memory limit
      # Reduced max_connections based on actual usage (7 connections observed)
      max_connections: "150"
      shared_buffers: "1GB"  # 25% of 4GB memory limit  
      effective_cache_size: "3GB"  # ~75% of 4GB memory limit  
      maintenance_work_mem: "256MB"   # Scaled for 4GB memory limit
      checkpoint_completion_target: "0.9"
      wal_buffers: "24MB"
      default_statistics_target: "100"
      random_page_cost: "1.1"  # Good for SSD storage
      effective_io_concurrency: "200"
      work_mem: "24MB"  # Increased from 14MB: 150 connections × 24MB = 3.6GB max
      min_wal_size: "1GB"
      max_wal_size: "6GB"
      
      # Additional optimizations for your hardware (tuned for 2-core limit)
      max_worker_processes: "8"   # Scaled for 2 CPU cores
      max_parallel_workers: "6"   # Increased for better OLTP workload
      max_parallel_workers_per_gather: "3"  # Max 3 workers per query
      max_parallel_maintenance_workers: "3"  # For maintenance operations
      
      # Network timeout adjustments for 100Mbps VLAN
      wal_sender_timeout: "10s"  # Increased from 5s for slower network
      wal_receiver_timeout: "10s"  # Increased from 5s for slower network
      
      # Multi-instance HA configuration with asynchronous replication
      synchronous_commit: "on" # favor data integrity

      # Log long running queries
      log_min_duration_statement: "5000"  # Log queries > 5 seconds
      log_line_prefix: "%t [%p]: [%l-1] user=%u,db=%d,app=%a,client=%h "
      log_statement: "none"  # Only log slow queries, not all
      
      # Query activity tracking - increase limit for complex queries
      track_activity_query_size: "8192"  # 8KB - allows full query text in pg_stat_activity

  
  # Storage configuration using PostgreSQL-optimized storage class
  storage:
    size: 50Gi
    storageClass: longhorn-postgresql
  
  # Separate WAL storage for better I/O performance
  walStorage:
    size: 10Gi
    storageClass: longhorn-postgresql
  
  # Enable pod anti-affinity for HA cluster (distribute across nodes)
  affinity:
    enablePodAntiAffinity: true
    topologyKey: kubernetes.io/hostname
  
  resources:
    requests:
      cpu: 750m
      memory: 1.5Gi
    limits:
      cpu: 2000m
      memory: 4Gi

  # Enable superuser access for maintenance
  enableSuperuserAccess: true
  
  # Certificate configuration using cert-manager
  certificates:
    serverTLSSecret: postgresql-shared-server-cert
    serverCASecret: postgresql-shared-server-cert
    clientCASecret: postgresql-shared-client-cert
    replicationTLSSecret: postgresql-shared-client-cert
  
  # Replication slot configuration - enabled for HA cluster
  replicationSlots:
    highAvailability:
      enabled: true    # Enable HA replication slots for multi-instance cluster
    synchronizeReplicas:
      enabled: true    # Enable replica synchronization for HA
    
  # Monitoring configuration for Prometheus metrics
  monitoring:
    enablePodMonitor: true
    # Custom metrics for dashboard compatibility
    customQueriesConfigMap:
      - name: postgresql-dashboard-metrics
        key: queries
      - name: postgresql-connection-metrics
        key: custom-queries
  
  # Reasonable startup delay for stable 2-instance cluster
  startDelay: 30  
  probes:
    startup:
      initialDelaySeconds: 60    # Allow PostgreSQL to start and begin recovery
      periodSeconds: 10
      timeoutSeconds: 10
      failureThreshold: 90       # 15 minutes total for replica recovery with Longhorn storage  
    readiness:
      initialDelaySeconds: 30    # Allow instance manager to initialize
      periodSeconds: 10
      timeoutSeconds: 10
      failureThreshold: 3
    liveness:
      initialDelaySeconds: 120   # Allow full startup before liveness checks
      periodSeconds: 30
      timeoutSeconds: 10
      failureThreshold: 3
  
  primaryUpdateMethod: switchover  # Use switchover instead of restart to prevent restart loops
  primaryUpdateStrategy: unsupervised

  # S3 backup configuration for CloudNativePG - TEMPORARILY DISABLED
  # backup:
  #   # Backup retention policy
  #   retentionPolicy: "30d"  # Keep backups for 30 days
  #   
  #   # S3 backup configuration for Backblaze B2
  #   barmanObjectStore:
  #     destinationPath: s3://postgresql-backups/cnpg
  #     s3Credentials:
  #       accessKeyId:
  #         name: postgresql-s3-backup-credentials
  #         key: AWS_ACCESS_KEY_ID
  #       secretAccessKey:
  #         name: postgresql-s3-backup-credentials
  #         key: AWS_SECRET_ACCESS_KEY
  #     endpointURL: https://s3.eu-central-003.backblazeb2.com
  #     
  #     # Backblaze B2 specific configuration
  #     data:
  #       compression: gzip
  #       encryption: AES256
  #       immediateCheckpoint: true
  #       jobs: 2  # Parallel backup jobs
  #     
  #     wal:
  #       compression: gzip
  #       encryption: AES256
  #       maxParallel: 2  # Parallel WAL archiving