Files
Keybard-Vagabond-Demo/manifests/infrastructure/postgresql/cluster-shared.yaml

178 lines
5.9 KiB
YAML
Raw Normal View History

2025-12-24 14:35:17 +01:00
---
apiVersion: postgresql.cnpg.io/v1
kind: Cluster
metadata:
name: postgres-shared
namespace: postgresql-system
labels:
app: postgresql-shared
backup.longhorn.io/enable: "true"
spec:
instances: 3
# Use CloudNativePG-compatible PostGIS image
# imageName: ghcr.io/cloudnative-pg/postgresql:16.6 # Standard image
imageName: registry.keyboardvagabond.com/library/cnpg-postgis:16.6-3.4-v2
2025-12-24 14:35:17 +01:00
# Bootstrap with initial database and user
bootstrap:
initdb:
database: shared_db
owner: shared_user
encoding: UTF8
localeCollate: en_US.UTF-8
localeCType: en_US.UTF-8
# Install PostGIS extensions in template database (available to all databases)
postInitTemplateSQL:
- CREATE EXTENSION IF NOT EXISTS postgis;
- CREATE EXTENSION IF NOT EXISTS postgis_topology;
- CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;
- CREATE EXTENSION IF NOT EXISTS postgis_tiger_geocoder;
# PostgreSQL configuration for conservative scaling (4GB memory limit)
2025-12-24 14:35:17 +01:00
postgresql:
parameters:
# Performance optimizations for 4GB memory limit
# Reduced max_connections based on actual usage (7 connections observed)
max_connections: "150"
shared_buffers: "1GB" # 25% of 4GB memory limit
effective_cache_size: "3GB" # ~75% of 4GB memory limit
maintenance_work_mem: "256MB" # Scaled for 4GB memory limit
2025-12-24 14:35:17 +01:00
checkpoint_completion_target: "0.9"
wal_buffers: "24MB"
default_statistics_target: "100"
random_page_cost: "1.1" # Good for SSD storage
effective_io_concurrency: "200"
work_mem: "24MB" # Increased from 14MB: 150 connections × 24MB = 3.6GB max
2025-12-24 14:35:17 +01:00
min_wal_size: "1GB"
max_wal_size: "6GB"
# Additional optimizations for your hardware (tuned for 2-core limit)
max_worker_processes: "8" # Scaled for 2 CPU cores
max_parallel_workers: "6" # Increased for better OLTP workload
max_parallel_workers_per_gather: "3" # Max 3 workers per query
max_parallel_maintenance_workers: "3" # For maintenance operations
# Network timeout adjustments for 100Mbps VLAN
wal_sender_timeout: "10s" # Increased from 5s for slower network
wal_receiver_timeout: "10s" # Increased from 5s for slower network
# Multi-instance HA configuration with asynchronous replication
synchronous_commit: "on" # favor data integrity
# Log long running queries
log_min_duration_statement: "5000" # Log queries > 5 seconds
log_line_prefix: "%t [%p]: [%l-1] user=%u,db=%d,app=%a,client=%h "
log_statement: "none" # Only log slow queries, not all
# Query activity tracking - increase limit for complex queries
track_activity_query_size: "8192" # 8KB - allows full query text in pg_stat_activity
# Storage configuration using PostgreSQL-optimized storage class
storage:
size: 50Gi
storageClass: longhorn-postgresql
# Separate WAL storage for better I/O performance
walStorage:
size: 10Gi
storageClass: longhorn-postgresql
# Enable pod anti-affinity for HA cluster (distribute across nodes)
affinity:
enablePodAntiAffinity: true
topologyKey: kubernetes.io/hostname
resources:
requests:
cpu: 750m
memory: 1.5Gi
limits:
cpu: 2000m
memory: 4Gi
2025-12-24 14:35:17 +01:00
# Enable superuser access for maintenance
enableSuperuserAccess: true
# Certificate configuration using cert-manager
certificates:
serverTLSSecret: postgresql-shared-server-cert
serverCASecret: postgresql-shared-server-cert
clientCASecret: postgresql-shared-client-cert
replicationTLSSecret: postgresql-shared-client-cert
# Replication slot configuration - enabled for HA cluster
replicationSlots:
highAvailability:
enabled: true # Enable HA replication slots for multi-instance cluster
synchronizeReplicas:
enabled: true # Enable replica synchronization for HA
# Monitoring configuration for Prometheus metrics
monitoring:
enablePodMonitor: true
# Custom metrics for dashboard compatibility
customQueriesConfigMap:
- name: postgresql-dashboard-metrics
key: queries
- name: postgresql-connection-metrics
key: custom-queries
# Reasonable startup delay for stable 2-instance cluster
startDelay: 30
probes:
startup:
initialDelaySeconds: 60 # Allow PostgreSQL to start and begin recovery
periodSeconds: 10
timeoutSeconds: 10
failureThreshold: 90 # 15 minutes total for replica recovery with Longhorn storage
readiness:
initialDelaySeconds: 30 # Allow instance manager to initialize
periodSeconds: 10
timeoutSeconds: 10
failureThreshold: 3
liveness:
initialDelaySeconds: 120 # Allow full startup before liveness checks
periodSeconds: 30
timeoutSeconds: 10
failureThreshold: 3
primaryUpdateMethod: switchover # Use switchover instead of restart to prevent restart loops
primaryUpdateStrategy: unsupervised
# S3 backup configuration for CloudNativePG - TEMPORARILY DISABLED
# backup:
# # Backup retention policy
# retentionPolicy: "30d" # Keep backups for 30 days
#
# # S3 backup configuration for Backblaze B2
# barmanObjectStore:
# destinationPath: s3://postgresql-backups/cnpg
# s3Credentials:
# accessKeyId:
# name: postgresql-s3-backup-credentials
# key: AWS_ACCESS_KEY_ID
# secretAccessKey:
# name: postgresql-s3-backup-credentials
# key: AWS_SECRET_ACCESS_KEY
# endpointURL: https://s3.eu-central-003.backblazeb2.com
2025-12-24 14:35:17 +01:00
#
# # Backblaze B2 specific configuration
# data:
# compression: gzip
# encryption: AES256
# immediateCheckpoint: true
# jobs: 2 # Parallel backup jobs
#
# wal:
# compression: gzip
# encryption: AES256
# maxParallel: 2 # Parallel WAL archiving