221 lines
8.9 KiB
YAML
221 lines
8.9 KiB
YAML
# Keeping for reference
|
|
|
|
# ---
|
|
# # OpenObserve Alert Configuration for Celery Queue Monitoring
|
|
# # This file contains the alert configurations that should be imported into OpenObserve
|
|
# apiVersion: v1
|
|
# kind: ConfigMap
|
|
# metadata:
|
|
# name: openobserve-alert-configs
|
|
# namespace: celery-monitoring
|
|
# labels:
|
|
# app.kubernetes.io/name: openobserve-alerts
|
|
# app.kubernetes.io/component: monitoring
|
|
# data:
|
|
# celery-queue-alerts.json: |
|
|
# {
|
|
# "alerts": [
|
|
# {
|
|
# "name": "PieFed Celery Queue High",
|
|
# "description": "PieFed Celery queue has more than 10,000 pending tasks",
|
|
# "query": "SELECT avg(celery_queue_length) as avg_queue_length FROM metrics WHERE queue_name='celery' AND database='piefed' AND _timestamp >= now() - interval '5 minutes'",
|
|
# "condition": "avg_queue_length > 10000",
|
|
# "frequency": "5m",
|
|
# "severity": "warning",
|
|
# "enabled": true,
|
|
# "actions": [
|
|
# {
|
|
# "type": "webhook",
|
|
# "webhook_url": "https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK",
|
|
# "message": "🚨 PieFed Celery queue is high: {{avg_queue_length}} tasks pending"
|
|
# }
|
|
# ]
|
|
# },
|
|
# {
|
|
# "name": "PieFed Celery Queue Critical",
|
|
# "description": "PieFed Celery queue has more than 50,000 pending tasks",
|
|
# "query": "SELECT avg(celery_queue_length) as avg_queue_length FROM metrics WHERE queue_name='celery' AND database='piefed' AND _timestamp >= now() - interval '5 minutes'",
|
|
# "condition": "avg_queue_length > 50000",
|
|
# "frequency": "2m",
|
|
# "severity": "critical",
|
|
# "enabled": true,
|
|
# "actions": [
|
|
# {
|
|
# "type": "webhook",
|
|
# "webhook_url": "https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK",
|
|
# "message": "🔥 CRITICAL: PieFed Celery queue is critically high: {{avg_queue_length}} tasks pending. Consider scaling workers!"
|
|
# }
|
|
# ]
|
|
# },
|
|
# {
|
|
# "name": "BookWyrm Celery Queue High",
|
|
# "description": "BookWyrm Celery queue has more than 1,000 pending tasks",
|
|
# "query": "SELECT avg(celery_queue_length) as avg_queue_length FROM metrics WHERE queue_name='total' AND database='bookwyrm' AND _timestamp >= now() - interval '5 minutes'",
|
|
# "condition": "avg_queue_length > 1000",
|
|
# "frequency": "5m",
|
|
# "severity": "warning",
|
|
# "enabled": true,
|
|
# "actions": [
|
|
# {
|
|
# "type": "webhook",
|
|
# "webhook_url": "https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK",
|
|
# "message": "📚 BookWyrm Celery queue is high: {{avg_queue_length}} tasks pending"
|
|
# }
|
|
# ]
|
|
# },
|
|
# {
|
|
# "name": "Redis Connection Lost",
|
|
# "description": "Redis connection is down for Celery monitoring",
|
|
# "query": "SELECT avg(redis_connection_status) as connection_status FROM metrics WHERE _timestamp >= now() - interval '2 minutes'",
|
|
# "condition": "connection_status < 1",
|
|
# "frequency": "1m",
|
|
# "severity": "critical",
|
|
# "enabled": true,
|
|
# "actions": [
|
|
# {
|
|
# "type": "webhook",
|
|
# "webhook_url": "https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK",
|
|
# "message": "💥 CRITICAL: Redis connection lost for Celery monitoring!"
|
|
# }
|
|
# ]
|
|
# },
|
|
# {
|
|
# "name": "Celery Queue Processing Stalled",
|
|
# "description": "Celery queue size hasn't decreased in 15 minutes",
|
|
# "query": "SELECT celery_queue_length FROM metrics WHERE queue_name='celery' AND database='piefed' AND _timestamp >= now() - interval '15 minutes' ORDER BY _timestamp DESC LIMIT 1",
|
|
# "condition": "celery_queue_length > (SELECT celery_queue_length FROM metrics WHERE queue_name='celery' AND database='piefed' AND _timestamp >= now() - interval '20 minutes' AND _timestamp < now() - interval '15 minutes' ORDER BY _timestamp DESC LIMIT 1)",
|
|
# "frequency": "10m",
|
|
# "severity": "warning",
|
|
# "enabled": true,
|
|
# "actions": [
|
|
# {
|
|
# "type": "webhook",
|
|
# "webhook_url": "https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK",
|
|
# "message": "⚠️ Celery queue processing appears stalled. Queue size hasn't decreased in 15 minutes."
|
|
# }
|
|
# ]
|
|
# }
|
|
# ]
|
|
# }
|
|
|
|
# dashboard-config.json: |
|
|
# {
|
|
# "dashboard": {
|
|
# "title": "Celery Queue Monitoring",
|
|
# "description": "Monitor Celery queue sizes and processing rates for PieFed and BookWyrm",
|
|
# "panels": [
|
|
# {
|
|
# "title": "PieFed Queue Length",
|
|
# "type": "line",
|
|
# "query": "SELECT _timestamp, celery_queue_length FROM metrics WHERE queue_name='celery' AND database='piefed' AND _timestamp >= now() - interval '24 hours'",
|
|
# "x_axis": "_timestamp",
|
|
# "y_axis": "celery_queue_length"
|
|
# },
|
|
# {
|
|
# "title": "BookWyrm Total Queue Length",
|
|
# "type": "line",
|
|
# "query": "SELECT _timestamp, celery_queue_length FROM metrics WHERE queue_name='total' AND database='bookwyrm' AND _timestamp >= now() - interval '24 hours'",
|
|
# "x_axis": "_timestamp",
|
|
# "y_axis": "celery_queue_length"
|
|
# },
|
|
# {
|
|
# "title": "Queue Processing Rate (PieFed)",
|
|
# "type": "line",
|
|
# "query": "SELECT _timestamp, celery_queue_length - LAG(celery_queue_length, 1) OVER (ORDER BY _timestamp) as processing_rate FROM metrics WHERE queue_name='celery' AND database='piefed' AND _timestamp >= now() - interval '6 hours'",
|
|
# "x_axis": "_timestamp",
|
|
# "y_axis": "processing_rate"
|
|
# },
|
|
# {
|
|
# "title": "Redis Connection Status",
|
|
# "type": "stat",
|
|
# "query": "SELECT redis_connection_status FROM metrics WHERE _timestamp >= now() - interval '5 minutes' ORDER BY _timestamp DESC LIMIT 1"
|
|
# },
|
|
# {
|
|
# "title": "Current Queue Sizes",
|
|
# "type": "table",
|
|
# "query": "SELECT queue_name, database, celery_queue_length FROM metrics WHERE _timestamp >= now() - interval '5 minutes' GROUP BY queue_name, database ORDER BY celery_queue_length DESC"
|
|
# }
|
|
# ]
|
|
# }
|
|
# }
|
|
|
|
# ---
|
|
# # Instructions ConfigMap
|
|
# apiVersion: v1
|
|
# kind: ConfigMap
|
|
# metadata:
|
|
# name: openobserve-setup-instructions
|
|
# namespace: celery-monitoring
|
|
# data:
|
|
# README.md: |
|
|
# # OpenObserve Celery Queue Monitoring Setup
|
|
|
|
# ## 1. Import Alerts
|
|
|
|
# 1. Access your OpenObserve dashboard
|
|
# 2. Go to Alerts → Import
|
|
# 3. Copy the contents of `celery-queue-alerts.json` from the `openobserve-alert-configs` ConfigMap
|
|
# 4. Paste and import the alert configurations
|
|
|
|
# ## 2. Configure Webhooks
|
|
|
|
# Update the webhook URLs in the alert configurations:
|
|
# - Replace `https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK` with your actual Slack webhook URL
|
|
# - Or configure other notification methods (email, Discord, etc.)
|
|
|
|
# ## 3. Import Dashboard
|
|
|
|
# 1. Go to Dashboards → Import
|
|
# 2. Copy the contents of `dashboard-config.json` from the `openobserve-alert-configs` ConfigMap
|
|
# 3. Paste and import the dashboard configuration
|
|
|
|
# ## 4. Verify Metrics
|
|
|
|
# Check that metrics are being collected:
|
|
# ```sql
|
|
# SELECT * FROM metrics WHERE __name__ LIKE 'celery_%' ORDER BY _timestamp DESC LIMIT 10
|
|
# ```
|
|
|
|
# ## 5. Alert Thresholds
|
|
|
|
# Current alert thresholds:
|
|
# - **PieFed Warning**: > 10,000 tasks
|
|
# - **PieFed Critical**: > 50,000 tasks
|
|
# - **BookWyrm Warning**: > 1,000 tasks
|
|
# - **Redis Connection**: Connection lost
|
|
|
|
# Adjust these thresholds based on your normal queue sizes and processing capacity.
|
|
|
|
# ## 6. Monitoring Queries
|
|
|
|
# Useful queries for monitoring:
|
|
|
|
# ### Current queue sizes:
|
|
# ```sql
|
|
# SELECT queue_name, database, celery_queue_length
|
|
# FROM metrics
|
|
# WHERE _timestamp >= now() - interval '5 minutes'
|
|
# GROUP BY queue_name, database
|
|
# ORDER BY celery_queue_length DESC
|
|
# ```
|
|
|
|
# ### Queue processing rate (tasks/minute):
|
|
# ```sql
|
|
# SELECT _timestamp,
|
|
# celery_queue_length - LAG(celery_queue_length, 1) OVER (ORDER BY _timestamp) as processing_rate
|
|
# FROM metrics
|
|
# WHERE queue_name='celery' AND database='piefed'
|
|
# AND _timestamp >= now() - interval '1 hour'
|
|
# ```
|
|
|
|
# ### Average queue size over time:
|
|
# ```sql
|
|
# SELECT DATE_TRUNC('hour', _timestamp) as hour,
|
|
# AVG(celery_queue_length) as avg_queue_length
|
|
# FROM metrics
|
|
# WHERE queue_name='celery' AND database='piefed'
|
|
# AND _timestamp >= now() - interval '24 hours'
|
|
# GROUP BY hour
|
|
# ORDER BY hour
|
|
# ```
|