-
Notifications
You must be signed in to change notification settings - Fork 85
ENG-1948 - Celery healthcheck HTTP endpoint #7091
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
af5161d
3f84e92
8da5830
765b06f
bb8edb4
1706854
6ad8c77
6af3d6d
4153a68
195868e
8da1f48
b6a4eee
073e0b8
b2648c6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| # fmt: off | ||
| # type: ignore | ||
| # pylint: skip-file | ||
| # isort:off | ||
|
|
||
|
|
||
| from .server import HealthCheckServer | ||
|
|
||
|
|
||
| def register(celery_app): | ||
| celery_app.steps["worker"].add(HealthCheckServer) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,116 @@ | ||
| import json | ||
| import threading | ||
| from http.server import HTTPServer, SimpleHTTPRequestHandler | ||
| from typing import Any, Optional | ||
|
|
||
| from celery import bootsteps | ||
| from celery.worker import WorkController | ||
| from loguru import logger | ||
|
|
||
| HEALTHCHECK_DEFAULT_PORT = 9000 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should use a consistent value, I think 9001 would be ok. It's 9000 in some places (
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is 9001 in the compose file explicitly to ensure that the config would override it and it would work as-expected. But we can just keep it 9000 everywhere / default. |
||
| HEALTHCHECK_DEFAULT_PING_TIMEOUT = 2.0 | ||
| HEALTHCHECK_DEFAULT_HTTP_SERVER_SHUTDOWN_TIMEOUT = 2.0 | ||
|
|
||
|
|
||
| class HealthcheckHandler(SimpleHTTPRequestHandler): | ||
| """HTTP request handler with additional properties and functions""" | ||
|
|
||
| def __init__( | ||
| self, parent: WorkController, healthcheck_ping_timeout: float, *args: Any | ||
| ): | ||
| self.parent = parent | ||
| self.healthcheck_ping_timeout = healthcheck_ping_timeout | ||
| super().__init__(*args) | ||
|
|
||
| def do_GET(self) -> None: | ||
| """Handle GET requests""" | ||
| try: | ||
| try: | ||
| parent = self.parent | ||
| insp = parent.app.control.inspect( | ||
| destination=[parent.hostname], timeout=self.healthcheck_ping_timeout | ||
| ) | ||
| result = insp.ping() | ||
|
|
||
| data = json.dumps({"status": "ok", "data": result}) | ||
| logger.debug(f"Healthcheck ping result: {data}") | ||
|
|
||
| self.send_response(200) | ||
| self.send_header("Content-type", "application/json") | ||
| self.end_headers() | ||
| self.wfile.write(bytes(data, "utf-8")) | ||
| except Exception as e: | ||
| logger.warning(f"Healthcheck ping exception: {e}") | ||
| response = {"status": "error", "data": str(e)} | ||
| self.send_response(503) | ||
| self.send_header("Content-type", "application/json") | ||
| self.end_headers() | ||
| self.wfile.write(bytes(json.dumps(response), "utf-8")) | ||
| except Exception as ex: | ||
| logger.exception("HealthcheckHandler exception", exc_info=ex) | ||
| self.send_response(500) | ||
|
|
||
|
|
||
| class HealthCheckServer(bootsteps.StartStopStep): | ||
| # ignore kwargs type | ||
| def __init__(self, parent: WorkController, **kwargs): # type: ignore [arg-type, no-untyped-def] | ||
| self.thread: Optional[threading.Thread] = None | ||
| self.http_server: Optional[HTTPServer] = None | ||
|
|
||
| self.parent = parent | ||
|
|
||
| # config | ||
| self.healthcheck_port = int( | ||
| getattr(parent.app.conf, "healthcheck_port", HEALTHCHECK_DEFAULT_PORT) | ||
| ) | ||
| self.healthcheck_ping_timeout = float( | ||
| getattr( | ||
| parent.app.conf, | ||
| "healthcheck_ping_timeout", | ||
| HEALTHCHECK_DEFAULT_PING_TIMEOUT, | ||
| ) | ||
| ) | ||
| self.shutdown_timeout = float( | ||
| getattr( | ||
| parent.app.conf, | ||
| "shutdown_timeout", | ||
| HEALTHCHECK_DEFAULT_HTTP_SERVER_SHUTDOWN_TIMEOUT, | ||
| ) | ||
| ) | ||
|
|
||
| super().__init__(**kwargs) | ||
|
|
||
| # The mypy hints for an HTTP handler are strange, so ignoring them here | ||
| def http_handler(self, *args) -> None: # type: ignore [arg-type, no-untyped-def] | ||
| HealthcheckHandler(self.parent, self.healthcheck_ping_timeout, *args) | ||
|
|
||
| def start(self, parent: WorkController) -> None: | ||
| # Ignore mypy hints here as the constructed object immediately handles the request | ||
| # (if you look in the source code for SimpleHTTPRequestHandler, specifically the finalize request method) | ||
| self.http_server = HTTPServer( | ||
| ("0.0.0.0", self.healthcheck_port), self.http_handler # type: ignore [arg-type] | ||
| ) | ||
|
|
||
| self.thread = threading.Thread( | ||
| target=self.http_server.serve_forever, daemon=True | ||
| ) | ||
| self.thread.start() | ||
|
|
||
| def stop(self, parent: WorkController) -> None: | ||
| if self.http_server is None: | ||
| logger.warning( | ||
| "Requested stop of HTTP healthcheck server, but no server was started" | ||
| ) | ||
| else: | ||
| logger.info( | ||
| f"Stopping health check server with a timeout of {self.shutdown_timeout} seconds" | ||
| ) | ||
| self.http_server.shutdown() | ||
|
|
||
| # Really this should not happen if the HTTP server is None, but just in case, we should check. | ||
| if self.thread is None: | ||
| logger.warning("No thread in HTTP healthcheck server to shutdown...") | ||
| else: | ||
| self.thread.join(self.shutdown_timeout) | ||
|
|
||
| logger.info(f"Health check server stopped on port {self.healthcheck_port}") | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,36 @@ | ||
| import pytest | ||
|
|
||
| import pytest | ||
| import requests | ||
| from loguru import logger | ||
|
|
||
|
|
||
| class TestCeleryHealthCheckServer: | ||
| def test_responds_to_ping_properly(self, celery_session_app, celery_session_worker): | ||
| try: | ||
| response = requests.get("http://127.0.0.1:9000/") | ||
| assert response.status_code == 200 | ||
| assert response.json()["status"] == "ok" | ||
| except requests.exceptions.ConnectionError: | ||
| pytest.fail("Connection error") | ||
|
|
||
|
|
||
| class TestCeleryHealthCheckWorker: | ||
| @pytest.fixture(autouse=True) | ||
| def setup_teardown(self): | ||
| yield | ||
| with pytest.raises(Exception): | ||
| requests.get("http://127.0.0.1:9000/", timeout=1) | ||
|
|
||
| def test_shutdown_gracefully(self, celery_session_app, celery_session_worker): | ||
| try: | ||
| logger.info("Shutdown gracefully") | ||
| celery_session_worker.stop() | ||
| logger.info("Shutdown gracefully finished") | ||
| except Exception: | ||
| pytest.fail("Failed to stop health check server") | ||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice consolidation work here! Should we add the HTTP health check to
worker-otherso the worker services that extend this one get the new health check?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We could but really it's just in the one below to guarantee something exercises it