nameko 是有重连机制的,但是也还是会经常发现无法重连的情况

比如下面的报错:

<ServiceContainer [newspaper_service] at 0x7f81fc095480> thread exited with error
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/site-packages/nameko/containers.py", line 475, in _handle_thread_exited
    gt.wait()
  File "/usr/local/lib/python3.10/site-packages/eventlet/greenthread.py", line 181, in wait
    return self._exit_event.wait()
  File "/usr/local/lib/python3.10/site-packages/eventlet/event.py", line 132, in wait
    current.throw(*self._exc)
  File "/usr/local/lib/python3.10/site-packages/eventlet/greenthread.py", line 221, in main
    result = function(*args, **kwargs)
  File "/usr/local/lib/python3.10/site-packages/nameko/containers.py", line 407, in _run_worker
    result, exc_info = handle_result(
  File "/usr/local/lib/python3.10/site-packages/nameko/messaging.py", line 473, in handle_result
    self.handle_message_processed(message, result, exc_info)
  File "/usr/local/lib/python3.10/site-packages/nameko/messaging.py", line 481, in handle_message_processed
    self.queue_consumer.ack_message(message)
  File "/usr/local/lib/python3.10/site-packages/nameko/messaging.py", line 315, in ack_message
    message.ack()
  File "/usr/local/lib/python3.10/site-packages/kombu/message.py", line 126, in ack
    self.channel.basic_ack(self.delivery_tag, multiple=multiple)
  File "/usr/local/lib/python3.10/site-packages/amqp/channel.py", line 1407, in basic_ack
    return self.send_method(
  File "/usr/local/lib/python3.10/site-packages/amqp/abstract_channel.py", line 70, in send_method
    conn.frame_writer(1, self.channel_id, sig, args, content)
  File "/usr/local/lib/python3.10/site-packages/amqp/method_framing.py", line 186, in write_frame
    write(buffer_store.view[:offset])
  File "/usr/local/lib/python3.10/site-packages/amqp/transport.py", line 347, in write
    self._write(s)
  File "/usr/local/lib/python3.10/site-packages/eventlet/greenio/base.py", line 407, in sendall
    tail = self.send(data, flags)
  File "/usr/local/lib/python3.10/site-packages/eventlet/greenio/base.py", line 401, in send
    return self._send_loop(self.fd.send, data, flags)
  File "/usr/local/lib/python3.10/site-packages/eventlet/greenio/base.py", line 388, in _send_loop
    return send_method(data, *args)
BrokenPipeError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/site-packages/eventlet/hubs/poll.py", line 111, in wait
    listener.cb(fileno)
  File "/usr/local/lib/python3.10/site-packages/eventlet/greenthread.py", line 221, in main
    result = function(*args, **kwargs)
  File "/usr/local/lib/python3.10/site-packages/nameko/containers.py", line 407, in _run_worker
    result, exc_info = handle_result(
  File "/usr/local/lib/python3.10/site-packages/nameko/messaging.py", line 473, in handle_result
    self.handle_message_processed(message, result, exc_info)
  File "/usr/local/lib/python3.10/site-packages/nameko/messaging.py", line 481, in handle_message_processed
    self.queue_consumer.ack_message(message)
  File "/usr/local/lib/python3.10/site-packages/nameko/messaging.py", line 315, in ack_message
    message.ack()
  File "/usr/local/lib/python3.10/site-packages/kombu/message.py", line 126, in ack
    self.channel.basic_ack(self.delivery_tag, multiple=multiple)
  File "/usr/local/lib/python3.10/site-packages/amqp/channel.py", line 1407, in basic_ack
    return self.send_method(
  File "/usr/local/lib/python3.10/site-packages/amqp/abstract_channel.py", line 70, in send_method
    conn.frame_writer(1, self.channel_id, sig, args, content)
  File "/usr/local/lib/python3.10/site-packages/amqp/method_framing.py", line 186, in write_frame
    write(buffer_store.view[:offset])
  File "/usr/local/lib/python3.10/site-packages/amqp/transport.py", line 347, in write
    self._write(s)
  File "/usr/local/lib/python3.10/site-packages/eventlet/greenio/base.py", line 407, in sendall
    tail = self.send(data, flags)
  File "/usr/local/lib/python3.10/site-packages/eventlet/greenio/base.py", line 401, in send
    return self._send_loop(self.fd.send, data, flags)
  File "/usr/local/lib/python3.10/site-packages/eventlet/greenio/base.py", line 388, in _send_loop
    return send_method(data, *args)
BrokenPipeError: [Errno 32] Broken pipe
Removing descriptor: 148

还有下面的报错:

POST http://172.16.21.153:9200/novel_sample/_count [status:200 request:0.066s]
Connection to broker lost, trying to re-establish connection...
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/site-packages/kombu/mixins.py", line 171, in run
    for _ in self.consume(limit=None, **kwargs):
  File "/usr/local/lib/python3.10/site-packages/kombu/mixins.py", line 193, in consume
    conn.drain_events(timeout=safety_interval)
  File "/usr/local/lib/python3.10/site-packages/kombu/connection.py", line 316, in drain_events
    return self.transport.drain_events(self.connection, **kwargs)
  File "/usr/local/lib/python3.10/site-packages/kombu/transport/pyamqp.py", line 169, in drain_events
    return connection.drain_events(**kwargs)
  File "/usr/local/lib/python3.10/site-packages/amqp/connection.py", line 525, in drain_events
    while not self.blocking_read(timeout):
  File "/usr/local/lib/python3.10/site-packages/amqp/connection.py", line 531, in blocking_read
    return self.on_inbound_frame(frame)
  File "/usr/local/lib/python3.10/site-packages/amqp/method_framing.py", line 53, in on_frame
    callback(channel, method_sig, buf, None)
  File "/usr/local/lib/python3.10/site-packages/amqp/connection.py", line 537, in on_inbound_method
    return self.channels[channel_id].dispatch_method(
  File "/usr/local/lib/python3.10/site-packages/amqp/abstract_channel.py", line 156, in dispatch_method
    listener(*args)
  File "/usr/local/lib/python3.10/site-packages/amqp/channel.py", line 293, in _on_close
    raise error_for_code(
amqp.exceptions.PreconditionFailed: (0, 0): (406) PRECONDITION_FAILED - delivery acknowledgement on channel 1 timed out. Timeout value used: 1800000 ms. This timeout value can be configured, see consumers doc guide to learn more
Connected to amqp://pon-it:**@172.16.36.108:5672/pon-it
Connection to broker lost, trying to re-establish connection...
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/site-packages/kombu/mixins.py", line 171, in run
    for _ in self.consume(limit=None, **kwargs):
  File "/usr/local/lib/python3.10/site-packages/kombu/mixins.py", line 193, in consume
    conn.drain_events(timeout=safety_interval)
  File "/usr/local/lib/python3.10/site-packages/kombu/connection.py", line 316, in drain_events
    return self.transport.drain_events(self.connection, **kwargs)
  File "/usr/local/lib/python3.10/site-packages/kombu/transport/pyamqp.py", line 169, in drain_events
    return connection.drain_events(**kwargs)
  File "/usr/local/lib/python3.10/site-packages/amqp/connection.py", line 525, in drain_events
    while not self.blocking_read(timeout):
  File "/usr/local/lib/python3.10/site-packages/amqp/connection.py", line 531, in blocking_read
    return self.on_inbound_frame(frame)
  File "/usr/local/lib/python3.10/site-packages/amqp/method_framing.py", line 53, in on_frame
    callback(channel, method_sig, buf, None)
  File "/usr/local/lib/python3.10/site-packages/amqp/connection.py", line 537, in on_inbound_method
    return self.channels[channel_id].dispatch_method(
  File "/usr/local/lib/python3.10/site-packages/amqp/abstract_channel.py", line 156, in dispatch_method
    listener(*args)
  File "/usr/local/lib/python3.10/site-packages/amqp/channel.py", line 293, in _on_close
    raise error_for_code(
amqp.exceptions.PreconditionFailed: (0, 0): (406) PRECONDITION_FAILED - delivery acknowledgement on channel 1 timed out. Timeout value used: 1800000 ms. This timeout value can be configured, see consumers doc guide to learn more
Connected to amqp://pon-it:**@172.16.36.108:5672/pon-it
Connection to broker lost, trying to re-establish connection...
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/site-packages/kombu/mixins.py", line 171, in run
    for _ in self.consume(limit=None, **kwargs):
  File "/usr/local/lib/python3.10/site-packages/kombu/mixins.py", line 193, in consume
    conn.drain_events(timeout=safety_interval)
  File "/usr/local/lib/python3.10/site-packages/kombu/connection.py", line 316, in drain_events
    return self.transport.drain_events(self.connection, **kwargs)
  File "/usr/local/lib/python3.10/site-packages/kombu/transport/pyamqp.py", line 169, in drain_events
    return connection.drain_events(**kwargs)
  File "/usr/local/lib/python3.10/site-packages/amqp/connection.py", line 525, in drain_events
    while not self.blocking_read(timeout):
  File "/usr/local/lib/python3.10/site-packages/amqp/connection.py", line 531, in blocking_read
    return self.on_inbound_frame(frame)
  File "/usr/local/lib/python3.10/site-packages/amqp/method_framing.py", line 53, in on_frame
    callback(channel, method_sig, buf, None)
  File "/usr/local/lib/python3.10/site-packages/amqp/connection.py", line 537, in on_inbound_method
    return self.channels[channel_id].dispatch_method(
  File "/usr/local/lib/python3.10/site-packages/amqp/abstract_channel.py", line 156, in dispatch_method
    listener(*args)
  File "/usr/local/lib/python3.10/site-packages/amqp/channel.py", line 293, in _on_close
    raise error_for_code(
amqp.exceptions.PreconditionFailed: (0, 0): (406) PRECONDITION_FAILED - delivery acknowledgement on channel 1 timed out. Timeout value used: 1800000 ms. This timeout value can be configured, see consumers doc guide to learn more
Connected to amqp://pon-it:**@172.16.36.108:5672/pon-it

但是这些问题怎么解决我还不知道


产生的原因我大概知道了,基本都是因为执行超时(至少 amqp.exceptions.PreconditionFailed 铁定是的)

然后对于这个方案,我用的是检查标准输出,发现 amqp.exceptions 或者 BrokenPipeError 开头的日志就删除这个进程

因为我用的还是 k8s 运行的,所以对我来说监测 pod 的标准输出

有需要的可以参考我下面的代码

from utils.dingtalk_helpers import DingTalk
from typing import List
from kubernetes.client.models.v1_pod import V1Pod
from kubernetes.client.models.v1_pod_list import V1PodList
from kubernetes.client.models.v1_object_meta import V1ObjectMeta
from kubernetes.client import CoreV1Api
from kubernetes import client, config
from loguru import logger

# 定义上下文和命名空间
contexts = ["ali-test", "ali-prod", "gcp-test", "gcp-prod"]
namespaced_names = ["mediawise", "video-tracker"]


def get_last_n_lines(log: str, n: int) -> List[str]:
    """获取日志的最后 N 行"""
    return log.strip().splitlines()[-n:]


def process_pods_in_context(context: str):
    """处理指定上下文中的所有 Pods"""
    logger.info(f"切换到上下文: {context}")
    config.load_kube_config(context=context)
    v1 = CoreV1Api()

    for namespaced_name in namespaced_names:
        try:
            # 获取指定命名空间中的所有 Pods
            pod_list: V1PodList = v1.list_namespaced_pod(namespaced_name)

            for pod in pod_list.items:
                pod: V1Pod
                metadata: V1ObjectMeta = pod.metadata
                pod_name = metadata.name

                if pod.status.phase != "Running":
                    continue

                try:
                    # 获取 Pod 的日志
                    log: str = v1.read_namespaced_pod_log(
                        name=pod_name, namespace=namespaced_name, tail_lines=3
                    )
                    last_lines: List[str] = get_last_n_lines(log, 3)
                    if any(line.startswith('amqp.exceptions') for line in last_lines) or any(line.startswith('BrokenPipeError') for line in last_lines):
                        logger.debug(f"{pod_name} 日志最后 3 行: {last_lines}")
                        logger.warning(
                            f"发现 amqp.exceptions 或者 BrokenPipeError 日志,删除 Pod: {pod_name}")
                        v1.delete_namespaced_pod(pod_name, namespaced_name)
                        logger.info(f"删除成功: {pod_name}")
                        DingTalk.waring(
                            f"发现 amqp.exceptions 或者 BrokenPipeError 日志,删除 Pod: {pod_name}")
                except Exception as log_error:
                    logger.warning(f"无法获取 Pod {pod_name} 的日志: {log_error}")
        except Exception as error:
            logger.exception(f"在命名空间 {namespaced_name} 处理时发生错误: {error}")


def _run():
    """主执行函数,遍历所有上下文"""
    for context in contexts:
        try:
            process_pods_in_context(context)
        except Exception as context_error:
            logger.exception(f"在上下文 {context} 中处理时发生错误: {context_error}")


if __name__ == "__main__":
    _run()

这相当于给 nameko 的进程做了一个健康检查机制,发现有病的直接 delete pod,让 k8s 重新生成一个新的 pod

这不是解决问题的根本之道,但是 nameko 这个开源项目本身已经不维护了,且这个问题不知道怎么 fix,所以只能打上这样的补丁了


universe_king
3.4k 声望680 粉丝