nameko 是有重连机制的,但是也还是会经常发现无法重连的情况
比如下面的报错:
<ServiceContainer [newspaper_service] at 0x7f81fc095480> thread exited with error
Traceback (most recent call last):
File "/usr/local/lib/python3.10/site-packages/nameko/containers.py", line 475, in _handle_thread_exited
gt.wait()
File "/usr/local/lib/python3.10/site-packages/eventlet/greenthread.py", line 181, in wait
return self._exit_event.wait()
File "/usr/local/lib/python3.10/site-packages/eventlet/event.py", line 132, in wait
current.throw(*self._exc)
File "/usr/local/lib/python3.10/site-packages/eventlet/greenthread.py", line 221, in main
result = function(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/nameko/containers.py", line 407, in _run_worker
result, exc_info = handle_result(
File "/usr/local/lib/python3.10/site-packages/nameko/messaging.py", line 473, in handle_result
self.handle_message_processed(message, result, exc_info)
File "/usr/local/lib/python3.10/site-packages/nameko/messaging.py", line 481, in handle_message_processed
self.queue_consumer.ack_message(message)
File "/usr/local/lib/python3.10/site-packages/nameko/messaging.py", line 315, in ack_message
message.ack()
File "/usr/local/lib/python3.10/site-packages/kombu/message.py", line 126, in ack
self.channel.basic_ack(self.delivery_tag, multiple=multiple)
File "/usr/local/lib/python3.10/site-packages/amqp/channel.py", line 1407, in basic_ack
return self.send_method(
File "/usr/local/lib/python3.10/site-packages/amqp/abstract_channel.py", line 70, in send_method
conn.frame_writer(1, self.channel_id, sig, args, content)
File "/usr/local/lib/python3.10/site-packages/amqp/method_framing.py", line 186, in write_frame
write(buffer_store.view[:offset])
File "/usr/local/lib/python3.10/site-packages/amqp/transport.py", line 347, in write
self._write(s)
File "/usr/local/lib/python3.10/site-packages/eventlet/greenio/base.py", line 407, in sendall
tail = self.send(data, flags)
File "/usr/local/lib/python3.10/site-packages/eventlet/greenio/base.py", line 401, in send
return self._send_loop(self.fd.send, data, flags)
File "/usr/local/lib/python3.10/site-packages/eventlet/greenio/base.py", line 388, in _send_loop
return send_method(data, *args)
BrokenPipeError: [Errno 32] Broken pipe
Traceback (most recent call last):
File "/usr/local/lib/python3.10/site-packages/eventlet/hubs/poll.py", line 111, in wait
listener.cb(fileno)
File "/usr/local/lib/python3.10/site-packages/eventlet/greenthread.py", line 221, in main
result = function(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/nameko/containers.py", line 407, in _run_worker
result, exc_info = handle_result(
File "/usr/local/lib/python3.10/site-packages/nameko/messaging.py", line 473, in handle_result
self.handle_message_processed(message, result, exc_info)
File "/usr/local/lib/python3.10/site-packages/nameko/messaging.py", line 481, in handle_message_processed
self.queue_consumer.ack_message(message)
File "/usr/local/lib/python3.10/site-packages/nameko/messaging.py", line 315, in ack_message
message.ack()
File "/usr/local/lib/python3.10/site-packages/kombu/message.py", line 126, in ack
self.channel.basic_ack(self.delivery_tag, multiple=multiple)
File "/usr/local/lib/python3.10/site-packages/amqp/channel.py", line 1407, in basic_ack
return self.send_method(
File "/usr/local/lib/python3.10/site-packages/amqp/abstract_channel.py", line 70, in send_method
conn.frame_writer(1, self.channel_id, sig, args, content)
File "/usr/local/lib/python3.10/site-packages/amqp/method_framing.py", line 186, in write_frame
write(buffer_store.view[:offset])
File "/usr/local/lib/python3.10/site-packages/amqp/transport.py", line 347, in write
self._write(s)
File "/usr/local/lib/python3.10/site-packages/eventlet/greenio/base.py", line 407, in sendall
tail = self.send(data, flags)
File "/usr/local/lib/python3.10/site-packages/eventlet/greenio/base.py", line 401, in send
return self._send_loop(self.fd.send, data, flags)
File "/usr/local/lib/python3.10/site-packages/eventlet/greenio/base.py", line 388, in _send_loop
return send_method(data, *args)
BrokenPipeError: [Errno 32] Broken pipe
Removing descriptor: 148
还有下面的报错:
POST http://172.16.21.153:9200/novel_sample/_count [status:200 request:0.066s]
Connection to broker lost, trying to re-establish connection...
Traceback (most recent call last):
File "/usr/local/lib/python3.10/site-packages/kombu/mixins.py", line 171, in run
for _ in self.consume(limit=None, **kwargs):
File "/usr/local/lib/python3.10/site-packages/kombu/mixins.py", line 193, in consume
conn.drain_events(timeout=safety_interval)
File "/usr/local/lib/python3.10/site-packages/kombu/connection.py", line 316, in drain_events
return self.transport.drain_events(self.connection, **kwargs)
File "/usr/local/lib/python3.10/site-packages/kombu/transport/pyamqp.py", line 169, in drain_events
return connection.drain_events(**kwargs)
File "/usr/local/lib/python3.10/site-packages/amqp/connection.py", line 525, in drain_events
while not self.blocking_read(timeout):
File "/usr/local/lib/python3.10/site-packages/amqp/connection.py", line 531, in blocking_read
return self.on_inbound_frame(frame)
File "/usr/local/lib/python3.10/site-packages/amqp/method_framing.py", line 53, in on_frame
callback(channel, method_sig, buf, None)
File "/usr/local/lib/python3.10/site-packages/amqp/connection.py", line 537, in on_inbound_method
return self.channels[channel_id].dispatch_method(
File "/usr/local/lib/python3.10/site-packages/amqp/abstract_channel.py", line 156, in dispatch_method
listener(*args)
File "/usr/local/lib/python3.10/site-packages/amqp/channel.py", line 293, in _on_close
raise error_for_code(
amqp.exceptions.PreconditionFailed: (0, 0): (406) PRECONDITION_FAILED - delivery acknowledgement on channel 1 timed out. Timeout value used: 1800000 ms. This timeout value can be configured, see consumers doc guide to learn more
Connected to amqp://pon-it:**@172.16.36.108:5672/pon-it
Connection to broker lost, trying to re-establish connection...
Traceback (most recent call last):
File "/usr/local/lib/python3.10/site-packages/kombu/mixins.py", line 171, in run
for _ in self.consume(limit=None, **kwargs):
File "/usr/local/lib/python3.10/site-packages/kombu/mixins.py", line 193, in consume
conn.drain_events(timeout=safety_interval)
File "/usr/local/lib/python3.10/site-packages/kombu/connection.py", line 316, in drain_events
return self.transport.drain_events(self.connection, **kwargs)
File "/usr/local/lib/python3.10/site-packages/kombu/transport/pyamqp.py", line 169, in drain_events
return connection.drain_events(**kwargs)
File "/usr/local/lib/python3.10/site-packages/amqp/connection.py", line 525, in drain_events
while not self.blocking_read(timeout):
File "/usr/local/lib/python3.10/site-packages/amqp/connection.py", line 531, in blocking_read
return self.on_inbound_frame(frame)
File "/usr/local/lib/python3.10/site-packages/amqp/method_framing.py", line 53, in on_frame
callback(channel, method_sig, buf, None)
File "/usr/local/lib/python3.10/site-packages/amqp/connection.py", line 537, in on_inbound_method
return self.channels[channel_id].dispatch_method(
File "/usr/local/lib/python3.10/site-packages/amqp/abstract_channel.py", line 156, in dispatch_method
listener(*args)
File "/usr/local/lib/python3.10/site-packages/amqp/channel.py", line 293, in _on_close
raise error_for_code(
amqp.exceptions.PreconditionFailed: (0, 0): (406) PRECONDITION_FAILED - delivery acknowledgement on channel 1 timed out. Timeout value used: 1800000 ms. This timeout value can be configured, see consumers doc guide to learn more
Connected to amqp://pon-it:**@172.16.36.108:5672/pon-it
Connection to broker lost, trying to re-establish connection...
Traceback (most recent call last):
File "/usr/local/lib/python3.10/site-packages/kombu/mixins.py", line 171, in run
for _ in self.consume(limit=None, **kwargs):
File "/usr/local/lib/python3.10/site-packages/kombu/mixins.py", line 193, in consume
conn.drain_events(timeout=safety_interval)
File "/usr/local/lib/python3.10/site-packages/kombu/connection.py", line 316, in drain_events
return self.transport.drain_events(self.connection, **kwargs)
File "/usr/local/lib/python3.10/site-packages/kombu/transport/pyamqp.py", line 169, in drain_events
return connection.drain_events(**kwargs)
File "/usr/local/lib/python3.10/site-packages/amqp/connection.py", line 525, in drain_events
while not self.blocking_read(timeout):
File "/usr/local/lib/python3.10/site-packages/amqp/connection.py", line 531, in blocking_read
return self.on_inbound_frame(frame)
File "/usr/local/lib/python3.10/site-packages/amqp/method_framing.py", line 53, in on_frame
callback(channel, method_sig, buf, None)
File "/usr/local/lib/python3.10/site-packages/amqp/connection.py", line 537, in on_inbound_method
return self.channels[channel_id].dispatch_method(
File "/usr/local/lib/python3.10/site-packages/amqp/abstract_channel.py", line 156, in dispatch_method
listener(*args)
File "/usr/local/lib/python3.10/site-packages/amqp/channel.py", line 293, in _on_close
raise error_for_code(
amqp.exceptions.PreconditionFailed: (0, 0): (406) PRECONDITION_FAILED - delivery acknowledgement on channel 1 timed out. Timeout value used: 1800000 ms. This timeout value can be configured, see consumers doc guide to learn more
Connected to amqp://pon-it:**@172.16.36.108:5672/pon-it
但是这些问题怎么解决我还不知道
产生的原因我大概知道了,基本都是因为执行超时(至少 amqp.exceptions.PreconditionFailed 铁定是的)
然后对于这个方案,我用的是检查标准输出,发现 amqp.exceptions 或者 BrokenPipeError 开头的日志就删除这个进程
因为我用的还是 k8s 运行的,所以对我来说监测 pod 的标准输出
有需要的可以参考我下面的代码
from utils.dingtalk_helpers import DingTalk
from typing import List
from kubernetes.client.models.v1_pod import V1Pod
from kubernetes.client.models.v1_pod_list import V1PodList
from kubernetes.client.models.v1_object_meta import V1ObjectMeta
from kubernetes.client import CoreV1Api
from kubernetes import client, config
from loguru import logger
# 定义上下文和命名空间
contexts = ["ali-test", "ali-prod", "gcp-test", "gcp-prod"]
namespaced_names = ["mediawise", "video-tracker"]
def get_last_n_lines(log: str, n: int) -> List[str]:
"""获取日志的最后 N 行"""
return log.strip().splitlines()[-n:]
def process_pods_in_context(context: str):
"""处理指定上下文中的所有 Pods"""
logger.info(f"切换到上下文: {context}")
config.load_kube_config(context=context)
v1 = CoreV1Api()
for namespaced_name in namespaced_names:
try:
# 获取指定命名空间中的所有 Pods
pod_list: V1PodList = v1.list_namespaced_pod(namespaced_name)
for pod in pod_list.items:
pod: V1Pod
metadata: V1ObjectMeta = pod.metadata
pod_name = metadata.name
if pod.status.phase != "Running":
continue
try:
# 获取 Pod 的日志
log: str = v1.read_namespaced_pod_log(
name=pod_name, namespace=namespaced_name, tail_lines=3
)
last_lines: List[str] = get_last_n_lines(log, 3)
if any(line.startswith('amqp.exceptions') for line in last_lines) or any(line.startswith('BrokenPipeError') for line in last_lines):
logger.debug(f"{pod_name} 日志最后 3 行: {last_lines}")
logger.warning(
f"发现 amqp.exceptions 或者 BrokenPipeError 日志,删除 Pod: {pod_name}")
v1.delete_namespaced_pod(pod_name, namespaced_name)
logger.info(f"删除成功: {pod_name}")
DingTalk.waring(
f"发现 amqp.exceptions 或者 BrokenPipeError 日志,删除 Pod: {pod_name}")
except Exception as log_error:
logger.warning(f"无法获取 Pod {pod_name} 的日志: {log_error}")
except Exception as error:
logger.exception(f"在命名空间 {namespaced_name} 处理时发生错误: {error}")
def _run():
"""主执行函数,遍历所有上下文"""
for context in contexts:
try:
process_pods_in_context(context)
except Exception as context_error:
logger.exception(f"在上下文 {context} 中处理时发生错误: {context_error}")
if __name__ == "__main__":
_run()
这相当于给 nameko 的进程做了一个健康检查机制,发现有病的直接 delete pod,让 k8s 重新生成一个新的 pod
这不是解决问题的根本之道,但是 nameko 这个开源项目本身已经不维护了,且这个问题不知道怎么 fix,所以只能打上这样的补丁了
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。