接着上篇的雪崩检测,回顾下LongLinkTaskManager::__RunOnStartTask:
/mars-master/mars/stn/src/longlink_task_manager.cc
void LongLinkTaskManager::__RunOnStartTask() {
std::list<TaskProfile>::iterator first = lst_cmd_.begin();
std::list<TaskProfile>::iterator last = lst_cmd_.end();
bool ismakesureauthruned = false;
bool ismakesureauthsuccess = false;
uint64_t curtime = ::gettickcount();
bool canretry = curtime - lastbatcherrortime_ >= retry_interval_;
bool canprint = true;
int sent_count = 0;
while (first != last) {
std::list<TaskProfile>::iterator next = first;
++next;
......
if (!first->antiavalanche_checked) {
if (!Req2Buf(first->task.taskid, first->task.user_context, bufreq, error_code, Task::kChannelLong)) {
__SingleRespHandle(first, kEctEnDecode, error_code, kTaskFailHandleTaskEnd, longlink_->Profile());
first = next;
continue;
}
// 雪崩检测
xassert2(fun_anti_avalanche_check_);
if (!fun_anti_avalanche_check_(first->task, bufreq.Ptr(), (int)bufreq.Length())) {
__SingleRespHandle(first, kEctLocal, kEctLocalAntiAvalanche, kTaskFailHandleTaskEnd, longlink_->Profile());
first = next;
continue;
}
first->antiavalanche_checked = true;
}
if (!longlinkconnectmon_->MakeSureConnected()) {
break;
}
if (0 == bufreq.Length()) {
if (!Req2Buf(first->task.taskid, first->task.user_context, bufreq, error_code, Task::kChannelLong)) {
__SingleRespHandle(first, kEctEnDecode, error_code, kTaskFailHandleTaskEnd, longlink_->Profile());
first = next;
continue;
}
// 雪崩检测
xassert2(fun_anti_avalanche_check_);
if (!first->antiavalanche_checked && !fun_anti_avalanche_check_(first->task, bufreq.Ptr(), (int)bufreq.Length())) {
__SingleRespHandle(first, kEctLocal, kEctLocalAntiAvalanche, kTaskFailHandleTaskEnd, longlink_->Profile());
first = next;
continue;
}
}
first->transfer_profile.loop_start_task_time = ::gettickcount();
first->transfer_profile.first_pkg_timeout = __FirstPkgTimeout(first->task.server_process_cost, bufreq.Length(), sent_count, dynamic_timeout_.GetStatus());
first->current_dyntime_status = (first->task.server_process_cost <= 0) ? dynamic_timeout_.GetStatus() : kEValuating;
first->transfer_profile.read_write_timeout = __ReadWriteTimeout(first->transfer_profile.first_pkg_timeout);
first->transfer_profile.send_data_size = bufreq.Length();
first->running_id = longlink_->Send((const unsigned char*) bufreq.Ptr(), (unsigned int)bufreq.Length(), first->task.cmdid, first->task.taskid,
first->task.send_only ? "":first->task.cgi);
if (!first->running_id) {
xwarn2(TSF"task add into longlink readwrite fail cgi:%_, cmdid:%_, taskid:%_", first->task.cgi, first->task.cmdid, first->task.taskid);
first = next;
continue;
}
xinfo2(TSF"task add into longlink readwrite suc cgi:%_, cmdid:%_, taskid:%_, size:%_, timeout(firstpkg:%_, rw:%_, task:%_), retry:%_",
first->task.cgi, first->task.cmdid, first->task.taskid, first->transfer_profile.send_data_size, first->transfer_profile.first_pkg_timeout / 1000,
first->transfer_profile.read_write_timeout / 1000, first->task_timeout / 1000, first->remain_retry_count);
if (first->task.send_only) {
__SingleRespHandle(first, kEctOK, 0, kTaskFailHandleNoError, longlink_->Profile());
}
++sent_count;
first = next;
}
}
其实后面就剩下一个longlink_->Send,这个才是真正的发送函数,前面的是一堆参数的设定。好吧,我们来看看:
/mars-master/mars/stn/src/longlink.cc
bool LongLink::Send(const unsigned char* _pbuf, size_t _len, uint32_t _cmdid, uint32_t _taskid, const std::string& _task_info) {
ScopedLock lock(mutex_);
if (kConnected != connectstatus_) return false;
return __Send(_pbuf, _len, _cmdid, _taskid, _task_info);
}
bool LongLink::__Send(const unsigned char* _pbuf, size_t _len, uint32_t _cmdid, uint32_t _taskid, const std::string& _task_info) {
lstsenddata_.push_back(LongLinkSendData());
lstsenddata_.back().cmdid = _cmdid;
lstsenddata_.back().taskid = _taskid;
longlink_pack(_cmdid, _taskid, _pbuf, _len, lstsenddata_.back().data);
lstsenddata_.back().data.Seek(0, AutoBuffer::ESeekStart);
lstsenddata_.back().task_info = _task_info;
readwritebreak_.Break();
return true;
}
可以直接看__Send方法了,就是将需要传输的数据以LongLinkSendData为载体压入队列中,然后执行了SocketSelectBreaker::Break:
/mars-master/mars/comm/windows/SocketSelect/SocketSelect.cpp
bool SocketSelectBreaker::Break() {
ScopedLock lock(m_mutex);
if (m_broken) return true;
char dummy[] = "1";
int ret = sendto(m_socket_w, &dummy, strlen(dummy), 0, (sockaddr*)&m_sendin, m_sendinlen);
m_broken = true;
if (ret < 0 || ret != strlen(dummy)) {
xerror2(TSF"sendto Ret:%_, errno:(%_, %_)", ret, errno, WSAGetLastError());
m_broken = false;
ReCreate();
}
// Ret = WSAGetLastError();
return m_broken;
}
这里可以看到,实际上只发送了一个字符1.实际上这个发送只是为了检测当前通道是否正常可用的,可以理解为一种心跳吧,不过不是定时的那种。
也就是说,每次入队一个待发送数据时,都要进行一下通道检测。那么后面肯定有队列的自我运转机制来进行真实的数据发送。那么我们来找找线索吧。
在LongLink的构造时候,已经将LongLink::__Run通过boost::bind赋值给了thread_。那么LongLink::MakeSureConnected里面又执行了thread_.start(&newone);可以看到是个线程在运转着__Run函数。那么在哪里调用的LongLink::MakeSureConnected,找到的一个线索链:StnLogic.java::makesureLongLinkConnected->stn_logic.cc::MakesureLonglinkConnected->NetCore::MakeSureLongLinkConnect->LongLink::MakeSureConnected。我们把这个调用线索代码贴到下面:
public class StnLogic {
/**
* 检测长链接状态.如果没有连接上,则会尝试重连.
*/
public static native void makesureLongLinkConnected();
}
// stn_logic.cc
void MakesureLonglinkConnected() {
xinfo2(TSF "make sure longlink connect");
STN_WEAK_CALL(MakeSureLongLinkConnect());
}
void NetCore::MakeSureLongLinkConnect() {
#ifdef USE_LONG_LINK
longlink_task_manager_->LongLinkChannel().MakeSureConnected();
#endif
}
bool LongLink::MakeSureConnected(bool* _newone) {
if (_newone) *_newone = false;
ScopedLock lock(mutex_);
if (kConnected == ConnectStatus()) return true;
bool newone = false;
thread_.start(&newone);
if (newone) {
connectstatus_ = kConnectIdle;
conn_profile_.Reset();
identifychecker_.Reset();
disconnectinternalcode_ = kNone;
readwritebreak_.Clear();
connectbreak_.Clear();
}
if (_newone) *_newone = newone;
return false;
}
最后会被上层samples的MarsServiceNative调用:
@Override
public void onCreate() {
super.onCreate();
final MarsServiceProfile profile = gFactory.createMarsServiceProfile();
stub = new MarsServiceStub(this, profile);
// set callback
AppLogic.setCallBack(stub);
StnLogic.setCallBack(stub);
SdtLogic.setCallBack(stub);
// Initialize the Mars PlatformComm
Mars.init(getApplicationContext(), new Handler(Looper.getMainLooper()));
// Initialize the Mars
StnLogic.setLonglinkSvrAddr(profile.longLinkHost(), profile.longLinkPorts());
StnLogic.setShortlinkSvrAddr(profile.shortLinkPort());
StnLogic.setClientVersion(profile.productID());
Mars.onCreate(true);
// !!!这里调用!!!
StnLogic.makesureLongLinkConnected();
//
Log.d(TAG, "mars service native created");
}
总之就是最后启动一个线程来执行,线程函数是LongLink::__Run:
/mars-master/mars/stn/src/longlink.cc
void LongLink::__Run() {
// sync to MakeSureConnected data reset
{
ScopedLock lock(mutex_);
}
uint64_t cur_time = gettickcount();
xinfo_function(TSF"LongLink Rebuild span:%_, net:%_", conn_profile_.disconn_time != 0 ? cur_time - conn_profile_.disconn_time : 0, getNetInfo());
ConnectProfile conn_profile;
conn_profile.start_time = cur_time;
conn_profile.conn_reason = conn_profile_.disconn_errcode;
getCurrNetLabel(conn_profile.net_type);
conn_profile.tid = xlogger_tid();
__UpdateProfile(conn_profile);
#ifdef ANDROID
wakelock_.Lock(30 * 1000);
#endif
SOCKET sock = __RunConnect(conn_profile);
#ifdef ANDROID
wakelock_.Lock(1000);
#endif
if (INVALID_SOCKET == sock) {
conn_profile.disconn_time = ::gettickcount();
conn_profile.disconn_signal = ::getSignal(::getNetInfo() == kWifi);
__UpdateProfile(conn_profile);
return;
}
ErrCmdType errtype = kEctOK;
int errcode = 0;
__RunReadWrite(sock, errtype, errcode, conn_profile);
socket_close(sock);
conn_profile.disconn_time = ::gettickcount();
conn_profile.disconn_errtype = errtype;
conn_profile.disconn_errcode = errcode;
conn_profile.disconn_signal = ::getSignal(::getNetInfo() == kWifi);
__ConnectStatus(kDisConnected);
__UpdateProfile(conn_profile);
if (kEctOK != errtype) __RunResponseError(errtype, errcode, conn_profile);
#ifdef ANDROID
wakelock_.Lock(1000);
#endif
}
我们只看重点吧:
1.__RunConnect,连接;
2.__RunReadWrite,执行读写(阻塞不断执行);
__RunConnect的代码就不贴了,核心的就是com_connect.ConnectImpatient。
/mars-master/mars/comm/socket/complexconnect.cc
SOCKET ComplexConnect::ConnectImpatient(const std::vector<socket_address>& _vecaddr, SocketSelectBreaker& _breaker, MComplexConnect* _observer) {
......
// 生成ConnectCheckFSM数组
for (unsigned int i = 0; i < _vecaddr.size(); ++i) {
xinfo2(TSF"complex.conn %_", _vecaddr[i].url());
ConnectCheckFSM* ic = new ConnectCheckFSM(_vecaddr[i], timeout_, i, _observer);
vecsocketfsm.push_back(ic);
}
......
do {
......
// 前置准备工作
SocketSelect sel(_breaker);
sel.PreSelect();
......
// 内部执行连接
for (unsigned int i = 0; i < index; ++i) {
if (NULL == vecsocketfsm[i]) continue;
xgroup2_define(group);
vecsocketfsm[i]->PreSelect(sel, group);
xgroup2_if(!group.Empty(), TSF"index:%_, @%_, ", i, this) << group;
timeout = std::min(timeout, vecsocketfsm[i]->Timeout());
}
......
// 执行select
if (INT_MAX == timeout) {
ret = sel.Select();
} else {
timeout = std::max(0, timeout);
ret = sel.Select(timeout);
}
......
for (unsigned int i = 0; i < index; ++i) {
if (NULL == vecsocketfsm[i]) continue;
xgroup2_define(group);
vecsocketfsm[i]->AfterSelect(sel, group);
xgroup2_if(!group.Empty(), TSF"index:%_, @%_, ", i, this) << group;
if (TcpClientFSM::EEnd == vecsocketfsm[i]->Status()) {
if (_observer) _observer->OnFinished(i, socket_address(&vecsocketfsm[i]->Address()), vecsocketfsm[i]->Socket(), vecsocketfsm[i]->Error(),
vecsocketfsm[i]->Rtt(), vecsocketfsm[i]->TotalRtt(), (int)(gettickcount() - starttime));
vecsocketfsm[i]->Close();
delete vecsocketfsm[i];
vecsocketfsm[i] = NULL;
lasterror = -1;
continue;
}
if (TcpClientFSM::EReadWrite == vecsocketfsm[i]->Status() && ConnectCheckFSM::ECheckFail == vecsocketfsm[i]->CheckStatus()) {
if (_observer) _observer->OnFinished(i, socket_address(&vecsocketfsm[i]->Address()), vecsocketfsm[i]->Socket(), vecsocketfsm[i]->Error(),
vecsocketfsm[i]->Rtt(), vecsocketfsm[i]->TotalRtt(), (int)(gettickcount() - starttime));
vecsocketfsm[i]->Close();
delete vecsocketfsm[i];
vecsocketfsm[i] = NULL;
lasterror = -1;
continue;
}
if (TcpClientFSM::EReadWrite == vecsocketfsm[i]->Status() && ConnectCheckFSM::ECheckOK == vecsocketfsm[i]->CheckStatus()) {
if (_observer) _observer->OnFinished(i, socket_address(&vecsocketfsm[i]->Address()), vecsocketfsm[i]->Socket(), vecsocketfsm[i]->Error(),
vecsocketfsm[i]->Rtt(), vecsocketfsm[i]->TotalRtt(), (int)(gettickcount() - starttime));
xinfo2(TSF"index:%_, sock:%_, suc ConnectImpatient:%_:%_, RTT:(%_, %_), @%_", i, vecsocketfsm[i]->Socket(),
vecsocketfsm[i]->IP(), vecsocketfsm[i]->Port(), vecsocketfsm[i]->Rtt(), vecsocketfsm[i]->TotalRtt(), this);
retsocket = vecsocketfsm[i]->Socket();
index_ = i;
index_conn_rtt_ = vecsocketfsm[i]->Rtt();
index_conn_totalcost_ = vecsocketfsm[i]->TotalRtt();
vecsocketfsm[i]->Socket(INVALID_SOCKET);
delete vecsocketfsm[i];
vecsocketfsm[i] = NULL;
break;
}
}
......
} while (true);
}
1.根据传递进来的一个地址数组,来生成ConnectCheckFSM的一个数组;
2.进入一个do while的死循环;
3.根据入口的SocketSelectBreaker创建SocketSelect,并执行PreSelect方法,执行一个前期准备工作;
4.对地址池中的每个ConnectCheckFSM进行连接,如果状态不是要进行连接,则执行别的前置操作。,在这个过程中,会将连接的socket保存在SocketSelect中(这里有必要在后面看下深入的代码);
5.执行连接的select操作,异步检测是否有数据可从通道上读取;
6.之后的for循环,做select后的数据读取等事情,将地址集对应的ConnectCheckFSM所有对象都执行一下AfterSelect,并根据返回的状态,调用回调通知观察者;
下面我们来看一下TcpClientFSM::AfterSelect:
/mars-master/mars/comm/socket/tcpclient_fsm.cc
void TcpClientFSM::AfterSelect(SocketSelect& _sel, XLogger& _log) {
if (EConnecting == status_) AfterConnectSelect(_sel, _log);
else if (EReadWrite == status_) AfterReadWriteSelect(_sel, _log);
if (EEnd == status_ && INVALID_SOCKET != sock_) {
_OnClose(last_status_, error_, false);
}
}
根据状态的不同调用不同的函数执行,如果是连接,调用AfterConnectSelect,如果是读写,调用AfterReadWriteSelect。
下面看下AfterConnectSelect:
void TcpClientFSM::AfterConnectSelect(const SocketSelect& _sel, XLogger& _log) {
xassert2(EConnecting == status_, "%d", status_);
int timeout = ConnectTimeout();
xinfo2(TSF"sock:%_, (%_:%_), ", sock_, addr_.ip(), addr_.port()) >> _log;
if (_sel.Exception_FD_ISSET(sock_)) {
socklen_t len = sizeof(error_);
if (0 != getsockopt(sock_, SOL_SOCKET, SO_ERROR, &error_, &len)) { error_ = socket_errno; }
xwarn2(TSF"close connect exception: (%_, %_)", sock_, error_, socket_strerror(error_)) >> _log;
end_connecttime_ = gettickcount();
last_status_ = status_;
status_ = EEnd;
return;
}
error_ = socket_error(sock_);
if (0 != error_) {
xwarn2(TSF"close connect error:(%_, %_), ", error_, socket_strerror(error_)) >> _log;
end_connecttime_ = gettickcount();
last_status_ = status_;
status_ = EEnd;
return;
}
if (0 == error_ && _sel.Write_FD_ISSET(sock_)){
end_connecttime_ = gettickcount();
last_status_ = status_;
status_ = EReadWrite;
xinfo2(TSF"connected Rtt:%_, ", Rtt()) >> _log;
_OnConnected(Rtt());
return;
}
if (0 >= timeout) {
end_connecttime_ = gettickcount();
xwarn2(TSF"close connect timeout:(%_, %_), (%_, %_)", ConnectAbsTimeout(), -timeout, SOCKET_ERRNO(ETIMEDOUT), socket_strerror(SOCKET_ERRNO(ETIMEDOUT))) >> _log;
error_ = SOCKET_ERRNO(ETIMEDOUT);
last_status_ = status_;
status_ = EEnd;
return;
}
}
如果成功,调用_OnConnected。然后通过他会调回到继承者的同名虚函数中,在这里就是ConnectCheckFSM:
virtual void _OnConnected(int _rtt) {
m_checkfintime = ::gettickcount();
if (!m_observer) return;
m_observer->OnConnected(m_index, addr_, sock_, 0, _rtt);
if (ECheckOK == CheckStatus()) {
return;
}
if (!m_observer->OnVerifySend(m_index, addr_, sock_, send_buf_)) {
m_check_status = ECheckFail;
}
}
这里首先调用了观察者的OnConnected,这个观察者就是LongLinkConnectObserver。
我们回来看ConnectImpatient,在循环里执行了AfterSelect,之后根据每个ConnectCheckFSM的状态更新vecsocketfsm数组。在for循环的下面会有3段代码来做这个根据状态更新数组的操作,前两段是如果连接已经关闭的处理和错误的情况处理,都需要从数组中将该项目置为null。第三段是成功完成的情况处理。注意,前两段是continue,而第三段是break。怎么理解这里呢?我的解释是,本身是有个地址池的连接方式,如果其中一个能够成功连接上并且能够正常收发,那么其余的就不需要再尝试了,因此这里做了break处理。可以看到这里的3种情况处理了TcpClientFSM::EEnd、TcpClientFSM::EReadWrite,那么如果是ESstart和EConnecting的情况下,是不会清除这个数组元素的。
再接着看,是这个for循环之后的处理,循环判断所有的连接是否都是无效的,如果都是无效的,继续执行这个while死循环,否则如果有一个是有效的,那么跳出来。也就是说,再次执行的时候index也会进行上面的自增++运算,那么继续往后尝试下一个连接。
再往后看,是跳出了while死循环的情况,又把这些连接依次close,然后清除了数组。再然后是返回了retsocket。这玩意儿的唯一赋值是在上面的for循环中的第三段判定中,这段判定才是一个关键,就是说一个可用的连接出现了。那么直接带来的就是返回一个可用的socket,否则返回的将是个INVALID_SOCKET。
稍微总结下这里,还是挺巧妙的,可以理解为从地址池中找到可用的连接,不是漫无目的的尝试,而是递进式,并且将无效的随时置为null。不过说实话,应当可以写的更简洁,这里实在是有些晦涩。
至此,连接部分分析完毕。
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。