ID: 298 - fix LANplus retry

``I had submitted a patch back on Nov 19, 2013 regarding a fix to lanplus retry.
This had resolved a problem whereby a retry of a payload type of
IPMI_PAYLOAD_TYPE_IPMI first removed the request from the queue before going
back for a retry of the message. I have been able to determine why this fix
works correctly. More importantly I have been able to resolve other retry
problems in lanplus where assertion panics were hitting on certain retry
operations. A new, replacement patch for resolving both of these types of retry
bugs follows.

The first bug,where the ipmi_lanplus_send_payload() is sending a payload type of
IPMI_PAYLOAD_TYPE_IPMI is retryable, however I found in testing that it did not
remove the previous request entry from the list of requests chain. If the
original message had timed out, a second message sent, the second reply would
not match up to the right entry on the list as the req command and sequence
numbers are the same. By first removing the first request from the chain this
resolves it. The consequence of not removing the stale entry was random errors.

The second bug is when waiting for a message response times out during the
ipmi_lanplus_send_payload types IPMI_PAYLOAD_TYPE(s) RCMP_OPEN_REQUEST, RAKP1,
RAKP_3. In various testing where the message timed out on either of these three
payload types, ipmitool would assertion panic upon retry as the session_state
was wrong. The timeout could be due to the message never getting to the BMC, the
BMC never acting/responding to the message, or the reply message packet dropped
(it is UDP after all). If the BMC had acted on the message but the reply was not
received, the BMC state would had advanced, and a retry of any of these three
commands would error. It is not knowable at retry time if the BMC had acted on
the message or not. The solution is upon message timeout failure, retry all
three commands in the sequence. This has shown to be reliable and does not
result in assertions or any unexpected BMC behaviors. Should the original
message response eventually arrive very late, it is just discarded.

The testing for these problems was elusive until we found a moderately slow BMC
and had separate sessions direct a fusillade of nmap operations on the BMC, then
run simple ipmitool commands. This caused sufficient loading of the network and
BMC to cause lengthy delays and outright packet drops. The general approach on
the second fix is to return a timeout error code back through ipmi_lanplus_open
where the sequence can be retried.''

Patch-by: Pat Donlin
This commit is contained in:
Zdenek Styblik 2014-03-23 08:15:51 +01:00
parent cab450a052
commit 176774cf9f

View File

@ -2099,6 +2099,7 @@ ipmi_lanplus_send_payload(
uint8_t * msg_data; uint8_t * msg_data;
int msg_length; int msg_length;
struct ipmi_session * session = intf->session; struct ipmi_session * session = intf->session;
struct ipmi_rq_entry * entry = NULL;
int try = 0; int try = 0;
int xmit = 1; int xmit = 1;
time_t ltime; time_t ltime;
@ -2123,7 +2124,6 @@ ipmi_lanplus_send_payload(
/* /*
* Build an IPMI v1.5 or v2 command * Build an IPMI v1.5 or v2 command
*/ */
struct ipmi_rq_entry * entry;
struct ipmi_rq * ipmi_request = payload->payload.ipmi_request.request; struct ipmi_rq * ipmi_request = payload->payload.ipmi_request.request;
lprintf(LOG_DEBUG, ""); lprintf(LOG_DEBUG, "");
@ -2250,11 +2250,17 @@ ipmi_lanplus_send_payload(
{ {
case IPMI_PAYLOAD_TYPE_RMCP_OPEN_REQUEST: case IPMI_PAYLOAD_TYPE_RMCP_OPEN_REQUEST:
session->v2_data.session_state = LANPLUS_STATE_OPEN_SESSION_SENT; session->v2_data.session_state = LANPLUS_STATE_OPEN_SESSION_SENT;
/* not retryable for timeouts, force no retry */
try = session->retry;
break; break;
case IPMI_PAYLOAD_TYPE_RAKP_1: case IPMI_PAYLOAD_TYPE_RAKP_1:
session->v2_data.session_state = LANPLUS_STATE_RAKP_1_SENT; session->v2_data.session_state = LANPLUS_STATE_RAKP_1_SENT;
/* not retryable for timeouts, force no retry */
try = session->retry;
break; break;
case IPMI_PAYLOAD_TYPE_RAKP_3: case IPMI_PAYLOAD_TYPE_RAKP_3:
/* not retryable for timeouts, force no retry */
try = session->retry;
session->v2_data.session_state = LANPLUS_STATE_RAKP_3_SENT; session->v2_data.session_state = LANPLUS_STATE_RAKP_3_SENT;
break; break;
} }
@ -2305,6 +2311,10 @@ ipmi_lanplus_send_payload(
if (rsp) if (rsp)
break; break;
/* This payload type is retryable for timeouts. */
if ((payload->payload_type == IPMI_PAYLOAD_TYPE_IPMI) && entry) {
ipmi_req_remove_entry( entry->rq_seq, entry->req.msg.cmd);
}
} }
/* only timeout if time exceeds the timeout value */ /* only timeout if time exceeds the timeout value */
@ -2771,6 +2781,7 @@ ipmi_lanplus_open_session(struct ipmi_intf * intf)
struct ipmi_session * session = intf->session; struct ipmi_session * session = intf->session;
uint8_t * msg; uint8_t * msg;
struct ipmi_rs * rsp; struct ipmi_rs * rsp;
/* 0 = success, 1 = error, 2 = timeout */
int rc = 0; int rc = 0;
@ -2780,7 +2791,7 @@ ipmi_lanplus_open_session(struct ipmi_intf * intf)
msg = (uint8_t*)malloc(IPMI_OPEN_SESSION_REQUEST_SIZE); msg = (uint8_t*)malloc(IPMI_OPEN_SESSION_REQUEST_SIZE);
if (msg == NULL) { if (msg == NULL) {
lprintf(LOG_ERR, "ipmitool: malloc failure"); lprintf(LOG_ERR, "ipmitool: malloc failure");
return -1; return 1;
} }
memset(msg, 0, IPMI_OPEN_SESSION_REQUEST_SIZE); memset(msg, 0, IPMI_OPEN_SESSION_REQUEST_SIZE);
@ -2810,7 +2821,7 @@ ipmi_lanplus_open_session(struct ipmi_intf * intf)
session->cipher_suite_id); session->cipher_suite_id);
free(msg); free(msg);
msg = NULL; msg = NULL;
return -1; return 1;
} }
@ -2866,14 +2877,17 @@ ipmi_lanplus_open_session(struct ipmi_intf * intf)
if (verbose) if (verbose)
lanplus_dump_open_session_response(rsp); lanplus_dump_open_session_response(rsp);
if (rsp == NULL ) {
lprintf(LOG_DEBUG, "Timeout in open session response message.");
return 2;
}
if (rsp->payload.open_session_response.rakp_return_code != if (rsp->payload.open_session_response.rakp_return_code !=
IPMI_RAKP_STATUS_NO_ERRORS) IPMI_RAKP_STATUS_NO_ERRORS)
{ {
lprintf(LOG_WARNING, "Error in open session response message : %s\n", lprintf(LOG_WARNING, "Error in open session response message : %s\n",
val2str(rsp->payload.open_session_response.rakp_return_code, val2str(rsp->payload.open_session_response.rakp_return_code,
ipmi_rakp_return_codes)); ipmi_rakp_return_codes));
return -1; return 1;
} }
else else
{ {
@ -2907,7 +2921,7 @@ ipmi_lanplus_open_session(struct ipmi_intf * intf)
"not what we requested 0x%02x\n", "not what we requested 0x%02x\n",
rsp->payload.open_session_response.auth_alg, rsp->payload.open_session_response.auth_alg,
session->v2_data.requested_auth_alg); session->v2_data.requested_auth_alg);
rc = -1; rc = 1;
} }
else if (rsp->payload.open_session_response.integrity_alg != else if (rsp->payload.open_session_response.integrity_alg !=
session->v2_data.requested_integrity_alg) session->v2_data.requested_integrity_alg)
@ -2916,7 +2930,7 @@ ipmi_lanplus_open_session(struct ipmi_intf * intf)
"not what we requested 0x%02x\n", "not what we requested 0x%02x\n",
rsp->payload.open_session_response.integrity_alg, rsp->payload.open_session_response.integrity_alg,
session->v2_data.requested_integrity_alg); session->v2_data.requested_integrity_alg);
rc = -1; rc = 1;
} }
else if (rsp->payload.open_session_response.crypt_alg != else if (rsp->payload.open_session_response.crypt_alg !=
session->v2_data.requested_crypt_alg) session->v2_data.requested_crypt_alg)
@ -2925,7 +2939,7 @@ ipmi_lanplus_open_session(struct ipmi_intf * intf)
"not what we requested 0x%02x\n", "not what we requested 0x%02x\n",
rsp->payload.open_session_response.crypt_alg, rsp->payload.open_session_response.crypt_alg,
session->v2_data.requested_crypt_alg); session->v2_data.requested_crypt_alg);
rc = -1; rc = 1;
} }
} }
@ -2959,7 +2973,7 @@ ipmi_lanplus_rakp1(struct ipmi_intf * intf)
struct ipmi_session * session = intf->session; struct ipmi_session * session = intf->session;
uint8_t * msg; uint8_t * msg;
struct ipmi_rs * rsp; struct ipmi_rs * rsp;
int rc = 0; int rc = 0; /* 0 = success, 1 = error, 2 = timeout */
/* /*
* Build a RAKP 1 message * Build a RAKP 1 message
@ -3039,8 +3053,8 @@ ipmi_lanplus_rakp1(struct ipmi_intf * intf)
if (rsp == NULL) if (rsp == NULL)
{ {
lprintf(LOG_INFO, "> Error: no response from RAKP 1 message"); lprintf(LOG_WARNING, "> Error: no response from RAKP 1 message");
return 1; return 2;
} }
session->v2_data.session_state = LANPLUS_STATE_RAKP_2_RECEIVED; session->v2_data.session_state = LANPLUS_STATE_RAKP_2_RECEIVED;
@ -3212,8 +3226,8 @@ ipmi_lanplus_rakp3(struct ipmi_intf * intf)
} }
else if (rsp == NULL) else if (rsp == NULL)
{ {
lprintf(LOG_INFO, "> Error: no response from RAKP 3 message"); lprintf(LOG_WARNING, "> Error: no response from RAKP 3 message");
return 1; return 2;
} }
@ -3337,6 +3351,7 @@ int
ipmi_lanplus_open(struct ipmi_intf * intf) ipmi_lanplus_open(struct ipmi_intf * intf)
{ {
int rc; int rc;
int retry;
struct get_channel_auth_cap_rsp auth_cap; struct get_channel_auth_cap_rsp auth_cap;
struct ipmi_session *session; struct ipmi_session *session;
@ -3363,7 +3378,6 @@ ipmi_lanplus_open(struct ipmi_intf * intf)
/* Setup our lanplus session state */ /* Setup our lanplus session state */
session->v2_data.session_state = LANPLUS_STATE_PRESESSION;
session->v2_data.auth_alg = IPMI_AUTH_RAKP_NONE; session->v2_data.auth_alg = IPMI_AUTH_RAKP_NONE;
session->v2_data.crypt_alg = IPMI_CRYPT_NONE; session->v2_data.crypt_alg = IPMI_CRYPT_NONE;
session->v2_data.console_id = 0x00; session->v2_data.console_id = 0x00;
@ -3407,40 +3421,55 @@ ipmi_lanplus_open(struct ipmi_intf * intf)
goto fail; goto fail;
} }
/*
* If the open/rakp1/rakp3 sequence encounters a timeout, the whole sequence
* needs to restart. The individual messages are not individually retryable,
* as the session state is advancing.
*/
for (retry = 0; retry < IPMI_LAN_RETRY; retry++) {
session->v2_data.session_state = LANPLUS_STATE_PRESESSION;
/* /*
* Open session * Open session
*/ */
if (ipmi_lanplus_open_session(intf)){ if ((rc = ipmi_lanplus_open_session(intf)) == 1) {
intf->close(intf); intf->close(intf);
goto fail; goto fail;
} }
if (rc == 2) {
lprintf(LOG_DEBUG, "Retry lanplus open session, %d", retry);
continue;
}
/* /*
* RAKP 1 * RAKP 1
*/ */
if (ipmi_lanplus_rakp1(intf)){ if ((rc = ipmi_lanplus_rakp1(intf)) == 1) {
intf->close(intf); intf->close(intf);
goto fail; goto fail;
} }
if (rc == 2) {
lprintf(LOG_DEBUG, "Retry lanplus rakp1, %d", retry);
continue;
}
/* /*
* RAKP 3 * RAKP 3
*/ */
if (ipmi_lanplus_rakp3(intf)){ if ((rc = ipmi_lanplus_rakp3(intf)) == 1) {
intf->close(intf); intf->close(intf);
goto fail; goto fail;
} }
if (rc == 0) break;
lprintf(LOG_DEBUG,"Retry lanplus rakp3, %d", retry);
}
lprintf(LOG_DEBUG, "IPMIv2 / RMCP+ SESSION OPENED SUCCESSFULLY\n"); lprintf(LOG_DEBUG, "IPMIv2 / RMCP+ SESSION OPENED SUCCESSFULLY\n");
if (!ipmi_oem_active(intf, "i82571spt")) { if (!ipmi_oem_active(intf, "i82571spt")) {
rc = ipmi_set_session_privlvl_cmd(intf); rc = ipmi_set_session_privlvl_cmd(intf);
if (rc < 0) if (rc < 0) {
intf->close(intf);
goto fail; goto fail;
} }
}
intf->manufacturer_id = ipmi_get_oem(intf); intf->manufacturer_id = ipmi_get_oem(intf);
bridgePossible = 1; bridgePossible = 1;