mlflow / mlflow

Open source platform for the machine learning lifecycle
https://mlflow.org
Apache License 2.0
18.74k stars 4.23k forks source link

SSLError when using mlflow tracking server #9551

Open xianqiangHub opened 1 year ago

xianqiangHub commented 1 year ago

Summary

在2.6版本中,使用mlflow server --app-name basic-auth启动后,通过 export MLFLOW_TRACKING_USERNAME=username export MLFLOW_TRACKING_PASSWORD=password

import mlflow

mlflow.set_tracking_uri("https:///") with mlflow.start_run(): ... 报错:requests.exceptions.SSLError: HTTPSConnectionPool(host='172.18.86.87', port=5000): Max retries exceeded with url: /api/2.0/mlflow/users/create (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1002)')))

Notes

harupy commented 1 year ago

@xianqiangHub Please share the code that can reproduce the issue.

xianqiangHub commented 1 year ago

@xianqiangHub Please share the code that can reproduce the issue. import mlflow import mlflow.sklearn import numpy as np from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score import os

os.environ['MLFLOW_TRACKING_USERNAME'] = "xx" os.environ['MLFLOW_TRACKING_PASSWORD'] = "x"

设置远程MLflow服务器地址

mlflow.set_tracking_uri("https://172.18.86.87:5000/")

生成示例数据

np.random.seed(42) X = np.random.rand(100, 2) y = (X[:, 0] + X[:, 1] > 1).astype(int)

划分训练集和测试集

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

创建MLflow实验

mlflow.set_experiment("Logistic_Regression_Demo")

开始MLflow运行,并设置运行名称

with mlflow.start_run(run_name="MyRun") as run:

记录参数

mlflow.log_param("random_seed", 42)
mlflow.log_param("test_size", 0.2)

# 创建并训练逻辑回归模型
model = LogisticRegression()
model.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = model.predict(X_test)

# 计算精确度
accuracy = accuracy_score(y_test, y_pred)

# 记录精确度指标
mlflow.log_metric("accuracy", accuracy)

# 保存模型
mlflow.sklearn.log_model(model, "logistic_regression_model")
harupy commented 1 year ago

@xianqiangHub Thanks, can you shared the full stack trace?

xianqiangHub commented 1 year ago

@xianqiangHub Thanks, can you shared the full stack trace? I changed the url to Http today and it worked, but I still got an error when using Https. This is the specific information.

SSLEOFError Traceback (most recent call last) File ~/anaconda3/lib/python3.11/site-packages/urllib3/connectionpool.py:670, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw) 669 # Make the request on the httplib connection object. --> 670 httplib_response = self._make_request( 671 conn, 672 method, 673 url, 674 timeout=timeout_obj, 675 body=body, 676 headers=headers, 677 chunked=chunked, 678 ) 680 # If we're going to release the connection in finally:, then 681 # the response doesn't need to know about the connection. Otherwise 682 # it will also try to release it and we'll have a double-release 683 # mess.

File ~/anaconda3/lib/python3.11/site-packages/urllib3/connectionpool.py:381, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw) 380 try: --> 381 self._validate_conn(conn) 382 except (SocketTimeout, BaseSSLError) as e: 383 # Py2 raises this as a BaseSSLError, Py3 raises it as socket timeout.

File ~/anaconda3/lib/python3.11/site-packages/urllib3/connectionpool.py:978, in HTTPSConnectionPool._validate_conn(self, conn) 977 if not getattr(conn, "sock", None): # AppEngine might not have .sock --> 978 conn.connect() 980 if not conn.is_verified:

File ~/anaconda3/lib/python3.11/site-packages/urllib3/connection.py:362, in HTTPSConnection.connect(self) 360 context.load_default_certs() --> 362 self.sock = ssl_wrap_socket( 363 sock=conn, 364 keyfile=self.key_file, 365 certfile=self.cert_file, 366 key_password=self.key_password, 367 ca_certs=self.ca_certs, 368 ca_cert_dir=self.ca_cert_dir, 369 ca_cert_data=self.ca_cert_data, 370 server_hostname=server_hostname, 371 ssl_context=context, 372 ) 374 if self.assert_fingerprint:

File ~/anaconda3/lib/python3.11/site-packages/urllib3/util/ssl_.py:399, in ssl_wrap_socket(sock, keyfile, certfile, cert_reqs, ca_certs, server_hostname, ssl_version, ciphers, ssl_context, ca_cert_dir, key_password, ca_cert_data) 388 warnings.warn( 389 "An HTTPS request has been made, but the SNI (Server Name " 390 "Indication) extension to TLS is not available on this platform. " (...) 396 SNIMissingWarning, 397 ) --> 399 return context.wrap_socket(sock)

File ~/anaconda3/lib/python3.11/ssl.py:517, in SSLContext.wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, session) 511 def wrap_socket(self, sock, server_side=False, 512 do_handshake_on_connect=True, 513 suppress_ragged_eofs=True, 514 server_hostname=None, session=None): 515 # SSLSocket class handles server_hostname encoding before it calls 516 # ctx._wrap_socket() --> 517 return self.sslsocket_class._create( 518 sock=sock, 519 server_side=server_side, 520 do_handshake_on_connect=do_handshake_on_connect, 521 suppress_ragged_eofs=suppress_ragged_eofs, 522 server_hostname=server_hostname, 523 context=self, 524 session=session 525 )

File ~/anaconda3/lib/python3.11/ssl.py:1075, in SSLSocket._create(cls, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, context, session) 1074 raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets") -> 1075 self.do_handshake() 1076 except (OSError, ValueError):

File ~/anaconda3/lib/python3.11/ssl.py:1346, in SSLSocket.do_handshake(self, block) 1345 self.settimeout(None) -> 1346 self._sslobj.do_handshake() 1347 finally:

SSLEOFError: EOF occurred in violation of protocol (_ssl.c:1002)

During handling of the above exception, another exception occurred:

MaxRetryError Traceback (most recent call last) File ~/anaconda3/lib/python3.11/site-packages/requests/adapters.py:486, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies) 485 try: --> 486 resp = conn.urlopen( 487 method=request.method, 488 url=url, 489 body=request.body, 490 headers=request.headers, 491 redirect=False, 492 assert_same_host=False, 493 preload_content=False, 494 decode_content=False, 495 retries=self.max_retries, 496 timeout=timeout, 497 chunked=chunked, 498 ) 500 except (ProtocolError, OSError) as err:

File ~/anaconda3/lib/python3.11/site-packages/urllib3/connectionpool.py:754, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, response_kw) 751 log.warning( 752 "Retrying (%r) after connection broken by '%r': %s", retries, err, url 753 ) --> 754 return self.urlopen( 755 method, 756 url, 757 body, 758 headers, 759 retries, 760 redirect, 761 assert_same_host, 762 timeout=timeout, 763 pool_timeout=pool_timeout, 764 release_conn=release_conn, 765 chunked=chunked, 766 body_pos=body_pos, 767 response_kw 768 ) 770 # Handle redirect?

File ~/anaconda3/lib/python3.11/site-packages/urllib3/connectionpool.py:754, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, response_kw) 751 log.warning( 752 "Retrying (%r) after connection broken by '%r': %s", retries, err, url 753 ) --> 754 return self.urlopen( 755 method, 756 url, 757 body, 758 headers, 759 retries, 760 redirect, 761 assert_same_host, 762 timeout=timeout, 763 pool_timeout=pool_timeout, 764 release_conn=release_conn, 765 chunked=chunked, 766 body_pos=body_pos, 767 response_kw 768 ) 770 # Handle redirect?

[... skipping similar frames: HTTPConnectionPool.urlopen at line 754 (2 times)]

File ~/anaconda3/lib/python3.11/site-packages/urllib3/connectionpool.py:754, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, response_kw) 751 log.warning( 752 "Retrying (%r) after connection broken by '%r': %s", retries, err, url 753 ) --> 754 return self.urlopen( 755 method, 756 url, 757 body, 758 headers, 759 retries, 760 redirect, 761 assert_same_host, 762 timeout=timeout, 763 pool_timeout=pool_timeout, 764 release_conn=release_conn, 765 chunked=chunked, 766 body_pos=body_pos, 767 response_kw 768 ) 770 # Handle redirect?

File ~/anaconda3/lib/python3.11/site-packages/urllib3/connectionpool.py:726, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw) 724 e = ProtocolError("Connection aborted.", e) --> 726 retries = retries.increment( 727 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2] 728 ) 729 retries.sleep()

File ~/anaconda3/lib/python3.11/site-packages/urllib3/util/retry.py:446, in Retry.increment(self, method, url, response, error, _pool, _stacktrace) 445 if new_retry.is_exhausted(): --> 446 raise MaxRetryError(_pool, url, error or ResponseError(cause)) 448 log.debug("Incremented Retry for (url='%s'): %r", url, new_retry)

MaxRetryError: HTTPSConnectionPool(host='172.18.86.87', port=5000): Max retries exceeded with url: /api/2.0/mlflow/experiments/get-by-name?experiment_name=Logistic_Regression_Demo (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1002)')))

During handling of the above exception, another exception occurred:

SSLError Traceback (most recent call last) File ~/anaconda3/lib/python3.11/site-packages/mlflow/utils/rest_utils.py:93, in http_request(host_creds, endpoint, method, max_retries, backoff_factor, extra_headers, retry_codes, timeout, kwargs) 92 try: ---> 93 return _get_http_response_with_retries( 94 method, 95 url, 96 max_retries, 97 backoff_factor, 98 retry_codes, 99 headers=headers, 100 verify=host_creds.verify, 101 timeout=timeout, 102 kwargs, 103 ) 104 except requests.exceptions.Timeout as to:

File ~/anaconda3/lib/python3.11/site-packages/mlflow/utils/request_utils.py:131, in _get_http_response_with_retries(method, url, max_retries, backoff_factor, retry_codes, kwargs) 130 session = _get_request_session(max_retries, backoff_factor, retry_codes) --> 131 return session.request(method, url, kwargs)

File ~/anaconda3/lib/python3.11/site-packages/requests/sessions.py:589, in Session.request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json) 588 send_kwargs.update(settings) --> 589 resp = self.send(prep, **send_kwargs) 591 return resp

File ~/anaconda3/lib/python3.11/site-packages/requests/sessions.py:703, in Session.send(self, request, kwargs) 702 # Send the request --> 703 r = adapter.send(request, kwargs) 705 # Total elapsed time of the request (approximately)

File ~/anaconda3/lib/python3.11/site-packages/requests/adapters.py:517, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies) 515 if isinstance(e.reason, _SSLError): 516 # This branch is for urllib3 v1.22 and later. --> 517 raise SSLError(e, request=request) 519 raise ConnectionError(e, request=request)

SSLError: HTTPSConnectionPool(host='172.18.86.87', port=5000): Max retries exceeded with url: /api/2.0/mlflow/experiments/get-by-name?experiment_name=Logistic_Regression_Demo (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1002)')))

During handling of the above exception, another exception occurred:

MlflowException Traceback (most recent call last) Cell In[2], line 10 7 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 9 # 创建MLflow实验 ---> 10 mlflow.set_experiment("Logistic_Regression_Demo")

File ~/anaconda3/lib/python3.11/site-packages/mlflow/tracking/fluent.py:129, in set_experiment(experiment_name, experiment_id) 127 client = MlflowClient() 128 if experiment_id is None: --> 129 experiment = client.get_experiment_by_name(experiment_name) 130 if not experiment: 131 _logger.info( 132 "Experiment with name '%s' does not exist. Creating a new experiment.", 133 experiment_name, 134 )

File ~/anaconda3/lib/python3.11/site-packages/mlflow/tracking/client.py:507, in MlflowClient.get_experiment_by_name(self, name) 476 def get_experiment_by_name(self, name: str) -> Optional[Experiment]: 477 """ 478 Retrieve an experiment by experiment name from the backend store 479 (...) 505 Lifecycle_stage: active 506 """ --> 507 return self._tracking_client.get_experiment_by_name(name)

File ~/anaconda3/lib/python3.11/site-packages/mlflow/tracking/_tracking_service/client.py:222, in TrackingServiceClient.get_experiment_by_name(self, name) 217 def get_experiment_by_name(self, name): 218 """ 219 :param name: The experiment name. 220 :return: :py:class:mlflow.entities.Experiment 221 """ --> 222 return self.store.get_experiment_by_name(name)

File ~/anaconda3/lib/python3.11/site-packages/mlflow/store/tracking/rest_store.py:307, in RestStore.get_experiment_by_name(self, experiment_name) 305 try: 306 req_body = message_to_json(GetExperimentByName(experiment_name=experiment_name)) --> 307 response_proto = self._call_endpoint(GetExperimentByName, req_body) 308 return Experiment.from_proto(response_proto.experiment) 309 except MlflowException as e:

File ~/anaconda3/lib/python3.11/site-packages/mlflow/store/tracking/rest_store.py:59, in RestStore._call_endpoint(self, api, json_body) 57 endpoint, method = _METHOD_TO_INFO[api] 58 response_proto = api.Response() ---> 59 return call_endpoint(self.get_host_creds(), endpoint, method, json_body, response_proto)

File ~/anaconda3/lib/python3.11/site-packages/mlflow/utils/rest_utils.py:199, in call_endpoint(host_creds, endpoint, method, json_body, response_proto, extra_headers) 197 if method == "GET": 198 call_kwargs["params"] = json_body --> 199 response = http_request(**call_kwargs) 200 else: 201 call_kwargs["json"] = json_body

File ~/anaconda3/lib/python3.11/site-packages/mlflow/utils/rest_utils.py:113, in http_request(host_creds, endpoint, method, max_retries, backoff_factor, extra_headers, retry_codes, timeout, **kwargs) 111 raise InvalidUrlException(f"Invalid url: {url}") from iu 112 except Exception as e: --> 113 raise MlflowException(f"API request to {url} failed with exception {e}")

MlflowException: API request to https://172.18.86.87:5000/api/2.0/mlflow/experiments/get-by-name failed with exception HTTPSConnectionPool(host='172.18.86.87', port=5000): Max retries exceeded with url: /api/2.0/mlflow/experiments/get-by-name?experiment_name=Logistic_Regression_Demo (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1002)')))

github-actions[bot] commented 1 year ago

@mlflow/mlflow-team Please assign a maintainer and start triaging this issue.

walternat1ve commented 10 months ago

i also experienced that issue at least 3 times by now which resulted in a crash of my experiments. how can i avoid mlflow dragging it down to the death?

Max retries exceeded with url: /abc/api/2.0/mlflow/runs/get?run_uuid=XXY&run_id=XXX (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1007)')))