Closed achave11-ucsc closed 4 months ago
This patch was produced during PL:
Subject: [PATCH] [r] Index dcp35 in prod (#5949)
---
Index: src/azul/service/async_manifest_service.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/src/azul/service/async_manifest_service.py b/src/azul/service/async_manifest_service.py
--- a/src/azul/service/async_manifest_service.py (revision 9c69f828897a5b9b1c37f96dff7aaf94b04aca27)
+++ b/src/azul/service/async_manifest_service.py (date 1707945397482)
@@ -125,7 +125,7 @@
token = Token.first(execution_id)
input = json.dumps(input)
try:
- # If there already is an exception of the given name, and if that
+ # If there already is an execution of the given name, and if that
# execution is still ongoing and was given the same input as what we
# pass here, `start_execution` will succeed idempotently
execution = self._sfn.start_execution(stateMachineArn=self.machine_arn,
@@ -133,12 +133,12 @@
input=input)
except self._sfn.exceptions.ExecutionAlreadyExists:
# This exception indicates that there is already an execution with
- # the given name but that it is has ended, or that its input differs
+ # the given name but that it has ended, or that its input differs
# from what we were passing now. The latter case is unexpected and
# therefore constitues an error. In the former case we return the
# token so that the client has to make another request to actually
# obtain the resulting manifest. Strictly speaking, we could return
- # the manifest here but it keeps the control flow simpler. This is
+ # the manifest here, but it keeps the control flow simpler. This is
# benevolent race is rare enough to not worry about optimizing.
execution = self._sfn.describe_execution(executionArn=execution_arn)
if execution['input'] != input:
Index: src/azul/service/manifest_service.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/src/azul/service/manifest_service.py b/src/azul/service/manifest_service.py
--- a/src/azul/service/manifest_service.py (revision 9c69f828897a5b9b1c37f96dff7aaf94b04aca27)
+++ b/src/azul/service/manifest_service.py (date 1707945678727)
@@ -743,8 +743,15 @@
try:
encoded_file_name = tagging[self.file_name_tag]
except KeyError:
- # Can't be absent under S3's strong consistency
- assert False, (object_key, self.file_name_tag)
+ # Some manifest generators apply the tag after creating
+ # the object. If we happened to be right inbetween the
+ # creation and application of tags, we pretend that the
+ # manifest object doesn't exist, assuming that the caller
+ # will tell the client to check back later. Note that this
+ # might even involve the attempted creation of a step-function
+ # execution but because step-functions are idempotent the result
+ # of the … <TODO>
+ return None
else:
encoded_file_name = encoded_file_name.encode('ascii')
return base64.urlsafe_b64decode(encoded_file_name).decode('utf-8')
Assignee to consider next steps.
This race is rare so a fix might be difficult to verify. To increase the probability of the race, a drop! commit should be added, containing a time.sleep()
call between the creation of the manifest object and the application tags to it. The integration_test.IndexingIntegrationTest._test_manifest
test case already covers multiple concurrent requests to the same manifest, and should fail in the presence of that sleep
.
Spike to perform above experiment.
The integration_test.IndexingIntegrationTest._test_manifest test case already covers multiple concurrent requests to the same manifest, and should fail in the presence of that sleep.
@hannes-ucsc: "That's not actually the case because all concurrent requests are made at the same time, observe the object to be missing and get a token presenting the ongoing step function execution. The race is observed when one thread starts the generation and the second thread, a little latter observes the manifest object to exists but not yet tagged. This requires that the initial request made by the two threads is staggered and not at the same time. Assignee to create a dedicated IT test method for this purpose."
This new integration test that reproduces the race conditions for the paged manifest generators, in addition to the drop!
commit make the test fail as expected when the manifest requested is available but not yet tagged.
Index: src/azul/service/manifest_service.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/src/azul/service/manifest_service.py b/src/azul/service/manifest_service.py
--- a/src/azul/service/manifest_service.py (revision d280daece2fba9f83380079d466188820acec31f)
+++ b/src/azul/service/manifest_service.py (date 1709063955081)
@@ -1290,6 +1290,7 @@
file_name = self.file_name(manifest_key, base_name=partition.file_name)
tagging = self.tagging(file_name)
if tagging is not None:
+ time.sleep(16) # Exposes race between manifest completion and tagging
self.storage.put_object_tagging(object_key, tagging)
return partition.last(file_name)
else:
Index: test/integration_test.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/test/integration_test.py b/test/integration_test.py
--- a/test/integration_test.py (revision d280daece2fba9f83380079d466188820acec31f)
+++ b/test/integration_test.py (date 1709063938522)
@@ -490,6 +490,8 @@
bundle_fqids=catalog.bundles)
self._test_single_entity_response(catalog=catalog.name)
+ catalog = self.random.choice(catalogs)
+ self._test_manifest_tagging_race(catalog=catalog.name)
for catalog in catalogs:
self._test_manifest(catalog.name)
self._test_dos_and_drs(catalog.name)
@@ -569,29 +571,74 @@
with self.subTest('other_endpoints', endpoint=endpoint, path=path, args=args):
self._check_endpoint(GET, path, args=args, endpoint=endpoint)
- def _test_manifest(self, catalog: CatalogName):
+ def _test_manifest_tagging_race(self, catalog: CatalogName):
supported_formats = self.metadata_plugin(catalog).manifest_formats
assert supported_formats
validators: dict[ManifestFormat, Callable[[str, bytes], None]] = {
ManifestFormat.compact: self._check_compact_manifest,
- ManifestFormat.terra_bdbag: self._check_terra_bdbag_manifest,
- ManifestFormat.terra_pfb: self._check_terra_pfb_manifest,
- ManifestFormat.curl: self._check_curl_manifest
+ ManifestFormat.curl: self._check_compact_manifest,
}
- for format in [None, *supported_formats]:
- # IT catalogs with just one public source are always indexed
- # completely if that source contains less than the minimum number of
- # bundles required. So regardless of any randomness employed by this
- # test, manifests derived from these catalogs will always be based
- # on the same content hash. Since the resulting reuse of cached
- # manifests interferes with this test, we need another means of
- # randomizing the manifest key: a random but all-inclusive filter.
- tibi_byte = 1024 ** 4
- filters = {
- self._file_size_facet(catalog): {
- 'within': [[0, tibi_byte + self.random.randint(0, tibi_byte)]]
- }
- }
+ filters = self._init_manifest_filters(catalog)
+
+ format = self.random.choice([*validators])
+
+ endpoint_url = config.service_endpoint
+ manifeset_url = endpoint_url.set(path='/manifest/files')
+
+ with self.subTest('manifest-tagging', catalog=catalog, format=format):
+ args = dict(catalog=catalog, filters=json.dumps(filters))
+ validator = validators[format]
+ args['format'] = format.value
+ manifeset_url.args = args
+
+ responses = list()
+ while True:
+ response = self._get_url(PUT, manifeset_url)
+ responses.append(response)
+ if response.status == 301:
+ # Request the same manifest without following the redirect
+ # in order to expose the race condition that returns an
+ # untagged cached manifest. This happens when a step
+ # function execution has generated a manifest but is still
+ # in the process of tagging the object.
+ time.sleep(2)
+ elif response.status == 302:
+ signed_url = furl(response.headers['Location'])
+ response = self._get_url(GET, signed_url)
+ break
+
+ validator(catalog, response.data)
+
+ execution_ids = self._manifest_execution_ids(responses, fetch=False)
+ self.assertEqual(1, len(execution_ids))
+
+
+ def _init_manifest_filters(self, catalog):
+ # IT catalogs with just one public source are always indexed
+ # completely if that source contains less than the minimum number of
+ # bundles required. So regardless of any randomness employed by this
+ # test, manifests derived from these catalogs will always be based
+ # on the same content hash. Since the resulting reuse of cached
+ # manifests interferes with this test, we need another means of
+ # randomizing the manifest key: a random but all-inclusive filter.
+ tibi_byte = 1024 ** 4
+ return {
+ self._file_size_facet(catalog): {
+ 'within': [[0, tibi_byte + self.random.randint(0, tibi_byte)]]
+ }
+ }
+
+ def _test_manifest(self, catalog: CatalogName):
+ supported_formats = self.metadata_plugin(catalog).manifest_formats
+ assert supported_formats
+ validators: dict[ManifestFormat, Callable[[str, bytes], None]] = {
+ ManifestFormat.compact: self._check_compact_manifest,
+ ManifestFormat.terra_bdbag: self._check_terra_bdbag_manifest,
+ ManifestFormat.terra_pfb: self._check_terra_pfb_manifest,
+ ManifestFormat.curl: self._check_curl_manifest
+ }
+ for format in [None, *supported_formats]:
+ filters = self._init_manifest_filters(catalog)
first_fetch = bool(self.random.getrandbits(1))
for fetch in [first_fetch, not first_fetch]:
with self.subTest('manifest', catalog=catalog, format=format, fetch=fetch):
This is the behavior, as per the service & manifest-service lambda logs,
CloudWatch Logs Insights
region: us-east-1
log-group-names: /aws/lambda/azul-service-abrahamsc, /aws/lambda/azul-service-abrahamsc-manifest
start-time: 2024-02-27T18:36:46.273Z
end-time: 2024-02-27T18:37:10.392Z
query-string:
fields @timestamp
| parse @log "lambda/*" as lambda
| filter @message like /Received|Returning|s3\.Create/
| filter @requestId like /c70a6273|aec38af5|c3f42697|be7613d3|752d2d63|cae7bef7|0f78b4c6|70453983|a5016217/
| parse "Received * request for '*'" as method, path
| parse 'catalog": "*"' as catalog
| parse 'Returning * response' as status
| parse 'Location": "*"' as location
| parse "KeyError: '*'" as assertion_error_on
| parse 'azul.boto3*' as manifest_gen
| sort @timestamp asc
| limit 1000
@timestamp | lambda | method | path | catalog | status | location | assertion_error_on | manifest_gen |
---|---|---|---|---|---|---|---|---|
2024-02-27 18:36:57.384 | azul-service-abrahamsc | PUT | /manifest/files | dcp3-it | ||||
2024-02-27 18:36:58.012 | azul-service-abrahamsc | 301 | https://service.abrahamsc.dev.singlecell.gi.ucsc.edu/manifest/files/k8Qgg4BHvIzANeCb4CFIaHbwsvQ6MNd5mWtxkFB9IIcbCF8AAQ== | |||||
2024-02-27 18:37:00.137 | azul-service-abrahamsc | PUT | /manifest/files | dcp3-it | ||||
2024-02-27 18:37:00.234 | azul-service-abrahamsc | 301 | https://service.abrahamsc.dev.singlecell.gi.ucsc.edu/manifest/files/k8Qgg4BHvIzANeCb4CFIaHbwsvQ6MNd5mWtxkFB9IIcbCF8AAQ== | |||||
2024-02-27 18:37:01.674 | azul-service-abrahamsc-manifest | s3.CreateMultipartUpload: Making POST request to https://s3.amazonaws.com/edu-ucsc-gi-platform-hca-dev-storage-abrahamsc.us-east-1/manifests/aa79a215-1050-536f-a18a-09ac1dbe1248.ad14b32b-dea0-555e-b745-8ebe5bb0958d.tsv?uploads | ||||||
2024-02-27 18:37:01.675 | azul-service-abrahamsc-manifest | s3.CreateMultipartUpload: … without request body | ||||||
2024-02-27 18:37:01.702 | azul-service-abrahamsc-manifest | s3.CreateMultipartUpload: Got 200 response | ||||||
2024-02-27 18:37:01.702 | azul-service-abrahamsc-manifest | s3.CreateMultipartUpload: … with response body b'<?xml version="1.0" encoding="UTF-8"?>\n |
||||||
2024-02-27 18:37:02.360 | azul-service-abrahamsc | PUT | /manifest/files | dcp3-it | ||||
2024-02-27 18:37:02.439 | azul-service-abrahamsc | 500 | azul_file_name | |||||
2024-02-27 18:37:02.575 | azul-service-abrahamsc | PUT | /manifest/files | dcp3-it | ||||
2024-02-27 18:37:02.629 | azul-service-abrahamsc | 500 | azul_file_name | |||||
2024-02-27 18:37:02.740 | azul-service-abrahamsc | PUT | /manifest/files | dcp3-it | ||||
2024-02-27 18:37:02.795 | azul-service-abrahamsc | 500 | azul_file_name | |||||
2024-02-27 18:37:02.912 | azul-service-abrahamsc | PUT | /manifest/files | dcp3-it | ||||
2024-02-27 18:37:03.005 | azul-service-abrahamsc | 500 | azul_file_name | |||||
2024-02-27 18:37:03.258 | azul-service-abrahamsc | PUT | /manifest/files | dcp3-it | ||||
2024-02-27 18:37:03.321 | azul-service-abrahamsc | 500 | azul_file_name | |||||
2024-02-27 18:37:03.433 | azul-service-abrahamsc | PUT | /manifest/files | dcp3-it | ||||
2024-02-27 18:37:03.484 | azul-service-abrahamsc | 500 | azul_file_name |
For demo, disable the fix in a personal deployment, re-deploy and run the IT. It should fail. It might help to prime the IT using the delete
and index
variables so we don't have to wait for indexing part of the IT to complete.
The following logs exhibit the race when a service request for a manifest, with an ongoing generation, fails due to the manifest missing the name tag, which are then applied after the service request had return a 500 status code.
CloudWatch Logs Insights
region: us-east-1
log-group-names: /aws/lambda/azul-service-prod, /aws/lambda/azul-service-prod-manifest
start-time: -172800s
end-time: 0s
query-string:
~~~^^^^^^^^^^^^^^^^^^^^ KeyError: 'azul_file_name' During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/var/task/chalice/app.py", line 1917, in _get_view_function_response response = view_function(**function_args) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/var/task/app.py", line 1557, in fetch_file_manifest_get return _file_manifest(fetch=True) ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/var/task/app.py", line 1583, in _file_manifest return app.manifest_controller.get_manifest_async(query_params=query_params, ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/var/task/azul/service/manifest_controller.py", line 139, in get_manifest_async manifest = self.service.get_cached_manifest(format=format, ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/var/task/azul/service/manifest_service.py", line 651, in get_cached_manifest return self._get_cached_manifest(generator_cls, manifest_key) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/var/task/azul/service/manifest_service.py", line 690, in _get_cached_manifest file_name = self._get_cached_manifest_file_name(generator_cls, manifest_key) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/var/task/azul/service/manifest_service.py", line 747, in _get_cached_manifest_file_name assert False, (object_key, self.file_name_tag) AssertionError: ('manifests/52f339c2-0874-58b0-82a3-eef96ee01af5.58e28026-9ea8-5083-928d-0517707b35a2.tsv', 'azul_file_name')~~~^^^^^^^^^^^^^^^^^^^^ KeyError: 'azul_file_name' During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/var/task/chalice/app.py", line 1917, in _get_view_function_response response = view_function(**function_args) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/var/task/app.py", line 1557, in fetch_file_manifest_get return _file_manifest(fetch=True) ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/var/task/app.py", line 1583, in _file_manifest return app.manifest_controller.get_manifest_async(query_params=query_params, ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/var/task/azul/service/manifest_controller.py", line 139, in get_manifest_async manifest = self.service.get_cached_m