Closed yugangzhang closed 8 years ago
Indeed this is the same issue showing up again. Thanks for your patience while we try to figure this out.
attn @tacaswell @arkilic, see also https://github.com/NSLS-II/Bug-Reports/issues/47
@tacaswell Did reIndex() work last time?
Are someone doing testing now? The case became even worse now, I can't load the data even try ten times.
ev, = get_events(hdr, ['eiger_4M_cam_img_image_lightfield'], fill = True)
I got
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/filestore/retrieve.py in get_data(eid, handle_registry)
240 try:
--> 241 datum = _DATUM_CACHE[eid]
242 except KeyError:
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/boltons/cacheutils.py in __getitem__(self, key)
157 try:
--> 158 link = self.link_map[key]
159 except KeyError:
KeyError: 'e623242f-4ce0-476d-b1d1-0d0181b176b3'
During handling of the above exception, another exception occurred:
OperationFailure Traceback (most recent call last)
<ipython-input-45-d419f1cc1c9b> in <module>()
----> 1 ev, = get_events(hdr, ['eiger_4M_cam_img_image_lightfield'], fill = True)
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/databroker-0.3.0-py3.4.egg/databroker/databroker.py in get_events(headers, fields, fill)
301 del event.timestamps[field]
302 if fill:
--> 303 fill_event(event)
304 yield event
305
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/databroker-0.3.0-py3.4.egg/databroker/databroker.py in fill_event(event)
213 if is_external[data_key]:
214 # Retrieve a numpy array from filestore
--> 215 event.data[data_key] = fs.retrieve(value)
216
217
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/filestore/commands.py in inner(*args, **kwargs)
19 port = int(conf.connection_config['port'])
20 db_connect(database=database, host=host, port=port)
---> 21 return func(*args, **kwargs)
22 return inner
23
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/filestore/commands.py in retrieve(eid)
136 The requested data as a numpy array
137 """
--> 138 return _get_data(eid)
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/filestore/retrieve.py in get_data(eid, handle_registry)
244 d_objs = Datum._get_collection()
245 # find the current document
--> 246 edoc = d_objs.find_one({'datum_id': eid})
247 if edoc is None:
248 raise Datum.DoesNotExist()
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/collection.py in find_one(self, spec_or_id, *args, **kwargs)
1082 *args, **kwargs).max_time_ms(max_time_ms)
1083
-> 1084 for result in cursor.limit(-1):
1085 return result
1086 return None
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/cursor.py in __next__(self)
1147 raise StopIteration
1148 db = self.__collection.database
-> 1149 if len(self.__data) or self._refresh():
1150 if self.__manipulate:
1151 return db._fix_outgoing(self.__data.popleft(),
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/cursor.py in _refresh(self)
1079 self.__skip, ntoreturn,
1080 self.__query_spec(), self.__fields,
-> 1081 self.__codec_options.uuid_representation))
1082 if not self.__id:
1083 self.__killed = True
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/cursor.py in __send_message(self, message)
1021 self.__codec_options.tz_aware,
1022 self.__codec_options.uuid_representation,
-> 1023 self.__compile_re)
1024 except OperationFailure:
1025 self.__killed = True
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/helpers.py in _unpack_response(response, cursor_id, as_class, tz_aware, uuid_subtype, compile_re)
142 error_object.get("$err"),
143 error_object.get("code"),
--> 144 error_object)
145
146 result = {}
OperationFailure: database error: getFile(): bad file number value (corrupt db?): run repair
@yugangzhang to my knowledge nobody is working on this right now. Has there been an unclean shutdown or smth of that nature in chx since @tacaswell fixed the issue? This sort of error appears only if server is shut down abruptly.
Seems like server was shut down incorrectly. Working after fix
This problem occurs again!
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/filestore/retrieve.py in get_data(eid, handle_registry)
240 try:
--> 241 datum = _DATUM_CACHE[eid]
242 except KeyError:
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/boltons/cacheutils.py in __getitem__(self, key)
157 try:
--> 158 link = self.link_map[key]
159 except KeyError:
KeyError: '269bf22b-b15c-4359-9324-c473ed6fd8a6'
During handling of the above exception, another exception occurred:
OperationFailure Traceback (most recent call last)
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/collection.py in create_index(self, key_or_list, cache_for, **kwargs)
1481 read_preference=ReadPreference.PRIMARY,
-> 1482 indexes=[index])
1483 except OperationFailure as exc:
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/database.py in command(self, command, value, check, allowable_errors, uuid_subtype, compile_re, read_preference, codec_options, **kwargs)
534 uuid_subtype, compile_re, read_preference,
--> 535 codec_options, **kwargs)[0]
536
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/database.py in _command(self, command, value, check, allowable_errors, uuid_subtype, compile_re, read_preference, codec_options, **kwargs)
439 helpers._check_command_response(result, self.connection.disconnect,
--> 440 msg, allowable_errors)
441
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/helpers.py in _check_command_response(response, reset, msg, allowable_errors)
212 msg = msg or "%s"
--> 213 raise OperationFailure(msg % errmsg, code, response)
214
OperationFailure: command SON([('createIndexes', 'datum'), ('indexes', [{'name': '_id_-1', 'background': False, 'dropDups': False, 'key': SON([('_id', -1)])}])]) on namespace filestore.$cmd failed: no such cmd: createIndexes
During handling of the above exception, another exception occurred:
OperationFailure Traceback (most recent call last)
<ipython-input-9-6ed165265c77> in <module>()
1 if BlueScan:
2 hdr = db[uid]
----> 3 ev, = get_events( hdr, [detector] )
4 imgs = ev['data'][detector]
5 else:
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/databroker-0.3.0-py3.4.egg/databroker/databroker.py in get_events(headers, fields, fill)
301 del event.timestamps[field]
302 if fill:
--> 303 fill_event(event)
304 yield event
305
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/databroker-0.3.0-py3.4.egg/databroker/databroker.py in fill_event(event)
213 if is_external[data_key]:
214 # Retrieve a numpy array from filestore
--> 215 event.data[data_key] = fs.retrieve(value)
216
217
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/filestore/commands.py in inner(*args, **kwargs)
19 port = int(conf.connection_config['port'])
20 db_connect(database=database, host=host, port=port)
---> 21 return func(*args, **kwargs)
22 return inner
23
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/filestore/commands.py in retrieve(eid)
136 The requested data as a numpy array
137 """
--> 138 return _get_data(eid)
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/filestore/retrieve.py in get_data(eid, handle_registry)
242 except KeyError:
243 keys = ['datum_kwargs', 'resource']
--> 244 d_objs = Datum._get_collection()
245 # find the current document
246 edoc = d_objs.find_one({'datum_id': eid})
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/mongoengine-0.8.7-py3.4.egg/mongoengine/document.py in _get_collection(cls)
175 cls._collection = db[collection_name]
176 if cls._meta.get('auto_create_index', True):
--> 177 cls.ensure_indexes()
178 return cls._collection
179
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/mongoengine-0.8.7-py3.4.egg/mongoengine/document.py in ensure_indexes(cls)
571 opts.update(spec)
572 collection.ensure_index(fields, background=background,
--> 573 drop_dups=drop_dups, **opts)
574
575 # If _cls is being used (for polymorphism), it needs an index,
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/collection.py in ensure_index(self, key_or_list, cache_for, **kwargs)
1597 if not self.__database.connection._cached(self.__database.name,
1598 self.__name, name):
-> 1599 return self.create_index(key_or_list, cache_for, **kwargs)
1600 return None
1601
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/collection.py in create_index(self, key_or_list, cache_for, **kwargs)
1486 self.__database.system.indexes.insert(index, manipulate=False,
1487 check_keys=False,
-> 1488 **self._get_wc_override())
1489 else:
1490 raise
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/collection.py in insert(self, doc_or_docs, manipulate, safe, check_keys, continue_on_error, **kwargs)
544 message._do_batched_insert(self.__full_name, gen(), check_keys,
545 safe, options, continue_on_error,
--> 546 self.uuid_subtype, client)
547
548 if return_one:
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/mongo_client.py in _send_message(self, message, with_last_error, command)
1270 response = self.__receive_message_on_socket(1, request_id,
1271 sock_info)
-> 1272 rv = self.__check_response_to_last_error(response, command)
1273
1274 return rv
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/mongo_client.py in __check_response_to_last_error(self, response, is_command)
1214 if code in (11000, 11001, 12582):
1215 raise DuplicateKeyError(details["err"], code, result)
-> 1216 raise OperationFailure(details["err"], code, result)
1217
1218 def __check_bson_size(self, message):
OperationFailure: add index fails, too many indexes for filestore.datum key:{ _id: -1 }
We can get the data after three times try.
You should check that your underlying file system is not corrupted.
@tacaswell I think it's the same mongo error we had before. Could you try repairing it, I'm on vacation today
This thing happened again!
As part of roll-out for the next cycle we will be doing significant maintenance on the mongo setup. This should include updating the version of mongo CHX is running to 3.+. We will also do a dump-> reload cycle. You should also get a new version of FS which completely removes mongoengine which is a complicating factor here.
We have seen this on no other beam line nor in any of our testing suggesting that this is something specific to CHX.
When we use jupyter Function 'get_events' to open an eiger hdf5 file, we only can open the data after three times try.
The wrong message is:
We met this problem before, I thought @tacaswell already fixed it.