NSLS-II / Bug-Reports

Unified issue-tracker for bugs in the data acquisition, management, and analysis software at NSLS-II
BSD 3-Clause "New" or "Revised" License
2 stars 5 forks source link

filestore collections have 64 _id indexes #57

Closed yugangzhang closed 8 years ago

yugangzhang commented 9 years ago

When we use jupyter Function 'get_events' to open an eiger hdf5 file, we only can open the data after three times try.

ev, = get_events(hdr, ['eiger_4M_cam_img_image_lightfield'], fill = True)

The wrong message is:

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/filestore/retrieve.py in get_data(eid, handle_registry)
    240     try:
--> 241         datum = _DATUM_CACHE[eid]
    242     except KeyError:

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/boltons/cacheutils.py in __getitem__(self, key)
    157             try:
--> 158                 link = self.link_map[key]
    159             except KeyError:

KeyError: '64babf82-c99d-4378-bc28-eb39f8521898'

During handling of the above exception, another exception occurred:

OperationFailure                          Traceback (most recent call last)
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/collection.py in create_index(self, key_or_list, cache_for, **kwargs)
   1481                                     read_preference=ReadPreference.PRIMARY,
-> 1482                                     indexes=[index])
   1483         except OperationFailure as exc:

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/database.py in command(self, command, value, check, allowable_errors, uuid_subtype, compile_re, read_preference, codec_options, **kwargs)
    534                              uuid_subtype, compile_re, read_preference,
--> 535                              codec_options, **kwargs)[0]
    536 

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/database.py in _command(self, command, value, check, allowable_errors, uuid_subtype, compile_re, read_preference, codec_options, **kwargs)
    439             helpers._check_command_response(result, self.connection.disconnect,
--> 440                                             msg, allowable_errors)
    441 

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/helpers.py in _check_command_response(response, reset, msg, allowable_errors)
    212             msg = msg or "%s"
--> 213             raise OperationFailure(msg % errmsg, code, response)
    214 

OperationFailure: command SON([('createIndexes', 'datum'), ('indexes', [{'background': False, 'name': '_id_-1', 'key': SON([('_id', -1)]), 'dropDups': False}])]) on namespace filestore.$cmd failed: no such cmd: createIndexes

During handling of the above exception, another exception occurred:

OperationFailure                          Traceback (most recent call last)
<ipython-input-4-d419f1cc1c9b> in <module>()
----> 1 ev, = get_events(hdr, ['eiger_4M_cam_img_image_lightfield'], fill = True)

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/databroker-0.3.0-py3.4.egg/databroker/databroker.py in get_events(headers, fields, fill)
    301                     del event.timestamps[field]
    302                 if fill:
--> 303                     fill_event(event)
    304                 yield event
    305 

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/databroker-0.3.0-py3.4.egg/databroker/databroker.py in fill_event(event)
    213         if is_external[data_key]:
    214             # Retrieve a numpy array from filestore
--> 215             event.data[data_key] = fs.retrieve(value)
    216 
    217 

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/filestore/commands.py in inner(*args, **kwargs)
     19         port = int(conf.connection_config['port'])
     20         db_connect(database=database, host=host, port=port)
---> 21         return func(*args, **kwargs)
     22     return inner
     23 

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/filestore/commands.py in retrieve(eid)
    136         The requested data as a numpy array
    137     """
--> 138     return _get_data(eid)

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/filestore/retrieve.py in get_data(eid, handle_registry)
    242     except KeyError:
    243         keys = ['datum_kwargs', 'resource']
--> 244         d_objs = Datum._get_collection()
    245         # find the current document
    246         edoc = d_objs.find_one({'datum_id': eid})

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/mongoengine-0.8.7-py3.4.egg/mongoengine/document.py in _get_collection(cls)
    175                 cls._collection = db[collection_name]
    176             if cls._meta.get('auto_create_index', True):
--> 177                 cls.ensure_indexes()
    178         return cls._collection
    179 

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/mongoengine-0.8.7-py3.4.egg/mongoengine/document.py in ensure_indexes(cls)
    571                 opts.update(spec)
    572                 collection.ensure_index(fields, background=background,
--> 573                                         drop_dups=drop_dups, **opts)
    574 
    575         # If _cls is being used (for polymorphism), it needs an index,

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/collection.py in ensure_index(self, key_or_list, cache_for, **kwargs)
   1597         if not self.__database.connection._cached(self.__database.name,
   1598                                                   self.__name, name):
-> 1599             return self.create_index(key_or_list, cache_for, **kwargs)
   1600         return None
   1601 

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/collection.py in create_index(self, key_or_list, cache_for, **kwargs)
   1486                 self.__database.system.indexes.insert(index, manipulate=False,
   1487                                                       check_keys=False,
-> 1488                                                       **self._get_wc_override())
   1489             else:
   1490                 raise

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/collection.py in insert(self, doc_or_docs, manipulate, safe, check_keys, continue_on_error, **kwargs)
    544             message._do_batched_insert(self.__full_name, gen(), check_keys,
    545                                        safe, options, continue_on_error,
--> 546                                        self.uuid_subtype, client)
    547 
    548         if return_one:

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/mongo_client.py in _send_message(self, message, with_last_error, command)
   1270                     response = self.__receive_message_on_socket(1, request_id,
   1271                                                                 sock_info)
-> 1272                     rv = self.__check_response_to_last_error(response, command)
   1273 
   1274                 return rv

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/mongo_client.py in __check_response_to_last_error(self, response, is_command)
   1214         if code in (11000, 11001, 12582):
   1215             raise DuplicateKeyError(details["err"], code, result)
-> 1216         raise OperationFailure(details["err"], code, result)
   1217 
   1218     def __check_bson_size(self, message):

OperationFailure: add index fails, too many indexes for filestore.datum key:{ _id: -1 }

We met this problem before, I thought @tacaswell already fixed it.

danielballan commented 9 years ago

Indeed this is the same issue showing up again. Thanks for your patience while we try to figure this out.

attn @tacaswell @arkilic, see also https://github.com/NSLS-II/Bug-Reports/issues/47

ghost commented 9 years ago

@tacaswell Did reIndex() work last time?

yugangzhang commented 9 years ago

Are someone doing testing now? The case became even worse now, I can't load the data even try ten times.

ev, = get_events(hdr, ['eiger_4M_cam_img_image_lightfield'], fill = True)

I got

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/filestore/retrieve.py in get_data(eid, handle_registry)
    240     try:
--> 241         datum = _DATUM_CACHE[eid]
    242     except KeyError:

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/boltons/cacheutils.py in __getitem__(self, key)
    157             try:
--> 158                 link = self.link_map[key]
    159             except KeyError:

KeyError: 'e623242f-4ce0-476d-b1d1-0d0181b176b3'

During handling of the above exception, another exception occurred:

OperationFailure                          Traceback (most recent call last)
<ipython-input-45-d419f1cc1c9b> in <module>()
----> 1 ev, = get_events(hdr, ['eiger_4M_cam_img_image_lightfield'], fill = True)

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/databroker-0.3.0-py3.4.egg/databroker/databroker.py in get_events(headers, fields, fill)
    301                     del event.timestamps[field]
    302                 if fill:
--> 303                     fill_event(event)
    304                 yield event
    305 

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/databroker-0.3.0-py3.4.egg/databroker/databroker.py in fill_event(event)
    213         if is_external[data_key]:
    214             # Retrieve a numpy array from filestore
--> 215             event.data[data_key] = fs.retrieve(value)
    216 
    217 

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/filestore/commands.py in inner(*args, **kwargs)
     19         port = int(conf.connection_config['port'])
     20         db_connect(database=database, host=host, port=port)
---> 21         return func(*args, **kwargs)
     22     return inner
     23 

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/filestore/commands.py in retrieve(eid)
    136         The requested data as a numpy array
    137     """
--> 138     return _get_data(eid)

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/filestore/retrieve.py in get_data(eid, handle_registry)
    244         d_objs = Datum._get_collection()
    245         # find the current document
--> 246         edoc = d_objs.find_one({'datum_id': eid})
    247         if edoc is None:
    248             raise Datum.DoesNotExist()

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/collection.py in find_one(self, spec_or_id, *args, **kwargs)
   1082                            *args, **kwargs).max_time_ms(max_time_ms)
   1083 
-> 1084         for result in cursor.limit(-1):
   1085             return result
   1086         return None

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/cursor.py in __next__(self)
   1147             raise StopIteration
   1148         db = self.__collection.database
-> 1149         if len(self.__data) or self._refresh():
   1150             if self.__manipulate:
   1151                 return db._fix_outgoing(self.__data.popleft(),

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/cursor.py in _refresh(self)
   1079                               self.__skip, ntoreturn,
   1080                               self.__query_spec(), self.__fields,
-> 1081                               self.__codec_options.uuid_representation))
   1082             if not self.__id:
   1083                 self.__killed = True

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/cursor.py in __send_message(self, message)
   1021                 self.__codec_options.tz_aware,
   1022                 self.__codec_options.uuid_representation,
-> 1023                 self.__compile_re)
   1024         except OperationFailure:
   1025             self.__killed = True

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/helpers.py in _unpack_response(response, cursor_id, as_class, tz_aware, uuid_subtype, compile_re)
    142                                error_object.get("$err"),
    143                                error_object.get("code"),
--> 144                                error_object)
    145 
    146     result = {}

OperationFailure: database error: getFile(): bad file number value (corrupt db?): run repair
ghost commented 9 years ago

@yugangzhang to my knowledge nobody is working on this right now. Has there been an unclean shutdown or smth of that nature in chx since @tacaswell fixed the issue? This sort of error appears only if server is shut down abruptly.

ghost commented 9 years ago

Seems like server was shut down incorrectly. Working after fix

yugangzhang commented 8 years ago

This problem occurs again!

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/filestore/retrieve.py in get_data(eid, handle_registry)
    240     try:
--> 241         datum = _DATUM_CACHE[eid]
    242     except KeyError:

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/boltons/cacheutils.py in __getitem__(self, key)
    157             try:
--> 158                 link = self.link_map[key]
    159             except KeyError:

KeyError: '269bf22b-b15c-4359-9324-c473ed6fd8a6'

During handling of the above exception, another exception occurred:

OperationFailure                          Traceback (most recent call last)
/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/collection.py in create_index(self, key_or_list, cache_for, **kwargs)
   1481                                     read_preference=ReadPreference.PRIMARY,
-> 1482                                     indexes=[index])
   1483         except OperationFailure as exc:

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/database.py in command(self, command, value, check, allowable_errors, uuid_subtype, compile_re, read_preference, codec_options, **kwargs)
    534                              uuid_subtype, compile_re, read_preference,
--> 535                              codec_options, **kwargs)[0]
    536 

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/database.py in _command(self, command, value, check, allowable_errors, uuid_subtype, compile_re, read_preference, codec_options, **kwargs)
    439             helpers._check_command_response(result, self.connection.disconnect,
--> 440                                             msg, allowable_errors)
    441 

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/helpers.py in _check_command_response(response, reset, msg, allowable_errors)
    212             msg = msg or "%s"
--> 213             raise OperationFailure(msg % errmsg, code, response)
    214 

OperationFailure: command SON([('createIndexes', 'datum'), ('indexes', [{'name': '_id_-1', 'background': False, 'dropDups': False, 'key': SON([('_id', -1)])}])]) on namespace filestore.$cmd failed: no such cmd: createIndexes

During handling of the above exception, another exception occurred:

OperationFailure                          Traceback (most recent call last)
<ipython-input-9-6ed165265c77> in <module>()
      1 if BlueScan:
      2     hdr = db[uid]
----> 3     ev, = get_events(  hdr, [detector] )
      4     imgs = ev['data'][detector]
      5 else:

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/databroker-0.3.0-py3.4.egg/databroker/databroker.py in get_events(headers, fields, fill)
    301                     del event.timestamps[field]
    302                 if fill:
--> 303                     fill_event(event)
    304                 yield event
    305 

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/databroker-0.3.0-py3.4.egg/databroker/databroker.py in fill_event(event)
    213         if is_external[data_key]:
    214             # Retrieve a numpy array from filestore
--> 215             event.data[data_key] = fs.retrieve(value)
    216 
    217 

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/filestore/commands.py in inner(*args, **kwargs)
     19         port = int(conf.connection_config['port'])
     20         db_connect(database=database, host=host, port=port)
---> 21         return func(*args, **kwargs)
     22     return inner
     23 

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/filestore/commands.py in retrieve(eid)
    136         The requested data as a numpy array
    137     """
--> 138     return _get_data(eid)

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/filestore/retrieve.py in get_data(eid, handle_registry)
    242     except KeyError:
    243         keys = ['datum_kwargs', 'resource']
--> 244         d_objs = Datum._get_collection()
    245         # find the current document
    246         edoc = d_objs.find_one({'datum_id': eid})

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/mongoengine-0.8.7-py3.4.egg/mongoengine/document.py in _get_collection(cls)
    175                 cls._collection = db[collection_name]
    176             if cls._meta.get('auto_create_index', True):
--> 177                 cls.ensure_indexes()
    178         return cls._collection
    179 

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/mongoengine-0.8.7-py3.4.egg/mongoengine/document.py in ensure_indexes(cls)
    571                 opts.update(spec)
    572                 collection.ensure_index(fields, background=background,
--> 573                                         drop_dups=drop_dups, **opts)
    574 
    575         # If _cls is being used (for polymorphism), it needs an index,

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/collection.py in ensure_index(self, key_or_list, cache_for, **kwargs)
   1597         if not self.__database.connection._cached(self.__database.name,
   1598                                                   self.__name, name):
-> 1599             return self.create_index(key_or_list, cache_for, **kwargs)
   1600         return None
   1601 

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/collection.py in create_index(self, key_or_list, cache_for, **kwargs)
   1486                 self.__database.system.indexes.insert(index, manipulate=False,
   1487                                                       check_keys=False,
-> 1488                                                       **self._get_wc_override())
   1489             else:
   1490                 raise

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/collection.py in insert(self, doc_or_docs, manipulate, safe, check_keys, continue_on_error, **kwargs)
    544             message._do_batched_insert(self.__full_name, gen(), check_keys,
    545                                        safe, options, continue_on_error,
--> 546                                        self.uuid_subtype, client)
    547 
    548         if return_one:

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/mongo_client.py in _send_message(self, message, with_last_error, command)
   1270                     response = self.__receive_message_on_socket(1, request_id,
   1271                                                                 sock_info)
-> 1272                     rv = self.__check_response_to_last_error(response, command)
   1273 
   1274                 return rv

/home/yuzhang/.conda/envs/user_analysis/lib/python3.4/site-packages/pymongo/mongo_client.py in __check_response_to_last_error(self, response, is_command)
   1214         if code in (11000, 11001, 12582):
   1215             raise DuplicateKeyError(details["err"], code, result)
-> 1216         raise OperationFailure(details["err"], code, result)
   1217 
   1218     def __check_bson_size(self, message):

OperationFailure: add index fails, too many indexes for filestore.datum key:{ _id: -1 }

We can get the data after three times try.

tacaswell commented 8 years ago

You should check that your underlying file system is not corrupted.

ghost commented 8 years ago

@tacaswell I think it's the same mongo error we had before. Could you try repairing it, I'm on vacation today

yugangzhang commented 8 years ago

This thing happened again!

tacaswell commented 8 years ago

As part of roll-out for the next cycle we will be doing significant maintenance on the mongo setup. This should include updating the version of mongo CHX is running to 3.+. We will also do a dump-> reload cycle. You should also get a new version of FS which completely removes mongoengine which is a complicating factor here.

We have seen this on no other beam line nor in any of our testing suggesting that this is something specific to CHX.