CodyKochmann / graphdb

sqlite based graph database for storing native python objects and their relationships to each other
MIT License
190 stars 10 forks source link

see if dbm is a faster backend for graphdb than sqlite #8

Open CodyKochmann opened 6 years ago

CodyKochmann commented 6 years ago

I have all the respect in the world for sqlite, but since graph databases are recursive, we should probably be open to other engines, especially since dbm comes with python's stdlib as well.

CodyKochmann commented 6 years ago

its rough, but heres the proof of concept to show how graphdb would work in a key-value dbm based setting

In [43]: from base64 import b64encode as b64e

In [44]: b64e(dill.dumps(['hello', 55, None, ({6:'l'},), tuple()], protocol=dill.HIGHEST_PROTOCOL))
Out[44]: b'gASVHAAAAAAAAABdlCiMBWhlbGxvlEs3Tn2USwaMAWyUc4WUKWUu'

In [45]: s = b64e(dill.dumps(['hello', 55, None, ({6:'l'},), tuple()], protocol=dill.HIGHEST_PROTOCOL))

In [46]: s
Out[46]: b'gASVHAAAAAAAAABdlCiMBWhlbGxvlEs3Tn2USwaMAWyUc4WUKWUu'

In [47]: from base64 import b64decode as b64d

In [48]: dill.loads(b64d(s))
Out[48]: ['hello', 55, None, ({6: 'l'},), ()]

In [49]: s = b64e(dill.dumps(['hello', 55, None, ({6:'l'},), tuple(), lambda x:x+4], protocol=dill.HIGHEST_PROTOCOL))

In [50]: dill.loads(b64d(s))
Out[50]: ['hello', 55, None, ({6: 'l'},), (), <function __main__.<lambda>>]

In [51]: dill.loads(b64d(s))[-1](4) # should return 8
Out[51]: 8

In [52]: import hashlib

In [53]: hashlib.algorithms_available
Out[53]:
{'DSA',
 'DSA-SHA',
 'MD4',
 'MD5',
 'RIPEMD160',
 'SHA',
 'SHA1',
 'SHA224',
 'SHA256',
 'SHA384',
 'SHA512',
 'dsaEncryption',
 'dsaWithSHA',
 'ecdsa-with-SHA1',
 'md4',
 'md5',
 'ripemd160',
 'sha',
 'sha1',
 'sha224',
 'sha256',
 'sha384',
 'sha512',
 'whirlpool'}

In [54]: hashlib.algorithms_guaranteed
Out[54]: {'md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512'}

In [55]: sha256 = lambda stream, hasher=hashlib.sha256:hasher(stream).hexdigest()

In [56]: s
Out[56]: b'gASV2AAAAAAAAABdlCiMBWhlbGxvlEs3Tn2USwaMAWyUc4WUKYwJZGlsbC5kaWxslIwQX2NyZWF0ZV9mdW5jdGlvbpSTlChoBYwKX2xvYWRfdHlwZZSTlIwIQ29kZVR5cGWUhZRSlChLAUsASwFLAktDQwh8AABkAQAXU5ROSwSGlCmMAXiUhZSMHzxpcHl0aG9uLWlucHV0LTQ5LWQ1ZTY0NjRkMGIxMj6UjAg8bGFtYmRhPpRLAUMAlCkpdJRSlGNfX2J1aWx0aW5fXwpfX21haW5fXwpoEk5OfZR0lFKUZS4='

In [57]: sha256(s)
Out[57]: '9c8d283ad8123f256717522971de06f4de4512de136bdecb879ff8568da231f2'

In [58]: sha256(s)
Out[58]: '9c8d283ad8123f256717522971de06f4de4512de136bdecb879ff8568da231f2'

In [59]: import dbm

In [60]: db = dbm.open('db.dbm', 'c')

In [61]: db
Out[61]: <_gdbm.gdbm at 0x7f013e976af0>

In [62]: db[sha256(s)] = s

In [63]: db[sha256(s)]
Out[63]: b'gASV2AAAAAAAAABdlCiMBWhlbGxvlEs3Tn2USwaMAWyUc4WUKYwJZGlsbC5kaWxslIwQX2NyZWF0ZV9mdW5jdGlvbpSTlChoBYwKX2xvYWRfdHlwZZSTlIwIQ29kZVR5cGWUhZRSlChLAUsASwFLAktDQwh8AABkAQAXU5ROSwSGlCmMAXiUhZSMHzxpcHl0aG9uLWlucHV0LTQ5LWQ1ZTY0NjRkMGIxMj6UjAg8bGFtYmRhPpRLAUMAlCkpdJRSlGNfX2J1aWx0aW5fXwpfX21haW5fXwpoEk5OfZR0lFKUZS4='

In [64]: db[sha256(s+'neighbor')] = s
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-64-3f31ca9c414a> in <module>()
----> 1 db[sha256(s+'neighbor')] = s

TypeError: can't concat bytes to str

In [65]: '{}{}'.format(s,'neighbor')
Out[65]: "b'gASV2AAAAAAAAABdlCiMBWhlbGxvlEs3Tn2USwaMAWyUc4WUKYwJZGlsbC5kaWxslIwQX2NyZWF0ZV9mdW5jdGlvbpSTlChoBYwKX2xvYWRfdHlwZZSTlIwIQ29kZVR5cGWUhZRSlChLAUsASwFLAktDQwh8AABkAQAXU5ROSwSGlCmMAXiUhZSMHzxpcHl0aG9uLWlucHV0LTQ5LWQ1ZTY0NjRkMGIxMj6UjAg8bGFtYmRhPpRLAUMAlCkpdJRSlGNfX2J1aWx0aW5fXwpfX21haW5fXwpoEk5OfZR0lFKUZS4='neighbor"

In [66]: b'{}{}'.format(s,'neighbor')
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-66-67a0d9dcfec1> in <module>()
----> 1 b'{}{}'.format(s,'neighbor')

AttributeError: 'bytes' object has no attribute 'format'

In [67]: b'{}{}'.format(s,bytes('neighbor'))
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-67-86a9eb41599b> in <module>()
----> 1 b'{}{}'.format(s,bytes('neighbor'))

AttributeError: 'bytes' object has no attribute 'format'

In [68]: s+bytes('neighbor')
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-68-f7f80a189415> in <module>()
----> 1 s+bytes('neighbor')

TypeError: string argument without an encoding

In [69]: s+bytes('neighbor'.encode())
Out[69]: b'gASV2AAAAAAAAABdlCiMBWhlbGxvlEs3Tn2USwaMAWyUc4WUKYwJZGlsbC5kaWxslIwQX2NyZWF0ZV9mdW5jdGlvbpSTlChoBYwKX2xvYWRfdHlwZZSTlIwIQ29kZVR5cGWUhZRSlChLAUsASwFLAktDQwh8AABkAQAXU5ROSwSGlCmMAXiUhZSMHzxpcHl0aG9uLWlucHV0LTQ5LWQ1ZTY0NjRkMGIxMj6UjAg8bGFtYmRhPpRLAUMAlCkpdJRSlGNfX2J1aWx0aW5fXwpfX21haW5fXwpoEk5OfZR0lFKUZS4=neighbor'

In [70]: s+'neighbor'.encode()
Out[70]: b'gASV2AAAAAAAAABdlCiMBWhlbGxvlEs3Tn2USwaMAWyUc4WUKYwJZGlsbC5kaWxslIwQX2NyZWF0ZV9mdW5jdGlvbpSTlChoBYwKX2xvYWRfdHlwZZSTlIwIQ29kZVR5cGWUhZRSlChLAUsASwFLAktDQwh8AABkAQAXU5ROSwSGlCmMAXiUhZSMHzxpcHl0aG9uLWlucHV0LTQ5LWQ1ZTY0NjRkMGIxMj6UjAg8bGFtYmRhPpRLAUMAlCkpdJRSlGNfX2J1aWx0aW5fXwpfX21haW5fXwpoEk5OfZR0lFKUZS4=neighbor'

In [71]: db[sha256(s+'neighbor'.encode())] = b64e(dill.dumps({'name':'billy'}))

In [72]: dill.loads(b64d(db[sha256(s+'neighbor'.encode())]))
Out[72]: {'name': 'billy'}

In [73]: sha256('billy')
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-73-2860b7aa153c> in <module>()
----> 1 sha256('billy')

<ipython-input-55-a8ccd624649d> in <lambda>(stream, hasher)
----> 1 sha256 = lambda stream, hasher=hashlib.sha256:hasher(stream).hexdigest()

TypeError: Unicode-objects must be encoded before hashing

In [74]: sha256('billy'.encode())
Out[74]: '47798d12ae31ce5a6d9c9dddec7bc9f2a27fffa424b7f2a154fa0e26690972de'

In [75]: def sha256(*args):
    ...:     hasher = hashlib.sha256()
    ...:     for a in args:
    ...:         if type(a) is bytes:
    ...:             hasher.update(a)
    ...:         elif type(a) is str:
    ...:             hasher.update(a.encode())
    ...:         else:
    ...:             hasher.update(b64e(dill.dumps({'name':'billy'})))
    ...:     return hasher.hexdigest()
    ...:

In [76]: def sha256(*args):
    ...:     hasher = hashlib.sha256()
    ...:     for a in args:
    ...:         if type(a) is bytes:
    ...:             hasher.update(a)
    ...:         elif type(a) is str:
    ...:             hasher.update(a.encode())
    ...:         else:
    ...:             hasher.update(b64e(dill.dumps(a)))
    ...:     return hasher.hexdigest()
    ...:

In [77]: def sha256(*args):
    ...:     hasher = hashlib.sha256()
    ...:     for a in args:
    ...:         hasher.update(a if type(a) is bytes else a.encode() if type(a) is str else b64e(dill.dumps(a, protocol=dill.HIGHEST_PROTOCOL)))
    ...:     return hasher.hexdigest()
    ...:

In [78]: sha256(['hello', 55, None, ({6:'l'},), tuple(), lambda x:x+4], 'knows', {'name':'billy'})
Out[78]: '871192a90da086a5d05ad150fb89201976c3a650b2ee80a7dac6320379431f5a'

In [79]: sha256(['hello', 55, None, ({6:'l'},), tuple(), lambda x:x+4], 'knows')
Out[79]: 'eee7f530f936bbf1fd94c789be338cfa9d5330f2bb091fc182d32aaad8517710'

In [80]: sha256({'name':'billy'})
Out[80]: '775233a9313e29615f2cbecbad065799c6f2ebb3ef585a7e5e399f19f0f0a810'

In [81]: class Relationships(type):
    ...:     pass
    ...:

In [82]: sha256(Relationships)
Out[82]: '483636a3c3491ecbe171dadba89f20fc2c0b3fcad508d4c8b4cafdba8aeb7a9a'

In [83]: obj_1=['hello', 55, None, ({6:'l'},), tuple(), lambda x:x+4]

In [84]: serialize = lambda o:b64e(dill.dumps(o, protocol=dill.HIGHEST_PROTOCOL))

In [85]: serialize = lambda o:dill.loads(b64d(o))

In [86]: serialize = lambda o:b64e(dill.dumps(o, protocol=dill.HIGHEST_PROTOCOL))

In [87]: deserialize = lambda o:dill.loads(b64d(o))

In [88]: db[sha256(obj_1)] = serialize(obj_1)

In [89]: db[sha256(obj_1,Relationships)] = serialize([])

In [90]: obj_2 = {'name':'billy'}

In [91]: db[sha256(obj_1,Relationships)] = serialize(set())

In [92]: db[sha256(obj_2)] = serialize(obj_2)

In [93]: db[sha256(obj_2,Relationships)] = serialize(set())

In [94]: ob1_relationships = deserialize(db[sha256(obj_1,Relationships)])

In [95]: ob1_relationships
Out[95]: set()

In [96]: from collections import defaultdict

In [97]: defaultdict(set)
Out[97]: defaultdict(set, {})

In [98]: db[sha256(obj_1,Relationships)] = serialize(defaultdict(set))

In [99]: db[sha256(obj_2,Relationships)] = serialize(defaultdict(set))

In [100]: ob1_relationships = deserialize(db[sha256(obj_1,Relationships)])

In [101]: ob1_relationships
Out[101]: defaultdict(set, {})

In [102]: ob1_relationships['knows'].add(sha256(obj_2))

In [103]: ob1_relationships
Out[103]:
defaultdict(set,
            {'knows': {'775233a9313e29615f2cbecbad065799c6f2ebb3ef585a7e5e399f19f0f0a810'}})

In [104]: db[sha256(obj_1,Relationships)] = ob1_relationships
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-104-c45cd6a2072a> in <module>()
----> 1 db[sha256(obj_1,Relationships)] = ob1_relationships

TypeError: gdbm mappings have byte or string elements only

In [105]: db[sha256(obj_1,Relationships)] = serialize(ob1_relationships)

In [106]: obj_1
Out[106]: ['hello', 55, None, ({6: 'l'},), (), <function __main__.<lambda>>]

In [107]: obj_2
Out[107]: {'name': 'billy'}

In [108]: # now query the db to see if you can get obj_2 from obj_1.knows

In [109]: for relative_hash in deserialize(db[sha256(obj_1,Relationships)])['knows']:
     ...:     print(deserialize(db[relative_hash]))
     ...:
{'name': 'billy'}
CodyKochmann commented 6 years ago

and heres the proof of concept that we would actually be getting a speedup from using this model

In [111]: deserialize(db[next(iter( deserialize(db[sha256(obj_1,Relationships)])['knows'] ))])
Out[111]: {'name': 'billy'}

In [112]: grdb = graphdb('tmp.graph.db')
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-112-949c0887e715> in <module>()
----> 1 grdb = graphdb('tmp.graph.db')

TypeError: 'module' object is not callable

In [113]: grdb = graphdb.GraphDB('tmp.graph.db')

In [114]: grdb[obj_1].knows = obj_2
---------------------------------------------------------------------------
PicklingError                             Traceback (most recent call last)
<ipython-input-114-66e825998ca9> in <module>()
----> 1 grdb[obj_1].knows = obj_2

~/py3/lib/python3.4/site-packages/graphdb/__init__.py in __getitem__(self, key)
    253     def __getitem__(self, key):
    254         if self._autostore:
--> 255             self.store_item(key)
    256         return VList([V(self, key)])
    257

~/py3/lib/python3.4/site-packages/graphdb/__init__.py in store_item(self, item)
     89         if self._id_of(item) is None:
     90             #print('storing item', item)
---> 91             blob = self.serialize(item)
     92             self._execute(
     93                 'INSERT into objects (code) values (?);',

~/py3/lib/python3.4/site-packages/graphdb/__init__.py in serialize(item)
     78         return b64e(pickle.dumps(
     79             item,
---> 80             protocol=pickle.HIGHEST_PROTOCOL
     81         ))
     82

PicklingError: Can't pickle <function <lambda> at 0x7f013e8d6bf8>: attribute lookup <lambda> on __main__ failed

In [115]: grdb.serialize = lambda s, o:serialize(o)

In [116]: grdb.deserialize = lambda s, o:deserialize(o)

In [117]: grdb[obj_1].knows = obj_2
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-117-66e825998ca9> in <module>()
----> 1 grdb[obj_1].knows = obj_2

~/py3/lib/python3.4/site-packages/graphdb/__init__.py in __getitem__(self, key)
    253     def __getitem__(self, key):
    254         if self._autostore:
--> 255             self.store_item(key)
    256         return VList([V(self, key)])
    257

~/py3/lib/python3.4/site-packages/graphdb/__init__.py in store_item(self, item)
     89         if self._id_of(item) is None:
     90             #print('storing item', item)
---> 91             blob = self.serialize(item)
     92             self._execute(
     93                 'INSERT into objects (code) values (?);',

TypeError: <lambda>() missing 1 required positional argument: 'o'

In [118]: grdb.serialize = serialize

In [119]: grdb.deserialize = deserialize

In [120]: grdb[obj_1].knows = obj_2

In [121]: next(grdb[obj_1].knows())
Out[121]: {'name': 'billy'}

In [122]: deserialize(db[next(iter( deserialize(db[sha256(obj_1,Relationships)])['knows'] ))])
Out[122]: {'name': 'billy'}

In [123]: rps(lambda:next(grdb[obj_1].knows()))
Out[123]: 929

In [124]: rps(lambda:deserialize(db[next(iter( deserialize(db[sha256(obj_1,Relationships)])['knows'] ))]))
Out[124]: 1446
CodyKochmann commented 6 years ago

which is actually kind of surprising since in the proof of concept Im deserializing the entire relationship dict before I even begin looking for the actual object to load.

when checking the speed with of this implementation against the current version in terms of how speed is impacted by deeper and deeper levels of recursion, this key-value version slows down FAR less than the current sql based backend.