Open CodyKochmann opened 6 years ago
its rough, but heres the proof of concept to show how graphdb would work in a key-value dbm based setting
In [43]: from base64 import b64encode as b64e
In [44]: b64e(dill.dumps(['hello', 55, None, ({6:'l'},), tuple()], protocol=dill.HIGHEST_PROTOCOL))
Out[44]: b'gASVHAAAAAAAAABdlCiMBWhlbGxvlEs3Tn2USwaMAWyUc4WUKWUu'
In [45]: s = b64e(dill.dumps(['hello', 55, None, ({6:'l'},), tuple()], protocol=dill.HIGHEST_PROTOCOL))
In [46]: s
Out[46]: b'gASVHAAAAAAAAABdlCiMBWhlbGxvlEs3Tn2USwaMAWyUc4WUKWUu'
In [47]: from base64 import b64decode as b64d
In [48]: dill.loads(b64d(s))
Out[48]: ['hello', 55, None, ({6: 'l'},), ()]
In [49]: s = b64e(dill.dumps(['hello', 55, None, ({6:'l'},), tuple(), lambda x:x+4], protocol=dill.HIGHEST_PROTOCOL))
In [50]: dill.loads(b64d(s))
Out[50]: ['hello', 55, None, ({6: 'l'},), (), <function __main__.<lambda>>]
In [51]: dill.loads(b64d(s))[-1](4) # should return 8
Out[51]: 8
In [52]: import hashlib
In [53]: hashlib.algorithms_available
Out[53]:
{'DSA',
'DSA-SHA',
'MD4',
'MD5',
'RIPEMD160',
'SHA',
'SHA1',
'SHA224',
'SHA256',
'SHA384',
'SHA512',
'dsaEncryption',
'dsaWithSHA',
'ecdsa-with-SHA1',
'md4',
'md5',
'ripemd160',
'sha',
'sha1',
'sha224',
'sha256',
'sha384',
'sha512',
'whirlpool'}
In [54]: hashlib.algorithms_guaranteed
Out[54]: {'md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512'}
In [55]: sha256 = lambda stream, hasher=hashlib.sha256:hasher(stream).hexdigest()
In [56]: s
Out[56]: b'gASV2AAAAAAAAABdlCiMBWhlbGxvlEs3Tn2USwaMAWyUc4WUKYwJZGlsbC5kaWxslIwQX2NyZWF0ZV9mdW5jdGlvbpSTlChoBYwKX2xvYWRfdHlwZZSTlIwIQ29kZVR5cGWUhZRSlChLAUsASwFLAktDQwh8AABkAQAXU5ROSwSGlCmMAXiUhZSMHzxpcHl0aG9uLWlucHV0LTQ5LWQ1ZTY0NjRkMGIxMj6UjAg8bGFtYmRhPpRLAUMAlCkpdJRSlGNfX2J1aWx0aW5fXwpfX21haW5fXwpoEk5OfZR0lFKUZS4='
In [57]: sha256(s)
Out[57]: '9c8d283ad8123f256717522971de06f4de4512de136bdecb879ff8568da231f2'
In [58]: sha256(s)
Out[58]: '9c8d283ad8123f256717522971de06f4de4512de136bdecb879ff8568da231f2'
In [59]: import dbm
In [60]: db = dbm.open('db.dbm', 'c')
In [61]: db
Out[61]: <_gdbm.gdbm at 0x7f013e976af0>
In [62]: db[sha256(s)] = s
In [63]: db[sha256(s)]
Out[63]: b'gASV2AAAAAAAAABdlCiMBWhlbGxvlEs3Tn2USwaMAWyUc4WUKYwJZGlsbC5kaWxslIwQX2NyZWF0ZV9mdW5jdGlvbpSTlChoBYwKX2xvYWRfdHlwZZSTlIwIQ29kZVR5cGWUhZRSlChLAUsASwFLAktDQwh8AABkAQAXU5ROSwSGlCmMAXiUhZSMHzxpcHl0aG9uLWlucHV0LTQ5LWQ1ZTY0NjRkMGIxMj6UjAg8bGFtYmRhPpRLAUMAlCkpdJRSlGNfX2J1aWx0aW5fXwpfX21haW5fXwpoEk5OfZR0lFKUZS4='
In [64]: db[sha256(s+'neighbor')] = s
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-64-3f31ca9c414a> in <module>()
----> 1 db[sha256(s+'neighbor')] = s
TypeError: can't concat bytes to str
In [65]: '{}{}'.format(s,'neighbor')
Out[65]: "b'gASV2AAAAAAAAABdlCiMBWhlbGxvlEs3Tn2USwaMAWyUc4WUKYwJZGlsbC5kaWxslIwQX2NyZWF0ZV9mdW5jdGlvbpSTlChoBYwKX2xvYWRfdHlwZZSTlIwIQ29kZVR5cGWUhZRSlChLAUsASwFLAktDQwh8AABkAQAXU5ROSwSGlCmMAXiUhZSMHzxpcHl0aG9uLWlucHV0LTQ5LWQ1ZTY0NjRkMGIxMj6UjAg8bGFtYmRhPpRLAUMAlCkpdJRSlGNfX2J1aWx0aW5fXwpfX21haW5fXwpoEk5OfZR0lFKUZS4='neighbor"
In [66]: b'{}{}'.format(s,'neighbor')
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-66-67a0d9dcfec1> in <module>()
----> 1 b'{}{}'.format(s,'neighbor')
AttributeError: 'bytes' object has no attribute 'format'
In [67]: b'{}{}'.format(s,bytes('neighbor'))
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-67-86a9eb41599b> in <module>()
----> 1 b'{}{}'.format(s,bytes('neighbor'))
AttributeError: 'bytes' object has no attribute 'format'
In [68]: s+bytes('neighbor')
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-68-f7f80a189415> in <module>()
----> 1 s+bytes('neighbor')
TypeError: string argument without an encoding
In [69]: s+bytes('neighbor'.encode())
Out[69]: b'gASV2AAAAAAAAABdlCiMBWhlbGxvlEs3Tn2USwaMAWyUc4WUKYwJZGlsbC5kaWxslIwQX2NyZWF0ZV9mdW5jdGlvbpSTlChoBYwKX2xvYWRfdHlwZZSTlIwIQ29kZVR5cGWUhZRSlChLAUsASwFLAktDQwh8AABkAQAXU5ROSwSGlCmMAXiUhZSMHzxpcHl0aG9uLWlucHV0LTQ5LWQ1ZTY0NjRkMGIxMj6UjAg8bGFtYmRhPpRLAUMAlCkpdJRSlGNfX2J1aWx0aW5fXwpfX21haW5fXwpoEk5OfZR0lFKUZS4=neighbor'
In [70]: s+'neighbor'.encode()
Out[70]: b'gASV2AAAAAAAAABdlCiMBWhlbGxvlEs3Tn2USwaMAWyUc4WUKYwJZGlsbC5kaWxslIwQX2NyZWF0ZV9mdW5jdGlvbpSTlChoBYwKX2xvYWRfdHlwZZSTlIwIQ29kZVR5cGWUhZRSlChLAUsASwFLAktDQwh8AABkAQAXU5ROSwSGlCmMAXiUhZSMHzxpcHl0aG9uLWlucHV0LTQ5LWQ1ZTY0NjRkMGIxMj6UjAg8bGFtYmRhPpRLAUMAlCkpdJRSlGNfX2J1aWx0aW5fXwpfX21haW5fXwpoEk5OfZR0lFKUZS4=neighbor'
In [71]: db[sha256(s+'neighbor'.encode())] = b64e(dill.dumps({'name':'billy'}))
In [72]: dill.loads(b64d(db[sha256(s+'neighbor'.encode())]))
Out[72]: {'name': 'billy'}
In [73]: sha256('billy')
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-73-2860b7aa153c> in <module>()
----> 1 sha256('billy')
<ipython-input-55-a8ccd624649d> in <lambda>(stream, hasher)
----> 1 sha256 = lambda stream, hasher=hashlib.sha256:hasher(stream).hexdigest()
TypeError: Unicode-objects must be encoded before hashing
In [74]: sha256('billy'.encode())
Out[74]: '47798d12ae31ce5a6d9c9dddec7bc9f2a27fffa424b7f2a154fa0e26690972de'
In [75]: def sha256(*args):
...: hasher = hashlib.sha256()
...: for a in args:
...: if type(a) is bytes:
...: hasher.update(a)
...: elif type(a) is str:
...: hasher.update(a.encode())
...: else:
...: hasher.update(b64e(dill.dumps({'name':'billy'})))
...: return hasher.hexdigest()
...:
In [76]: def sha256(*args):
...: hasher = hashlib.sha256()
...: for a in args:
...: if type(a) is bytes:
...: hasher.update(a)
...: elif type(a) is str:
...: hasher.update(a.encode())
...: else:
...: hasher.update(b64e(dill.dumps(a)))
...: return hasher.hexdigest()
...:
In [77]: def sha256(*args):
...: hasher = hashlib.sha256()
...: for a in args:
...: hasher.update(a if type(a) is bytes else a.encode() if type(a) is str else b64e(dill.dumps(a, protocol=dill.HIGHEST_PROTOCOL)))
...: return hasher.hexdigest()
...:
In [78]: sha256(['hello', 55, None, ({6:'l'},), tuple(), lambda x:x+4], 'knows', {'name':'billy'})
Out[78]: '871192a90da086a5d05ad150fb89201976c3a650b2ee80a7dac6320379431f5a'
In [79]: sha256(['hello', 55, None, ({6:'l'},), tuple(), lambda x:x+4], 'knows')
Out[79]: 'eee7f530f936bbf1fd94c789be338cfa9d5330f2bb091fc182d32aaad8517710'
In [80]: sha256({'name':'billy'})
Out[80]: '775233a9313e29615f2cbecbad065799c6f2ebb3ef585a7e5e399f19f0f0a810'
In [81]: class Relationships(type):
...: pass
...:
In [82]: sha256(Relationships)
Out[82]: '483636a3c3491ecbe171dadba89f20fc2c0b3fcad508d4c8b4cafdba8aeb7a9a'
In [83]: obj_1=['hello', 55, None, ({6:'l'},), tuple(), lambda x:x+4]
In [84]: serialize = lambda o:b64e(dill.dumps(o, protocol=dill.HIGHEST_PROTOCOL))
In [85]: serialize = lambda o:dill.loads(b64d(o))
In [86]: serialize = lambda o:b64e(dill.dumps(o, protocol=dill.HIGHEST_PROTOCOL))
In [87]: deserialize = lambda o:dill.loads(b64d(o))
In [88]: db[sha256(obj_1)] = serialize(obj_1)
In [89]: db[sha256(obj_1,Relationships)] = serialize([])
In [90]: obj_2 = {'name':'billy'}
In [91]: db[sha256(obj_1,Relationships)] = serialize(set())
In [92]: db[sha256(obj_2)] = serialize(obj_2)
In [93]: db[sha256(obj_2,Relationships)] = serialize(set())
In [94]: ob1_relationships = deserialize(db[sha256(obj_1,Relationships)])
In [95]: ob1_relationships
Out[95]: set()
In [96]: from collections import defaultdict
In [97]: defaultdict(set)
Out[97]: defaultdict(set, {})
In [98]: db[sha256(obj_1,Relationships)] = serialize(defaultdict(set))
In [99]: db[sha256(obj_2,Relationships)] = serialize(defaultdict(set))
In [100]: ob1_relationships = deserialize(db[sha256(obj_1,Relationships)])
In [101]: ob1_relationships
Out[101]: defaultdict(set, {})
In [102]: ob1_relationships['knows'].add(sha256(obj_2))
In [103]: ob1_relationships
Out[103]:
defaultdict(set,
{'knows': {'775233a9313e29615f2cbecbad065799c6f2ebb3ef585a7e5e399f19f0f0a810'}})
In [104]: db[sha256(obj_1,Relationships)] = ob1_relationships
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-104-c45cd6a2072a> in <module>()
----> 1 db[sha256(obj_1,Relationships)] = ob1_relationships
TypeError: gdbm mappings have byte or string elements only
In [105]: db[sha256(obj_1,Relationships)] = serialize(ob1_relationships)
In [106]: obj_1
Out[106]: ['hello', 55, None, ({6: 'l'},), (), <function __main__.<lambda>>]
In [107]: obj_2
Out[107]: {'name': 'billy'}
In [108]: # now query the db to see if you can get obj_2 from obj_1.knows
In [109]: for relative_hash in deserialize(db[sha256(obj_1,Relationships)])['knows']:
...: print(deserialize(db[relative_hash]))
...:
{'name': 'billy'}
and heres the proof of concept that we would actually be getting a speedup from using this model
In [111]: deserialize(db[next(iter( deserialize(db[sha256(obj_1,Relationships)])['knows'] ))])
Out[111]: {'name': 'billy'}
In [112]: grdb = graphdb('tmp.graph.db')
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-112-949c0887e715> in <module>()
----> 1 grdb = graphdb('tmp.graph.db')
TypeError: 'module' object is not callable
In [113]: grdb = graphdb.GraphDB('tmp.graph.db')
In [114]: grdb[obj_1].knows = obj_2
---------------------------------------------------------------------------
PicklingError Traceback (most recent call last)
<ipython-input-114-66e825998ca9> in <module>()
----> 1 grdb[obj_1].knows = obj_2
~/py3/lib/python3.4/site-packages/graphdb/__init__.py in __getitem__(self, key)
253 def __getitem__(self, key):
254 if self._autostore:
--> 255 self.store_item(key)
256 return VList([V(self, key)])
257
~/py3/lib/python3.4/site-packages/graphdb/__init__.py in store_item(self, item)
89 if self._id_of(item) is None:
90 #print('storing item', item)
---> 91 blob = self.serialize(item)
92 self._execute(
93 'INSERT into objects (code) values (?);',
~/py3/lib/python3.4/site-packages/graphdb/__init__.py in serialize(item)
78 return b64e(pickle.dumps(
79 item,
---> 80 protocol=pickle.HIGHEST_PROTOCOL
81 ))
82
PicklingError: Can't pickle <function <lambda> at 0x7f013e8d6bf8>: attribute lookup <lambda> on __main__ failed
In [115]: grdb.serialize = lambda s, o:serialize(o)
In [116]: grdb.deserialize = lambda s, o:deserialize(o)
In [117]: grdb[obj_1].knows = obj_2
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-117-66e825998ca9> in <module>()
----> 1 grdb[obj_1].knows = obj_2
~/py3/lib/python3.4/site-packages/graphdb/__init__.py in __getitem__(self, key)
253 def __getitem__(self, key):
254 if self._autostore:
--> 255 self.store_item(key)
256 return VList([V(self, key)])
257
~/py3/lib/python3.4/site-packages/graphdb/__init__.py in store_item(self, item)
89 if self._id_of(item) is None:
90 #print('storing item', item)
---> 91 blob = self.serialize(item)
92 self._execute(
93 'INSERT into objects (code) values (?);',
TypeError: <lambda>() missing 1 required positional argument: 'o'
In [118]: grdb.serialize = serialize
In [119]: grdb.deserialize = deserialize
In [120]: grdb[obj_1].knows = obj_2
In [121]: next(grdb[obj_1].knows())
Out[121]: {'name': 'billy'}
In [122]: deserialize(db[next(iter( deserialize(db[sha256(obj_1,Relationships)])['knows'] ))])
Out[122]: {'name': 'billy'}
In [123]: rps(lambda:next(grdb[obj_1].knows()))
Out[123]: 929
In [124]: rps(lambda:deserialize(db[next(iter( deserialize(db[sha256(obj_1,Relationships)])['knows'] ))]))
Out[124]: 1446
which is actually kind of surprising since in the proof of concept Im deserializing the entire relationship dict before I even begin looking for the actual object to load.
when checking the speed with of this implementation against the current version in terms of how speed is impacted by deeper and deeper levels of recursion, this key-value version slows down FAR less than the current sql based backend.
I have all the respect in the world for sqlite, but since graph databases are recursive, we should probably be open to other engines, especially since dbm comes with python's stdlib as well.