ArangoDB-Community / pyArango

Python Driver for ArangoDB with built-in validation
https://pyarango.readthedocs.io/en/latest/
Apache License 2.0
238 stars 90 forks source link

while running the sample code for performance analysis, memory useage has soared to 100% in a few minutes. #133

Open kaylee000 opened 5 years ago

kaylee000 commented 5 years ago

My Environment

ArangoDB Version:3.4.0 Storage Engine: RocksDB Deployment Mode: Single Server pyArango Version:1.3.2 Operating System: Ubuntu16.04 hard disk capacity:20G RAM:2G

Sample Code

#!/usr/bin/python
import json as json_mod
import sys
import random
import statsd
import pyArango
from datetime import datetime
from pyArango.connection import *
from pyArango.collection import *

import requests
from gevent import monkey
from gevent import GreenletExit

monkey.patch_all()
import gevent
import grequests

statsdc = {}
auth_token = None
connection_urls = [
    "http://127.0.0.1:8529"
]

class JWTAuth(requests.auth.AuthBase):
    def __init__(self, token):
        self.token = token

    def __call__(self, r):
        # Implement JWT authentication
        r.headers['Authorization'] = 'Bearer %s' % self.token
        return r

def get_auth_token():
    global auth_token, connection_urls

    if auth_token:
        return auth_token

    kwargs = {'data': '{"username":"%s","password":"%s"}' % ("root", "")}
    for connection_url in connection_urls:
        response = requests.post('%s/_open/auth' % connection_url, **kwargs)
        if response.ok:
            json_data = response.content
            if json_data:
                data_dict = json_mod.loads(json_data.decode('utf-8'))
                auth_token = data_dict.get('jwt')
                break
    return auth_token

class AikidoSession(object):
    # def __init__(self, session_username, session_password):
    #     statsdc.incr('conn')
    #     if session_username:
    #         self.auth = JWTAuth(session_password)
    #     else:
    #         self.auth = None

    def __init__(self, *args, **kwargs):
        statsdc.incr('conn')
        session_username, session_password = args[:2]
        if session_username:
            self.auth = JWTAuth(session_password)
        else:
            self.auth = None

    def post(self, url, data=None, json=None, **kwargs):
        if data is not None:
            kwargs['data'] = data
        if json is not None:
            kwargs['json'] = json

        kwargs['auth'] = self.auth
        return grequests.map([grequests.post(url, **kwargs)])[0]

    def get(self, url, **kwargs):
        kwargs['auth'] = self.auth
        result = grequests.map([grequests.get(url, **kwargs)])[0]
        return result

    def put(self, url, data=None, **kwargs):
        if data is not None:
            kwargs['data'] = data
        kwargs['auth'] = self.auth
        return grequests.map([grequests.put(url, **kwargs)])[0]

    def head(self, url, **kwargs):
        kwargs['auth'] = self.auth
        return grequests.map([grequests.put(url, **kwargs)])[0]

    def options(self, url, **kwargs):
        kwargs['auth'] = self.auth
        return grequests.map([grequests.options(url, **kwargs)])[0]

    def patch(self, url, data=None, **kwargs):
        if data is not None:
            kwargs['data'] = data
        kwargs['auth'] = self.auth
        return grequests.map([grequests.patch(url, **kwargs)])[0]

    def delete(self, url, **kwargs):
        kwargs['auth'] = self.auth
        return grequests.map([grequests.delete(url, **kwargs)])[0]

    def disconnect(self):
        statsdc.decr('conn')
        pass

# Monkey patch the connection object:
pyArango.connection.AikidoSession = AikidoSession

def microsecs_to_millisec_string(microsecs):
    return str('%d.%dms' % (microsecs / 1000, microsecs % 1000))

def get_time_since(start_time, idstr):
    diff = datetime.now() - start_time
    microsecs = (diff.total_seconds() * 1000 * 1000) + diff.microseconds
    statsdc.timing(idstr, int(microsecs))
    return microsecs_to_millisec_string(microsecs)

statsdc = statsd.StatsClient('127.0.0.1', '8125')
conn = Connection(username="root", password=get_auth_token(), statsdClient=statsdc)
db = conn["_system"]

transaction = '''
    function(params) {
      var db = require('@arangodb').db;
      var startOne = Date.now();
      var q1 = db._query(
        `FOR oneUser IN user
            FILTER user._key == @userid
            UPDATE {
              _key: oneUser._key,
              lastseen: @timestamp,
              counter: oneUser.counter + 1
            } IN user`,
         {
          userid: 'user_' + params.i,
          timestamp: params.timestamp
        });
      var startTwo = Date.now();
      var q2 = db._query(`FOR v, e IN 1..1 OUTBOUND @user userToGroups
                            FILTER e.counter == @i
                          UPDATE {
                             _key: v._key,
                             counter: v.counter + 1
                          } IN groups`,
                         {
                           user: 'user/user_' + params.i,
                           i: params.i % 10
                         });
      var startThree = Date.now();
      var q3 = db._query(`RETURN 1`);
      var end = Date.now();
      return {
        tq1: startTwo - startOne,
        tq2: startThree - startTwo,
        tq3: end - startThree,
        all: end - startOne
      };
    }
'''

def worker(i):
    # add a bit of a variance to the startup
    gevent.sleep(0.1 * random.random())

    statsdc.incr('clients')
    start_time = datetime.now()
    try:
        aql = '''
        FOR user IN user FILTER user._key == @username RETURN user
        '''
        db.AQLQuery(aql, rawResults=True, batchSize=1, count=True, bindVars={'username': 'user_%d' % i})

        times = db.transaction(action=transaction,
                               collections={"read": ['userToGroups'], "write": ['user', 'groups']},
                               params={'i': i, 'timestamp': start_time.isoformat()})['result']
        for which in times:
            statsdc.timing(which, times[which])
    except Exception as e:
        statsdc.incr('errors')
        print('Error in worker %d: error: %s in %s' % (i, str(e), get_time_since(start_time, 'errw')))
    statsdc.decr('clients')
    raise GreenletExit

print(sys.argv)
userrange_start = int(sys.argv[1])
userrange_end = int(sys.argv[2])
req_per_sec = float(sys.argv[3])
while (userrange_start < userrange_end):
    userrange_start += 1
    gevent.spawn(lambda i=userrange_start: worker(i))
    gevent.sleep(1.0 / float(req_per_sec))

This is a changed version of code which solved the highly memeory useage problems.

ps:excerpted parts that have changed.

Initializing the data

#! /usr/bin/env python
from pyArango.connection import Connection

conn = Connection(username='root', password='')

db = conn['_system']

if not db.hasCollection('user'):
    userCol = db.createCollection('Collection', name='user')
else:
    userCol = db.collections['user']
    userCol.truncate()

if not db.hasCollection('group'):
    groupCol = db.createCollection('Collection', name='group')
else:
    groupCol = db.collections['group']
    groupCol.truncate()

if not db.hasCollection('userToGroup'):
    userToGroupCol = db.createCollection(className='Edges', name='userToGroup')
else:
    userToGroupCol = db.collections['userToGroup']
    userToGroupCol.truncate()

noUsers = 100000
i = 0
while i < noUsers:
    i += 1
    userCol.createDocument({
        '_key': ('user_%d' % i),
        'foo': 'bar',
        'count': i,
        'counter': i,
        'visits': 0,
        'name': ("i am user no %d" % i),
        'somePayLoad': 'lorem Ipsem' * 10
    }).save()

userCol.ensureHashIndex(['count'], sparse=False)

noGroups = noUsers / 10

# We have one group each 10 users
i = 0
while i < noGroups:
    i += 1
    groupCol.createDocument({
        '_key': 'group_%d' % i,
        'counter': i,
        'name': "i am group no %d" % i
    }).save()

i = 0
while i < noUsers:
    j = 0
    i += 1
    while j < i % 10:
        j += 1
        userToGroupCol.createDocument({
            '_from': 'user/user_%d' % i,
            '_to': 'group/group_%d' % j,
            'groupRelationNo': j,
            'foo': 'bar',
            'name': "i am making user %d a member of group no %d" % (i, j)
        }).save()

The actual test code

transaction = '''
function(params) {
var db = require('@arangodb').db;
var startOne = Date.now();
var q1 = db._query(
`FOR oneUser IN user
FILTER oneUser._key == @userid
UPDATE {
_key: oneUser._key,
lastseen: @timestamp,
counter: oneUser.counter + 1
} IN user`,
{
userid: 'user_' + params.i,
timestamp: params.timestamp
});
var startTwo = Date.now();
var q2 = db._query(`FOR v, e IN 1..1 OUTBOUND @user userToGroup
FILTER e.groupRelationNo == @i
UPDATE {
_key: v._key,
counter: v.counter + 1
} IN group`,
{
user: 'user/user_' + params.i,
i: params.i % 10
});
var startThree = Date.now();
var q3 = db._query(`RETURN 1`);
var end = Date.now();
return {
tq1: startTwo - startOne,
tq2: startThree - startTwo,
tq3: end - startThree,
all: end - startOne
};
}
'''

So, is this the author's original intention? Or is there anything wrong with my code?

tariqdaouda commented 5 years ago

Hi @kaylee000,

Thanks fo this. I am not sure I get everything, is there a reason why you reimplement parts of pyArango in your sample?

kaylee000 commented 5 years ago

Hi @tariqdaouda , The sample code is from this website:https://www.arangodb.com/2017/10/performance-analysis-pyarango-usage-scenarios/. I made these changes just for running it correctly.

q1

document user/user_101

image

query result

image

q2

one document from userToGroups collection image

query result

image

As shown, there are no results for both queries. However, the results changed after i changed the code.

q1 query result

image

q2 query result image