aliyun / aliyun-odps-python-sdk

ODPS Python SDK and data analysis framework
http://pyodps.readthedocs.io
Apache License 2.0
434 stars 97 forks source link

pyodps 中 replace的用法报错 #47

Closed sangaj closed 6 years ago

sangaj commented 6 years ago

用replace 函数 报错为tuple index out of range。 dataframe 名为list, 列名为p, 其中有些行为空,有些有数,有些含有+86 代码为 list.p.repalce('+86', '')

我尝试用另外一个数据源的数据,也有相同问题, 我下载了iris的数据,然后传至公共服务器,然后用replace,还是报错相同问题, 是不是本身这个function 有问题,或者能否在文档中举例说明如何使用replace?

qinxuye commented 6 years ago

把错误栈贴出来看下。

sangaj commented 6 years ago

iristest = DataFrame(o.get_table('pyodps_iris')) iristest.name.replace('Iris',' ')

IndexError Traceback (most recent call last) /home/hjh/anaconda3/lib/python3.6/site-packages/IPython/core/formatters.py in call(self, obj) 670 type_pprinters=self.type_printers, 671 deferred_pprinters=self.deferred_printers) --> 672 printer.pretty(obj) 673 printer.flush() 674 return stream.getvalue()

/home/hjh/anaconda3/lib/python3.6/site-packages/IPython/lib/pretty.py in pretty(self, obj) 381 if callable(meth): 382 return meth(obj, self, cycle) --> 383 return _default_pprint(obj, self, cycle) 384 finally: 385 self.end_group()

/home/hjh/anaconda3/lib/python3.6/site-packages/IPython/lib/pretty.py in _default_pprint(obj, p, cycle) 501 if _safe_getattr(klass, 'repr', None) not in _baseclassreprs: 502 # A user-provided repr. Find newlines and replace them with p.break() --> 503 _repr_pprint(obj, p, cycle) 504 return 505 p.begin_group(1, '<')

/home/hjh/anaconda3/lib/python3.6/site-packages/IPython/lib/pretty.py in _reprpprint(obj, p, cycle) 699 """A pprint that just redirects to the normal repr function.""" 700 # Find newlines and replace them with p.break() --> 701 output = repr(obj) 702 for idx,output_line in enumerate(output.splitlines()): 703 if idx:

/home/hjh/anaconda3/lib/python3.6/site-packages/odps/df/expr/expressions.py in repr(self) 112 if self.execution is None: 113 try: --> 114 self.execution = self.execute() 115 except Exception as e: 116 self.__execution = e

/home/hjh/anaconda3/lib/python3.6/site-packages/odps/df/expr/expressions.py in execute(self, kwargs) 185 return result 186 --> 187 return self._handle_delay_call('execute', self, wrapper=wrapper, kwargs) 188 189 def compile(self):

/home/hjh/anaconda3/lib/python3.6/site-packages/odps/df/expr/expressions.py in _handle_delay_call(self, method, *args, *kwargs) 144 145 wrapper = kwargs.pop('wrapper', None) --> 146 result = getattr(engine, method)(args, **kwargs) 147 if wrapper is None: 148 return result

/home/hjh/anaconda3/lib/python3.6/site-packages/odps/df/backends/core.py in execute(self, exprs_args_kwargs, kwargs) 722 kwargs['ui'] = self._create_ui(kwargs) 723 kwargs['action'] = '_execute' --> 724 return self._action(exprs_args_kwargs, *kwargs) 725 726 def persist(self, exprs_args_kwargs, **kwargs):

/home/hjh/anaconda3/lib/python3.6/site-packages/odps/df/backends/core.py in _action(self, *exprs_args_kwargs, **kwargs) 547 try: 548 res = self._execute_dag(dag, ui=ui, async=async, n_parallel=n_parallel, --> 549 timeout=timeout, progress_proportion=progress_proportion) 550 except KeyboardInterrupt: 551 self.stop()

/home/hjh/anaconda3/lib/python3.6/site-packages/odps/df/backends/core.py in _execute_dag(cls, dag, ui, async, n_parallel, timeout, close_and_notify, progress_proportion) 786 progress_proportion=1.0): 787 return dag.execute(ui=ui, async=async, n_parallel=n_parallel, timeout=timeout, --> 788 close_and_notify=close_and_notify, progress_proportion=progress_proportion) 789 790 def _get_libraries(self, libraries):

/home/hjh/anaconda3/lib/python3.6/site-packages/odps/df/backends/core.py in execute(self, ui, async, n_parallel, timeout, close_and_notify, progress_proportion) 329 try: 330 if n_parallel <= 1: --> 331 results = self._run(ui, progress_proportion) 332 else: 333 results = self._run_in_parallel(ui, n_parallel, progress_proportion=progress_proportion)

/home/hjh/anaconda3/lib/python3.6/site-packages/odps/df/backends/core.py in _run(self, ui, progress_proportion) 205 result_idx = dict() 206 for i, call in enumerate(calls): --> 207 res = call(ui=ui, progress_proportion=progress_proportion / len(calls)) 208 results[i] = res 209 if call.result_index is not None:

/home/hjh/anaconda3/lib/python3.6/site-packages/odps/df/backends/core.py in call(self, ui, progress_proportion) 184 185 def call(self, ui=None, progress_proportion=None): --> 186 res = self.run(ui=ui, progress_proportion=progress_proportion) 187 if self.callback: 188 self.callback(res)

/home/hjh/anaconda3/lib/python3.6/site-packages/odps/df/backends/core.py in run(s, execute_kw) 580 if 'ui' in kw: 581 kw['ui'].add_keys(group_key) --> 582 result = engine._do_execute(expr_dag, expr, kw) 583 if 'ui' in kw: 584 kw['ui'].remove_keys(group_key)

/home/hjh/anaconda3/lib/python3.6/site-packages/odps/df/backends/odpssql/engine.py in _do_execute(self, expr_dag, expr, ui, progress_proportion, lifecycle, head, tail, hints, priority, **kw) 323 return result 324 --> 325 sql = self._compile(expr, libraries=libraries) 326 327 cache_data = None

/home/hjh/anaconda3/lib/python3.6/site-packages/odps/df/backends/odpssql/engine.py in _compile(self, expr, prettify, libraries) 190 191 libraries = self._ctx.prepare_resources(self._get_libraries(libraries)) --> 192 self._ctx.register_udfs(*gen_udf(expr, UDF_CLASS_NAME, libraries=libraries)) 193 194 return backend.compile(expr)

/home/hjh/anaconda3/lib/python3.6/site-packages/odps/df/backends/odpssql/codegen.py in gen_udf(expr, func_cls_name, libraries) 671 if isinstance(node, MappedExpr): 672 _gen_map_udf(node, func_cls_name, libraries, func, resources, --> 673 func_to_udfs, func_to_resources, func_params) 674 elif isinstance(node, RowAppliedCollectionExpr): 675 _gen_apply_udf(node, func_cls_name, libraries, func, resources,

/home/hjh/anaconda3/lib/python3.6/site-packages/odps/df/backends/odpssql/codegen.py in _gen_map_udf(node, func_cls_name, libraries, func, resources, func_to_udfs, func_to_resources, func_params) 515 'to_type': to_type, 516 'func_cls_name': func_cls_name, --> 517 'func_str': to_str(base64.b64encode(cloudpickle.dumps(func, dump_code=options.df.dump_udf))), 518 'func_args_str': func_args_str, 519 'func_kwargs_str': func_kwargs_str,

/home/hjh/anaconda3/lib/python3.6/site-packages/odps/lib/cloudpickle.py in dumps(obj, protocol, dump_code) 672 673 cp = CloudPickler(file, protocol, dump_code) --> 674 cp.dump(obj) 675 676 return file.getvalue()

/home/hjh/anaconda3/lib/python3.6/site-packages/odps/lib/cloudpickle.py in dump(self, obj) 185 self.inject_addons() 186 try: --> 187 return Pickler.dump(self, obj) 188 except RuntimeError as e: 189 if 'recursion' in e.args[0]:

/home/hjh/anaconda3/lib/python3.6/pickle.py in dump(self, obj) 407 if self.proto >= 4: 408 self.framer.start_framing() --> 409 self.save(obj) 410 self.write(STOP) 411 self.framer.end_framing()

/home/hjh/anaconda3/lib/python3.6/pickle.py in save(self, obj, save_persistent_id) 474 f = self.dispatch.get(t) 475 if f is not None: --> 476 f(self, obj) # Call unbound method with explicit self 477 return 478

/home/hjh/anaconda3/lib/python3.6/site-packages/odps/lib/cloudpickle.py in save_function(self, obj, name) 268 klass = getattr(themodule, name, None) 269 if klass is None or klass is not obj: --> 270 self.save_function_tuple(obj) 271 return 272

/home/hjh/anaconda3/lib/python3.6/site-packages/odps/lib/cloudpickle.py in save_function_tuple(self, func) 298 write = self.write 299 --> 300 code, f_globals, defaults, closure, dct, base_globals = self.extract_func_data(func) 301 302 save(_fill_function) # skeleton function updater

/home/hjh/anaconda3/lib/python3.6/site-packages/odps/lib/cloudpickle.py in extract_func_data(self, func) 359 360 # extract all global ref's --> 361 func_global_refs = self.extract_code_globals(code) 362 363 # process all variables referenced by global environment

/home/hjh/anaconda3/lib/python3.6/site-packages/odps/lib/cloudpickle.py in extract_code_globals(co) 341 extended_arg = oparg*65536 342 if op in GLOBAL_OPS: --> 343 out_names.add(names[oparg]) 344 345 # see if nested function have any global refs

IndexError: tuple index out of range

qinxuye commented 6 years ago

这个是因为 replace 的内部实现使用了 Python 函数,这个函数要被 pickle 到 MaxCompute 执行,而你的环境是 3.6,而 MaxCompute 内部是 2.7,我们对 3.6 的字节码改写还没有支持。

解决这个问题是使用小于等于 3.5 的版本,最好是用 Python 2.7,这样和 MaxCompute 这边兼容会较好。

wjsi commented 6 years ago

Py36 code support already added.