Reading data for code generation.

For code generation task, should I use the data reading method used for concode.

def read_concode_examples(filename, data_num):
    """Read examples from filename."""
    examples = []

    with open(filename) as f:
        for idx, line in enumerate(f):
            x = json.loads(line)
            examples.append(
                Example(
                    idx=idx,
                    source=x["nl"].strip(),
                    target=x["code"].strip()
                )
            )
            idx += 1
            if idx == data_num:
                break
    return examples

OR Data reading method used for code summarization (here replacing source with the docstring_tokens and target with code_tokens.

def read_summarize_examples(filename, data_num):
    """Read examples from filename."""
    examples = []
    with open(filename, encoding="utf-8") as f:
        for idx, line in enumerate(f):
            line = line.strip()
            js = json.loads(line)
            if 'idx' not in js:
                js['idx'] = idx
            code = ' '.join(js['code_tokens']).replace('\n', ' ')
            code = ' '.join(code.strip().split())
            nl = ' '.join(js['docstring_tokens']).replace('\n', '')
            nl = ' '.join(nl.strip().split())
            examples.append(
                Example(
                    idx=idx,
                    source=nl,
                    target=code,
                )
            )
            if idx + 1 == data_num:
                break
    return examples

Any suggestions?

Also, do I need to change the args.max_source_length = 256 and args.max_target_length = 128 for code generation task?

salesforce / CodeT5

Reading data for code generation. #27