patham9 / NarsGPT

A NARS implemented as a GPT model prompted to invoke reasoning steps, with NARS-based memory and control machinery implemented in Python.
MIT License
40 stars 15 forks source link

integrate :duck: #2

Closed PtrMan closed 1 year ago

PtrMan commented 1 year ago

:duck:

This is D.U.C.K. - Digital Utility Console Kit

It screams, it is adorable, it is stupid, it is semi-working, it can soon be useful for doing STUFF.

(it is using the https://github.com/lm-sys/FastChat model for inference)

Here is the code of duck:

#from inference import *

modelName = '/notebooks/github_vicuna7b/vic7b'
device = 'cpu'

outFilepath = 'out1_3.txt'

useLm = True # use LM or bypass for fast prototyping?

# supergoal as text
supergoal = 'google information about theory of parallel universes' # good results

##############
## LangChain

from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel

# Define a data model
class LangChainA(BaseModel):
    nextsubgoal: str
    justification: str

#############
## UTILS

# tries to parse a string which is numbered at the beginning
# /return None if parsing failed
def tryParseNumbered(s):
    import re
    # s = "1. Example text" # DBG
    match = re.match(r'^\d+\.\s(.+)', s)
    if match:
        s0 = match.group(1)
        return s0
    return None

def tryParseFunctioncall(s):
    import re
    # ex: 'a("blah")'
    match = re.match(r'^(.*)\(\"(.*)\"\)$', s)
    if match:
        return (match.group(1), match.group(2))
    return None

# given a text from the LMt, extract the response
# returns None if it failed
def extractLmResponse(text):
    z0 = '### Assistant: '
    idx0 = text.find(z0)
    if idx0 == -1:
        return None
    idx1 = idx0+len(z0)
    text0 = text[idx1:]

    # try to find
    z1 = '### Human: '
    idx2 = text0.find(z1)
    if idx2 == -1:
        return text0 # not found - is still valid, just truncated which probably isn't to bad
    text1 = text0[:idx2]

    return text1

# helper to classify a line of the response of subgoal2exec
def classifyLineA(line):
    if line.startswith('```'):
        return ('codeblock', line[3:])

    # try to parse the numbered command
    parseRes0 = tryParseNumbered(line)
    if parseRes0 is not None: # is it a valid enumerated command? ex: "2. BLAH"
        return ('enum', parseRes0) # return with 'payload' of numbered

    return ('default', line)

# specialized parser which classifies every line
def classifyLinesOfLmResponse(text):
    return list(map(lambda iv: classifyLineA(iv), text.split('\n')))

# group classifed lines into a tree of enum followed by codeblocks
def groupCodes(list0):
    isInCode = False # is the text in a codesection?
    currentCodes = []

    res = []

    for iLineType, iLineContent in list0:
        if iLineType == 'enum':
            if len(currentCodes) > 0:
                res.append(('enum', currentCodes[:]))
                # flush codes
                currentCodes = []
        elif iLineType == 'codeblock':
            isInCode = not isInCode # switch the state of the code
        elif iLineType == 'default':
            if isInCode: # is this a codeline?
                currentCodes.append(iLineContent) # add this as code to the codes
            else:
                # ignore
                pass
        else:
            pass # ignore

    if len(currentCodes) > 0:
        res.append(('enum', currentCodes[:]))
        # flush codes
        currentCodes = []

    return res

#
#model, tokenizer = load_model(modelName, device, 0)
#
#stream0 = generate_stream(model, tokenizer, {}, device, context_len=2048, stream_interval=2)
#
#for iv in stream0:
#    print(iv)
#
#
#exit(0)

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaTokenizer, AutoModel

def runPrompt(prompt0, model, tokenizer):
    # see https://huggingface.co/docs/transformers/tasks/language_modeling
    print(f'<run>>{prompt0}')
    inputs0 = tokenizer(prompt0, return_tensors="pt").input_ids
    outputs = model.generate(inputs0, max_new_tokens=512, do_sample=True, top_k=20, top_p=0.95)

    #pipe = pipeline(model=modelName, device_map="cpu")
    #output = pipe("This is a cool example!", do_sample=True, top_k=50, top_p=0.95)

    print(f' ... done')

    x0 = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    x1 = x0[0]
    return x1

# generate prompt
def genPrompt(goalToReformulate, partName):
    if partName == 'goalToExecA': # OUTDATED
        # prompt for vicerna 7B
        prompt0 = f'Write visit(url) to visit the url - ex: visit("http://y.com/"), write google(query) to google for query, write exit() when done. Your goal is to {goalToReformulate}. Give sequence of commands as bullet points!'

    elif partName == 'goalToExecB': 
        # takes supergoal into account
        # prompt for vicerna 7B
        prompt0 = f'Write visit(url) to visit the url - ex: visit("http://y.com/"), write google(query) to google for query, write exit() when done. Supergoal is to {supergoal}. Your current goal is to {goalToReformulate}. Give sequence of commands as bullet points!'
        prompt0 = f'Write visit(url) to visit the url - ex: visit("http://y.com/"), write google(query) to google for query, write shell(command) to execute <command>, write exit() when done. Supergoal is to {supergoal}. Your current goal is to {goalToReformulate}. Give sequence of commands as bullet points!'

    elif partName == 'goalToExecC':
        # prompt for vicerna 7B
        prompt0 = f'You execute shell commands like "visit" to visit a website or "google" to google. You also can use any other ubuntu shell command! Your supergoal is to {supergoal}. Your sub-goal is to {goalToReformulate}!'

    elif partName == 'goalToSubgoalsD': # DEPRECATED

        # prompt for vicerna 7B
        prompt0 = f'As a SOTA AI, your goal is to {goalToReformulate}. Write hierachical sub-goals of your goal as bullet points!'

    elif partName == 'goalToSubgoalsE':

        # prompt for vicerna 7B
        prompt0 = f'As a SOTA AI, your goal is to {goalToReformulate}. Write hierachical sub-goals of your goal as bullet points (*) at the top level!'

    elif partName == 'goalToSubgoalsF':
        # outputs JSON

        # ex: Your goal is to Research about ethical theories and principles. What is the next sub-goal? What is the justification for the sub-goal? Write the answer as JSON with 'nextsubgoal' and 'justification' as keys.

        # prompt for nicerna 7B
        prompt0 = f'ex: Your goal is to {goalToReformulate}. What is the next sub-goal? What is the justification for the sub-goal? Write the answer as JSON with \'nextsubgoal\' and \'justification\' as keys.'

    return prompt0

if useLm:
    tokenizer = AutoTokenizer.from_pretrained(modelName, use_fast=False)
    model = AutoModelForCausalLM.from_pretrained(modelName, low_cpu_mem_usage=True) # **kwargs

subgoals = []

# get subgoals of goal
if True: # codeblock
    for iAttempt in range(2): # do multiple attempts

        for z in range(5):
            print('')

        print(f'convert goal to subgoals... prompt=\'{supergoal}\'')

        prompt0 = genPrompt(supergoal, 'goalToSubgoalsD') # goal to execution
        prompt0 = f'### Human: {prompt0}\n'

        if True or useLm:
            x1 = runPrompt(prompt0, model, tokenizer)
        else:
            x1 = prototypingResponse # HACK< inject fake response for prototyping >

        print(f'{x1}')

        x2 = extractLmResponse(x1)

        lmOutType = 'listOfItems' #'jsonLangchain'
        # parse output
        if lmOutType == 'jsonLangchain': # try to parse answer as if it were structured JSON

            # Create a parser for the LangChainA data model
            parser = PydanticOutputParser(pydantic_object=LangChainA)

            lmResult0 = x2
            #lmResult0 = runPrompt(prompt0, model, tokenizer)

            # HACK< fake response from LM >
            #lmResult0 = 'AA {"nextsubgoal":"A B C"\n,  "justification":"BA BA"} \n{"nextsubgoal":"A B C"\n,  "justification":"BA KFKFKF"}   kfkfkfkf'

            for iSegment0 in lmResult0.split('}'): # split sequence of JSON into parts
                iSegment0 = iSegment0+'}' # add back end

                a = parser.parse(iSegment0)

                # add to subgoals
                subgoals.append(a.nextsubgoal)

        else: # else just handle it as if it were a list of items
            # convert text of response from LM to text of goals
            if True: # codeblock
                lines0 = x2.split('\n')
                for iLine in lines0:
                    if iLine.startswith('* '):
                        goalText0 = iLine[2:] # remove '* ' at beginning

                        # remove text after first ':' because the LM continues with a unnecessary explaination
                        idx1 = goalText0.find(':')
                        if idx1 != -1:
                            goalText0 = goalText0[:idx1]

                        # helper to clean up text of subgoal
                        def subgoalCleanup(txt):
                            res = txt
                            if res.endswith('.'):
                                # subgoal can end with '.', remove if present
                                # NOTE< does subgoal always end with '.'? >
                                res = res[:-1] # remove dot
                            return res

                        goalText1 = subgoalCleanup(goalText0)
                        subgoals.append(goalText1)

                        continue

                    if True: # codeblock
                        # sometimes the list is numbered
                        parsingResult = tryParseNumbered(iLine)
                        if parsingResult is not None:
                            subgoals.append(parsingResult)

                            continue

                    else:
                        print(f'warn: line didn\'t start with *, ignore!')

        print('subgoals:')
        print(f'{str(subgoals)}')

        if len(subgoals) > 0:
            break

if len(subgoals) == 0:
    print('warn: no subgoals were found! give up!')
    exit(1)

#exit(1) # fail because we are developing

# for testing
#subgoals.append('obtain an overview of the universe, including its size, structure, and composition')
#subgoals.append('find the best car dealer')

subgoalIdx = 0
for iSubgoal in subgoals:

    for z in range(5):
        print('')        
    print(f'examine subgoal={iSubgoal}')

    prompt0 = genPrompt(iSubgoal, 'goalToExecC') # goal to execution
    prompt0 = f'### Human: {prompt0}\n'

    x1 = runPrompt(prompt0, model, tokenizer)

    print(f'{x1}')

    x2 = extractLmResponse(x1)

    groupCodes0 = groupCodes(classifyLinesOfLmResponse(x2))
    print(groupCodes0)

    # write to json file for exec
    if True: # code-block
        # write commands to file
        print(f'write to {outFilepath}')
        with open(outFilepath, "a") as f:
            f.write('\n\n')

            for iGroupCodeType, iGroupCodePayload in groupCodes0:
                iGroupCodePayload1 = str(iGroupCodePayload) # convert python to str

                out0 = f'{{"type":"enum", "payload":"{iGroupCodePayload1}"}}'
                f.write(out0+'\n')

    # commented because it is the old code to parse the result from the LMt
    """
    # * parse response from LM

    cmds = []

    for iLine in x2.split('\n'):
        # try to parse the numbered command
        parseRes0 = tryParseNumbered(iLine)
        if parseRes0 is not None: # is it a valid enumerated command? ex: "2. BLAH"
            cmds.append(parseRes0)
        else:

            if len(iLine)>3 and iLine[:3] == ' * ': # ex: " * BLAH"
                cmds.append(iLine[3:])
            else:
                if len(iLine)>2 and iLine[:2] == '* ': #ex: "* BLAH"
                    cmds.append(iLine[2:])
                else:
                    # ex: "google("x fff")"
                    parseRes1 = tryParseFunctioncall(iLine)
                    if parseRes1 is not None:
                        cmds.append(iLine)

    print(cmds) # DEBUG

    if True: # code-block
        # write commands to file
        print(f'write to {outFilepath}')
        with open(outFilepath, "a") as f:
            f.write(f'{{"subgoalidx":"{subgoalIdx}","subgoal":"{iSubgoal}","cmds":"{str(cmds)}"}}\n')
    """

    subgoalIdx+=1

# DONE< implement subgoal output parser which looks at ``` when code begins
#       it also has to be able to handle step by step things (???) ! >
#       DONE< use it to extract actionable code! >

# TODO< add code executor >