microsoft / autogen

A programming framework for agentic AI. Discord: https://aka.ms/autogen-dc. Roadmap: https://aka.ms/autogen-roadmap
https://microsoft.github.io/autogen/
Creative Commons Attribution 4.0 International
28.12k stars 4.1k forks source link

[Feature Request]: Scraping Github code for retriver ragproxy #2255

Open Repomano opened 3 months ago

Repomano commented 3 months ago

Is your feature request related to a problem? Please describe.

Following #708 I developed some scraping code from github that downloads .py/.ipnyb for the ragproxy agent. I would like to find a way to use an agent in a chatgroup instead of an Openai call, with an agent that can decide which project to source from github.

Describe the solution you'd like

` class Scraper:

def __init__(self):
    self.chrome_options = Options()
    self.chrome_options.add_argument("--no-sandbox")
    self.chrome_options.add_argument("--headless=new")
    self.chrome_options.headless = True
    self.output_chat_list=['dead-page-title']
    self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
    self.model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
    self.dag = DAG()
    self.visited_pages = []

def limit_tokens_from_string(self,string: str, model: str, limit: int) -> str:
    try:
        encoding = tiktoken.encoding_for_model(model)
    except:
        encoding = tiktoken.encoding_for_model('gpt2') 
    encoded = encoding.encode(string)
    return encoding.decode(encoded[:limit])

def openai_call(self,
    prompt: str,
    model: str = 'gpt-3.5-turbo',
    temperature: float = 0.2,
    max_tokens: int = 100,
):
    while True:
            if not model.lower().startswith("gpt-"):
                response = openai.Completion.create(
                    engine=model,
                    prompt=prompt,
                    temperature=temperature,
                    max_tokens=max_tokens,
                    api_key=OPENAI_API_KEY,
                    top_p=1,
                    frequency_penalty=0,
                    presence_penalty=0,
                )
                return response.choices[0].text.strip()
            else:

                trimmed_prompt = self.limit_tokens_from_string(prompt, model, 6000 - max_tokens)

                # Use chat completion API
                client = OpenAI(api_key=OPENAI_API_KEY)

                messages = [{"role": "system", "content": trimmed_prompt}]
                response = client.chat.completions.create(
                    model=model,
                    messages=messages,
                    temperature=temperature,
                    max_tokens=max_tokens,
                    n=1,
                    stop=None,
                )
                return response.choices[0].message.content.strip()

def generate_embedding(self,sentence):
    input_ids = self.tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)['input_ids']
    with torch.no_grad():
        outputs = self.model(input_ids)
        embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().detach()
    return embeddings

def compute_similarity(self,target_sentence, sentences):
    target_embedding = self.generate_embedding(target_sentence)
    embeddings = [self.generate_embedding(sentence) for sentence in sentences]
    similarity_scores = []
    cos = nn.CosineSimilarity(dim=0, eps=1e-6)
    for i, embedding in enumerate(embeddings):
        similarity_score = abs(cos(torch.tensor(target_embedding), torch.tensor(embedding)))
        similarity_scores.append(similarity_score)
    numpy_array = np.array([tensor.item() for tensor in similarity_scores])
    max_index = np.argmax(numpy_array)
    return similarity_scores, max_index

def prompt_for_chat(self,objective):
    input_chat=f"""Consider the following objective: '{objective}'.
                            \nGenerate a Github topic title that contains: 'pythorch forecast'.
                            \nFor example: 'How to forecast with PyTorch'. or 'Forecasting with PyTorch'.
                            \nThe output should be in a format like: https://www.google.com/search?q=forecasting+github+airport
                            \nAnswer: https://www.google.com/search?q="""

    output_chat = self.openai_call(input_chat.replace("    ",""), max_tokens=4000)
    output_chat=output_chat.replace(" ","+").replace("+","+").replace('"',"").replace('\n',"")
    output_chat="https://www.google.com/search?q=GitHub+"+output_chat
    return output_chat

def chose_the_appropriate_project(self,repo_links,context_of_the_objective):
    readmes=[]
    for i,sdc in enumerate(repo_links):
        response = requests.get(sdc)
        html_content = response.text
        if "README.md" in html_content:
            redme_path=sdc+"/blob/main/README.md"
            readmes.append(self.add_file(redme_path,i,'md'))

    scores_similarity, max_readme_similarity_index =self.compute_similarity(context_of_the_objective,readmes)

    return repo_links[max_readme_similarity_index],readmes[max_readme_similarity_index]

def add_file(self,page_link,counter,type='py'):
    if type=='py':
        f=open("codes/"+page_link.split("/")[-1][:-3]+".py", "w")
    elif type=='md':
        f=open("codes/readme_"+str(counter)+".md", "w")
    else:
        f=open("codes/"+page_link.split("/")[-1][:-6]+".ipynb", "w")
    response = requests.get(page_link.replace("github","raw.githubusercontent").replace("/blob","")).text
    #if  type!='md':
    f.write(response)
    f.close()
    print("Downloaded: ",page_link)
    return response

def scrape(self,driver, page_link,num_downloads):
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, features="html.parser")
    repo_links = []
    if '.py' in page_source or '.ipynb' in page_source:

        pattern = r'(?<=href=")[^"]+\.py'
        py_files = re.findall(pattern, page_source)
        print("py_files")

        for x in list(set(py_files)):
            if "https://github.com/" in x:
                web_page=x
            else:
                web_page="https://github.com"+x

            print(web_page)
            if "comhttps" not in web_page:
                answer=self.add_file(web_page,num_downloads,'py')
            time.sleep(5)
            num_downloads+=1

        pattern = r'(?<=href=")[^"]+\.ipynb'
        py_files = re.findall(pattern, page_source)

        for x in list(set(py_files)):
            if "https://github.com" in x:
                web_page=x
            else:
                web_page="https://github.com"+x

            print(web_page)
            if "comhttps" not in web_page:
                answer=self.add_file(web_page,num_downloads,'ipynb')
                time.sleep(5)
            num_downloads+=1

    if 'topics'in page_link:
        repo_elements = soup.find_all('h3', class_='f3', limit=1)
        for elem in repo_elements:
            link = elem.find('a', class_='Link text-bold wb-break-word').get('href')
            repo_links.append('https://github.com' + link)
    elif "google" in page_link:
        for a_tag in soup.find_all('a', href=True):
            if "https://github.com/" in a_tag['href'] and "=h" not in a_tag['href']:
                repo_links.append(a_tag['href'])
                print(a_tag['href'])
        #chose_the_appropriate_project(repo_links,objective)
        link_project, readme_chosed_project = self.chose_the_appropriate_project(repo_links,objective)
        f=open("codes/README.md", "w")
        f.write(link_project+"\n"+readme_chosed_project)
        repo_links=[link_project]
    else:
        for a_tag in soup.find_all('a', href=True):
            tag='https://github.com'+a_tag['href']
            if 'tree/' in tag and tag not in self.visited_pages and page_link in tag and '/.' not in tag:
                repo_links.append(tag)

    return repo_links 

def get_visit_pages(self):
    self.dag.populate_parents()
    nx_graph = self.dag.to_networkx()
    pos = nx.spring_layout(nx_graph)
    nx.draw(nx_graph, pos, with_labels=True, node_size=30, node_color="skyblue", font_size=3, font_color="black")
    plt.savefig("dag_image.png")
    plt.show()

def crawl(self,page_link):
    try:
        thread_driver = webdriver.Chrome(options=self.chrome_options)
        if page_link not in self.visited_pages:
            self.visited_pages.append(page_link)
            if self.output_chat_list:
                thread_driver.get(page_link)
                local_links=self.scrape(thread_driver, page_link,len(self.visited_pages))

                if local_links is not None:
                    local_links = set(local_links)

                    for link in local_links.copy():
                        for visited_page in self.visited_pages:
                            if link == visited_page:
                                local_links.remove(link)

                if len(local_links) != 0:
                    for link in local_links:
                        if link not in self.visited_pages:
                            self.dag.add_edge(page_link, link)
                            if '.py' not in page_link and 'ipynb' not in page_link:
                                self.crawl(link)
    finally:
        print("QUITTING....")
        thread_driver.quit()

scraper_airport = Scraper() scraper_airport.crawl(scraper_airport.prompt_for_chat(objective)) `

Additional context

No response

thinkall commented 2 months ago

Hi @Repomano , thank you for raising the FR.

I don't quite understand I would like to find a way to use an agent in a chatgroup instead of an Openai call, with an agent that can decide which project to source from github.

Could you please rephrase it? Thanks.

Hk669 commented 2 months ago

@Repomano Instead of relying on web scraping, which may encounter legal complications, consider implementing direct requests to the GitHub endpoint using an access token for extracting the code.

docs: Rest APIs