run-llama / llama-hub

A library of data loaders for LLMs made by the community -- to be used with LlamaIndex and/or LangChain
https://llamahub.ai/
MIT License
3.46k stars 733 forks source link

Showing ConnectTimeout: HTTPSConnectionPool error #353

Closed abhijeetGithu closed 1 year ago

abhijeetGithu commented 1 year ago

Using lama-hub i created youtube video summerizer where i have used default imports like:

from llama_index import StorageContext, load_index_from_storage
from llama_index import VectorStoreIndex
import streamlit as st
from llama_index import download_loader
from llama_index import GPTVectorStoreIndex
from llama_index import LLMPredictor, GPTVectorStoreIndex, PromptHelper, ServiceContext
from langchain import OpenAI
from langchain.chat_models import ChatOpenAI

from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import JSONFormatter
import json
import datetime
from html2image import Html2Image

First time i run the application it worked successfully but after it was started showing error in url: /emptycrown/llama-hub/main/llama_hub/file/requirements.txt

The whole error which started coming again and again:

ConnectTimeout: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /emptycrown/llama-hub/main/llama_hub/file/requirements.txt (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x0000015BBC8D5DD0>, 'Connection to raw.githubusercontent.com timed out. (connect timeout=None)'))

I am not getting what the error coming in requirement.txt in llama_hub , and why it showing the ConnectTimeoutError So please suggest me a solution what exactly the problem is.

For any reference i am providing the whole code block of my project:

import os
os.environ["OPENAI_API_KEY"] = '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'

from llama_index import StorageContext, load_index_from_storage
from llama_index import VectorStoreIndex
import streamlit as st
from llama_index import download_loader
from llama_index import GPTVectorStoreIndex
from llama_index import LLMPredictor, GPTVectorStoreIndex, PromptHelper, ServiceContext
from langchain import OpenAI
from langchain.chat_models import ChatOpenAI

from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import JSONFormatter
import json
import datetime
from html2image import Html2Image

doc_path = './data/'
transcript_file = './data/JsonScriptfilename.json'
index_file = 'storage'
youtube_img = 'Youtube.png'
youtube_link = ''

# Extra code of llm.py

if 'video_id' not in st.session_state:
    st.session_state.video_id = ''

def send_click():
    st.session_state.video_id = youtube_link.split("v=")[1][:11]

index = None
st.title("Abhijeeet's Video Summarizer")

sidebar_placeholder = st.sidebar.container()
youtube_link = st.text_input("Youtube link:")
st.button("Summarize!", on_click=send_click)

if st.session_state.video_id != '':

    progress_bar = st.progress(5, text=f"Summarizing...")

    srt = YouTubeTranscriptApi.get_transcript(st.session_state.video_id, languages=['en'])
    formatter = JSONFormatter()
    json_formatted = formatter.format_transcript(srt)
    with open(transcript_file, 'w') as f: 
        f.write(json_formatted)

    hti = Html2Image()
    hti.screenshot(url=f"https://www.youtube.com/watch?v={st.session_state.video_id}", save_as=youtube_img)

    SimpleDirectoryReader = download_loader("SimpleDirectoryReader")

    loader = SimpleDirectoryReader(doc_path, recursive=True, exclude_hidden=True)
    documents = loader.load_data()

    sidebar_placeholder.header('Current Processing Video')
    sidebar_placeholder.image(youtube_img)
    sidebar_placeholder.write(documents[0].get_text()[:10000]+'...')

    # define LLM
    # llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", max_tokens=500))
    llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name="gpt-3.5-turbo",max_tokens=500))
    service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)
    index =GPTVectorStoreIndex.from_documents(
        documents, service_context=service_context
    )
    # index.save_to_disk(index_file)
    index.storage_context.persist()

    storage_context = StorageContext.from_defaults(persist_dir=index_file)
    index = load_index_from_storage(storage_context)

    service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)

    ## this is the debugging code which i am using just for checking only

    # when first building the index
    index = VectorStoreIndex.from_documents(
        documents, service_context=service_context
    )

    # when loading the index from disk
    index = load_index_from_storage(
        storage_context,service_context=service_context,   
    )

    query_engine = index.as_query_engine()
    response = query_engine.query("summarize this video only")
    print(response)

    section_texts = ''
    section_start_s = 0

    with open(transcript_file, 'r') as f:
        transcript = json.load(f)    

    start_text = transcript[0]["text"]

    progress_steps = int(transcript[-1]["start"]/300+2)
    progress_period = int(100/progress_steps)
    progress_timeleft = str(datetime.timedelta(seconds=20*progress_steps))
    percent_complete = 5
    progress_bar.progress(percent_complete, text=f"Summarizing...{progress_timeleft} left")

    section_response = ''

    for d in transcript:

        if d["start"] <= (section_start_s + 300) and transcript.index(d) != len(transcript) - 1:
            section_texts += ' ' + d["text"]

        else:
            end_text = d["text"]

            prompt = f"summarize this article from \"{start_text}\" to \"{end_text}\", limited in 100 words, start with \"This section of video\""
            #print(prompt)
            response =  query_engine.query(prompt)
            start_time = str(datetime.timedelta(seconds=section_start_s))
            end_time = str(datetime.timedelta(seconds=int(d['start'])))

            section_start_s += 300
            start_text = d["text"]
            section_texts = ''

            section_response += f"**{start_time} - {end_time}:**\n\r{response}\n\r"      

            percent_complete += progress_period
            progress_steps -= 1
            progress_timeleft = str(datetime.timedelta(seconds=20*progress_steps))
            progress_bar.progress(percent_complete, text=f"Summarizing...{progress_timeleft} left")

    prompt = "Summarize this article of a video, start with \"This Video\", the article is: " + section_response
    #print(prompt)
    response =  query_engine.query(prompt)

    progress_bar.progress(100, text="Completed!")
    st.subheader("Summary:")
    st.success(response, icon= "🤖")

    with st.expander("Section Details: "):
        st.write(section_response)

    st.session_state.video_id = ''
    st.stop()
abhijeetGithu commented 1 year ago

Anyone can help me with this error please

EmanuelCampos commented 1 year ago

The issue doesn't sound to be on the llama_index or llama_hub side but on the youtube transcript API

I would recommend you to use our audio loader https://llamahub.ai/l/file-audio

closing this issue for now, since doesn't to be a problem on our side