DivijJaswal / LLM-Research

0 stars 0 forks source link

Error While loading Words #1

Closed OrionXV closed 3 weeks ago

OrionXV commented 1 month ago

The text splitter is working well, however, parsed input in not entering the models correctly.

DivijJaswal commented 1 month ago

import torch from transformers import pipeline from transformers import AutoTokenizer from huggingface_hub import login import evaluate import random from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np import spacy from sklearn.decomposition import TruncatedSVD from transformers import logging import nltk nltk.download('wordnet', quiet=True) logging.set_verbosity_error()

login(token = "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") tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

summarizer = pipeline("summarization",tokenizer=tokenizer, model="facebook/bart-large-cnn")

references = ["The shepherd boy raised two false alarms to trick the villagers that the wolf was grazing his sheep. When a real wolf appears, the villagers ignore his call, thinking it's another lie. The wolf attacks his sheep and the boy learns a lesson after raising those false alarms, that lying results in losing people’s trust."]

def shorten_text(text, chunk_size, method): """ Shortens the text using the specified method. :param text: The original text to shorten. :param chunk_size: The size of each chunk if needed. :param method: Method to shorten the text. :return: Shortened text. """ if method == "clipping": return text[:chunk_size]

elif method == "iterative":
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    summarized_chunks = [summarizer(chunk, min_length=50, max_length=100, do_sample=False)[0]['summary_text'] for chunk in chunks]
    return " ".join(summarized_chunks)

elif method == "random_removal":
    words = text.split()
    while len(words) > chunk_size:
        index_to_remove = random.randint(0, len(words) - 1)
        del words[index_to_remove]
    return " ".join(words)

elif method == "sentence_ranking":
    return tfidf_sentence_ranking(text, chunk_size)

elif method == "sliding_window":
    return sliding_window(text, chunk_size)

elif method == "entity_filtering":
    return entity_filtering(text, chunk_size)

elif method == "summarize_summary":
    return summarize_summary(text, chunk_size)

elif method == "lsa":
    return lsa_text_summarization(text, chunk_size)

def tfidf_sentence_ranking(text, chunk_size): sentences = text.split('. ') vectorizer = TfidfVectorizer().fit_transform(sentences) vectors = vectorizer.toarray() sentence_scores = np.sum(vectors, axis=1)

ranked_sentences = sorted(((score, i, s) for i, (score, s) in enumerate(zip(sentence_scores, sentences))), reverse=True)
selected_sentences = [s for _, _, s in ranked_sentences[:int(chunk_size / 20)]]
return '. '.join(selected_sentences)

def sliding_window(text, chunk_size, overlap=100): words = text.split() windows = [] for i in range(0, len(words), chunk_size - overlap): window = words[i:i+chunk_size] windows.append(" ".join(window))

summarized_windows = [summarizer(w, min_length=50, max_length=100, do_sample=False)[0]['summary_text'] for w in windows]
return " ".join(summarized_windows)

def entity_filtering(text, chunk_size): nlp = spacy.load("en_core_web_sm") doc = nlp(text) importantsentences = [] for sent in doc.sents: if any(ent.label in ["PERSON", "ORG", "GPE", "DATE"] for ent in sent.ents): important_sentences.append(sent.text) if len(important_sentences) >= chunk_size / 20: break return " ".join(important_sentences)

def summarize_summary(text, chunk_size): """ Recursively summarize the text until it is under a desired length. """ summarized_text = text iteration = 0 while len(summarized_text) > chunk_size: print(f"Iteration {iteration + 1}: Text too long. Summarizing again.") summarized_text = shorten_text(summarized_text,1024,"iterative") iteration += 1 return summarized_text

def lsa_text_summarization(text, chunk_size): """ Use Latent Semantic Analysis (LSA) to extract the most important concepts from the text and return the most relevant sentences. """ sentences = text.split('. ')

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(sentences)

lsa_model = TruncatedSVD(n_components=1, n_iter=100)
lsa_model.fit(X)
lsa_scores = lsa_model.transform(X)

# Rank sentences by their relevance to the main topics
ranked_sentences = sorted(((lsa_scores[i, 0], s) for i, s in enumerate(sentences)), reverse=True)

# Select the top N sentences based on LSA scores
selected_sentences = [s for _, s in ranked_sentences[:int(chunk_size / 20)]]
return '. '.join(selected_sentences)

def scores(predictions): bleu = evaluate.load('bleu') results = bleu.compute(predictions=predictions, references=references) print(results)

rouge = evaluate.load('rouge')
results = rouge.compute(predictions=predictions,references=references)
print(results)

bertscore = evaluate.load("bertscore")
results = bertscore.compute(predictions=predictions, references=references, lang="en")
print(results)

meteor = evaluate.load('meteor')
results = meteor.compute(predictions=predictions, references=references)
print(results)

def summarize_text(input_file,num_beams):

with open(input_file, 'r', encoding='utf-8') as file:
    text = file.read()

print("clipping")  
clipped_text = shorten_text(text, 1024, "clipping")
short_summary = summarizer(clipped_text ,num_beams, min_length = 50, max_length =100,do_sample=False)  
print(short_summary)   
predictions = [item['summary_text'] for item in short_summary]
scores(predictions=predictions)

print("iterative")
short_summary = shorten_text(text,1024,"iterative")
print(short_summary) 
predictions = [short_summary]
scores(predictions=predictions)

#random removal     not complete
removed_text = shorten_text(text,1024,"random_removal")
print(len((removed_text.split())))
short_summary = summarizer(removed_text ,num_beams, min_length = 50, max_length =100,do_sample=False)  
print(short_summary)  

# sentence ranking  not working
ranked_text = shorten_text(text,1024,"sentence_ranking")
short_summary = summarizer(ranked_text ,num_beams, min_length = 50, max_length =100,do_sample=False)  
print(short_summary)

# sliding window not working
short_summary = shorten_text(text,1024,"sliding_window")
print(short_summary) 

# entity filtering  not working
filtered_text = shorten_text(text,1024,"entity_filtering")
short_summary = summarizer(filtered_text ,num_beams, min_length = 50, max_length =100,do_sample=False)  
print(short_summary)

#lsa  not working
lsa_text = shorten_text(text,1024,"lsa")
short_summary = summarizer(lsa_text ,num_beams, min_length = 50, max_length =100,do_sample=False)  
print(short_summary)

# print("summarize_summary") 
short_summary = shorten_text(text,1024,"summarize_summary")
print(short_summary) 
predictions = [short_summary]
scores(predictions=predictions)
# medium_summary = summarizer(text ,num_beams, min_length = 100, max_length =150,do_sample=False)
# large_summary = summarizer(text ,num_beams, min_length = 150, max_length =200,do_sample=False)

# print(medium_summary)
# print(large_summary)
# print(results)

def main(): input_file="./text1.txt" print(input_file) num_beams=5 summarize_text(input_file,num_beams)

if name == 'main': main()

OrionXV commented 1 month ago

Traceback (most recent call last): File "C:\Users\ellyt\Desktop\llms\LLM-Research\facebookBart.py", line 207, in main() File "C:\Users\ellyt\Desktop\llms\LLM-Research\facebookBart.py", line 204, in main summarize_text(input_file,num_beams) File "C:\Users\ellyt\Desktop\llms\LLM-Research\facebookBart.py", line 163, in summarize_text short_summary = summarizer(removed_text ,num_beams, min_length = 50, max_length =100,do_sample=False) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\ellyt\AppData\Local\Programs\Python\Python312\Lib\site-packages\transformers\pipelines\text2text_generation.py", line 269, in call return super().call(*args, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\ellyt\AppData\Local\Programs\Python\Python312\Lib\site-packages\transformers\pipelines\text2text_generation.py", line 167, in call result = super().call(*args, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\ellyt\AppData\Local\Programs\Python\Python312\Lib\site-packages\transformers\pipelines\base.py", line 1254, in call return self.run_single(inputs, preprocess_params, forward_params, postprocess_params) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\ellyt\AppData\Local\Programs\Python\Python312\Lib\site-packages\transformers\pipelines\base.py", line 1261, in run_single model_outputs = self.forward(model_inputs, forward_params) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\ellyt\AppData\Local\Programs\Python\Python312\Lib\site-packages\transformers\pipelines\base.py", line 1161, in forward model_outputs = self._forward(model_inputs, forward_params) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\ellyt\AppData\Local\Programs\Python\Python312\Lib\site-packages\transformers\pipelines\text2text_generation.py", line 191, in _forward output_ids = self.model.generate(model_inputs, *generate_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\ellyt\AppData\Local\Programs\Python\Python312\Lib\site-packages\torch\utils_contextlib.py", line 115, in decorate_context return func(args, kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\ellyt\AppData\Local\Programs\Python\Python312\Lib\site-packages\transformers\generation\utils.py", line 1696, in generate model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\ellyt\AppData\Local\Programs\Python\Python312\Lib\site-packages\transformers\generation\utils.py", line 539, in _prepare_encoder_decoder_kwargs_for_generation model_kwargs["encoder_outputs"]: ModelOutput = encoder(encoder_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\ellyt\AppData\Local\Programs\Python\Python312\Lib\site-packages\torch\nn\modules\module.py", line 1532, in _wrapped_call_impl return self._call_impl(*args, *kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\ellyt\AppData\Local\Programs\Python\Python312\Lib\site-packages\torch\nn\modules\module.py", line 1541, in _call_impl return forward_call(args, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\ellyt\AppData\Local\Programs\Python\Python312\Lib\site-packages\transformers\models\bart\modeling_bart.py", line 1168, in forward embed_pos = self.embed_positions(input) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\ellyt\AppData\Local\Programs\Python\Python312\Lib\site-packages\torch\nn\modules\module.py", line 1532, in _wrapped_call_impl return self._call_impl(*args, *kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\ellyt\AppData\Local\Programs\Python\Python312\Lib\site-packages\torch\nn\modules\module.py", line 1541, in _call_impl return forward_call(args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\ellyt\AppData\Local\Programs\Python\Python312\Lib\site-packages\transformers\models\bart\modeling_bart.py", line 130, in forward return super().forward(positions + self.offset) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\ellyt\AppData\Local\Programs\Python\Python312\Lib\site-packages\torch\nn\modules\sparse.py", line 163, in forward return F.embedding( ^^^^^^^^^^^^ File "C:\Users\ellyt\AppData\Local\Programs\Python\Python312\Lib\site-packages\torch\nn\functional.py", line 2264, in embedding return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ IndexError: index out of range in self

OrionXV commented 3 weeks ago

Fixed the issue with loading.