Closed OrionXV closed 3 weeks ago
import torch from transformers import pipeline from transformers import AutoTokenizer from huggingface_hub import login import evaluate import random from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np import spacy from sklearn.decomposition import TruncatedSVD from transformers import logging import nltk nltk.download('wordnet', quiet=True) logging.set_verbosity_error()
login(token = "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") tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
summarizer = pipeline("summarization",tokenizer=tokenizer, model="facebook/bart-large-cnn")
references = ["The shepherd boy raised two false alarms to trick the villagers that the wolf was grazing his sheep. When a real wolf appears, the villagers ignore his call, thinking it's another lie. The wolf attacks his sheep and the boy learns a lesson after raising those false alarms, that lying results in losing people’s trust."]
def shorten_text(text, chunk_size, method): """ Shortens the text using the specified method. :param text: The original text to shorten. :param chunk_size: The size of each chunk if needed. :param method: Method to shorten the text. :return: Shortened text. """ if method == "clipping": return text[:chunk_size]
elif method == "iterative":
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
summarized_chunks = [summarizer(chunk, min_length=50, max_length=100, do_sample=False)[0]['summary_text'] for chunk in chunks]
return " ".join(summarized_chunks)
elif method == "random_removal":
words = text.split()
while len(words) > chunk_size:
index_to_remove = random.randint(0, len(words) - 1)
del words[index_to_remove]
return " ".join(words)
elif method == "sentence_ranking":
return tfidf_sentence_ranking(text, chunk_size)
elif method == "sliding_window":
return sliding_window(text, chunk_size)
elif method == "entity_filtering":
return entity_filtering(text, chunk_size)
elif method == "summarize_summary":
return summarize_summary(text, chunk_size)
elif method == "lsa":
return lsa_text_summarization(text, chunk_size)
def tfidf_sentence_ranking(text, chunk_size): sentences = text.split('. ') vectorizer = TfidfVectorizer().fit_transform(sentences) vectors = vectorizer.toarray() sentence_scores = np.sum(vectors, axis=1)
ranked_sentences = sorted(((score, i, s) for i, (score, s) in enumerate(zip(sentence_scores, sentences))), reverse=True)
selected_sentences = [s for _, _, s in ranked_sentences[:int(chunk_size / 20)]]
return '. '.join(selected_sentences)
def sliding_window(text, chunk_size, overlap=100): words = text.split() windows = [] for i in range(0, len(words), chunk_size - overlap): window = words[i:i+chunk_size] windows.append(" ".join(window))
summarized_windows = [summarizer(w, min_length=50, max_length=100, do_sample=False)[0]['summary_text'] for w in windows]
return " ".join(summarized_windows)
def entity_filtering(text, chunk_size): nlp = spacy.load("en_core_web_sm") doc = nlp(text) importantsentences = [] for sent in doc.sents: if any(ent.label in ["PERSON", "ORG", "GPE", "DATE"] for ent in sent.ents): important_sentences.append(sent.text) if len(important_sentences) >= chunk_size / 20: break return " ".join(important_sentences)
def summarize_summary(text, chunk_size): """ Recursively summarize the text until it is under a desired length. """ summarized_text = text iteration = 0 while len(summarized_text) > chunk_size: print(f"Iteration {iteration + 1}: Text too long. Summarizing again.") summarized_text = shorten_text(summarized_text,1024,"iterative") iteration += 1 return summarized_text
def lsa_text_summarization(text, chunk_size): """ Use Latent Semantic Analysis (LSA) to extract the most important concepts from the text and return the most relevant sentences. """ sentences = text.split('. ')
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(sentences)
lsa_model = TruncatedSVD(n_components=1, n_iter=100)
lsa_model.fit(X)
lsa_scores = lsa_model.transform(X)
# Rank sentences by their relevance to the main topics
ranked_sentences = sorted(((lsa_scores[i, 0], s) for i, s in enumerate(sentences)), reverse=True)
# Select the top N sentences based on LSA scores
selected_sentences = [s for _, s in ranked_sentences[:int(chunk_size / 20)]]
return '. '.join(selected_sentences)
def scores(predictions): bleu = evaluate.load('bleu') results = bleu.compute(predictions=predictions, references=references) print(results)
rouge = evaluate.load('rouge')
results = rouge.compute(predictions=predictions,references=references)
print(results)
bertscore = evaluate.load("bertscore")
results = bertscore.compute(predictions=predictions, references=references, lang="en")
print(results)
meteor = evaluate.load('meteor')
results = meteor.compute(predictions=predictions, references=references)
print(results)
def summarize_text(input_file,num_beams):
with open(input_file, 'r', encoding='utf-8') as file:
text = file.read()
print("clipping")
clipped_text = shorten_text(text, 1024, "clipping")
short_summary = summarizer(clipped_text ,num_beams, min_length = 50, max_length =100,do_sample=False)
print(short_summary)
predictions = [item['summary_text'] for item in short_summary]
scores(predictions=predictions)
print("iterative")
short_summary = shorten_text(text,1024,"iterative")
print(short_summary)
predictions = [short_summary]
scores(predictions=predictions)
#random removal not complete
removed_text = shorten_text(text,1024,"random_removal")
print(len((removed_text.split())))
short_summary = summarizer(removed_text ,num_beams, min_length = 50, max_length =100,do_sample=False)
print(short_summary)
# sentence ranking not working
ranked_text = shorten_text(text,1024,"sentence_ranking")
short_summary = summarizer(ranked_text ,num_beams, min_length = 50, max_length =100,do_sample=False)
print(short_summary)
# sliding window not working
short_summary = shorten_text(text,1024,"sliding_window")
print(short_summary)
# entity filtering not working
filtered_text = shorten_text(text,1024,"entity_filtering")
short_summary = summarizer(filtered_text ,num_beams, min_length = 50, max_length =100,do_sample=False)
print(short_summary)
#lsa not working
lsa_text = shorten_text(text,1024,"lsa")
short_summary = summarizer(lsa_text ,num_beams, min_length = 50, max_length =100,do_sample=False)
print(short_summary)
# print("summarize_summary")
short_summary = shorten_text(text,1024,"summarize_summary")
print(short_summary)
predictions = [short_summary]
scores(predictions=predictions)
# medium_summary = summarizer(text ,num_beams, min_length = 100, max_length =150,do_sample=False)
# large_summary = summarizer(text ,num_beams, min_length = 150, max_length =200,do_sample=False)
# print(medium_summary)
# print(large_summary)
# print(results)
def main(): input_file="./text1.txt" print(input_file) num_beams=5 summarize_text(input_file,num_beams)
if name == 'main': main()
Traceback (most recent call last):
File "C:\Users\ellyt\Desktop\llms\LLM-Research\facebookBart.py", line 207, in
Fixed the issue with loading.
The text splitter is working well, however, parsed input in not entering the models correctly.