Open flower1430 opened 7 months ago
Procedural points:
In the colab notebook, the following code is used to extract keywords from
the literatures. (rake and yake are working, while gensim is not )
"it isn't working" is much too imprecise:
docanalysis
or is it yours?Giving a simple runnable example is highly desirable:
On Wed, Mar 27, 2024 at 7:21 AM flower1430 @.***> wrote:
In the colab notebook, the following code is used to extract keywords from the literatures. (rake and yake are working, while gensim is not )
!python /content/semanticClimate/keyword_extraction/code/keyword.py --html_path /content/remote_agri/eupmc_result.html --saving_path /content/ --method 'rake'
keyword.py
"""Untitled56.ipynb
Automatically generated by Colaboratory.
Original file is located at https://colab.research.google.com/drive/1aCn-dAMP2zpfjFRA2b_Ts5G58rJVbWKr """
from bs4 import BeautifulSoup from keybert import KeyBERT from multi_rake import Rake from summa import keywords import yake from IPython.display import HTML import pandas as pd import requests import os import argparse import spacy nlp = spacy.load("en_core_web_lg")
class keyword_extraction(): def init(self,html_path, saving_path, method): self.html_path = html_path self.saving_path = saving_path self.method = method self.text = '' self.span_list = []
def extract_span_list(self): with open(self.html_path, 'r') as f: html = f.read() soup = BeautifulSoup(html, features="html.parser") with open('/content/html_ex.html','w', encoding="utf-8")as file: file.write(soup.prettify())
kill all script and style elements
soup_elem = soup.find_all("span") for span_elem in soup_elem: #print(span_elem) span_elem.extract() span_text = span_elem.get_text().strip() lines = (line.strip() for line in span_text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ") if len(phrase.strip())>9 ) # drop blank lines #text_write = '\n'.join(chunk for chunk in chunks if chunk) span_text = ' '.join(chunk for chunk in chunks if chunk) if len(span_text)>9 and 'http' not in span_text and 'doi' not in span_text and 'Chapter' not in span_text: # print(span_text) # print('-'*50) self.span_list.append(span_text) return self.span_list
def tf_idf(self):
def clean(self,df): def tagger(x): return nlp(x)[0].pos_
def lemma(x): #print(nlp(x)[0].lemma_) return nlp(x)[0].lemma_ df['POS']= df['keyword/phrase'].apply(lambda x: tagger(x)) df['Lemma']= df['keyword/phrase'].apply(lambda x: lemma(x)) df= df[df['keyword/phrase'] == df['Lemma'] ] df = df.drop_duplicates(subset=['score'], keep='last') df= df[df.POS.isin(['NOUN', 'PROPN', 'ADJ', 'ADV'])] df= df[~df['keyword/phrase'].apply(lambda x: lemma(x)).isin(['http','https', 'publication','Chapter'])] df = df.drop(columns = ['Lemma'], axis = 0) return df
def extract_text_fom_html(self):
with open(self.html_path, 'r', encoding="utf-8") as f: html = f.read() soup = BeautifulSoup(html, features="html.parser") for script in soup(["script", "style"]): script.extract() # rip it out # get text text = soup.get_text() #print(text) # break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ") if len(phrase.strip())>9 ) # drop blank lines #text_write = '\n'.join(chunk for chunk in chunks if chunk) text = '\n '.join(chunk for chunk in chunks if chunk) self.text = text #print(text) # TEXT_ = f'Chapter06_text.txt' # saving_path = '/content/' with open('text.txt', 'w', encoding="utf-8") as file: file.write(text) return self.text
def extract_keywords_rake(self): rake = Rake() self.extract_text_fom_html() keywords_Rake = rake.apply(self.text) df_Rake =pd.DataFrame(keywords_Rake) df_Rake.rename(columns = {0:'keyword/phrase',1:'score'}, inplace = True) df_Rake = self.clean(df_Rake) df_Rake.to_csv(self.saving_path +'Rake_keywords.csv',index=None)
def extract_keywords_gensim(self): self.extract_text_fom_html() keywords_gensim= keywords(self.text,words = 100,scores = True, pos_filter =('NN','ADJ'),lemmatize = False, deacc =False) # run over all parameters df_gensim =pd.DataFrame(keywords_gensim) df_gensim.rename(columns = {0:'keyword/phrase',1:'score'}, inplace = True) df_gensim = self.clean(df_gensim) df_gensim.to_csv(self.saving_path +'gensim_keywords.csv',index=None)
def extract_keywords_yake(self): self.extract_text_fom_html() kw_extractor = yake.KeywordExtractor(top=100, stopwords=None) keywords_yake = kw_extractor.extract_keywords(self.text) df_yake =pd.DataFrame(keywords_yake) df_yake.rename(columns = {0:'keyword/phrase',1:'score'}, inplace = True) df_yake = self.clean(df_yake) df_yake.to_csv(self.saving_path +'yake_keywords.csv',index=None)
for kw, v in keywords_yake:
# print("Keyphrase: ",kw, ": score", v)
def extract_keywords_textrank(self): self.extract_text_fom_html() keywords_textrank = keywords.keywords(self.text, scores=True) df_textrank = pd.DataFrame(keywords_textrank) df_textrank.rename(columns = {0:'keyword/phrase',1:'score'}, inplace = True) df_textrank = self.clean(df_textrank) df_textrank.to_csv(self.saving_path +'textrank_keywords.csv',index=None)
def extract_keywords_keyBERT(self): kw_model = KeyBERT(model='all-mpnet-base-v2') keywords_keyBERT = kw_model.extract_keywords(self.text, keyphrase_ngram_range=(1, 2), stop_words='english', highlight=True, top_n=10)
def main(self): if method == 'rake': self.extract_keywords_rake() elif method == 'yake': self.extract_keywords_yake() elif method == 'gensim': self.extract_keywords_gensim() elif method == 'textrank': self.extract_keywords_textrank() elif method == 'keyBERT': self.extract_keywords_keyBERT()
if name == "main": parser = argparse.ArgumentParser() parser.add_argument('--html_path', required=True, help='give the path where your html lives: /...') parser.add_argument('--saving_path', required=True, help='path of the folder where you want to save the files : /...' ) parser.add_argument('--method', required=True, choices=['rake','yake','gensim','keyBERT','textrank'], help='which method you want to us to extact keywords /...')
args = parser.parse_args() html_path = args.html_path #'/content/semanticClimate/ipcc/ar6/wg3/Chapter06/fulltext.flow.html' saving_path = args.saving_path #'/content/' method = args.method keyword_extractions = keyword_extraction(html_path,saving_path,method) keyword_extractions.main()```
— Reply to this email directly, view it on GitHub https://github.com/petermr/docanalysis/issues/40, or unsubscribe https://github.com/notifications/unsubscribe-auth/AAFTCS5GNOMROWVWAASIARDY2JXP3AVCNFSM6AAAAABFKKEF32VHI2DSMVQWIX3LMV43ASLTON2WKOZSGIYTAMBVGU3TQNI . You are receiving this because you are subscribed to this thread.Message ID: @.***>
-- Peter Murray-Rust Founder ContentMine.org and Reader Emeritus in Molecular Informatics Dept. Of Chemistry, University of Cambridge, CB2 1EW, UK
In the colab notebook, the following code is used to extract keywords from the literatures. (rake and yake are working, while gensim is not )
!python /content/semanticClimate/keyword_extraction/code/keyword.py \ --html_path /content/remote_agri/eupmc_result.html \ --saving_path /content/ \ --method 'rake'
keyword.py