petermr / docanalysis

Semantic analysis of text documents including sentence and paragraph splitting
Apache License 2.0
13 stars 3 forks source link

Gensim method is not working in this code keyword.py while doing keyword extraction from literature with docanalysis #40

Open flower1430 opened 7 months ago

flower1430 commented 7 months ago

In the colab notebook, the following code is used to extract keywords from the literatures. (rake and yake are working, while gensim is not )

!python /content/semanticClimate/keyword_extraction/code/keyword.py \ --html_path /content/remote_agri/eupmc_result.html \ --saving_path /content/ \ --method 'rake'

keyword.py


"""Untitled56.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1aCn-dAMP2zpfjFRA2b_Ts5G58rJVbWKr
"""

from bs4 import BeautifulSoup
from keybert import KeyBERT
from multi_rake import Rake
from summa import keywords
import yake
from IPython.display import HTML
import pandas as pd
import requests
import os
import argparse
import spacy
nlp = spacy.load("en_core_web_lg")

class keyword_extraction():
  def __init__(self,html_path, saving_path, method):
    self.html_path = html_path
    self.saving_path = saving_path
    self.method = method
    self.text = ''
    self.span_list = []

  def extract_span_list(self):
    with open(self.html_path, 'r') as f:
      html = f.read()
      soup = BeautifulSoup(html, features="html.parser")
      with open('/content/html_ex.html','w', encoding="utf-8")as file:
       file.write(soup.prettify())
      # kill all script and style elements

      soup_elem = soup.find_all("span")
      for span_elem in soup_elem:
        #print(span_elem)
        span_elem.extract() 
        span_text = span_elem.get_text().strip()
        lines = (line.strip() for line in span_text.splitlines())
        # break multi-headlines into a line each
        chunks = (phrase.strip() for line in lines for phrase in line.split("  ") if len(phrase.strip())>9 )
        # drop blank lines
        #text_write = '\n'.join(chunk for chunk in chunks if chunk)
        span_text = ' '.join(chunk for chunk in chunks if chunk)
        if len(span_text)>9 and 'http' not in span_text and 'doi' not in span_text and 'Chapter' not in span_text:
          # print(span_text)
          # print('-'*50)
          self.span_list.append(span_text)
    return self.span_list      
  #def tf_idf(self):

  def clean(self,df):
      def tagger(x):
         return nlp(x)[0].pos_

      def lemma(x):
        #print(nlp(x)[0].lemma_)  
        return nlp(x)[0].lemma_ 

      df['POS']= df['keyword/phrase'].apply(lambda x: tagger(x))
      df['Lemma']= df['keyword/phrase'].apply(lambda x: lemma(x))
      df= df[df['keyword/phrase'] == df['Lemma'] ]
      df = df.drop_duplicates(subset=['score'], keep='last')
      df= df[df.POS.isin(['NOUN', 'PROPN', 'ADJ', 'ADV'])]
      df= df[~df['keyword/phrase'].apply(lambda x: lemma(x)).isin(['http','https', 'publication','Chapter'])]
      df = df.drop(columns = ['Lemma'], axis = 0)
      return df

  def extract_text_fom_html(self):

    with open(self.html_path, 'r', encoding="utf-8") as f:
      html = f.read()
      soup = BeautifulSoup(html, features="html.parser")

      for script in soup(["script", "style"]):
          script.extract()    # rip it out

      # get text
      text = soup.get_text()
      #print(text)
      # break into lines and remove leading and trailing space on each
      lines = (line.strip() for line in text.splitlines())
      # break multi-headlines into a line each
      chunks = (phrase.strip() for line in lines for phrase in line.split("      ") if len(phrase.strip())>9 )
      # drop blank lines
      #text_write = '\n'.join(chunk for chunk in chunks if chunk)
      text = '\n '.join(chunk for chunk in chunks if chunk)
      self.text = text
      #print(text)
      # TEXT_ = f'Chapter06_text.txt'
      # saving_path = '/content/'     
      with open('text.txt', 'w', encoding="utf-8") as file:
          file.write(text)
      return self.text
  def extract_keywords_rake(self):
    rake = Rake()
    self.extract_text_fom_html()
    keywords_Rake = rake.apply(self.text)
    df_Rake =pd.DataFrame(keywords_Rake)
    df_Rake.rename(columns = {0:'keyword/phrase',1:'score'}, inplace = True)
    df_Rake = self.clean(df_Rake)
    df_Rake.to_csv(self.saving_path +'Rake_keywords.csv',index=None)

  def extract_keywords_gensim(self):
    self.extract_text_fom_html()
    keywords_gensim= keywords(self.text,words = 100,scores = True, pos_filter =('NN','ADJ'),lemmatize = False, deacc =False) # run over all parameters 
    df_gensim =pd.DataFrame(keywords_gensim)
    df_gensim.rename(columns = {0:'keyword/phrase',1:'score'}, inplace = True)
    df_gensim = self.clean(df_gensim)
    df_gensim.to_csv(self.saving_path +'gensim_keywords.csv',index=None)  

  def extract_keywords_yake(self):
    self.extract_text_fom_html()
    kw_extractor = yake.KeywordExtractor(top=100, stopwords=None)
    keywords_yake = kw_extractor.extract_keywords(self.text)
    df_yake =pd.DataFrame(keywords_yake)
    df_yake.rename(columns = {0:'keyword/phrase',1:'score'}, inplace = True)
    df_yake = self.clean(df_yake)
    df_yake.to_csv(self.saving_path +'yake_keywords.csv',index=None) 
    # for kw, v in keywords_yake:
    #   print("Keyphrase: ",kw, ": score", v)  

  def extract_keywords_textrank(self):
    self.extract_text_fom_html()
    keywords_textrank = keywords.keywords(self.text, scores=True)
    df_textrank = pd.DataFrame(keywords_textrank)
    df_textrank.rename(columns = {0:'keyword/phrase',1:'score'}, inplace = True)
    df_textrank = self.clean(df_textrank)
    df_textrank.to_csv(self.saving_path +'textrank_keywords.csv',index=None)    

  def extract_keywords_keyBERT(self):
    kw_model = KeyBERT(model='all-mpnet-base-v2')
    keywords_keyBERT = kw_model.extract_keywords(self.text, 
                                     keyphrase_ngram_range=(1, 2), 
                                     stop_words='english', 
                                     highlight=True,
                                     top_n=10)  

  def main(self):
    if method == 'rake':
      self.extract_keywords_rake()
    elif method == 'yake':  
      self.extract_keywords_yake()
    elif method == 'gensim':  
      self.extract_keywords_gensim()
    elif method == 'textrank':  
      self.extract_keywords_textrank() 
    elif method == 'keyBERT':  
      self.extract_keywords_keyBERT() 

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--html_path',
                      required=True,
                      help='give the path where your html lives: /...')
    parser.add_argument('--saving_path',
                      required=True,
                      help='path of the folder where you want to save the files : /...'
                      )
    parser.add_argument('--method',
                      required=True,  choices=['rake','yake','gensim','keyBERT','textrank'],
                      help='which method you want to us to extact keywords /...')

    args = parser.parse_args()

    html_path = args.html_path #'/content/semanticClimate/ipcc/ar6/wg3/Chapter06/fulltext.flow.html'
    saving_path = args.saving_path  #'/content/'
    method = args.method

    keyword_extractions = keyword_extraction(html_path,saving_path,method)
    keyword_extractions.main()```
petermr commented 7 months ago

Procedural points:

Giving a simple runnable example is highly desirable:

On Wed, Mar 27, 2024 at 7:21 AM flower1430 @.***> wrote:

In the colab notebook, the following code is used to extract keywords from the literatures. (rake and yake are working, while gensim is not )

!python /content/semanticClimate/keyword_extraction/code/keyword.py --html_path /content/remote_agri/eupmc_result.html --saving_path /content/ --method 'rake'

keyword.py

"""Untitled56.ipynb

Automatically generated by Colaboratory.

Original file is located at https://colab.research.google.com/drive/1aCn-dAMP2zpfjFRA2b_Ts5G58rJVbWKr """

from bs4 import BeautifulSoup from keybert import KeyBERT from multi_rake import Rake from summa import keywords import yake from IPython.display import HTML import pandas as pd import requests import os import argparse import spacy nlp = spacy.load("en_core_web_lg")

class keyword_extraction(): def init(self,html_path, saving_path, method): self.html_path = html_path self.saving_path = saving_path self.method = method self.text = '' self.span_list = []

def extract_span_list(self): with open(self.html_path, 'r') as f: html = f.read() soup = BeautifulSoup(html, features="html.parser") with open('/content/html_ex.html','w', encoding="utf-8")as file: file.write(soup.prettify())

kill all script and style elements

  soup_elem = soup.find_all("span")
  for span_elem in soup_elem:
    #print(span_elem)
    span_elem.extract()
    span_text = span_elem.get_text().strip()
    lines = (line.strip() for line in span_text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  ") if len(phrase.strip())>9 )
    # drop blank lines
    #text_write = '\n'.join(chunk for chunk in chunks if chunk)
    span_text = ' '.join(chunk for chunk in chunks if chunk)
    if len(span_text)>9 and 'http' not in span_text and 'doi' not in span_text and 'Chapter' not in span_text:
      # print(span_text)
      # print('-'*50)
      self.span_list.append(span_text)
return self.span_list

def tf_idf(self):

def clean(self,df): def tagger(x): return nlp(x)[0].pos_

  def lemma(x):
    #print(nlp(x)[0].lemma_)
    return nlp(x)[0].lemma_

  df['POS']= df['keyword/phrase'].apply(lambda x: tagger(x))
  df['Lemma']= df['keyword/phrase'].apply(lambda x: lemma(x))
  df= df[df['keyword/phrase'] == df['Lemma'] ]
  df = df.drop_duplicates(subset=['score'], keep='last')
  df= df[df.POS.isin(['NOUN', 'PROPN', 'ADJ', 'ADV'])]
  df= df[~df['keyword/phrase'].apply(lambda x: lemma(x)).isin(['http','https', 'publication','Chapter'])]
  df = df.drop(columns = ['Lemma'], axis = 0)
  return df

def extract_text_fom_html(self):

with open(self.html_path, 'r', encoding="utf-8") as f:
  html = f.read()
  soup = BeautifulSoup(html, features="html.parser")

  for script in soup(["script", "style"]):
      script.extract()    # rip it out

  # get text
  text = soup.get_text()
  #print(text)
  # break into lines and remove leading and trailing space on each
  lines = (line.strip() for line in text.splitlines())
  # break multi-headlines into a line each
  chunks = (phrase.strip() for line in lines for phrase in line.split("      ") if len(phrase.strip())>9 )
  # drop blank lines
  #text_write = '\n'.join(chunk for chunk in chunks if chunk)
  text = '\n '.join(chunk for chunk in chunks if chunk)
  self.text = text
  #print(text)
  # TEXT_ = f'Chapter06_text.txt'
  # saving_path = '/content/'
  with open('text.txt', 'w', encoding="utf-8") as file:
      file.write(text)
  return self.text

def extract_keywords_rake(self): rake = Rake() self.extract_text_fom_html() keywords_Rake = rake.apply(self.text) df_Rake =pd.DataFrame(keywords_Rake) df_Rake.rename(columns = {0:'keyword/phrase',1:'score'}, inplace = True) df_Rake = self.clean(df_Rake) df_Rake.to_csv(self.saving_path +'Rake_keywords.csv',index=None)

def extract_keywords_gensim(self): self.extract_text_fom_html() keywords_gensim= keywords(self.text,words = 100,scores = True, pos_filter =('NN','ADJ'),lemmatize = False, deacc =False) # run over all parameters df_gensim =pd.DataFrame(keywords_gensim) df_gensim.rename(columns = {0:'keyword/phrase',1:'score'}, inplace = True) df_gensim = self.clean(df_gensim) df_gensim.to_csv(self.saving_path +'gensim_keywords.csv',index=None)

def extract_keywords_yake(self): self.extract_text_fom_html() kw_extractor = yake.KeywordExtractor(top=100, stopwords=None) keywords_yake = kw_extractor.extract_keywords(self.text) df_yake =pd.DataFrame(keywords_yake) df_yake.rename(columns = {0:'keyword/phrase',1:'score'}, inplace = True) df_yake = self.clean(df_yake) df_yake.to_csv(self.saving_path +'yake_keywords.csv',index=None)

for kw, v in keywords_yake:

#   print("Keyphrase: ",kw, ": score", v)

def extract_keywords_textrank(self): self.extract_text_fom_html() keywords_textrank = keywords.keywords(self.text, scores=True) df_textrank = pd.DataFrame(keywords_textrank) df_textrank.rename(columns = {0:'keyword/phrase',1:'score'}, inplace = True) df_textrank = self.clean(df_textrank) df_textrank.to_csv(self.saving_path +'textrank_keywords.csv',index=None)

def extract_keywords_keyBERT(self): kw_model = KeyBERT(model='all-mpnet-base-v2') keywords_keyBERT = kw_model.extract_keywords(self.text, keyphrase_ngram_range=(1, 2), stop_words='english', highlight=True, top_n=10)

def main(self): if method == 'rake': self.extract_keywords_rake() elif method == 'yake': self.extract_keywords_yake() elif method == 'gensim': self.extract_keywords_gensim() elif method == 'textrank': self.extract_keywords_textrank() elif method == 'keyBERT': self.extract_keywords_keyBERT()

if name == "main": parser = argparse.ArgumentParser() parser.add_argument('--html_path', required=True, help='give the path where your html lives: /...') parser.add_argument('--saving_path', required=True, help='path of the folder where you want to save the files : /...' ) parser.add_argument('--method', required=True, choices=['rake','yake','gensim','keyBERT','textrank'], help='which method you want to us to extact keywords /...')

args = parser.parse_args()

html_path = args.html_path #'/content/semanticClimate/ipcc/ar6/wg3/Chapter06/fulltext.flow.html'
saving_path = args.saving_path  #'/content/'
method = args.method

keyword_extractions = keyword_extraction(html_path,saving_path,method)
keyword_extractions.main()```

— Reply to this email directly, view it on GitHub https://github.com/petermr/docanalysis/issues/40, or unsubscribe https://github.com/notifications/unsubscribe-auth/AAFTCS5GNOMROWVWAASIARDY2JXP3AVCNFSM6AAAAABFKKEF32VHI2DSMVQWIX3LMV43ASLTON2WKOZSGIYTAMBVGU3TQNI . You are receiving this because you are subscribed to this thread.Message ID: @.***>

-- Peter Murray-Rust Founder ContentMine.org and Reader Emeritus in Molecular Informatics Dept. Of Chemistry, University of Cambridge, CB2 1EW, UK