brunoamaral / gregory-ai

Artificial Intelligence and Machine Learning to help find scientific research and filter relevant content
https://gregory-ai.com/
Other
47 stars 7 forks source link

clean up the abstracts from html and other strange tags #255

Closed brunoamaral closed 2 years ago

brunoamaral commented 2 years ago

This was resolved outside of git versioning. Keeping my notes here in case they are useful in the future.

from crossref.restful import Works, Etiquette
from gregory.models import Articles
from bs4 import BeautifulSoup
import html
import os

CLIENT_EMAIL = "bruno@gregory-ms.com"
CLIENT_WEBSITE = 'https://' + os.environ.get('DOMAIN_NAME') + '/'
my_etiquette = Etiquette('Gregory MS', 'v8', CLIENT_WEBSITE, CLIENT_EMAIL)
works = Works(etiquette=my_etiquette)

articles = Articles.objects.filter(summary__startswith='<',doi__isnull=False)

## GET ABSTRACTS FROM CROSSREF
### tested localy with about 4928 articles and missed 2873 abstracts 
## from these, only 9 didn't belong to pubmed

for article in articles:
    w = works.doi(article.doi)
    try:
        print(article.article_id,w['abstract'])
        try:
            article.summary = w['abstract']
            article.save()
        except:
            print('error', article.article_id)
    except:
        print(article.article_id,'nope')
        pass

## collect articles that have weird characters and link starts with pubmed:
articles = Articles.objects.filter(summary__startswith='<',doi__isnull=False,link__startswith='https://pubmed.ncbi.nlm.nih.gov').order_by('-article_id')

# Function to remove tags, losing structure
# def remove_tags(html):

#       # parse html content
#       soup = BeautifulSoup(html, "html.parser")

#       for data in soup(['style', 'script']):
#               # Remove tags
#               data.decompose()

#       # return data by retrieving the tag content
#       return ' '.join(soup.stripped_strings)

## remove the inline styles keeping the tags
# def greg_remove_style(html_abstract):
#   soup = BeautifulSoup(html_abstract,'html.parser')
#   for tag in soup():
#       for attribute in ["class", "id", "name", "style"]:
#               del tag[attribute]
#   return soup

for article in articles:
    print(article.article_id)
    abstract = html.unescape(article.summary)
    print(abstract)
    soup = BeautifulSoup(abstract,'html.parser')
    for tag in soup():
        for attribute in ["class", "id", "name", "style"]:
                del tag[attribute]
    abstract = str(soup)
    try:
        article.summary = abstract
        article.save()
    except Exception as e: 
        print(article.article_id,e)
        pass