this is in the feedreader.py file that processes the data
def update_articles_from_feeds(self):
sources = Sources.objects.filter(method='rss', source_for='science paper')
for source in sources:
feed = self.fetch_feed(source.link, source.ignore_ssl)
for entry in feed['entries']:
title = entry['title']
self.stdout.write(f"Processing {title}")
summary = entry.get('summary', '')
if hasattr(entry, 'summary_detail'):
summary = entry['summary_detail']['value']
published = entry.get('published')
if 'pubmed' in source.link and hasattr(entry, 'content'):
summary = entry['content'][0]['value']
published_date = parse(entry.get('published') or entry.get('prism_coverdate'), tzinfos=self.tzinfos).astimezone(pytz.utc)
link = greg.remove_utm(entry['link'])
doi = None
if 'pubmed' in source.link and entry.get('dc_identifier', '').startswith('doi:'):
doi = entry['dc_identifier'].replace('doi:', '')
elif 'faseb' in source.link:
doi = entry.get('prism_doi', '')
if doi:
crossref_paper = SciencePaper(doi=doi)
crossref_paper.refresh()
title = crossref_paper.title if crossref_paper.title else entry['title']
summary = crossref_paper.abstract if crossref_paper.abstract else entry.get('summary')
# Check if an article with the same DOI or title exists
existing_article = Articles.objects.filter(Q(doi=doi) | Q(title=title)).first()
if existing_article:
science_paper = existing_article
created = False
else:
science_paper = Articles.objects.create(
doi=doi,
title=title,
summary=summary,
link=link,
published_date=published_date,
container_title=crossref_paper.journal,
publisher=crossref_paper.publisher,
access=crossref_paper.access,
crossref_check=timezone.now()
)
created = True
if created:
science_paper.teams.add(source.team)
science_paper.subjects.add(source.subject)
science_paper.sources.add(source)
science_paper.save()
else:
if any([science_paper.title != title, science_paper.summary != SciencePaper.clean_abstract(abstract=summary),
science_paper.link != link, science_paper.published_date != published_date]):
science_paper.title = title
science_paper.summary = SciencePaper.clean_abstract(abstract=summary)
science_paper.link = link
science_paper.published_date = published_date
science_paper.sources.add(source)
science_paper.teams.add(source.team)
science_paper.subjects.add(source.subject)
science_paper.save()
# Process author information
if crossref_paper is not None: # Assuming `paper` contains the article's metadata including author information
if crossref_paper.authors is not None:
for author_info in crossref_paper.authors:
given_name = author_info.get('given')
family_name = author_info.get('family')
orcid = author_info.get('ORCID', None)
try:
if orcid: # If ORCID is present, use it as the primary key for author lookup/creation
author_obj, author_created = Authors.objects.get_or_create(
ORCID=orcid,
defaults={
'given_name': given_name,
'family_name': family_name
}
)
else: # If no ORCID is provided, fallback to using given_name and family_name for lookup/creation
if not given_name or not family_name:
self.stdout.write(f"Missing given name or family name, skipping this author. {crossref_paper.doi}")
continue
else:
author_obj, author_created = Authors.objects.get_or_create(
given_name=given_name,
family_name=family_name,
defaults={'ORCID': orcid} # orcid will be an empty string if not provided, which is fine
)
except MultipleObjectsReturned:
# Handle the case where multiple authors are returned
authors = Authors.objects.filter(given_name=given_name, family_name=family_name)
print(f"Multiple authors found for {given_name} {family_name}:")
for author in authors:
print(f"Author ID: {author.author_id}, ORCID: {author.ORCID}")
# Use the first author with an ORCID, if available
author_obj = next((author for author in authors if author.ORCID), authors.first())
# Link the author to the article if not already linked
if not science_paper.authors.filter(pk=author_obj.pk).exists():
science_paper.authors.add(author_obj)
else:
print('no DOI, trying to create article')
existing_article = Articles.objects.filter(title=title).first()
if existing_article:
science_paper = existing_article
created = False
else:
science_paper = Articles.objects.create(
title=title,
summary=summary,
link=link,
published_date=published_date,
source=source,
crossref_check=None
)
created = True
if not created:
if any([science_paper.title != title, science_paper.summary != SciencePaper.clean_abstract(abstract=summary),
science_paper.link != link, science_paper.published_date != published_date]):
science_paper.title = title
science_paper.summary = SciencePaper.clean_abstract(abstract=summary)
science_paper.link = link
science_paper.published_date = published_date
science_paper.teams.add(source.team)
science_paper.subjects.add(source.subject)
science_paper.sources.add(source)
science_paper.save()
rss feed where the problem was found: https://pubmed.ncbi.nlm.nih.gov/rss/search/10guX6I3SqrbUeeLKSTD6FCRM44ewnrN2MKKTQLLPMHB4xNsZU/?limit=15&utm_campaign=pubmed-2&fc=20210216052009
this is in the feedreader.py file that processes the data