brunoamaral / gregory

Artificial Intelligence and Machine Learning to help find scientific research and filter relevant content
https://gregory-ai.com/
Other
44 stars 6 forks source link

pubmed articles return a truncated abstract #413

Open brunoamaral opened 4 weeks ago

brunoamaral commented 4 weeks ago

rss feed where the problem was found: https://pubmed.ncbi.nlm.nih.gov/rss/search/10guX6I3SqrbUeeLKSTD6FCRM44ewnrN2MKKTQLLPMHB4xNsZU/?limit=15&utm_campaign=pubmed-2&fc=20210216052009

this is in the feedreader.py file that processes the data

        def update_articles_from_feeds(self):
            sources = Sources.objects.filter(method='rss', source_for='science paper')
            for source in sources:
                    feed = self.fetch_feed(source.link, source.ignore_ssl)
                    for entry in feed['entries']:
                            title = entry['title']
                            self.stdout.write(f"Processing {title}")
                            summary = entry.get('summary', '')
                            if hasattr(entry, 'summary_detail'):
                                    summary = entry['summary_detail']['value']
                            published = entry.get('published')
                            if 'pubmed' in source.link and hasattr(entry, 'content'):
                                    summary = entry['content'][0]['value']
                            published_date = parse(entry.get('published') or entry.get('prism_coverdate'), tzinfos=self.tzinfos).astimezone(pytz.utc)
                            link = greg.remove_utm(entry['link'])
                            doi = None
                            if 'pubmed' in source.link and entry.get('dc_identifier', '').startswith('doi:'):
                                    doi = entry['dc_identifier'].replace('doi:', '')
                            elif 'faseb' in source.link:
                                    doi = entry.get('prism_doi', '')

                            if doi:
                                    crossref_paper = SciencePaper(doi=doi)
                                    crossref_paper.refresh()
                                    title = crossref_paper.title if crossref_paper.title else entry['title']
                                    summary = crossref_paper.abstract if crossref_paper.abstract else entry.get('summary')

                                    # Check if an article with the same DOI or title exists
                                    existing_article = Articles.objects.filter(Q(doi=doi) | Q(title=title)).first()
                                    if existing_article:
                                        science_paper = existing_article
                                        created = False
                                    else:
                                        science_paper = Articles.objects.create(
                                            doi=doi,
                                            title=title,
                                            summary=summary,
                                            link=link,
                                            published_date=published_date,
                                            container_title=crossref_paper.journal,
                                            publisher=crossref_paper.publisher,
                                            access=crossref_paper.access,
                                            crossref_check=timezone.now()
                                        )
                                        created = True

                                    if created:
                                        science_paper.teams.add(source.team)
                                        science_paper.subjects.add(source.subject)
                                        science_paper.sources.add(source)
                                        science_paper.save()
                                    else:
                                            if any([science_paper.title != title, science_paper.summary != SciencePaper.clean_abstract(abstract=summary),
                                                    science_paper.link != link, science_paper.published_date != published_date]):
                                                    science_paper.title = title
                                                    science_paper.summary = SciencePaper.clean_abstract(abstract=summary)
                                                    science_paper.link = link
                                                    science_paper.published_date = published_date
                                                    science_paper.sources.add(source)
                                                    science_paper.teams.add(source.team)
                                                    science_paper.subjects.add(source.subject)
                                                    science_paper.save()

                                    # Process author information
                                    if crossref_paper is not None:  # Assuming `paper` contains the article's metadata including author information
                                        if crossref_paper.authors is not None:
                                            for author_info in crossref_paper.authors:
                                                given_name = author_info.get('given')
                                                family_name = author_info.get('family')
                                                orcid = author_info.get('ORCID', None)
                                                try:
                                                    if orcid:  # If ORCID is present, use it as the primary key for author lookup/creation
                                                        author_obj, author_created = Authors.objects.get_or_create(
                                                                ORCID=orcid,
                                                                defaults={
                                                                        'given_name': given_name,
                                                                        'family_name': family_name
                                                                        }
                                                                )
                                                    else:  # If no ORCID is provided, fallback to using given_name and family_name for lookup/creation
                                                        if not given_name or not family_name:
                                                            self.stdout.write(f"Missing given name or family name, skipping this author. {crossref_paper.doi}")
                                                            continue
                                                        else:
                                                            author_obj, author_created = Authors.objects.get_or_create(
                                                                given_name=given_name,
                                                                family_name=family_name,
                                                                defaults={'ORCID': orcid}  # orcid will be an empty string if not provided, which is fine
                                                            )
                                                except MultipleObjectsReturned:
                                                    # Handle the case where multiple authors are returned
                                                    authors = Authors.objects.filter(given_name=given_name, family_name=family_name)
                                                    print(f"Multiple authors found for {given_name} {family_name}:")
                                                    for author in authors:
                                                            print(f"Author ID: {author.author_id}, ORCID: {author.ORCID}")
                                                    # Use the first author with an ORCID, if available
                                                    author_obj = next((author for author in authors if author.ORCID), authors.first())

                                                    # Link the author to the article if not already linked
                                                if not science_paper.authors.filter(pk=author_obj.pk).exists():
                                                    science_paper.authors.add(author_obj)
                            else:
                                print('no DOI, trying to create article')
                                existing_article = Articles.objects.filter(title=title).first()
                                if existing_article:
                                            science_paper = existing_article
                                            created = False
                                else:
                                            science_paper = Articles.objects.create(
                                                title=title,
                                                summary=summary,
                                                link=link,
                                                published_date=published_date,
                                                source=source,
                                                crossref_check=None
                                            )
                                            created = True

                                if not created:
                                    if any([science_paper.title != title, science_paper.summary != SciencePaper.clean_abstract(abstract=summary),
                                                science_paper.link != link, science_paper.published_date != published_date]):
                                        science_paper.title = title
                                        science_paper.summary = SciencePaper.clean_abstract(abstract=summary)
                                        science_paper.link = link
                                        science_paper.published_date = published_date
                                        science_paper.teams.add(source.team)
                                        science_paper.subjects.add(source.subject)
                                        science_paper.sources.add(source)
                                        science_paper.save()